filepuff-mcp/internal/parser/docextract.go

// Package parser provides documentation extraction for multiple languages.
package parser

import (
	"regexp"
	"strings"

	"github.com/lukaszraczylo/mcp-filepuff/pkg/protocol"
	sitter "github.com/smacker/go-tree-sitter"
)

// DocComment represents an extracted documentation comment.
type DocComment struct {
	Tags      map[string]string
	Text      string
	Raw       string
	Style     CommentStyle
	StartLine int
	EndLine   int
}

// CommentStyle indicates the type of comment.
type CommentStyle string

const (
	CommentStyleLine      CommentStyle = "line"      // // comment
	CommentStyleBlock     CommentStyle = "block"     // /* comment */
	CommentStyleJSDoc     CommentStyle = "jsdoc"     // /** comment */
	CommentStyleDoxygen   CommentStyle = "doxygen"   // /** comment */ or /// comment
	CommentStyleDocstring CommentStyle = "docstring" // """comment""" or '''comment'''
	CommentStyleHash      CommentStyle = "hash"      // # comment (Python)
)

// ExtractDocComment extracts the documentation comment for a node.
func ExtractDocComment(n *sitter.Node, content []byte, lang protocol.Language) *DocComment {
	if n == nil {
		return nil
	}

	switch lang {
	case protocol.LangGo:
		return extractGoDocComment(n, content)
	case protocol.LangTypeScript, protocol.LangJavaScript:
		return extractJSDocComment(n, content)
	case protocol.LangPython:
		return extractPythonDocComment(n, content)
	case protocol.LangC, protocol.LangCpp:
		return extractCDocComment(n, content)
	case protocol.LangElixir:
		return extractElixirDocComment(n, content)
	case protocol.LangRust:
		return extractRustDocComment(n, content)
	default:
		return nil
	}
}

// extractGoDocComment extracts Go documentation comments.
// Go uses // or /* */ comments immediately preceding a declaration.
func extractGoDocComment(n *sitter.Node, content []byte) *DocComment {
	comments := collectPrecedingComments(n, content, []string{"comment"})
	if len(comments) == 0 {
		return nil
	}

	var parts []string
	var raw []string
	startLine := -1
	endLine := -1

	for _, c := range comments {
		text := GetNodeText(c, content)
		raw = append(raw, text)

		if startLine == -1 {
			startLine = int(c.StartPoint().Row) + 1
		}
		endLine = int(c.EndPoint().Row) + 1

		cleaned := cleanGoComment(text)
		if cleaned != "" {
			parts = append(parts, cleaned)
		}
	}

	if len(parts) == 0 {
		return nil
	}

	return &DocComment{
		Text:      strings.Join(parts, "\n"),
		Raw:       strings.Join(raw, "\n"),
		Style:     detectCommentStyle(raw[0]),
		Tags:      nil, // Go doesn't use JSDoc-style tags
		StartLine: startLine,
		EndLine:   endLine,
	}
}

// extractJSDocComment extracts JSDoc-style documentation comments.
func extractJSDocComment(n *sitter.Node, content []byte) *DocComment {
	comments := collectPrecedingComments(n, content, []string{"comment"})
	if len(comments) == 0 {
		return nil
	}

	// JSDoc prefers the last comment block if it's a JSDoc comment
	var jsDocComment *sitter.Node
	for i := len(comments) - 1; i >= 0; i-- {
		text := GetNodeText(comments[i], content)
		if strings.HasPrefix(strings.TrimSpace(text), "/**") {
			jsDocComment = comments[i]
			break
		}
	}

	if jsDocComment != nil {
		text := GetNodeText(jsDocComment, content)
		cleaned, tags := parseJSDoc(text)
		return &DocComment{
			Text:      cleaned,
			Raw:       text,
			Style:     CommentStyleJSDoc,
			Tags:      tags,
			StartLine: int(jsDocComment.StartPoint().Row) + 1,
			EndLine:   int(jsDocComment.EndPoint().Row) + 1,
		}
	}

	// Fall back to regular comments
	var parts []string
	var raw []string
	startLine := -1
	endLine := -1

	for _, c := range comments {
		text := GetNodeText(c, content)
		raw = append(raw, text)

		if startLine == -1 {
			startLine = int(c.StartPoint().Row) + 1
		}
		endLine = int(c.EndPoint().Row) + 1

		cleaned := cleanJSComment(text)
		if cleaned != "" {
			parts = append(parts, cleaned)
		}
	}

	if len(parts) == 0 {
		return nil
	}

	return &DocComment{
		Text:      strings.Join(parts, "\n"),
		Raw:       strings.Join(raw, "\n"),
		Style:     CommentStyleLine,
		Tags:      nil,
		StartLine: startLine,
		EndLine:   endLine,
	}
}

// extractPythonDocComment extracts Python docstrings.
// Python docstrings are triple-quoted strings inside the function/class body.
func extractPythonDocComment(n *sitter.Node, content []byte) *DocComment {
	// Python docstrings are inside the body, not before
	body := n.ChildByFieldName("body")
	if body == nil {
		return nil
	}

	// First statement should be the docstring if present
	if body.NamedChildCount() > 0 {
		first := body.NamedChild(0)
		if first != nil && first.Type() == "expression_statement" {
			if first.NamedChildCount() > 0 {
				expr := first.NamedChild(0)
				if expr != nil && expr.Type() == "string" {
					text := GetNodeText(expr, content)
					cleaned := cleanPythonDocstring(text)
					return &DocComment{
						Text:      cleaned,
						Raw:       text,
						Style:     CommentStyleDocstring,
						Tags:      nil,
						StartLine: int(expr.StartPoint().Row) + 1,
						EndLine:   int(expr.EndPoint().Row) + 1,
					}
				}
			}
		}
	}

	// Also check for # comments before the definition
	comments := collectPrecedingComments(n, content, []string{"comment"})
	if len(comments) == 0 {
		return nil
	}

	var parts []string
	var raw []string
	startLine := -1
	endLine := -1

	for _, c := range comments {
		text := GetNodeText(c, content)
		raw = append(raw, text)

		if startLine == -1 {
			startLine = int(c.StartPoint().Row) + 1
		}
		endLine = int(c.EndPoint().Row) + 1

		// Clean # comment
		cleaned := strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(text), "#"))
		if cleaned != "" {
			parts = append(parts, cleaned)
		}
	}

	if len(parts) == 0 {
		return nil
	}

	return &DocComment{
		Text:      strings.Join(parts, "\n"),
		Raw:       strings.Join(raw, "\n"),
		Style:     CommentStyleHash,
		Tags:      nil,
		StartLine: startLine,
		EndLine:   endLine,
	}
}

// extractCDocComment extracts C/C++ documentation comments (Doxygen style).
func extractCDocComment(n *sitter.Node, content []byte) *DocComment {
	comments := collectPrecedingComments(n, content, []string{"comment"})
	if len(comments) == 0 {
		return nil
	}

	// Look for Doxygen-style comment
	var doxyComment *sitter.Node
	for i := len(comments) - 1; i >= 0; i-- {
		text := GetNodeText(comments[i], content)
		trimmed := strings.TrimSpace(text)
		if strings.HasPrefix(trimmed, "/**") || strings.HasPrefix(trimmed, "///") || strings.HasPrefix(trimmed, "//!") {
			doxyComment = comments[i]
			break
		}
	}

	if doxyComment != nil {
		text := GetNodeText(doxyComment, content)
		cleaned, tags := parseDoxygen(text)
		return &DocComment{
			Text:      cleaned,
			Raw:       text,
			Style:     CommentStyleDoxygen,
			Tags:      tags,
			StartLine: int(doxyComment.StartPoint().Row) + 1,
			EndLine:   int(doxyComment.EndPoint().Row) + 1,
		}
	}

	// Fall back to regular comments
	var parts []string
	var raw []string
	startLine := -1
	endLine := -1

	for _, c := range comments {
		text := GetNodeText(c, content)
		raw = append(raw, text)

		if startLine == -1 {
			startLine = int(c.StartPoint().Row) + 1
		}
		endLine = int(c.EndPoint().Row) + 1

		cleaned := cleanCComment(text)
		if cleaned != "" {
			parts = append(parts, cleaned)
		}
	}

	if len(parts) == 0 {
		return nil
	}

	return &DocComment{
		Text:      strings.Join(parts, "\n"),
		Raw:       strings.Join(raw, "\n"),
		Style:     detectCommentStyle(raw[0]),
		Tags:      nil,
		StartLine: startLine,
		EndLine:   endLine,
	}
}

// collectPrecedingComments collects all comment nodes immediately before a node.
func collectPrecedingComments(n *sitter.Node, _ []byte, commentTypes []string) []*sitter.Node {
	var comments []*sitter.Node

	// Walk backwards through siblings
	prev := n.PrevSibling()
	lastCommentLine := int(n.StartPoint().Row)

	for prev != nil {
		isComment := false
		nodeType := prev.Type()
		for _, ct := range commentTypes {
			if nodeType == ct {
				isComment = true
				break
			}
		}

		if !isComment {
			break
		}

		commentEndLine := int(prev.EndPoint().Row)

		// Check if there's a blank line gap
		if lastCommentLine-commentEndLine > 1 {
			break
		}

		comments = append([]*sitter.Node{prev}, comments...)
		lastCommentLine = int(prev.StartPoint().Row)
		prev = prev.PrevSibling()
	}

	return comments
}

// detectCommentStyle determines the style of a comment.
func detectCommentStyle(comment string) CommentStyle {
	trimmed := strings.TrimSpace(comment)
	if strings.HasPrefix(trimmed, "/**") {
		return CommentStyleJSDoc
	}
	if strings.HasPrefix(trimmed, "///") || strings.HasPrefix(trimmed, "//!") {
		return CommentStyleDoxygen
	}
	if strings.HasPrefix(trimmed, "/*") {
		return CommentStyleBlock
	}
	if strings.HasPrefix(trimmed, "//") {
		return CommentStyleLine
	}
	if strings.HasPrefix(trimmed, "#") {
		return CommentStyleHash
	}
	if strings.HasPrefix(trimmed, `"""`) || strings.HasPrefix(trimmed, `'''`) {
		return CommentStyleDocstring
	}
	return CommentStyleLine
}

// cleanGoComment cleans a Go comment.
func cleanGoComment(comment string) string {
	comment = strings.TrimSpace(comment)

	// Handle // comments
	if after, found := strings.CutPrefix(comment, "//"); found {
		return strings.TrimSpace(after)
	}

	// Handle /* */ comments
	if strings.HasPrefix(comment, "/*") && strings.HasSuffix(comment, "*/") {
		comment = strings.TrimPrefix(comment, "/*")
		comment = strings.TrimSuffix(comment, "*/")
		return cleanBlockComment(comment)
	}

	return strings.TrimSpace(comment)
}

// cleanJSComment cleans a JavaScript/TypeScript comment.
func cleanJSComment(comment string) string {
	return cleanGoComment(comment) // Same rules
}

// cleanCComment cleans a C/C++ comment.
func cleanCComment(comment string) string {
	return cleanGoComment(comment) // Same rules
}

// cleanBlockComment cleans the content of a block comment.
func cleanBlockComment(comment string) string {
	lines := strings.Split(comment, "\n")
	var cleaned []string

	for _, line := range lines {
		line = strings.TrimSpace(line)
		// Remove leading * from each line (common in block comments)
		line = strings.TrimPrefix(line, "*")
		line = strings.TrimSpace(line)
		cleaned = append(cleaned, line)
	}

	// Remove empty leading/trailing lines
	for len(cleaned) > 0 && cleaned[0] == "" {
		cleaned = cleaned[1:]
	}
	for len(cleaned) > 0 && cleaned[len(cleaned)-1] == "" {
		cleaned = cleaned[:len(cleaned)-1]
	}

	return strings.Join(cleaned, "\n")
}

// parseJSDoc parses a JSDoc comment and extracts tags.
func parseJSDoc(comment string) (string, map[string]string) {
	comment = strings.TrimSpace(comment)

	// Remove /** and */
	comment = strings.TrimPrefix(comment, "/**")
	comment = strings.TrimSuffix(comment, "*/")

	lines := strings.Split(comment, "\n")
	var descLines []string
	tags := make(map[string]string)

	// Regex for JSDoc tags
	tagPattern := regexp.MustCompile(`^\s*\*?\s*@(\w+)\s*(.*)$`)

	for _, line := range lines {
		line = strings.TrimSpace(line)
		line = strings.TrimPrefix(line, "*")
		line = strings.TrimSpace(line)

		if matches := tagPattern.FindStringSubmatch(line); matches != nil {
			tagName := matches[1]
			tagValue := strings.TrimSpace(matches[2])
			if existing, ok := tags[tagName]; ok {
				tags[tagName] = existing + "\n" + tagValue
			} else {
				tags[tagName] = tagValue
			}
		} else if line != "" {
			descLines = append(descLines, line)
		}
	}

	return strings.Join(descLines, "\n"), tags
}

// parseDoxygen parses a Doxygen comment and extracts tags.
func parseDoxygen(comment string) (string, map[string]string) {
	comment = strings.TrimSpace(comment)

	// Handle /// and //! style comments
	comment = strings.TrimPrefix(comment, "///")
	comment = strings.TrimPrefix(comment, "//!")

	// Handle /** */ style comments
	comment = strings.TrimPrefix(comment, "/**")
	comment = strings.TrimSuffix(comment, "*/")

	lines := strings.Split(comment, "\n")
	var descLines []string
	tags := make(map[string]string)

	// Regex for Doxygen tags (@param, @return, \param, \return, etc.)
	tagPattern := regexp.MustCompile(`^\s*\*?\s*[@\\](\w+)\s*(.*)$`)

	for _, line := range lines {
		line = strings.TrimSpace(line)
		line = strings.TrimPrefix(line, "*")
		line = strings.TrimSpace(line)

		if matches := tagPattern.FindStringSubmatch(line); matches != nil {
			tagName := matches[1]
			tagValue := strings.TrimSpace(matches[2])
			if existing, ok := tags[tagName]; ok {
				tags[tagName] = existing + "\n" + tagValue
			} else {
				tags[tagName] = tagValue
			}
		} else if line != "" {
			descLines = append(descLines, line)
		}
	}

	return strings.Join(descLines, "\n"), tags
}

// FormatDocComment formats a DocComment for display.
func FormatDocComment(doc *DocComment) string {
	if doc == nil || doc.Text == "" {
		return ""
	}

	var sb strings.Builder
	sb.WriteString(doc.Text)

	if len(doc.Tags) > 0 {
		sb.WriteString("\n\n")
		// Order: description, params, returns, other
		paramOrder := []string{"param", "parameter", "arg", "argument"}
		returnOrder := []string{"return", "returns", "retval"}

		// Write params first
		for _, tagName := range paramOrder {
			if val, ok := doc.Tags[tagName]; ok {
				for _, line := range strings.Split(val, "\n") {
					sb.WriteString("@" + tagName + " " + line + "\n")
				}
			}
		}

		// Write returns
		for _, tagName := range returnOrder {
			if val, ok := doc.Tags[tagName]; ok {
				sb.WriteString("@" + tagName + " " + val + "\n")
			}
		}

		// Write remaining tags
		written := make(map[string]bool)
		for _, t := range paramOrder {
			written[t] = true
		}
		for _, t := range returnOrder {
			written[t] = true
		}

		for tagName, val := range doc.Tags {
			if !written[tagName] {
				sb.WriteString("@" + tagName + " " + val + "\n")
			}
		}
	}

	return strings.TrimSpace(sb.String())
}

// cleanPythonDocstring cleans a Python docstring.
func cleanPythonDocstring(doc string) string {
	doc = strings.TrimSpace(doc)

	// Remove triple quotes
	doc = strings.TrimPrefix(doc, `"""`)
	doc = strings.TrimSuffix(doc, `"""`)
	doc = strings.TrimPrefix(doc, `'''`)
	doc = strings.TrimSuffix(doc, `'''`)

	return strings.TrimSpace(doc)
}

// extractRustDocComment extracts Rust documentation comments (/// style).
func extractRustDocComment(n *sitter.Node, content []byte) *DocComment {
	comments := collectPrecedingComments(n, content, []string{"line_comment"})
	if len(comments) == 0 {
		return nil
	}

	// Filter for /// doc comments only
	var docComments []*sitter.Node
	for _, c := range comments {
		text := GetNodeText(c, content)
		trimmed := strings.TrimSpace(text)
		if strings.HasPrefix(trimmed, "///") {
			docComments = append(docComments, c)
		}
	}

	if len(docComments) == 0 {
		return nil
	}

	var parts []string
	var raw []string
	startLine := -1
	endLine := -1

	for _, c := range docComments {
		text := GetNodeText(c, content)
		raw = append(raw, text)

		if startLine == -1 {
			startLine = int(c.StartPoint().Row) + 1
		}
		endLine = int(c.EndPoint().Row) + 1

		// Clean /// prefix
		cleaned := strings.TrimSpace(text)
		cleaned = strings.TrimPrefix(cleaned, "///")
		if len(cleaned) > 0 && cleaned[0] == ' ' {
			cleaned = cleaned[1:]
		}
		parts = append(parts, cleaned)
	}

	if len(parts) == 0 {
		return nil
	}

	return &DocComment{
		Text:      strings.Join(parts, "\n"),
		Raw:       strings.Join(raw, "\n"),
		Style:     CommentStyleDoxygen,
		Tags:      nil,
		StartLine: startLine,
		EndLine:   endLine,
	}
}

// extractElixirDocComment extracts Elixir documentation from @doc and @moduledoc attributes.
// Elixir uses module attributes like @doc and @moduledoc for documentation.
func extractElixirDocComment(n *sitter.Node, content []byte) *DocComment {
	// Look for @doc or @moduledoc attribute preceding this node
	prev := n.PrevSibling()

	for prev != nil {
		// Check if this is an unary_operator with @ (module attribute)
		if prev.Type() == "unary_operator" {
			text := GetNodeText(prev, content)
			trimmed := strings.TrimSpace(text)

			// Check for @doc or @moduledoc
			if strings.HasPrefix(trimmed, "@doc") || strings.HasPrefix(trimmed, "@moduledoc") {
				// Extract the documentation string
				docText := extractElixirDocString(prev, content)
				if docText != "" {
					return &DocComment{
						Text:      docText,
						Raw:       text,
						Style:     CommentStyleDocstring,
						Tags:      nil,
						StartLine: int(prev.StartPoint().Row) + 1,
						EndLine:   int(prev.EndPoint().Row) + 1,
					}
				}
			}
		}

		// Also check for regular # comments
		if prev.Type() == "comment" {
			comments := collectPrecedingComments(n, content, []string{"comment"})
			if len(comments) > 0 {
				var parts []string
				var raw []string
				startLine := -1
				endLine := -1

				for _, c := range comments {
					text := GetNodeText(c, content)
					raw = append(raw, text)

					if startLine == -1 {
						startLine = int(c.StartPoint().Row) + 1
					}
					endLine = int(c.EndPoint().Row) + 1

					// Clean # comment
					cleaned := strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(text), "#"))
					if cleaned != "" {
						parts = append(parts, cleaned)
					}
				}

				if len(parts) > 0 {
					return &DocComment{
						Text:      strings.Join(parts, "\n"),
						Raw:       strings.Join(raw, "\n"),
						Style:     CommentStyleHash,
						Tags:      nil,
						StartLine: startLine,
						EndLine:   endLine,
					}
				}
			}
			break
		}

		prev = prev.PrevSibling()
	}

	return nil
}

// extractElixirDocString extracts the documentation string from an Elixir @doc/@moduledoc attribute.
func extractElixirDocString(n *sitter.Node, content []byte) string {
	// The doc attribute typically looks like:
	// @doc """
	// Documentation here
	// """
	// or
	// @doc "Single line doc"

	text := GetNodeText(n, content)

	// Find the string content after @doc or @moduledoc
	var docContent string

	// Check for heredoc style (triple quotes)
	if idx := strings.Index(text, `"""`); idx != -1 {
		// Find the closing triple quotes
		rest := text[idx+3:]
		if endIdx := strings.Index(rest, `"""`); endIdx != -1 {
			docContent = rest[:endIdx]
		}
	} else if idx := strings.Index(text, `"`); idx != -1 {
		// Single quoted string
		rest := text[idx+1:]
		if endIdx := strings.Index(rest, `"`); endIdx != -1 {
			docContent = rest[:endIdx]
		}
	}

	return strings.TrimSpace(docContent)
}