Files
filepuff-mcp/internal/parser/docextract.go
T
2026-02-22 15:24:48 +00:00

719 lines
18 KiB
Go

// Package parser provides documentation extraction for multiple languages.
package parser
import (
"regexp"
"strings"
"github.com/lukaszraczylo/mcp-filepuff/pkg/protocol"
sitter "github.com/smacker/go-tree-sitter"
)
// DocComment represents an extracted documentation comment.
type DocComment struct {
Tags map[string]string
Text string
Raw string
Style CommentStyle
StartLine int
EndLine int
}
// CommentStyle indicates the type of comment.
type CommentStyle string
const (
CommentStyleLine CommentStyle = "line" // // comment
CommentStyleBlock CommentStyle = "block" // /* comment */
CommentStyleJSDoc CommentStyle = "jsdoc" // /** comment */
CommentStyleDoxygen CommentStyle = "doxygen" // /** comment */ or /// comment
CommentStyleDocstring CommentStyle = "docstring" // """comment""" or '''comment'''
CommentStyleHash CommentStyle = "hash" // # comment (Python)
)
// ExtractDocComment extracts the documentation comment for a node.
func ExtractDocComment(n *sitter.Node, content []byte, lang protocol.Language) *DocComment {
if n == nil {
return nil
}
switch lang {
case protocol.LangGo:
return extractGoDocComment(n, content)
case protocol.LangTypeScript, protocol.LangJavaScript:
return extractJSDocComment(n, content)
case protocol.LangPython:
return extractPythonDocComment(n, content)
case protocol.LangC, protocol.LangCpp:
return extractCDocComment(n, content)
case protocol.LangElixir:
return extractElixirDocComment(n, content)
case protocol.LangRust:
return extractRustDocComment(n, content)
default:
return nil
}
}
// extractGoDocComment extracts Go documentation comments.
// Go uses // or /* */ comments immediately preceding a declaration.
func extractGoDocComment(n *sitter.Node, content []byte) *DocComment {
comments := collectPrecedingComments(n, content, []string{"comment"})
if len(comments) == 0 {
return nil
}
var parts []string
var raw []string
startLine := -1
endLine := -1
for _, c := range comments {
text := GetNodeText(c, content)
raw = append(raw, text)
if startLine == -1 {
startLine = int(c.StartPoint().Row) + 1
}
endLine = int(c.EndPoint().Row) + 1
cleaned := cleanGoComment(text)
if cleaned != "" {
parts = append(parts, cleaned)
}
}
if len(parts) == 0 {
return nil
}
return &DocComment{
Text: strings.Join(parts, "\n"),
Raw: strings.Join(raw, "\n"),
Style: detectCommentStyle(raw[0]),
Tags: nil, // Go doesn't use JSDoc-style tags
StartLine: startLine,
EndLine: endLine,
}
}
// extractJSDocComment extracts JSDoc-style documentation comments.
func extractJSDocComment(n *sitter.Node, content []byte) *DocComment {
comments := collectPrecedingComments(n, content, []string{"comment"})
if len(comments) == 0 {
return nil
}
// JSDoc prefers the last comment block if it's a JSDoc comment
var jsDocComment *sitter.Node
for i := len(comments) - 1; i >= 0; i-- {
text := GetNodeText(comments[i], content)
if strings.HasPrefix(strings.TrimSpace(text), "/**") {
jsDocComment = comments[i]
break
}
}
if jsDocComment != nil {
text := GetNodeText(jsDocComment, content)
cleaned, tags := parseJSDoc(text)
return &DocComment{
Text: cleaned,
Raw: text,
Style: CommentStyleJSDoc,
Tags: tags,
StartLine: int(jsDocComment.StartPoint().Row) + 1,
EndLine: int(jsDocComment.EndPoint().Row) + 1,
}
}
// Fall back to regular comments
var parts []string
var raw []string
startLine := -1
endLine := -1
for _, c := range comments {
text := GetNodeText(c, content)
raw = append(raw, text)
if startLine == -1 {
startLine = int(c.StartPoint().Row) + 1
}
endLine = int(c.EndPoint().Row) + 1
cleaned := cleanJSComment(text)
if cleaned != "" {
parts = append(parts, cleaned)
}
}
if len(parts) == 0 {
return nil
}
return &DocComment{
Text: strings.Join(parts, "\n"),
Raw: strings.Join(raw, "\n"),
Style: CommentStyleLine,
Tags: nil,
StartLine: startLine,
EndLine: endLine,
}
}
// extractPythonDocComment extracts Python docstrings.
// Python docstrings are triple-quoted strings inside the function/class body.
func extractPythonDocComment(n *sitter.Node, content []byte) *DocComment {
// Python docstrings are inside the body, not before
body := n.ChildByFieldName("body")
if body == nil {
return nil
}
// First statement should be the docstring if present
if body.NamedChildCount() > 0 {
first := body.NamedChild(0)
if first != nil && first.Type() == "expression_statement" {
if first.NamedChildCount() > 0 {
expr := first.NamedChild(0)
if expr != nil && expr.Type() == "string" {
text := GetNodeText(expr, content)
cleaned := cleanPythonDocstring(text)
return &DocComment{
Text: cleaned,
Raw: text,
Style: CommentStyleDocstring,
Tags: nil,
StartLine: int(expr.StartPoint().Row) + 1,
EndLine: int(expr.EndPoint().Row) + 1,
}
}
}
}
}
// Also check for # comments before the definition
comments := collectPrecedingComments(n, content, []string{"comment"})
if len(comments) == 0 {
return nil
}
var parts []string
var raw []string
startLine := -1
endLine := -1
for _, c := range comments {
text := GetNodeText(c, content)
raw = append(raw, text)
if startLine == -1 {
startLine = int(c.StartPoint().Row) + 1
}
endLine = int(c.EndPoint().Row) + 1
// Clean # comment
cleaned := strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(text), "#"))
if cleaned != "" {
parts = append(parts, cleaned)
}
}
if len(parts) == 0 {
return nil
}
return &DocComment{
Text: strings.Join(parts, "\n"),
Raw: strings.Join(raw, "\n"),
Style: CommentStyleHash,
Tags: nil,
StartLine: startLine,
EndLine: endLine,
}
}
// extractCDocComment extracts C/C++ documentation comments (Doxygen style).
func extractCDocComment(n *sitter.Node, content []byte) *DocComment {
comments := collectPrecedingComments(n, content, []string{"comment"})
if len(comments) == 0 {
return nil
}
// Look for Doxygen-style comment
var doxyComment *sitter.Node
for i := len(comments) - 1; i >= 0; i-- {
text := GetNodeText(comments[i], content)
trimmed := strings.TrimSpace(text)
if strings.HasPrefix(trimmed, "/**") || strings.HasPrefix(trimmed, "///") || strings.HasPrefix(trimmed, "//!") {
doxyComment = comments[i]
break
}
}
if doxyComment != nil {
text := GetNodeText(doxyComment, content)
cleaned, tags := parseDoxygen(text)
return &DocComment{
Text: cleaned,
Raw: text,
Style: CommentStyleDoxygen,
Tags: tags,
StartLine: int(doxyComment.StartPoint().Row) + 1,
EndLine: int(doxyComment.EndPoint().Row) + 1,
}
}
// Fall back to regular comments
var parts []string
var raw []string
startLine := -1
endLine := -1
for _, c := range comments {
text := GetNodeText(c, content)
raw = append(raw, text)
if startLine == -1 {
startLine = int(c.StartPoint().Row) + 1
}
endLine = int(c.EndPoint().Row) + 1
cleaned := cleanCComment(text)
if cleaned != "" {
parts = append(parts, cleaned)
}
}
if len(parts) == 0 {
return nil
}
return &DocComment{
Text: strings.Join(parts, "\n"),
Raw: strings.Join(raw, "\n"),
Style: detectCommentStyle(raw[0]),
Tags: nil,
StartLine: startLine,
EndLine: endLine,
}
}
// collectPrecedingComments collects all comment nodes immediately before a node.
func collectPrecedingComments(n *sitter.Node, _ []byte, commentTypes []string) []*sitter.Node {
var comments []*sitter.Node
// Walk backwards through siblings
prev := n.PrevSibling()
lastCommentLine := int(n.StartPoint().Row)
for prev != nil {
isComment := false
nodeType := prev.Type()
for _, ct := range commentTypes {
if nodeType == ct {
isComment = true
break
}
}
if !isComment {
break
}
commentEndLine := int(prev.EndPoint().Row)
// Check if there's a blank line gap
if lastCommentLine-commentEndLine > 1 {
break
}
comments = append([]*sitter.Node{prev}, comments...)
lastCommentLine = int(prev.StartPoint().Row)
prev = prev.PrevSibling()
}
return comments
}
// detectCommentStyle determines the style of a comment.
func detectCommentStyle(comment string) CommentStyle {
trimmed := strings.TrimSpace(comment)
if strings.HasPrefix(trimmed, "/**") {
return CommentStyleJSDoc
}
if strings.HasPrefix(trimmed, "///") || strings.HasPrefix(trimmed, "//!") {
return CommentStyleDoxygen
}
if strings.HasPrefix(trimmed, "/*") {
return CommentStyleBlock
}
if strings.HasPrefix(trimmed, "//") {
return CommentStyleLine
}
if strings.HasPrefix(trimmed, "#") {
return CommentStyleHash
}
if strings.HasPrefix(trimmed, `"""`) || strings.HasPrefix(trimmed, `'''`) {
return CommentStyleDocstring
}
return CommentStyleLine
}
// cleanGoComment cleans a Go comment.
func cleanGoComment(comment string) string {
comment = strings.TrimSpace(comment)
// Handle // comments
if after, found := strings.CutPrefix(comment, "//"); found {
return strings.TrimSpace(after)
}
// Handle /* */ comments
if strings.HasPrefix(comment, "/*") && strings.HasSuffix(comment, "*/") {
comment = strings.TrimPrefix(comment, "/*")
comment = strings.TrimSuffix(comment, "*/")
return cleanBlockComment(comment)
}
return strings.TrimSpace(comment)
}
// cleanJSComment cleans a JavaScript/TypeScript comment.
func cleanJSComment(comment string) string {
return cleanGoComment(comment) // Same rules
}
// cleanCComment cleans a C/C++ comment.
func cleanCComment(comment string) string {
return cleanGoComment(comment) // Same rules
}
// cleanBlockComment cleans the content of a block comment.
func cleanBlockComment(comment string) string {
lines := strings.Split(comment, "\n")
var cleaned []string
for _, line := range lines {
line = strings.TrimSpace(line)
// Remove leading * from each line (common in block comments)
line = strings.TrimPrefix(line, "*")
line = strings.TrimSpace(line)
cleaned = append(cleaned, line)
}
// Remove empty leading/trailing lines
for len(cleaned) > 0 && cleaned[0] == "" {
cleaned = cleaned[1:]
}
for len(cleaned) > 0 && cleaned[len(cleaned)-1] == "" {
cleaned = cleaned[:len(cleaned)-1]
}
return strings.Join(cleaned, "\n")
}
// parseJSDoc parses a JSDoc comment and extracts tags.
func parseJSDoc(comment string) (string, map[string]string) {
comment = strings.TrimSpace(comment)
// Remove /** and */
comment = strings.TrimPrefix(comment, "/**")
comment = strings.TrimSuffix(comment, "*/")
lines := strings.Split(comment, "\n")
var descLines []string
tags := make(map[string]string)
// Regex for JSDoc tags
tagPattern := regexp.MustCompile(`^\s*\*?\s*@(\w+)\s*(.*)$`)
for _, line := range lines {
line = strings.TrimSpace(line)
line = strings.TrimPrefix(line, "*")
line = strings.TrimSpace(line)
if matches := tagPattern.FindStringSubmatch(line); matches != nil {
tagName := matches[1]
tagValue := strings.TrimSpace(matches[2])
if existing, ok := tags[tagName]; ok {
tags[tagName] = existing + "\n" + tagValue
} else {
tags[tagName] = tagValue
}
} else if line != "" {
descLines = append(descLines, line)
}
}
return strings.Join(descLines, "\n"), tags
}
// parseDoxygen parses a Doxygen comment and extracts tags.
func parseDoxygen(comment string) (string, map[string]string) {
comment = strings.TrimSpace(comment)
// Handle /// and //! style comments
comment = strings.TrimPrefix(comment, "///")
comment = strings.TrimPrefix(comment, "//!")
// Handle /** */ style comments
comment = strings.TrimPrefix(comment, "/**")
comment = strings.TrimSuffix(comment, "*/")
lines := strings.Split(comment, "\n")
var descLines []string
tags := make(map[string]string)
// Regex for Doxygen tags (@param, @return, \param, \return, etc.)
tagPattern := regexp.MustCompile(`^\s*\*?\s*[@\\](\w+)\s*(.*)$`)
for _, line := range lines {
line = strings.TrimSpace(line)
line = strings.TrimPrefix(line, "*")
line = strings.TrimSpace(line)
if matches := tagPattern.FindStringSubmatch(line); matches != nil {
tagName := matches[1]
tagValue := strings.TrimSpace(matches[2])
if existing, ok := tags[tagName]; ok {
tags[tagName] = existing + "\n" + tagValue
} else {
tags[tagName] = tagValue
}
} else if line != "" {
descLines = append(descLines, line)
}
}
return strings.Join(descLines, "\n"), tags
}
// FormatDocComment formats a DocComment for display.
func FormatDocComment(doc *DocComment) string {
if doc == nil || doc.Text == "" {
return ""
}
var sb strings.Builder
sb.WriteString(doc.Text)
if len(doc.Tags) > 0 {
sb.WriteString("\n\n")
// Order: description, params, returns, other
paramOrder := []string{"param", "parameter", "arg", "argument"}
returnOrder := []string{"return", "returns", "retval"}
// Write params first
for _, tagName := range paramOrder {
if val, ok := doc.Tags[tagName]; ok {
for _, line := range strings.Split(val, "\n") {
sb.WriteString("@" + tagName + " " + line + "\n")
}
}
}
// Write returns
for _, tagName := range returnOrder {
if val, ok := doc.Tags[tagName]; ok {
sb.WriteString("@" + tagName + " " + val + "\n")
}
}
// Write remaining tags
written := make(map[string]bool)
for _, t := range paramOrder {
written[t] = true
}
for _, t := range returnOrder {
written[t] = true
}
for tagName, val := range doc.Tags {
if !written[tagName] {
sb.WriteString("@" + tagName + " " + val + "\n")
}
}
}
return strings.TrimSpace(sb.String())
}
// cleanPythonDocstring cleans a Python docstring.
func cleanPythonDocstring(doc string) string {
doc = strings.TrimSpace(doc)
// Remove triple quotes
doc = strings.TrimPrefix(doc, `"""`)
doc = strings.TrimSuffix(doc, `"""`)
doc = strings.TrimPrefix(doc, `'''`)
doc = strings.TrimSuffix(doc, `'''`)
return strings.TrimSpace(doc)
}
// extractRustDocComment extracts Rust documentation comments (/// style).
func extractRustDocComment(n *sitter.Node, content []byte) *DocComment {
comments := collectPrecedingComments(n, content, []string{"line_comment"})
if len(comments) == 0 {
return nil
}
// Filter for /// doc comments only
var docComments []*sitter.Node
for _, c := range comments {
text := GetNodeText(c, content)
trimmed := strings.TrimSpace(text)
if strings.HasPrefix(trimmed, "///") {
docComments = append(docComments, c)
}
}
if len(docComments) == 0 {
return nil
}
var parts []string
var raw []string
startLine := -1
endLine := -1
for _, c := range docComments {
text := GetNodeText(c, content)
raw = append(raw, text)
if startLine == -1 {
startLine = int(c.StartPoint().Row) + 1
}
endLine = int(c.EndPoint().Row) + 1
// Clean /// prefix
cleaned := strings.TrimSpace(text)
cleaned = strings.TrimPrefix(cleaned, "///")
if len(cleaned) > 0 && cleaned[0] == ' ' {
cleaned = cleaned[1:]
}
parts = append(parts, cleaned)
}
if len(parts) == 0 {
return nil
}
return &DocComment{
Text: strings.Join(parts, "\n"),
Raw: strings.Join(raw, "\n"),
Style: CommentStyleDoxygen,
Tags: nil,
StartLine: startLine,
EndLine: endLine,
}
}
// extractElixirDocComment extracts Elixir documentation from @doc and @moduledoc attributes.
// Elixir uses module attributes like @doc and @moduledoc for documentation.
func extractElixirDocComment(n *sitter.Node, content []byte) *DocComment {
// Look for @doc or @moduledoc attribute preceding this node
prev := n.PrevSibling()
for prev != nil {
// Check if this is an unary_operator with @ (module attribute)
if prev.Type() == "unary_operator" {
text := GetNodeText(prev, content)
trimmed := strings.TrimSpace(text)
// Check for @doc or @moduledoc
if strings.HasPrefix(trimmed, "@doc") || strings.HasPrefix(trimmed, "@moduledoc") {
// Extract the documentation string
docText := extractElixirDocString(prev, content)
if docText != "" {
return &DocComment{
Text: docText,
Raw: text,
Style: CommentStyleDocstring,
Tags: nil,
StartLine: int(prev.StartPoint().Row) + 1,
EndLine: int(prev.EndPoint().Row) + 1,
}
}
}
}
// Also check for regular # comments
if prev.Type() == "comment" {
comments := collectPrecedingComments(n, content, []string{"comment"})
if len(comments) > 0 {
var parts []string
var raw []string
startLine := -1
endLine := -1
for _, c := range comments {
text := GetNodeText(c, content)
raw = append(raw, text)
if startLine == -1 {
startLine = int(c.StartPoint().Row) + 1
}
endLine = int(c.EndPoint().Row) + 1
// Clean # comment
cleaned := strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(text), "#"))
if cleaned != "" {
parts = append(parts, cleaned)
}
}
if len(parts) > 0 {
return &DocComment{
Text: strings.Join(parts, "\n"),
Raw: strings.Join(raw, "\n"),
Style: CommentStyleHash,
Tags: nil,
StartLine: startLine,
EndLine: endLine,
}
}
}
break
}
prev = prev.PrevSibling()
}
return nil
}
// extractElixirDocString extracts the documentation string from an Elixir @doc/@moduledoc attribute.
func extractElixirDocString(n *sitter.Node, content []byte) string {
// The doc attribute typically looks like:
// @doc """
// Documentation here
// """
// or
// @doc "Single line doc"
text := GetNodeText(n, content)
// Find the string content after @doc or @moduledoc
var docContent string
// Check for heredoc style (triple quotes)
if idx := strings.Index(text, `"""`); idx != -1 {
// Find the closing triple quotes
rest := text[idx+3:]
if endIdx := strings.Index(rest, `"""`); endIdx != -1 {
docContent = rest[:endIdx]
}
} else if idx := strings.Index(text, `"`); idx != -1 {
// Single quoted string
rest := text[idx+1:]
if endIdx := strings.Index(rest, `"`); endIdx != -1 {
docContent = rest[:endIdx]
}
}
return strings.TrimSpace(docContent)
}