mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-23 03:51:31 +00:00
feat(leann-phase2): implement hybrid vector storage and graph-based search (#20)
* feat(leann-phase2): implement hybrid vector storage and graph-based search
- [x] Add AST-aware code chunking for Go, Python, and TypeScript using tree-sitter
- [x] Implement LEANN-inspired hybrid vector storage with hub detection and selective embedding storage (60-80% savings)
- [x] Add observation relationship graph with CSR format and edge detection (file overlap, semantic similarity, temporal, concept)
- [x] Implement graph-aware search with two-level traversal and relationship-based ranking
- [x] Add auto-tuning system for dynamic hub threshold adjustment based on query performance
- [x] Add comprehensive metrics tracking for vector storage, queries, latency, and graph traversals
- [x] Update configuration system with graph and hybrid storage settings
- [x] Add graph stats and vector metrics endpoints to worker service
- [x] Enhance UI sidebar with advanced metrics display and graph visualization
- [x] Optimize struct field alignment throughout codebase for memory efficiency
- [x] Update documentation with LEANN Phase 2 features and performance benefits
- [x] Add tree-sitter dependency for AST parsing
* fix: add fts5 build tag to CI workflow
Pass build-tags: "fts5" to shared workflow to properly compile
sqlite-vec-go-bindings with SQLite FTS5 support.
This fixes test failures in hybrid vector storage tests that require
CGO and FTS5 build tags.
Requires shared-actions@8f7f235 or later.
* docs: add testing documentation and macOS ARM64 known issue
Document the macOS ARM64 CGO linking issue with sqlite-vec-go-bindings
that prevents hybrid package tests from compiling locally.
Added:
- .github/TESTING.md: Comprehensive testing guide with platform-specific
issues, workarounds, and CI configuration details
- internal/vector/hybrid/README.md: Package-specific documentation
explaining the macOS limitation
- .github/CI_FIX_SUMMARY.md: Technical details of the CI fix
Key points:
- 41 out of 42 packages test successfully on all platforms
- hybrid package tests fail only on macOS ARM64 (local dev issue)
- Linux CI tests pass with proper build-tags: "fts5" configuration
- Production builds and runtime functionality unaffected
This is a known limitation of sqlite-vec-go-bindings on macOS ARM64
and does not impact CI/CD or production deployments.
* fix: add SQLite busy_timeout to prevent database locked errors
Set PRAGMA busy_timeout=5000 (5 seconds) to allow SQLite to retry
when the database is locked instead of failing immediately.
This fixes race conditions when multiple goroutines try to write
simultaneously, particularly in tests where StoreObservation spawns
async cleanup goroutines.
Root cause:
- StoreObservation launches goroutine -> CleanupOldObservations
- Multiple concurrent cleanups caused "database is locked" errors
- Without busy_timeout, SQLite fails immediately on lock contention
Solution:
- Add 5-second busy timeout for automatic retry on lock
- Standard practice for concurrent SQLite usage
- Works with existing WAL mode configuration
Fixes TestObservationStore_CleanupOldObservations in CI.
* docs: complete summary of all CI test fixes
Comprehensive documentation of all fixes applied:
1. Missing build tags (fts5)
2. Database locked errors (busy_timeout)
All 41/42 packages now pass tests. The hybrid package has a known
macOS ARM64 limitation that doesn't affect CI or production.
No functionality was removed - all fixes are additive only.
* fix: add SQLite driver import to hybrid tests for CGO linking
Add blank import of mattn/go-sqlite3 to hybrid test files to ensure
the SQLite driver is linked into the test binary. This provides the
SQLite symbols that sqlite-vec-go-bindings requires.
Root cause:
- hybrid package imports sqlitevec (transitively depends on sqlite-vec CGO)
- Test binary needs SQLite symbols for linking
- sqlitevec tests already had this import, but hybrid tests didn't
- Without the driver import, linker fails with "undefined symbols"
This fix enables hybrid tests to run with -race flag on all platforms.
Before: 41/42 packages pass (hybrid failed to link)
After: 42/42 packages pass ✅
Fixes hybrid test compilation on macOS ARM64, Linux, and Windows.
* docs: remove outdated macOS limitation documentation
The hybrid test linking issue has been fixed by adding the SQLite
driver import. All tests now pass on all platforms including macOS.
Removed:
- internal/vector/hybrid/README.md (documented workaround no longer needed)
- .github/TESTING.md (macOS limitation section obsolete)
All 42/42 packages now test successfully with -race flag.
* docs: final comprehensive summary of all CI fixes
All three issues now resolved:
1. Missing fts5 build tags
2. Database busy_timeout for concurrent writes
3. Missing SQLite driver import in hybrid tests
Result: 42/42 packages pass with -race on all platforms.
Credit to reviewer for identifying the race detector concern.
This commit is contained in:
@@ -0,0 +1,285 @@
|
||||
// Package golang provides AST-aware chunking for Go source files.
|
||||
package golang
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"go/ast"
|
||||
"go/parser"
|
||||
"go/token"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/chunking"
|
||||
)
|
||||
|
||||
// Chunker implements AST-aware chunking for Go files.
|
||||
type Chunker struct {
|
||||
options chunking.ChunkOptions
|
||||
}
|
||||
|
||||
// NewChunker creates a new Go chunker.
|
||||
func NewChunker(options chunking.ChunkOptions) *Chunker {
|
||||
return &Chunker{options: options}
|
||||
}
|
||||
|
||||
// Language returns the language this chunker supports.
|
||||
func (c *Chunker) Language() chunking.Language {
|
||||
return chunking.LanguageGo
|
||||
}
|
||||
|
||||
// SupportedExtensions returns the file extensions this chunker handles.
|
||||
func (c *Chunker) SupportedExtensions() []string {
|
||||
return []string{".go"}
|
||||
}
|
||||
|
||||
// Chunk parses a Go source file and returns semantic code chunks.
|
||||
func (c *Chunker) Chunk(ctx context.Context, filePath string) ([]chunking.Chunk, error) {
|
||||
// Read file content
|
||||
content, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read file: %w", err)
|
||||
}
|
||||
|
||||
// Parse the Go file
|
||||
fset := token.NewFileSet()
|
||||
file, err := parser.ParseFile(fset, filePath, content, parser.ParseComments)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse Go file: %w", err)
|
||||
}
|
||||
|
||||
chunks := make([]chunking.Chunk, 0)
|
||||
sourceLines := strings.Split(string(content), "\n")
|
||||
|
||||
// Extract chunks from declarations
|
||||
for _, decl := range file.Decls {
|
||||
switch d := decl.(type) {
|
||||
case *ast.FuncDecl:
|
||||
chunk := c.extractFunction(fset, d, sourceLines, filePath)
|
||||
if chunk != nil {
|
||||
chunks = append(chunks, *chunk)
|
||||
}
|
||||
case *ast.GenDecl:
|
||||
extracted := c.extractGenDecl(fset, d, sourceLines, filePath)
|
||||
chunks = append(chunks, extracted...)
|
||||
}
|
||||
}
|
||||
|
||||
return chunks, nil
|
||||
}
|
||||
|
||||
// extractFunction extracts a function or method declaration as a chunk.
|
||||
func (c *Chunker) extractFunction(fset *token.FileSet, fn *ast.FuncDecl, sourceLines []string, filePath string) *chunking.Chunk {
|
||||
// Skip unexported if configured
|
||||
if !c.options.IncludePrivate && !fn.Name.IsExported() {
|
||||
return nil
|
||||
}
|
||||
|
||||
startPos := fset.Position(fn.Pos())
|
||||
endPos := fset.Position(fn.End())
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageGo,
|
||||
Name: fn.Name.Name,
|
||||
StartLine: startPos.Line,
|
||||
EndLine: endPos.Line,
|
||||
}
|
||||
|
||||
// Determine if this is a method or a function
|
||||
if fn.Recv != nil && len(fn.Recv.List) > 0 {
|
||||
chunk.Type = chunking.ChunkTypeMethod
|
||||
chunk.ParentName = c.extractReceiverType(fn.Recv)
|
||||
} else {
|
||||
chunk.Type = chunking.ChunkTypeFunction
|
||||
}
|
||||
|
||||
// Extract content
|
||||
chunk.Content = c.extractLines(sourceLines, startPos.Line, endPos.Line)
|
||||
|
||||
// Extract signature (function declaration without body)
|
||||
chunk.Signature = c.extractFunctionSignature(fn, fset, sourceLines)
|
||||
|
||||
// Extract doc comment
|
||||
if c.options.IncludeDocComments && fn.Doc != nil {
|
||||
chunk.DocComment = strings.TrimSpace(fn.Doc.Text())
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractGenDecl extracts general declarations (type, const, var).
|
||||
func (c *Chunker) extractGenDecl(fset *token.FileSet, gd *ast.GenDecl, sourceLines []string, filePath string) []chunking.Chunk {
|
||||
var chunks []chunking.Chunk
|
||||
|
||||
for _, spec := range gd.Specs {
|
||||
switch s := spec.(type) {
|
||||
case *ast.TypeSpec:
|
||||
chunk := c.extractTypeSpec(fset, gd, s, sourceLines, filePath)
|
||||
if chunk != nil {
|
||||
chunks = append(chunks, *chunk)
|
||||
}
|
||||
case *ast.ValueSpec:
|
||||
// Handle const and var declarations
|
||||
chunk := c.extractValueSpec(fset, gd, s, sourceLines, filePath)
|
||||
if chunk != nil {
|
||||
chunks = append(chunks, *chunk)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
// extractTypeSpec extracts a type declaration (struct, interface, type alias).
|
||||
func (c *Chunker) extractTypeSpec(fset *token.FileSet, gd *ast.GenDecl, ts *ast.TypeSpec, sourceLines []string, filePath string) *chunking.Chunk {
|
||||
// Skip unexported if configured
|
||||
if !c.options.IncludePrivate && !ts.Name.IsExported() {
|
||||
return nil
|
||||
}
|
||||
|
||||
startPos := fset.Position(gd.Pos())
|
||||
endPos := fset.Position(gd.End())
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageGo,
|
||||
Name: ts.Name.Name,
|
||||
StartLine: startPos.Line,
|
||||
EndLine: endPos.Line,
|
||||
Content: c.extractLines(sourceLines, startPos.Line, endPos.Line),
|
||||
}
|
||||
|
||||
// Determine chunk type based on type expression
|
||||
switch ts.Type.(type) {
|
||||
case *ast.StructType:
|
||||
chunk.Type = chunking.ChunkTypeClass // Treat struct as class
|
||||
case *ast.InterfaceType:
|
||||
chunk.Type = chunking.ChunkTypeInterface
|
||||
default:
|
||||
chunk.Type = chunking.ChunkTypeType
|
||||
}
|
||||
|
||||
// Extract doc comment
|
||||
if c.options.IncludeDocComments && gd.Doc != nil {
|
||||
chunk.DocComment = strings.TrimSpace(gd.Doc.Text())
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractValueSpec extracts const or var declarations.
|
||||
func (c *Chunker) extractValueSpec(fset *token.FileSet, gd *ast.GenDecl, vs *ast.ValueSpec, sourceLines []string, filePath string) *chunking.Chunk {
|
||||
// Skip if all names are unexported and we're excluding private
|
||||
if !c.options.IncludePrivate {
|
||||
allUnexported := true
|
||||
for _, name := range vs.Names {
|
||||
if name.IsExported() {
|
||||
allUnexported = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allUnexported {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
startPos := fset.Position(gd.Pos())
|
||||
endPos := fset.Position(gd.End())
|
||||
|
||||
// Use first name as the chunk name, join multiple if present
|
||||
names := make([]string, len(vs.Names))
|
||||
for i, name := range vs.Names {
|
||||
names[i] = name.Name
|
||||
}
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageGo,
|
||||
Name: strings.Join(names, ", "),
|
||||
StartLine: startPos.Line,
|
||||
EndLine: endPos.Line,
|
||||
Content: c.extractLines(sourceLines, startPos.Line, endPos.Line),
|
||||
}
|
||||
|
||||
// Set type based on token
|
||||
if gd.Tok == token.CONST {
|
||||
chunk.Type = chunking.ChunkTypeConst
|
||||
} else {
|
||||
chunk.Type = chunking.ChunkTypeVar
|
||||
}
|
||||
|
||||
// Extract doc comment
|
||||
if c.options.IncludeDocComments && gd.Doc != nil {
|
||||
chunk.DocComment = strings.TrimSpace(gd.Doc.Text())
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractReceiverType extracts the receiver type name from a method.
|
||||
func (c *Chunker) extractReceiverType(recv *ast.FieldList) string {
|
||||
if len(recv.List) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
field := recv.List[0]
|
||||
switch t := field.Type.(type) {
|
||||
case *ast.Ident:
|
||||
return t.Name
|
||||
case *ast.StarExpr:
|
||||
if ident, ok := t.X.(*ast.Ident); ok {
|
||||
return ident.Name
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractFunctionSignature extracts the function signature without the body.
|
||||
func (c *Chunker) extractFunctionSignature(fn *ast.FuncDecl, fset *token.FileSet, sourceLines []string) string {
|
||||
if fn.Body == nil {
|
||||
// No body, return entire declaration
|
||||
startPos := fset.Position(fn.Pos())
|
||||
endPos := fset.Position(fn.End())
|
||||
return c.extractLines(sourceLines, startPos.Line, endPos.Line)
|
||||
}
|
||||
|
||||
// Extract from start of function to just before body
|
||||
startPos := fset.Position(fn.Pos())
|
||||
bodyPos := fset.Position(fn.Body.Pos())
|
||||
|
||||
// If body is on the same line, extract just that line up to the opening brace
|
||||
if startPos.Line == bodyPos.Line {
|
||||
line := sourceLines[startPos.Line-1]
|
||||
// Find the opening brace position
|
||||
if idx := strings.Index(line[startPos.Column-1:], "{"); idx >= 0 {
|
||||
return strings.TrimSpace(line[startPos.Column-1 : startPos.Column-1+idx])
|
||||
}
|
||||
return strings.TrimSpace(line[startPos.Column-1:])
|
||||
}
|
||||
|
||||
// Get lines from start to the line containing the opening brace
|
||||
sig := c.extractLines(sourceLines, startPos.Line, bodyPos.Line)
|
||||
// Remove the opening brace and anything after it
|
||||
if idx := strings.Index(sig, "{"); idx >= 0 {
|
||||
sig = sig[:idx]
|
||||
}
|
||||
return strings.TrimSpace(sig)
|
||||
}
|
||||
|
||||
// extractLines extracts a range of lines from source (1-indexed, inclusive).
|
||||
func (c *Chunker) extractLines(lines []string, start, end int) string {
|
||||
if start < 1 || end < start || start > len(lines) {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Adjust for 0-indexed array (start and end are 1-indexed)
|
||||
startIdx := start - 1
|
||||
endIdx := end
|
||||
if endIdx > len(lines) {
|
||||
endIdx = len(lines)
|
||||
}
|
||||
|
||||
return strings.Join(lines[startIdx:endIdx], "\n")
|
||||
}
|
||||
@@ -0,0 +1,214 @@
|
||||
package golang
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/chunking"
|
||||
)
|
||||
|
||||
func TestGoChunker_BasicFunctions(t *testing.T) {
|
||||
// Create temp test file
|
||||
tmpDir := t.TempDir()
|
||||
testFile := filepath.Join(tmpDir, "test.go")
|
||||
|
||||
testCode := `package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
// Greet prints a greeting message
|
||||
func Greet(name string) {
|
||||
fmt.Printf("Hello, %s!\n", name)
|
||||
}
|
||||
|
||||
// Add adds two numbers
|
||||
func Add(a, b int) int {
|
||||
return a + b
|
||||
}
|
||||
|
||||
// unexported function should be included by default
|
||||
func helper() string {
|
||||
return "helper"
|
||||
}
|
||||
`
|
||||
|
||||
if err := os.WriteFile(testFile, []byte(testCode), 0600); err != nil {
|
||||
t.Fatalf("Failed to create test file: %v", err)
|
||||
}
|
||||
|
||||
// Create chunker with default options
|
||||
chunker := NewChunker(chunking.DefaultChunkOptions())
|
||||
|
||||
// Chunk the file
|
||||
chunks, err := chunker.Chunk(context.Background(), testFile)
|
||||
if err != nil {
|
||||
t.Fatalf("Chunk() failed: %v", err)
|
||||
}
|
||||
|
||||
// Verify we got all functions
|
||||
if len(chunks) != 3 {
|
||||
t.Errorf("Expected 3 chunks (Greet, Add, helper), got %d", len(chunks))
|
||||
}
|
||||
|
||||
// Verify chunk details
|
||||
expectedNames := map[string]bool{
|
||||
"Greet": false,
|
||||
"Add": false,
|
||||
"helper": false,
|
||||
}
|
||||
|
||||
for _, chunk := range chunks {
|
||||
if chunk.Type != chunking.ChunkTypeFunction {
|
||||
t.Errorf("Expected chunk type 'function', got '%s'", chunk.Type)
|
||||
}
|
||||
|
||||
if chunk.Language != chunking.LanguageGo {
|
||||
t.Errorf("Expected language 'go', got '%s'", chunk.Language)
|
||||
}
|
||||
|
||||
if _, ok := expectedNames[chunk.Name]; !ok {
|
||||
t.Errorf("Unexpected function name: %s", chunk.Name)
|
||||
} else {
|
||||
expectedNames[chunk.Name] = true
|
||||
}
|
||||
|
||||
// Verify content is non-empty
|
||||
if chunk.Content == "" {
|
||||
t.Errorf("Chunk %s has empty content", chunk.Name)
|
||||
}
|
||||
|
||||
// Verify signature is present for functions
|
||||
if chunk.Signature == "" {
|
||||
t.Errorf("Chunk %s has empty signature", chunk.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify all expected functions were found
|
||||
for name, found := range expectedNames {
|
||||
if !found {
|
||||
t.Errorf("Expected function %s not found", name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGoChunker_StructsAndMethods(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
testFile := filepath.Join(tmpDir, "test.go")
|
||||
|
||||
testCode := `package main
|
||||
|
||||
// User represents a user
|
||||
type User struct {
|
||||
ID int
|
||||
Name string
|
||||
}
|
||||
|
||||
// GetName returns the user's name
|
||||
func (u *User) GetName() string {
|
||||
return u.Name
|
||||
}
|
||||
|
||||
// SetName sets the user's name
|
||||
func (u *User) SetName(name string) {
|
||||
u.Name = name
|
||||
}
|
||||
`
|
||||
|
||||
if err := os.WriteFile(testFile, []byte(testCode), 0600); err != nil {
|
||||
t.Fatalf("Failed to create test file: %v", err)
|
||||
}
|
||||
|
||||
chunker := NewChunker(chunking.DefaultChunkOptions())
|
||||
chunks, err := chunker.Chunk(context.Background(), testFile)
|
||||
if err != nil {
|
||||
t.Fatalf("Chunk() failed: %v", err)
|
||||
}
|
||||
|
||||
// Should have 1 struct + 2 methods = 3 chunks
|
||||
if len(chunks) != 3 {
|
||||
t.Errorf("Expected 3 chunks (User struct, GetName, SetName), got %d", len(chunks))
|
||||
}
|
||||
|
||||
// Find the struct and methods
|
||||
var structChunk, getNameChunk, setNameChunk *chunking.Chunk
|
||||
for i := range chunks {
|
||||
switch chunks[i].Name {
|
||||
case "User":
|
||||
structChunk = &chunks[i]
|
||||
case "GetName":
|
||||
getNameChunk = &chunks[i]
|
||||
case "SetName":
|
||||
setNameChunk = &chunks[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Verify struct
|
||||
if structChunk == nil {
|
||||
t.Fatal("User struct not found")
|
||||
}
|
||||
if structChunk.Type != chunking.ChunkTypeClass {
|
||||
t.Errorf("Expected User to be ChunkTypeClass, got %s", structChunk.Type)
|
||||
}
|
||||
|
||||
// Verify methods
|
||||
if getNameChunk == nil {
|
||||
t.Fatal("GetName method not found")
|
||||
}
|
||||
if getNameChunk.Type != chunking.ChunkTypeMethod {
|
||||
t.Errorf("Expected GetName to be ChunkTypeMethod, got %s", getNameChunk.Type)
|
||||
}
|
||||
if getNameChunk.ParentName != "User" {
|
||||
t.Errorf("Expected GetName parent to be 'User', got '%s'", getNameChunk.ParentName)
|
||||
}
|
||||
|
||||
if setNameChunk == nil {
|
||||
t.Fatal("SetName method not found")
|
||||
}
|
||||
if setNameChunk.Type != chunking.ChunkTypeMethod {
|
||||
t.Errorf("Expected SetName to be ChunkTypeMethod, got %s", setNameChunk.Type)
|
||||
}
|
||||
if setNameChunk.ParentName != "User" {
|
||||
t.Errorf("Expected SetName parent to be 'User', got '%s'", setNameChunk.ParentName)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGoChunker_DocComments(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
testFile := filepath.Join(tmpDir, "test.go")
|
||||
|
||||
testCode := `package main
|
||||
|
||||
// Calculate performs a calculation.
|
||||
// It takes two integers and returns their sum.
|
||||
func Calculate(a, b int) int {
|
||||
return a + b
|
||||
}
|
||||
`
|
||||
|
||||
if err := os.WriteFile(testFile, []byte(testCode), 0600); err != nil {
|
||||
t.Fatalf("Failed to create test file: %v", err)
|
||||
}
|
||||
|
||||
chunker := NewChunker(chunking.DefaultChunkOptions())
|
||||
chunks, err := chunker.Chunk(context.Background(), testFile)
|
||||
if err != nil {
|
||||
t.Fatalf("Chunk() failed: %v", err)
|
||||
}
|
||||
|
||||
if len(chunks) != 1 {
|
||||
t.Fatalf("Expected 1 chunk, got %d", len(chunks))
|
||||
}
|
||||
|
||||
chunk := chunks[0]
|
||||
if chunk.DocComment == "" {
|
||||
t.Error("Expected doc comment to be present")
|
||||
}
|
||||
|
||||
// Doc comment should contain the comment text
|
||||
expectedComment := "Calculate performs a calculation.\nIt takes two integers and returns their sum."
|
||||
if chunk.DocComment != expectedComment {
|
||||
t.Errorf("Expected doc comment '%s', got '%s'", expectedComment, chunk.DocComment)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
package chunking
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Manager dispatches files to appropriate language-specific chunkers.
|
||||
type Manager struct {
|
||||
chunkers map[string]Chunker // extension -> chunker
|
||||
options ChunkOptions
|
||||
}
|
||||
|
||||
// NewManager creates a new chunking manager with the given chunkers.
|
||||
func NewManager(chunkers []Chunker, options ChunkOptions) *Manager {
|
||||
m := &Manager{
|
||||
chunkers: make(map[string]Chunker),
|
||||
options: options,
|
||||
}
|
||||
|
||||
// Register chunkers by their supported extensions
|
||||
for _, chunker := range chunkers {
|
||||
for _, ext := range chunker.SupportedExtensions() {
|
||||
m.chunkers[ext] = chunker
|
||||
}
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
// ChunkFile chunks a single file using the appropriate language chunker.
|
||||
// Returns an error if no chunker is found for the file extension.
|
||||
func (m *Manager) ChunkFile(ctx context.Context, filePath string) ([]Chunk, error) {
|
||||
ext := strings.ToLower(filepath.Ext(filePath))
|
||||
chunker, ok := m.chunkers[ext]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no chunker for extension %s", ext)
|
||||
}
|
||||
|
||||
chunks, err := chunker.Chunk(ctx, filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("chunk %s: %w", filePath, err)
|
||||
}
|
||||
|
||||
// Apply options-based filtering
|
||||
filtered := make([]Chunk, 0, len(chunks))
|
||||
for _, chunk := range chunks {
|
||||
// Filter by minimum lines
|
||||
if m.options.MinLines > 0 {
|
||||
lineCount := chunk.EndLine - chunk.StartLine + 1
|
||||
if lineCount < m.options.MinLines {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Filter by maximum chunk size
|
||||
if m.options.MaxChunkSize > 0 && len(chunk.Content) > m.options.MaxChunkSize {
|
||||
// TODO: Consider splitting large chunks intelligently
|
||||
// For now, skip chunks that are too large
|
||||
continue
|
||||
}
|
||||
|
||||
filtered = append(filtered, chunk)
|
||||
}
|
||||
|
||||
return filtered, nil
|
||||
}
|
||||
|
||||
// ChunkFiles chunks multiple files in parallel.
|
||||
// Returns a map of file path to chunks, and any errors encountered.
|
||||
// Errors for individual files do not stop processing of other files.
|
||||
func (m *Manager) ChunkFiles(ctx context.Context, filePaths []string) (map[string][]Chunk, []error) {
|
||||
results := make(map[string][]Chunk)
|
||||
var errors []error
|
||||
|
||||
for _, filePath := range filePaths {
|
||||
chunks, err := m.ChunkFile(ctx, filePath)
|
||||
if err != nil {
|
||||
errors = append(errors, fmt.Errorf("%s: %w", filePath, err))
|
||||
continue
|
||||
}
|
||||
if len(chunks) > 0 {
|
||||
results[filePath] = chunks
|
||||
}
|
||||
}
|
||||
|
||||
return results, errors
|
||||
}
|
||||
|
||||
// SupportsFile checks if the manager can chunk the given file based on extension.
|
||||
func (m *Manager) SupportsFile(filePath string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(filePath))
|
||||
_, ok := m.chunkers[ext]
|
||||
return ok
|
||||
}
|
||||
|
||||
// SupportedExtensions returns all file extensions supported by registered chunkers.
|
||||
func (m *Manager) SupportedExtensions() []string {
|
||||
exts := make([]string, 0, len(m.chunkers))
|
||||
for ext := range m.chunkers {
|
||||
exts = append(exts, ext)
|
||||
}
|
||||
return exts
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
package chunking
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mockChunker is a test chunker that returns dummy chunks
|
||||
type mockChunker struct{}
|
||||
|
||||
func (m *mockChunker) Chunk(ctx context.Context, filePath string) ([]Chunk, error) {
|
||||
// Just return an empty chunk for testing
|
||||
return []Chunk{
|
||||
{
|
||||
FilePath: filePath,
|
||||
Language: LanguageGo,
|
||||
Type: ChunkTypeFunction,
|
||||
Name: "TestFunc",
|
||||
StartLine: 1,
|
||||
EndLine: 1,
|
||||
Content: "test",
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *mockChunker) Language() Language {
|
||||
return LanguageGo
|
||||
}
|
||||
|
||||
func (m *mockChunker) SupportedExtensions() []string {
|
||||
return []string{".go", ".py", ".ts"}
|
||||
}
|
||||
|
||||
func TestManager_ChunkMultipleFiles(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
|
||||
// Create a Go file
|
||||
goFile := filepath.Join(tmpDir, "test.go")
|
||||
goCode := `package main
|
||||
|
||||
func Hello() string {
|
||||
return "hello"
|
||||
}
|
||||
`
|
||||
if err := os.WriteFile(goFile, []byte(goCode), 0600); err != nil {
|
||||
t.Fatalf("Failed to create Go file: %v", err)
|
||||
}
|
||||
|
||||
// Create a Python file
|
||||
pyFile := filepath.Join(tmpDir, "test.py")
|
||||
pyCode := `def greet(name):
|
||||
return f"Hello, {name}!"
|
||||
|
||||
class User:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
`
|
||||
if err := os.WriteFile(pyFile, []byte(pyCode), 0600); err != nil {
|
||||
t.Fatalf("Failed to create Python file: %v", err)
|
||||
}
|
||||
|
||||
// Create a TypeScript file
|
||||
tsFile := filepath.Join(tmpDir, "test.ts")
|
||||
tsCode := `function add(a: number, b: number): number {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
class Calculator {
|
||||
multiply(a: number, b: number): number {
|
||||
return a * b;
|
||||
}
|
||||
}
|
||||
`
|
||||
if err := os.WriteFile(tsFile, []byte(tsCode), 0600); err != nil {
|
||||
t.Fatalf("Failed to create TypeScript file: %v", err)
|
||||
}
|
||||
|
||||
// Create manager
|
||||
manager := NewManager([]Chunker{&mockChunker{}}, DefaultChunkOptions())
|
||||
|
||||
// Test SupportsFile
|
||||
if !manager.SupportsFile(goFile) {
|
||||
t.Error("Manager should support .go files")
|
||||
}
|
||||
if !manager.SupportsFile(pyFile) {
|
||||
t.Error("Manager should support .py files")
|
||||
}
|
||||
if !manager.SupportsFile(tsFile) {
|
||||
t.Error("Manager should support .ts files")
|
||||
}
|
||||
|
||||
unsupportedFile := filepath.Join(tmpDir, "test.txt")
|
||||
if manager.SupportsFile(unsupportedFile) {
|
||||
t.Error("Manager should not support .txt files")
|
||||
}
|
||||
|
||||
// Test ChunkFiles
|
||||
results, errs := manager.ChunkFiles(context.Background(), []string{goFile, pyFile, tsFile})
|
||||
if len(errs) > 0 {
|
||||
t.Errorf("ChunkFiles returned errors: %v", errs)
|
||||
}
|
||||
|
||||
if len(results) != 3 {
|
||||
t.Errorf("Expected results for 3 files, got %d", len(results))
|
||||
}
|
||||
|
||||
// Verify each file has chunks
|
||||
for _, file := range []string{goFile, pyFile, tsFile} {
|
||||
if chunks, ok := results[file]; !ok || len(chunks) == 0 {
|
||||
t.Errorf("No chunks found for file %s", file)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mockChunkerWithExts is a test chunker with configurable extensions
|
||||
type mockChunkerWithExts struct {
|
||||
exts []string
|
||||
}
|
||||
|
||||
func (m *mockChunkerWithExts) Chunk(ctx context.Context, filePath string) ([]Chunk, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (m *mockChunkerWithExts) Language() Language {
|
||||
return LanguageGo
|
||||
}
|
||||
|
||||
func (m *mockChunkerWithExts) SupportedExtensions() []string {
|
||||
return m.exts
|
||||
}
|
||||
|
||||
func TestManager_SupportedExtensions(t *testing.T) {
|
||||
|
||||
// Create manager with mock chunkers
|
||||
manager := NewManager([]Chunker{
|
||||
&mockChunkerWithExts{exts: []string{".go"}},
|
||||
&mockChunkerWithExts{exts: []string{".py", ".pyw"}},
|
||||
}, DefaultChunkOptions())
|
||||
|
||||
exts := manager.SupportedExtensions()
|
||||
expectedExts := map[string]bool{
|
||||
".go": false,
|
||||
".py": false,
|
||||
".pyw": false,
|
||||
}
|
||||
|
||||
for _, ext := range exts {
|
||||
if _, ok := expectedExts[ext]; ok {
|
||||
expectedExts[ext] = true
|
||||
} else {
|
||||
t.Errorf("Unexpected extension: %s", ext)
|
||||
}
|
||||
}
|
||||
|
||||
for ext, found := range expectedExts {
|
||||
if !found {
|
||||
t.Errorf("Expected extension %s not found", ext)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
// Package python provides AST-aware chunking for Python source files using tree-sitter.
|
||||
package python
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"github.com/smacker/go-tree-sitter/python"
|
||||
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/chunking"
|
||||
)
|
||||
|
||||
// Chunker implements AST-aware chunking for Python files.
|
||||
type Chunker struct {
|
||||
parser *sitter.Parser
|
||||
options chunking.ChunkOptions
|
||||
}
|
||||
|
||||
// NewChunker creates a new Python chunker.
|
||||
func NewChunker(options chunking.ChunkOptions) *Chunker {
|
||||
parser := sitter.NewParser()
|
||||
parser.SetLanguage(python.GetLanguage())
|
||||
|
||||
return &Chunker{
|
||||
options: options,
|
||||
parser: parser,
|
||||
}
|
||||
}
|
||||
|
||||
// Language returns the language this chunker supports.
|
||||
func (c *Chunker) Language() chunking.Language {
|
||||
return chunking.LanguagePython
|
||||
}
|
||||
|
||||
// SupportedExtensions returns the file extensions this chunker handles.
|
||||
func (c *Chunker) SupportedExtensions() []string {
|
||||
return []string{".py"}
|
||||
}
|
||||
|
||||
// Chunk parses a Python source file and returns semantic code chunks.
|
||||
func (c *Chunker) Chunk(ctx context.Context, filePath string) ([]chunking.Chunk, error) {
|
||||
// Read file content
|
||||
content, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read file: %w", err)
|
||||
}
|
||||
|
||||
// Parse the Python file
|
||||
tree, err := c.parser.ParseCtx(ctx, nil, content)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse Python file: %w", err)
|
||||
}
|
||||
defer tree.Close()
|
||||
|
||||
sourceLines := strings.Split(string(content), "\n")
|
||||
chunks := make([]chunking.Chunk, 0)
|
||||
|
||||
// Walk the AST and extract chunks
|
||||
c.walkNode(tree.RootNode(), content, sourceLines, filePath, "", &chunks)
|
||||
|
||||
return chunks, nil
|
||||
}
|
||||
|
||||
// walkNode recursively walks the tree-sitter AST and extracts chunks.
|
||||
func (c *Chunker) walkNode(node *sitter.Node, source []byte, sourceLines []string, filePath string, parentName string, chunks *[]chunking.Chunk) {
|
||||
nodeType := node.Type()
|
||||
|
||||
switch nodeType {
|
||||
case "function_definition":
|
||||
chunk := c.extractFunction(node, source, sourceLines, filePath, parentName)
|
||||
if chunk != nil {
|
||||
*chunks = append(*chunks, *chunk)
|
||||
}
|
||||
|
||||
case "class_definition":
|
||||
chunk := c.extractClass(node, source, sourceLines, filePath)
|
||||
if chunk != nil {
|
||||
*chunks = append(*chunks, *chunk)
|
||||
|
||||
// Walk class body to find methods
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == "block" {
|
||||
c.walkNode(child, source, sourceLines, filePath, chunk.Name, chunks)
|
||||
}
|
||||
}
|
||||
}
|
||||
return // Don't walk children again
|
||||
|
||||
case "block":
|
||||
// Walk statements in block
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
c.walkNode(node.Child(i), source, sourceLines, filePath, parentName, chunks)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Walk all children
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
c.walkNode(node.Child(i), source, sourceLines, filePath, parentName, chunks)
|
||||
}
|
||||
}
|
||||
|
||||
// extractFunction extracts a function definition chunk.
|
||||
func (c *Chunker) extractFunction(node *sitter.Node, source []byte, sourceLines []string, filePath string, parentName string) *chunking.Chunk {
|
||||
// Find function name
|
||||
var nameNode *sitter.Node
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == "identifier" {
|
||||
nameNode = child
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if nameNode == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
name := nameNode.Content(source)
|
||||
|
||||
// Skip private functions if configured
|
||||
if !c.options.IncludePrivate && strings.HasPrefix(name, "_") && !strings.HasPrefix(name, "__") {
|
||||
return nil
|
||||
}
|
||||
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
endLine := int(node.EndPoint().Row) + 1
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguagePython,
|
||||
Name: name,
|
||||
ParentName: parentName,
|
||||
StartLine: startLine,
|
||||
EndLine: endLine,
|
||||
Content: c.extractLines(sourceLines, startLine, endLine),
|
||||
}
|
||||
|
||||
// Determine if this is a method or function
|
||||
if parentName != "" {
|
||||
chunk.Type = chunking.ChunkTypeMethod
|
||||
} else {
|
||||
chunk.Type = chunking.ChunkTypeFunction
|
||||
}
|
||||
|
||||
// Extract signature (def line)
|
||||
chunk.Signature = c.extractFunctionSignature(node, source, sourceLines)
|
||||
|
||||
// Extract docstring as doc comment
|
||||
if c.options.IncludeDocComments {
|
||||
chunk.DocComment = c.extractDocstring(node, source)
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractClass extracts a class definition chunk.
|
||||
func (c *Chunker) extractClass(node *sitter.Node, source []byte, sourceLines []string, filePath string) *chunking.Chunk {
|
||||
// Find class name
|
||||
var nameNode *sitter.Node
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == "identifier" {
|
||||
nameNode = child
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if nameNode == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
name := nameNode.Content(source)
|
||||
|
||||
// Skip private classes if configured
|
||||
if !c.options.IncludePrivate && strings.HasPrefix(name, "_") && !strings.HasPrefix(name, "__") {
|
||||
return nil
|
||||
}
|
||||
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
endLine := int(node.EndPoint().Row) + 1
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguagePython,
|
||||
Type: chunking.ChunkTypeClass,
|
||||
Name: name,
|
||||
StartLine: startLine,
|
||||
EndLine: endLine,
|
||||
Content: c.extractLines(sourceLines, startLine, endLine),
|
||||
}
|
||||
|
||||
// Extract class signature (class line)
|
||||
chunk.Signature = c.extractClassSignature(node, source, sourceLines)
|
||||
|
||||
// Extract docstring as doc comment
|
||||
if c.options.IncludeDocComments {
|
||||
chunk.DocComment = c.extractDocstring(node, source)
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractFunctionSignature extracts the function definition line.
|
||||
func (c *Chunker) extractFunctionSignature(node *sitter.Node, source []byte, sourceLines []string) string {
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
|
||||
// Find the colon that ends the signature
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == ":" {
|
||||
endLine := int(child.EndPoint().Row) + 1
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, endLine))
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: just return first line
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, startLine))
|
||||
}
|
||||
|
||||
// extractClassSignature extracts the class definition line.
|
||||
func (c *Chunker) extractClassSignature(node *sitter.Node, source []byte, sourceLines []string) string {
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
|
||||
// Find the colon that ends the signature
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == ":" {
|
||||
endLine := int(child.EndPoint().Row) + 1
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, endLine))
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: just return first line
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, startLine))
|
||||
}
|
||||
|
||||
// extractDocstring extracts the docstring from a function or class.
|
||||
func (c *Chunker) extractDocstring(node *sitter.Node, source []byte) string {
|
||||
// Find the block
|
||||
var blockNode *sitter.Node
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == "block" {
|
||||
blockNode = child
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if blockNode == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Check if first statement in block is a string (docstring)
|
||||
for i := 0; i < int(blockNode.ChildCount()); i++ {
|
||||
child := blockNode.Child(i)
|
||||
if child.Type() == "expression_statement" {
|
||||
// Check if it contains a string
|
||||
for j := 0; j < int(child.ChildCount()); j++ {
|
||||
grandchild := child.Child(j)
|
||||
if grandchild.Type() == "string" {
|
||||
docstring := grandchild.Content(source)
|
||||
// Remove quotes
|
||||
docstring = strings.Trim(docstring, `"'`)
|
||||
return strings.TrimSpace(docstring)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractLines extracts a range of lines from source (1-indexed, inclusive).
|
||||
func (c *Chunker) extractLines(lines []string, start, end int) string {
|
||||
if start < 1 || end < start || start > len(lines) {
|
||||
return ""
|
||||
}
|
||||
|
||||
startIdx := start - 1
|
||||
endIdx := end
|
||||
if endIdx > len(lines) {
|
||||
endIdx = len(lines)
|
||||
}
|
||||
|
||||
return strings.Join(lines[startIdx:endIdx], "\n")
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
// Package chunking provides AST-aware code chunking for semantic code search.
|
||||
// Chunks code files into logical units (functions, classes, methods) that preserve
|
||||
// semantic boundaries for better vector embedding and retrieval.
|
||||
package chunking
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ChunkType represents the type of code chunk.
|
||||
type ChunkType string
|
||||
|
||||
const (
|
||||
// ChunkTypeFunction represents a standalone function.
|
||||
ChunkTypeFunction ChunkType = "function"
|
||||
// ChunkTypeMethod represents a method on a class/struct/type.
|
||||
ChunkTypeMethod ChunkType = "method"
|
||||
// ChunkTypeClass represents a class or struct definition.
|
||||
ChunkTypeClass ChunkType = "class"
|
||||
// ChunkTypeInterface represents an interface definition.
|
||||
ChunkTypeInterface ChunkType = "interface"
|
||||
// ChunkTypeType represents a type alias or type definition.
|
||||
ChunkTypeType ChunkType = "type"
|
||||
// ChunkTypeConst represents constant declarations.
|
||||
ChunkTypeConst ChunkType = "const"
|
||||
// ChunkTypeVar represents variable declarations.
|
||||
ChunkTypeVar ChunkType = "var"
|
||||
)
|
||||
|
||||
// Language represents a programming language.
|
||||
type Language string
|
||||
|
||||
const (
|
||||
// LanguageGo represents the Go programming language.
|
||||
LanguageGo Language = "go"
|
||||
// LanguagePython represents the Python programming language.
|
||||
LanguagePython Language = "python"
|
||||
// LanguageTypeScript represents the TypeScript programming language.
|
||||
LanguageTypeScript Language = "typescript"
|
||||
// LanguageJavaScript represents the JavaScript programming language.
|
||||
LanguageJavaScript Language = "javascript"
|
||||
)
|
||||
|
||||
// Chunk represents a semantic code chunk with AST-derived boundaries.
|
||||
type Chunk struct {
|
||||
Metadata map[string]interface{}
|
||||
FilePath string
|
||||
Language Language
|
||||
Type ChunkType
|
||||
Name string
|
||||
ParentName string
|
||||
Content string
|
||||
Signature string
|
||||
DocComment string
|
||||
StartLine int
|
||||
EndLine int
|
||||
}
|
||||
|
||||
// Identifier returns a human-readable identifier for this chunk.
|
||||
// Format: "ParentName.Name" for methods, "Name" for top-level.
|
||||
func (c *Chunk) Identifier() string {
|
||||
if c.ParentName != "" {
|
||||
return fmt.Sprintf("%s.%s", c.ParentName, c.Name)
|
||||
}
|
||||
return c.Name
|
||||
}
|
||||
|
||||
// LineRange returns a human-readable line range.
|
||||
// Format: "L123-L456"
|
||||
func (c *Chunk) LineRange() string {
|
||||
return fmt.Sprintf("L%d-L%d", c.StartLine, c.EndLine)
|
||||
}
|
||||
|
||||
// SearchableContent returns content optimized for semantic search.
|
||||
// Combines signature, doc comment, and content in a structured format.
|
||||
func (c *Chunk) SearchableContent() string {
|
||||
var parts []string
|
||||
|
||||
// Include signature for functions/methods
|
||||
if c.Signature != "" {
|
||||
parts = append(parts, c.Signature)
|
||||
}
|
||||
|
||||
// Include doc comment
|
||||
if c.DocComment != "" {
|
||||
parts = append(parts, c.DocComment)
|
||||
}
|
||||
|
||||
// Include actual content
|
||||
if c.Content != "" {
|
||||
parts = append(parts, c.Content)
|
||||
}
|
||||
|
||||
return strings.Join(parts, "\n\n")
|
||||
}
|
||||
|
||||
// Chunker is the interface for language-specific code chunkers.
|
||||
type Chunker interface {
|
||||
// Chunk parses a source file and returns semantic code chunks.
|
||||
// Returns an error if the file cannot be parsed or read.
|
||||
Chunk(ctx context.Context, filePath string) ([]Chunk, error)
|
||||
|
||||
// Language returns the language this chunker supports.
|
||||
Language() Language
|
||||
|
||||
// SupportedExtensions returns file extensions this chunker handles.
|
||||
// Example: []string{".go"} for Go chunker
|
||||
SupportedExtensions() []string
|
||||
}
|
||||
|
||||
// ChunkOptions provides options for chunking behavior.
|
||||
type ChunkOptions struct {
|
||||
// MaxChunkSize is the maximum size of a chunk in bytes.
|
||||
// Chunks larger than this will be split (respecting boundaries where possible).
|
||||
// 0 means no limit.
|
||||
MaxChunkSize int
|
||||
|
||||
// IncludeDocComments controls whether to include documentation comments.
|
||||
IncludeDocComments bool
|
||||
|
||||
// IncludePrivate controls whether to include private/unexported symbols.
|
||||
IncludePrivate bool
|
||||
|
||||
// MinLines is the minimum number of lines for a chunk to be included.
|
||||
// Chunks smaller than this will be skipped.
|
||||
// 0 means no minimum.
|
||||
MinLines int
|
||||
}
|
||||
|
||||
// DefaultChunkOptions returns sensible default options.
|
||||
func DefaultChunkOptions() ChunkOptions {
|
||||
return ChunkOptions{
|
||||
MaxChunkSize: 8192, // ~8KB per chunk (well under token limit)
|
||||
IncludeDocComments: true,
|
||||
IncludePrivate: true, // Include all symbols for comprehensive search
|
||||
MinLines: 0, // No minimum - include even single-line functions
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,403 @@
|
||||
// Package typescript provides AST-aware chunking for TypeScript and JavaScript source files using tree-sitter.
|
||||
package typescript
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"github.com/smacker/go-tree-sitter/typescript/typescript"
|
||||
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/chunking"
|
||||
)
|
||||
|
||||
// Chunker implements AST-aware chunking for TypeScript/JavaScript files.
|
||||
type Chunker struct {
|
||||
parser *sitter.Parser
|
||||
options chunking.ChunkOptions
|
||||
}
|
||||
|
||||
// NewChunker creates a new TypeScript chunker.
|
||||
func NewChunker(options chunking.ChunkOptions) *Chunker {
|
||||
parser := sitter.NewParser()
|
||||
parser.SetLanguage(typescript.GetLanguage())
|
||||
|
||||
return &Chunker{
|
||||
options: options,
|
||||
parser: parser,
|
||||
}
|
||||
}
|
||||
|
||||
// Language returns the language this chunker supports.
|
||||
func (c *Chunker) Language() chunking.Language {
|
||||
return chunking.LanguageTypeScript
|
||||
}
|
||||
|
||||
// SupportedExtensions returns the file extensions this chunker handles.
|
||||
func (c *Chunker) SupportedExtensions() []string {
|
||||
return []string{".ts", ".tsx", ".js", ".jsx"}
|
||||
}
|
||||
|
||||
// Chunk parses a TypeScript/JavaScript source file and returns semantic code chunks.
|
||||
func (c *Chunker) Chunk(ctx context.Context, filePath string) ([]chunking.Chunk, error) {
|
||||
// Read file content
|
||||
content, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read file: %w", err)
|
||||
}
|
||||
|
||||
// Parse the file
|
||||
tree, err := c.parser.ParseCtx(ctx, nil, content)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse TypeScript file: %w", err)
|
||||
}
|
||||
defer tree.Close()
|
||||
|
||||
sourceLines := strings.Split(string(content), "\n")
|
||||
chunks := make([]chunking.Chunk, 0)
|
||||
|
||||
// Walk the AST and extract chunks
|
||||
c.walkNode(tree.RootNode(), content, sourceLines, filePath, "", &chunks)
|
||||
|
||||
return chunks, nil
|
||||
}
|
||||
|
||||
// walkNode recursively walks the tree-sitter AST and extracts chunks.
|
||||
func (c *Chunker) walkNode(node *sitter.Node, source []byte, sourceLines []string, filePath string, parentName string, chunks *[]chunking.Chunk) {
|
||||
nodeType := node.Type()
|
||||
|
||||
switch nodeType {
|
||||
case "function_declaration":
|
||||
chunk := c.extractFunction(node, source, sourceLines, filePath, parentName)
|
||||
if chunk != nil {
|
||||
*chunks = append(*chunks, *chunk)
|
||||
}
|
||||
|
||||
case "method_definition":
|
||||
chunk := c.extractMethod(node, source, sourceLines, filePath, parentName)
|
||||
if chunk != nil {
|
||||
*chunks = append(*chunks, *chunk)
|
||||
}
|
||||
|
||||
case "arrow_function", "function_expression":
|
||||
// Handle arrow functions and function expressions assigned to variables
|
||||
chunk := c.extractFunctionExpression(node, source, sourceLines, filePath, parentName)
|
||||
if chunk != nil {
|
||||
*chunks = append(*chunks, *chunk)
|
||||
}
|
||||
|
||||
case "class_declaration":
|
||||
chunk := c.extractClass(node, source, sourceLines, filePath)
|
||||
if chunk != nil {
|
||||
*chunks = append(*chunks, *chunk)
|
||||
|
||||
// Walk class body to find methods
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == "class_body" {
|
||||
c.walkNode(child, source, sourceLines, filePath, chunk.Name, chunks)
|
||||
}
|
||||
}
|
||||
}
|
||||
return // Don't walk children again
|
||||
|
||||
case "interface_declaration":
|
||||
chunk := c.extractInterface(node, source, sourceLines, filePath)
|
||||
if chunk != nil {
|
||||
*chunks = append(*chunks, *chunk)
|
||||
}
|
||||
|
||||
case "type_alias_declaration":
|
||||
chunk := c.extractTypeAlias(node, source, sourceLines, filePath)
|
||||
if chunk != nil {
|
||||
*chunks = append(*chunks, *chunk)
|
||||
}
|
||||
}
|
||||
|
||||
// Walk all children
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
c.walkNode(node.Child(i), source, sourceLines, filePath, parentName, chunks)
|
||||
}
|
||||
}
|
||||
|
||||
// extractFunction extracts a function declaration.
|
||||
func (c *Chunker) extractFunction(node *sitter.Node, source []byte, sourceLines []string, filePath string, parentName string) *chunking.Chunk {
|
||||
name := c.findChildContent(node, "identifier", source)
|
||||
if name == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
endLine := int(node.EndPoint().Row) + 1
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageTypeScript,
|
||||
Type: chunking.ChunkTypeFunction,
|
||||
Name: name,
|
||||
ParentName: parentName,
|
||||
StartLine: startLine,
|
||||
EndLine: endLine,
|
||||
Content: c.extractLines(sourceLines, startLine, endLine),
|
||||
Signature: c.extractFunctionSignature(node, source, sourceLines),
|
||||
}
|
||||
|
||||
// Extract JSDoc comment
|
||||
if c.options.IncludeDocComments {
|
||||
chunk.DocComment = c.extractComment(node, source)
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractMethod extracts a method definition from a class.
|
||||
func (c *Chunker) extractMethod(node *sitter.Node, source []byte, sourceLines []string, filePath string, parentName string) *chunking.Chunk {
|
||||
name := c.findChildContent(node, "property_identifier", source)
|
||||
if name == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip private methods if configured
|
||||
if !c.options.IncludePrivate && strings.HasPrefix(name, "_") {
|
||||
return nil
|
||||
}
|
||||
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
endLine := int(node.EndPoint().Row) + 1
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageTypeScript,
|
||||
Type: chunking.ChunkTypeMethod,
|
||||
Name: name,
|
||||
ParentName: parentName,
|
||||
StartLine: startLine,
|
||||
EndLine: endLine,
|
||||
Content: c.extractLines(sourceLines, startLine, endLine),
|
||||
Signature: c.extractMethodSignature(node, source, sourceLines),
|
||||
}
|
||||
|
||||
// Extract JSDoc comment
|
||||
if c.options.IncludeDocComments {
|
||||
chunk.DocComment = c.extractComment(node, source)
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractFunctionExpression extracts arrow functions and function expressions.
|
||||
func (c *Chunker) extractFunctionExpression(node *sitter.Node, source []byte, sourceLines []string, filePath string, parentName string) *chunking.Chunk {
|
||||
// Try to find the variable name from parent
|
||||
parent := node.Parent()
|
||||
if parent == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var name string
|
||||
if parent.Type() == "variable_declarator" {
|
||||
name = c.findChildContent(parent, "identifier", source)
|
||||
} else if parent.Type() == "assignment_expression" {
|
||||
// Handle const foo = () => {}
|
||||
for i := 0; i < int(parent.ChildCount()); i++ {
|
||||
child := parent.Child(i)
|
||||
if child.Type() == "identifier" || child.Type() == "member_expression" {
|
||||
name = child.Content(source)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if name == "" {
|
||||
return nil // Anonymous function, skip
|
||||
}
|
||||
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
endLine := int(node.EndPoint().Row) + 1
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageTypeScript,
|
||||
Type: chunking.ChunkTypeFunction,
|
||||
Name: name,
|
||||
ParentName: parentName,
|
||||
StartLine: startLine,
|
||||
EndLine: endLine,
|
||||
Content: c.extractLines(sourceLines, startLine, endLine),
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractClass extracts a class declaration.
|
||||
func (c *Chunker) extractClass(node *sitter.Node, source []byte, sourceLines []string, filePath string) *chunking.Chunk {
|
||||
name := c.findChildContent(node, "type_identifier", source)
|
||||
if name == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
endLine := int(node.EndPoint().Row) + 1
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageTypeScript,
|
||||
Type: chunking.ChunkTypeClass,
|
||||
Name: name,
|
||||
StartLine: startLine,
|
||||
EndLine: endLine,
|
||||
Content: c.extractLines(sourceLines, startLine, endLine),
|
||||
Signature: c.extractClassSignature(node, source, sourceLines),
|
||||
}
|
||||
|
||||
// Extract JSDoc comment
|
||||
if c.options.IncludeDocComments {
|
||||
chunk.DocComment = c.extractComment(node, source)
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractInterface extracts an interface declaration.
|
||||
func (c *Chunker) extractInterface(node *sitter.Node, source []byte, sourceLines []string, filePath string) *chunking.Chunk {
|
||||
name := c.findChildContent(node, "type_identifier", source)
|
||||
if name == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
endLine := int(node.EndPoint().Row) + 1
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageTypeScript,
|
||||
Type: chunking.ChunkTypeInterface,
|
||||
Name: name,
|
||||
StartLine: startLine,
|
||||
EndLine: endLine,
|
||||
Content: c.extractLines(sourceLines, startLine, endLine),
|
||||
}
|
||||
|
||||
// Extract JSDoc comment
|
||||
if c.options.IncludeDocComments {
|
||||
chunk.DocComment = c.extractComment(node, source)
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// extractTypeAlias extracts a type alias declaration.
|
||||
func (c *Chunker) extractTypeAlias(node *sitter.Node, source []byte, sourceLines []string, filePath string) *chunking.Chunk {
|
||||
name := c.findChildContent(node, "type_identifier", source)
|
||||
if name == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
endLine := int(node.EndPoint().Row) + 1
|
||||
|
||||
chunk := &chunking.Chunk{
|
||||
FilePath: filePath,
|
||||
Language: chunking.LanguageTypeScript,
|
||||
Type: chunking.ChunkTypeType,
|
||||
Name: name,
|
||||
StartLine: startLine,
|
||||
EndLine: endLine,
|
||||
Content: c.extractLines(sourceLines, startLine, endLine),
|
||||
}
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
// findChildContent finds the first child of the given type and returns its content.
|
||||
func (c *Chunker) findChildContent(node *sitter.Node, childType string, source []byte) string {
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == childType {
|
||||
return child.Content(source)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractFunctionSignature extracts the function signature.
|
||||
func (c *Chunker) extractFunctionSignature(node *sitter.Node, source []byte, sourceLines []string) string {
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
|
||||
// Find the opening brace of the body
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == "statement_block" {
|
||||
endLine := int(child.StartPoint().Row) + 1
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, endLine-1))
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: just return first line
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, startLine))
|
||||
}
|
||||
|
||||
// extractMethodSignature extracts the method signature.
|
||||
func (c *Chunker) extractMethodSignature(node *sitter.Node, source []byte, sourceLines []string) string {
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
|
||||
// Find the opening brace of the body
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == "statement_block" {
|
||||
endLine := int(child.StartPoint().Row) + 1
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, endLine-1))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, startLine))
|
||||
}
|
||||
|
||||
// extractClassSignature extracts the class declaration line.
|
||||
func (c *Chunker) extractClassSignature(node *sitter.Node, source []byte, sourceLines []string) string {
|
||||
startLine := int(node.StartPoint().Row) + 1
|
||||
|
||||
// Find the opening brace of the class body
|
||||
for i := 0; i < int(node.ChildCount()); i++ {
|
||||
child := node.Child(i)
|
||||
if child.Type() == "class_body" {
|
||||
endLine := int(child.StartPoint().Row) + 1
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, endLine-1))
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimSpace(c.extractLines(sourceLines, startLine, startLine))
|
||||
}
|
||||
|
||||
// extractComment extracts JSDoc or other comments from a node.
|
||||
func (c *Chunker) extractComment(node *sitter.Node, source []byte) string {
|
||||
// Check previous sibling for comment
|
||||
prevSibling := node.PrevSibling()
|
||||
if prevSibling != nil && prevSibling.Type() == "comment" {
|
||||
comment := prevSibling.Content(source)
|
||||
// Remove comment markers
|
||||
comment = strings.TrimPrefix(comment, "/**")
|
||||
comment = strings.TrimPrefix(comment, "/*")
|
||||
comment = strings.TrimSuffix(comment, "*/")
|
||||
comment = strings.TrimPrefix(comment, "//")
|
||||
return strings.TrimSpace(comment)
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractLines extracts a range of lines from source (1-indexed, inclusive).
|
||||
func (c *Chunker) extractLines(lines []string, start, end int) string {
|
||||
if start < 1 || end < start || start > len(lines) {
|
||||
return ""
|
||||
}
|
||||
|
||||
startIdx := start - 1
|
||||
endIdx := end
|
||||
if endIdx > len(lines) {
|
||||
endIdx = len(lines)
|
||||
}
|
||||
|
||||
return strings.Join(lines[startIdx:endIdx], "\n")
|
||||
}
|
||||
Reference in New Issue
Block a user