feat(chunking): add AST-aware code chunking for Go, Python, TypeScript

- [x] Add language-specific chunkers with AST parsing (Go, Python, TypeScript)
- [x] Implement chunking manager to dispatch files to appropriate chunkers
- [x] Integrate code chunks into vector sync for semantic search
- [x] Add tree-sitter dependency for Python/TypeScript parsing
- [x] Reorder struct fields for consistency across codebase
- [x] Rename error variables to follow Go conventions (err → unmarshalErr, etc.)
- [x] Add code chunk metadata to vector documents (language, symbol name, line ranges)
- [x] Update worker service to initialize chunking pipeline with all three languages
This commit is contained in:
2026-01-07 13:19:58 +00:00
parent 40a44a71eb
commit 4f4b4ac70f
78 changed files with 2313 additions and 652 deletions
+8 -8
View File
@@ -87,9 +87,9 @@ func (c *Client) AddDocuments(ctx context.Context, docs []Document) error {
for i, doc := range docs {
// Serialize embedding to blob format
embBlob, err := sqlite_vec.SerializeFloat32(embeddings[i])
if err != nil {
return fmt.Errorf("serialize embedding for %s: %w", doc.ID, err)
embBlob, serErr := sqlite_vec.SerializeFloat32(embeddings[i])
if serErr != nil {
return fmt.Errorf("serialize embedding for %s: %w", doc.ID, serErr)
}
// Extract metadata
@@ -212,8 +212,8 @@ func (c *Client) Query(ctx context.Context, query string, limit int, where map[s
var sqliteID int64
var docType, fieldType, project, scope sql.NullString
if err := rows.Scan(&r.ID, &r.Distance, &sqliteID, &docType, &fieldType, &project, &scope); err != nil {
return nil, fmt.Errorf("scan row: %w", err)
if scanErr := rows.Scan(&r.ID, &r.Distance, &sqliteID, &docType, &fieldType, &project, &scope); scanErr != nil {
return nil, fmt.Errorf("scan row: %w", scanErr)
}
r.Similarity = DistanceToSimilarity(r.Distance)
@@ -319,11 +319,11 @@ func (c *Client) NeedsRebuild(ctx context.Context) (bool, string) {
// StaleVectorInfo contains information about a vector that needs rebuilding.
type StaleVectorInfo struct {
DocID string
SQLiteID int64
DocType string
FieldType string
Project string
Scope string
SQLiteID int64
}
// GetStaleVectors returns doc_ids of vectors with mismatched or null model versions.
@@ -352,8 +352,8 @@ func (c *Client) GetStaleVectors(ctx context.Context) ([]StaleVectorInfo, error)
var sqliteID sql.NullInt64
var docType, fieldType, project, scope sql.NullString
if err := rows.Scan(&info.DocID, &sqliteID, &docType, &fieldType, &project, &scope); err != nil {
return nil, fmt.Errorf("scan row: %w", err)
if scanErr := rows.Scan(&info.DocID, &sqliteID, &docType, &fieldType, &project, &scope); scanErr != nil {
return nil, fmt.Errorf("scan row: %w", scanErr)
}
info.SQLiteID = sqliteID.Int64
+4 -3
View File
@@ -8,21 +8,22 @@ const (
DocTypeObservation DocType = "observation"
DocTypeSessionSummary DocType = "session_summary"
DocTypeUserPrompt DocType = "user_prompt"
DocTypeCodeChunk DocType = "code_chunk"
)
// Document represents a document to store with vector embedding.
type Document struct {
Metadata map[string]any
ID string
Content string
Metadata map[string]any
}
// QueryResult represents a search result from vector search.
type QueryResult struct {
Metadata map[string]any
ID string
Distance float64
Similarity float64 // 1.0 = identical, 0.0 = opposite (derived from distance)
Metadata map[string]any
Similarity float64
}
// DistanceToSimilarity converts sqlite-vec cosine distance to similarity score.
+3 -3
View File
@@ -42,10 +42,10 @@ func TestQueryResult_Fields(t *testing.T) {
func TestBuildWhereFilter(t *testing.T) {
tests := []struct {
expected map[string]interface{}
name string
docType DocType
project string
expected map[string]interface{}
}{
{
name: "empty_filters",
@@ -474,9 +474,9 @@ func TestCopyMetadataMulti(t *testing.T) {
func TestJoinStrings(t *testing.T) {
tests := []struct {
name string
strs []string
sep string
expected string
strs []string
}{
{
name: "empty_slice",
@@ -522,8 +522,8 @@ func TestTruncateString(t *testing.T) {
tests := []struct {
name string
input string
maxLen int
expected string
maxLen int
}{
{
name: "shorter_than_max",
+117 -3
View File
@@ -5,13 +5,15 @@ import (
"context"
"fmt"
"github.com/lukaszraczylo/claude-mnemonic/internal/chunking"
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
"github.com/rs/zerolog/log"
)
// Sync provides synchronization between SQLite data and vector embeddings.
type Sync struct {
client *Client
client *Client
chunkingManager *chunking.Manager
}
// NewSync creates a new sync service.
@@ -19,9 +21,23 @@ func NewSync(client *Client) *Sync {
return &Sync{client: client}
}
// SetChunkingManager sets the code chunking manager (optional).
// If set, observations will include code chunks from tracked files.
func (s *Sync) SetChunkingManager(manager *chunking.Manager) {
s.chunkingManager = manager
}
// SyncObservation syncs a single observation to the vector store.
// If a chunking manager is configured, also chunks tracked code files.
func (s *Sync) SyncObservation(ctx context.Context, obs *models.Observation) error {
docs := s.formatObservationDocs(obs)
// Add code chunks from tracked files if chunking manager is available
if s.chunkingManager != nil {
codeChunkDocs := s.formatCodeChunkDocs(ctx, obs)
docs = append(docs, codeChunkDocs...)
}
if len(docs) == 0 {
return nil
}
@@ -99,6 +115,98 @@ func (s *Sync) formatObservationDocs(obs *models.Observation) []Document {
return docs
}
// formatCodeChunkDocs formats code chunks from tracked files into vector documents.
// Uses AST-aware chunking to extract semantic code units (functions, classes, methods).
func (s *Sync) formatCodeChunkDocs(ctx context.Context, obs *models.Observation) []Document {
if s.chunkingManager == nil {
return nil
}
// Determine scope for metadata
scope := string(obs.Scope)
if scope == "" {
scope = "project"
}
// Collect all tracked files (read + modified)
allFiles := make([]string, 0, len(obs.FilesRead)+len(obs.FilesModified))
allFiles = append(allFiles, obs.FilesRead...)
allFiles = append(allFiles, obs.FilesModified...)
// Filter to only files supported by chunking manager
var supportedFiles []string
for _, file := range allFiles {
if s.chunkingManager.SupportsFile(file) {
supportedFiles = append(supportedFiles, file)
}
}
if len(supportedFiles) == 0 {
return nil
}
// Chunk all supported files
results, errs := s.chunkingManager.ChunkFiles(ctx, supportedFiles)
if len(errs) > 0 {
// Log errors but don't fail the entire sync
for _, err := range errs {
log.Warn().Err(err).Msg("Failed to chunk file")
}
}
// Convert chunks to vector documents
docs := make([]Document, 0)
chunkIndex := 0
for filePath, chunks := range results {
for _, chunk := range chunks {
doc := Document{
ID: fmt.Sprintf("obs_%d_chunk_%d", obs.ID, chunkIndex),
Content: chunk.SearchableContent(),
Metadata: map[string]any{
"sqlite_id": obs.ID,
"doc_type": string(DocTypeCodeChunk),
"field_type": "code_chunk",
"sdk_session_id": obs.SDKSessionID,
"project": obs.Project,
"scope": scope,
"created_at_epoch": obs.CreatedAtEpoch,
// Code chunk specific metadata
"file_path": filePath,
"language": string(chunk.Language),
"chunk_type": string(chunk.Type),
"symbol_name": chunk.Name,
"start_line": chunk.StartLine,
"end_line": chunk.EndLine,
},
}
// Add parent name if this is a method
if chunk.ParentName != "" {
doc.Metadata["parent_name"] = chunk.ParentName
}
// Add signature if available
if chunk.Signature != "" {
doc.Metadata["signature"] = chunk.Signature
}
docs = append(docs, doc)
chunkIndex++
}
}
if len(docs) > 0 {
log.Debug().
Int64("observationId", obs.ID).
Int("codeChunks", len(docs)).
Int("files", len(results)).
Msg("Generated code chunk documents")
}
return docs
}
// SyncSummary syncs a single session summary to the vector store.
func (s *Sync) SyncSummary(ctx context.Context, summary *models.SessionSummary) error {
docs := s.formatSummaryDocs(summary)
@@ -191,21 +299,27 @@ func (s *Sync) SyncUserPrompt(ctx context.Context, prompt *models.UserPromptWith
}
// DeleteObservations removes observation documents from the vector store.
// Includes both observation fields (narrative, facts) and code chunks.
func (s *Sync) DeleteObservations(ctx context.Context, observationIDs []int64) error {
if len(observationIDs) == 0 {
return nil
}
// Generate all possible document IDs for these observations
// Pattern: obs_{id}_narrative, obs_{id}_fact_{0..n}
// Pattern: obs_{id}_narrative, obs_{id}_fact_{0..n}, obs_{id}_chunk_{0..n}
const maxFactsPerObs = 20
ids := make([]string, 0, len(observationIDs)*(maxFactsPerObs+1))
const maxChunksPerObs = 100 // Reasonable upper bound for code chunks
ids := make([]string, 0, len(observationIDs)*(maxFactsPerObs+maxChunksPerObs+1))
for _, obsID := range observationIDs {
ids = append(ids, fmt.Sprintf("obs_%d_narrative", obsID))
for i := 0; i < maxFactsPerObs; i++ {
ids = append(ids, fmt.Sprintf("obs_%d_fact_%d", obsID, i))
}
// Include code chunk IDs
for i := 0; i < maxChunksPerObs; i++ {
ids = append(ids, fmt.Sprintf("obs_%d_chunk_%d", obsID, i))
}
}
if err := s.client.DeleteDocuments(ctx, ids); err != nil {