Files
lukaszraczylo 4f4b4ac70f feat(chunking): add AST-aware code chunking for Go, Python, TypeScript
- [x] Add language-specific chunkers with AST parsing (Go, Python, TypeScript)
- [x] Implement chunking manager to dispatch files to appropriate chunkers
- [x] Integrate code chunks into vector sync for semantic search
- [x] Add tree-sitter dependency for Python/TypeScript parsing
- [x] Reorder struct fields for consistency across codebase
- [x] Rename error variables to follow Go conventions (err → unmarshalErr, etc.)
- [x] Add code chunk metadata to vector documents (language, symbol name, line ranges)
- [x] Update worker service to initialize chunking pipeline with all three languages
2026-01-07 13:19:58 +00:00

449 lines
12 KiB
Go

// Package expansion provides context-aware query expansion for improved search recall.
package expansion
import (
"context"
"regexp"
"strings"
"sync"
"github.com/lukaszraczylo/claude-mnemonic/internal/embedding"
"github.com/rs/zerolog/log"
)
// QueryIntent represents the detected intent of a query.
type QueryIntent string
const (
// IntentQuestion indicates a question-type query (how, why, what, etc.)
IntentQuestion QueryIntent = "question"
// IntentError indicates an error/debugging query
IntentError QueryIntent = "error"
// IntentImplementation indicates an implementation/coding query
IntentImplementation QueryIntent = "implementation"
// IntentArchitecture indicates an architecture/design query
IntentArchitecture QueryIntent = "architecture"
// IntentGeneral indicates a general lookup query
IntentGeneral QueryIntent = "general"
)
// ExpandedQuery represents a query variant with metadata.
type ExpandedQuery struct {
Query string `json:"query"`
Source string `json:"source"`
Intent QueryIntent `json:"intent"`
Weight float64 `json:"weight"`
}
// Expander provides context-aware query expansion.
type Expander struct {
embedSvc *embedding.Service
intentPatterns map[QueryIntent][]*regexp.Regexp
vocabulary []VocabEntry
vocabVectors [][]float32
vocabMu sync.RWMutex
}
// VocabEntry represents a vocabulary term from observations.
type VocabEntry struct {
Term string
Source string
Weight float64
}
// Config holds expander configuration.
type Config struct {
// MaxExpansions limits the number of expanded queries returned
MaxExpansions int
// MinSimilarity is the minimum similarity score for vocabulary expansion
MinSimilarity float64
// EnableVocabularyExpansion enables finding related terms from observations
EnableVocabularyExpansion bool
}
// DefaultConfig returns sensible default configuration.
func DefaultConfig() Config {
return Config{
MaxExpansions: 4,
MinSimilarity: 0.5,
EnableVocabularyExpansion: true,
}
}
// NewExpander creates a new query expander.
func NewExpander(embedSvc *embedding.Service) *Expander {
e := &Expander{
embedSvc: embedSvc,
intentPatterns: buildIntentPatterns(),
}
return e
}
// buildIntentPatterns creates regex patterns for intent detection.
func buildIntentPatterns() map[QueryIntent][]*regexp.Regexp {
patterns := make(map[QueryIntent][]*regexp.Regexp)
// Question patterns
patterns[IntentQuestion] = []*regexp.Regexp{
regexp.MustCompile(`(?i)^(how|why|what|when|where|which|who)\b`),
regexp.MustCompile(`(?i)\?$`),
regexp.MustCompile(`(?i)\b(explain|describe|understand)\b`),
}
// Error/debugging patterns
patterns[IntentError] = []*regexp.Regexp{
regexp.MustCompile(`(?i)\b(error|bug|issue|problem|fail|crash|exception|panic)\b`),
regexp.MustCompile(`(?i)\b(fix|debug|troubleshoot|resolve)\b`),
regexp.MustCompile(`(?i)\b(doesn't work|not working|broken)\b`),
}
// Implementation patterns
patterns[IntentImplementation] = []*regexp.Regexp{
regexp.MustCompile(`(?i)\b(implement|add|create|build|write|code)\b`),
regexp.MustCompile(`(?i)\b(function|method|handler|endpoint|api)\b`),
regexp.MustCompile(`(?i)\b(feature|functionality)\b`),
}
// Architecture patterns
patterns[IntentArchitecture] = []*regexp.Regexp{
regexp.MustCompile(`(?i)\b(architecture|design|pattern|structure)\b`),
regexp.MustCompile(`(?i)\b(component|module|layer|service)\b`),
regexp.MustCompile(`(?i)\b(flow|pipeline|workflow)\b`),
}
return patterns
}
// DetectIntent analyzes a query to determine its intent.
func (e *Expander) DetectIntent(query string) QueryIntent {
query = strings.TrimSpace(query)
if query == "" {
return IntentGeneral
}
// Check patterns in priority order
intentOrder := []QueryIntent{IntentError, IntentQuestion, IntentImplementation, IntentArchitecture}
for _, intent := range intentOrder {
patterns := e.intentPatterns[intent]
for _, pattern := range patterns {
if pattern.MatchString(query) {
return intent
}
}
}
return IntentGeneral
}
// Expand generates expanded query variants based on the original query.
func (e *Expander) Expand(ctx context.Context, query string, cfg Config) []ExpandedQuery {
query = strings.TrimSpace(query)
if query == "" {
return nil
}
intent := e.DetectIntent(query)
expansions := make([]ExpandedQuery, 0, cfg.MaxExpansions)
// Always include the original query with highest weight
expansions = append(expansions, ExpandedQuery{
Query: query,
Weight: 1.0,
Source: "original",
Intent: intent,
})
// Generate intent-based expansions
intentExpansions := e.expandByIntent(query, intent)
expansions = append(expansions, intentExpansions...)
// Generate vocabulary-based expansions if enabled and we have vocabulary
if cfg.EnableVocabularyExpansion && e.embedSvc != nil && len(e.vocabulary) > 0 {
vocabExpansions := e.expandByVocabulary(ctx, query, cfg.MinSimilarity)
expansions = append(expansions, vocabExpansions...)
}
// Deduplicate and limit
expansions = deduplicateExpansions(expansions)
if len(expansions) > cfg.MaxExpansions {
expansions = expansions[:cfg.MaxExpansions]
}
log.Debug().
Str("query", truncate(query, 50)).
Str("intent", string(intent)).
Int("expansions", len(expansions)).
Msg("Query expanded")
return expansions
}
// expandByIntent generates expansions based on detected query intent.
func (e *Expander) expandByIntent(query string, intent QueryIntent) []ExpandedQuery {
var expansions []ExpandedQuery
// Extract key terms from query for context-aware expansion
keyTerms := extractKeyTerms(query)
switch intent {
case IntentQuestion:
// For questions, create a declarative variant
declarative := makeDeclarative(query)
if declarative != query {
expansions = append(expansions, ExpandedQuery{
Query: declarative,
Weight: 0.85,
Source: "intent:declarative",
Intent: intent,
})
}
case IntentError:
// For errors, expand with solution-oriented terms
if len(keyTerms) > 0 {
solutionQuery := strings.Join(keyTerms, " ") + " solution fix"
expansions = append(expansions, ExpandedQuery{
Query: solutionQuery,
Weight: 0.8,
Source: "intent:solution",
Intent: intent,
})
}
case IntentImplementation:
// For implementation queries, focus on the what/how
if len(keyTerms) > 0 {
howQuery := "how " + strings.Join(keyTerms, " ")
expansions = append(expansions, ExpandedQuery{
Query: howQuery,
Weight: 0.75,
Source: "intent:how",
Intent: intent,
})
}
case IntentArchitecture:
// For architecture queries, expand with design context
if len(keyTerms) > 0 {
designQuery := strings.Join(keyTerms, " ") + " design structure"
expansions = append(expansions, ExpandedQuery{
Query: designQuery,
Weight: 0.75,
Source: "intent:design",
Intent: intent,
})
}
case IntentGeneral:
// For general queries, try noun phrase extraction
// No additional expansion - rely on vocabulary expansion
}
return expansions
}
// expandByVocabulary finds similar terms from the observation vocabulary.
func (e *Expander) expandByVocabulary(ctx context.Context, query string, minSimilarity float64) []ExpandedQuery {
e.vocabMu.RLock()
defer e.vocabMu.RUnlock()
if len(e.vocabulary) == 0 || e.embedSvc == nil {
return nil
}
// Embed the query
queryEmb, err := e.embedSvc.Embed(query)
if err != nil {
log.Warn().Err(err).Msg("Failed to embed query for vocabulary expansion")
return nil
}
// Find similar vocabulary terms
type scoredTerm struct {
entry VocabEntry
score float64
}
var similar []scoredTerm
for i, entry := range e.vocabulary {
if i >= len(e.vocabVectors) {
break
}
score := cosineSimilarity(queryEmb, e.vocabVectors[i])
if score >= minSimilarity {
similar = append(similar, scoredTerm{entry: entry, score: score})
}
}
if len(similar) == 0 {
return nil
}
// Sort by score (descending) using bubble sort
for i := 0; i < len(similar)-1; i++ {
for j := i + 1; j < len(similar); j++ {
if similar[j].score > similar[i].score {
similar[i], similar[j] = similar[j], similar[i]
}
}
}
// Create expansion by combining top similar terms with query
var expansions []ExpandedQuery
if len(similar) > 0 {
// Take top 2 similar terms and combine with original key terms
keyTerms := extractKeyTerms(query)
for i := 0; i < min(2, len(similar)); i++ {
term := similar[i].entry.Term
// Don't add if term is already in query
if strings.Contains(strings.ToLower(query), strings.ToLower(term)) {
continue
}
combinedQuery := strings.Join(keyTerms, " ") + " " + term
expansions = append(expansions, ExpandedQuery{
Query: combinedQuery,
Weight: 0.7 * similar[i].score * similar[i].entry.Weight,
Source: "vocabulary:" + term,
Intent: IntentGeneral,
})
}
}
return expansions
}
// Helper functions
// extractKeyTerms extracts meaningful terms from a query.
func extractKeyTerms(query string) []string {
// Common stop words to filter out
stopWords := map[string]bool{
"a": true, "an": true, "the": true, "is": true, "are": true,
"was": true, "were": true, "be": true, "been": true, "being": true,
"have": true, "has": true, "had": true, "do": true, "does": true,
"did": true, "will": true, "would": true, "could": true, "should": true,
"may": true, "might": true, "must": true, "can": true,
"i": true, "me": true, "my": true, "we": true, "our": true,
"you": true, "your": true, "it": true, "its": true,
"this": true, "that": true, "these": true, "those": true,
"what": true, "which": true, "who": true, "whom": true,
"how": true, "why": true, "when": true, "where": true,
"to": true, "for": true, "with": true, "about": true, "from": true,
"in": true, "on": true, "at": true, "by": true, "of": true,
"and": true, "or": true, "but": true, "if": true, "then": true,
}
// Split and filter
words := strings.Fields(strings.ToLower(query))
var terms []string
for _, word := range words {
// Remove punctuation
word = strings.Trim(word, ".,?!;:'\"()[]{}")
if len(word) < 2 {
continue
}
if stopWords[word] {
continue
}
terms = append(terms, word)
}
return terms
}
// makeDeclarative converts a question to a declarative statement.
func makeDeclarative(query string) string {
query = strings.TrimSpace(query)
// Remove question mark
query = strings.TrimSuffix(query, "?")
// Handle common question patterns
patterns := []struct {
prefix string
replacement string
}{
{"how do i ", ""},
{"how to ", ""},
{"how does ", ""},
{"how is ", ""},
{"what is ", ""},
{"what are ", ""},
{"why does ", ""},
{"why is ", ""},
{"where is ", ""},
{"where are ", ""},
{"when does ", ""},
{"when is ", ""},
}
lower := strings.ToLower(query)
for _, p := range patterns {
if strings.HasPrefix(lower, p.prefix) {
return strings.TrimSpace(query[len(p.prefix):])
}
}
return query
}
// deduplicateExpansions removes duplicate queries while preserving order.
func deduplicateExpansions(expansions []ExpandedQuery) []ExpandedQuery {
seen := make(map[string]bool)
result := make([]ExpandedQuery, 0, len(expansions))
for _, exp := range expansions {
normalized := strings.ToLower(strings.TrimSpace(exp.Query))
if !seen[normalized] {
seen[normalized] = true
result = append(result, exp)
}
}
return result
}
// cosineSimilarity computes cosine similarity between two vectors.
func cosineSimilarity(a, b []float32) float64 {
if len(a) != len(b) || len(a) == 0 {
return 0
}
var dot, normA, normB float64
for i := range a {
dot += float64(a[i]) * float64(b[i])
normA += float64(a[i]) * float64(a[i])
normB += float64(b[i]) * float64(b[i])
}
if normA == 0 || normB == 0 {
return 0
}
return dot / (sqrt(normA) * sqrt(normB))
}
// sqrt is a simple square root implementation.
func sqrt(x float64) float64 {
if x <= 0 {
return 0
}
z := x
for i := 0; i < 10; i++ {
z = (z + x/z) / 2
}
return z
}
// truncate truncates a string to maxLen characters.
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen] + "..."
}