mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-09 23:59:40 +00:00
4f4b4ac70f
- [x] Add language-specific chunkers with AST parsing (Go, Python, TypeScript) - [x] Implement chunking manager to dispatch files to appropriate chunkers - [x] Integrate code chunks into vector sync for semantic search - [x] Add tree-sitter dependency for Python/TypeScript parsing - [x] Reorder struct fields for consistency across codebase - [x] Rename error variables to follow Go conventions (err → unmarshalErr, etc.) - [x] Add code chunk metadata to vector documents (language, symbol name, line ranges) - [x] Update worker service to initialize chunking pipeline with all three languages
449 lines
12 KiB
Go
449 lines
12 KiB
Go
// Package expansion provides context-aware query expansion for improved search recall.
|
|
package expansion
|
|
|
|
import (
|
|
"context"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/lukaszraczylo/claude-mnemonic/internal/embedding"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// QueryIntent represents the detected intent of a query.
|
|
type QueryIntent string
|
|
|
|
const (
|
|
// IntentQuestion indicates a question-type query (how, why, what, etc.)
|
|
IntentQuestion QueryIntent = "question"
|
|
// IntentError indicates an error/debugging query
|
|
IntentError QueryIntent = "error"
|
|
// IntentImplementation indicates an implementation/coding query
|
|
IntentImplementation QueryIntent = "implementation"
|
|
// IntentArchitecture indicates an architecture/design query
|
|
IntentArchitecture QueryIntent = "architecture"
|
|
// IntentGeneral indicates a general lookup query
|
|
IntentGeneral QueryIntent = "general"
|
|
)
|
|
|
|
// ExpandedQuery represents a query variant with metadata.
|
|
type ExpandedQuery struct {
|
|
Query string `json:"query"`
|
|
Source string `json:"source"`
|
|
Intent QueryIntent `json:"intent"`
|
|
Weight float64 `json:"weight"`
|
|
}
|
|
|
|
// Expander provides context-aware query expansion.
|
|
type Expander struct {
|
|
embedSvc *embedding.Service
|
|
intentPatterns map[QueryIntent][]*regexp.Regexp
|
|
vocabulary []VocabEntry
|
|
vocabVectors [][]float32
|
|
vocabMu sync.RWMutex
|
|
}
|
|
|
|
// VocabEntry represents a vocabulary term from observations.
|
|
type VocabEntry struct {
|
|
Term string
|
|
Source string
|
|
Weight float64
|
|
}
|
|
|
|
// Config holds expander configuration.
|
|
type Config struct {
|
|
// MaxExpansions limits the number of expanded queries returned
|
|
MaxExpansions int
|
|
// MinSimilarity is the minimum similarity score for vocabulary expansion
|
|
MinSimilarity float64
|
|
// EnableVocabularyExpansion enables finding related terms from observations
|
|
EnableVocabularyExpansion bool
|
|
}
|
|
|
|
// DefaultConfig returns sensible default configuration.
|
|
func DefaultConfig() Config {
|
|
return Config{
|
|
MaxExpansions: 4,
|
|
MinSimilarity: 0.5,
|
|
EnableVocabularyExpansion: true,
|
|
}
|
|
}
|
|
|
|
// NewExpander creates a new query expander.
|
|
func NewExpander(embedSvc *embedding.Service) *Expander {
|
|
e := &Expander{
|
|
embedSvc: embedSvc,
|
|
intentPatterns: buildIntentPatterns(),
|
|
}
|
|
return e
|
|
}
|
|
|
|
// buildIntentPatterns creates regex patterns for intent detection.
|
|
func buildIntentPatterns() map[QueryIntent][]*regexp.Regexp {
|
|
patterns := make(map[QueryIntent][]*regexp.Regexp)
|
|
|
|
// Question patterns
|
|
patterns[IntentQuestion] = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)^(how|why|what|when|where|which|who)\b`),
|
|
regexp.MustCompile(`(?i)\?$`),
|
|
regexp.MustCompile(`(?i)\b(explain|describe|understand)\b`),
|
|
}
|
|
|
|
// Error/debugging patterns
|
|
patterns[IntentError] = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)\b(error|bug|issue|problem|fail|crash|exception|panic)\b`),
|
|
regexp.MustCompile(`(?i)\b(fix|debug|troubleshoot|resolve)\b`),
|
|
regexp.MustCompile(`(?i)\b(doesn't work|not working|broken)\b`),
|
|
}
|
|
|
|
// Implementation patterns
|
|
patterns[IntentImplementation] = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)\b(implement|add|create|build|write|code)\b`),
|
|
regexp.MustCompile(`(?i)\b(function|method|handler|endpoint|api)\b`),
|
|
regexp.MustCompile(`(?i)\b(feature|functionality)\b`),
|
|
}
|
|
|
|
// Architecture patterns
|
|
patterns[IntentArchitecture] = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)\b(architecture|design|pattern|structure)\b`),
|
|
regexp.MustCompile(`(?i)\b(component|module|layer|service)\b`),
|
|
regexp.MustCompile(`(?i)\b(flow|pipeline|workflow)\b`),
|
|
}
|
|
|
|
return patterns
|
|
}
|
|
|
|
// DetectIntent analyzes a query to determine its intent.
|
|
func (e *Expander) DetectIntent(query string) QueryIntent {
|
|
query = strings.TrimSpace(query)
|
|
if query == "" {
|
|
return IntentGeneral
|
|
}
|
|
|
|
// Check patterns in priority order
|
|
intentOrder := []QueryIntent{IntentError, IntentQuestion, IntentImplementation, IntentArchitecture}
|
|
|
|
for _, intent := range intentOrder {
|
|
patterns := e.intentPatterns[intent]
|
|
for _, pattern := range patterns {
|
|
if pattern.MatchString(query) {
|
|
return intent
|
|
}
|
|
}
|
|
}
|
|
|
|
return IntentGeneral
|
|
}
|
|
|
|
// Expand generates expanded query variants based on the original query.
|
|
func (e *Expander) Expand(ctx context.Context, query string, cfg Config) []ExpandedQuery {
|
|
query = strings.TrimSpace(query)
|
|
if query == "" {
|
|
return nil
|
|
}
|
|
|
|
intent := e.DetectIntent(query)
|
|
expansions := make([]ExpandedQuery, 0, cfg.MaxExpansions)
|
|
|
|
// Always include the original query with highest weight
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: query,
|
|
Weight: 1.0,
|
|
Source: "original",
|
|
Intent: intent,
|
|
})
|
|
|
|
// Generate intent-based expansions
|
|
intentExpansions := e.expandByIntent(query, intent)
|
|
expansions = append(expansions, intentExpansions...)
|
|
|
|
// Generate vocabulary-based expansions if enabled and we have vocabulary
|
|
if cfg.EnableVocabularyExpansion && e.embedSvc != nil && len(e.vocabulary) > 0 {
|
|
vocabExpansions := e.expandByVocabulary(ctx, query, cfg.MinSimilarity)
|
|
expansions = append(expansions, vocabExpansions...)
|
|
}
|
|
|
|
// Deduplicate and limit
|
|
expansions = deduplicateExpansions(expansions)
|
|
if len(expansions) > cfg.MaxExpansions {
|
|
expansions = expansions[:cfg.MaxExpansions]
|
|
}
|
|
|
|
log.Debug().
|
|
Str("query", truncate(query, 50)).
|
|
Str("intent", string(intent)).
|
|
Int("expansions", len(expansions)).
|
|
Msg("Query expanded")
|
|
|
|
return expansions
|
|
}
|
|
|
|
// expandByIntent generates expansions based on detected query intent.
|
|
func (e *Expander) expandByIntent(query string, intent QueryIntent) []ExpandedQuery {
|
|
var expansions []ExpandedQuery
|
|
|
|
// Extract key terms from query for context-aware expansion
|
|
keyTerms := extractKeyTerms(query)
|
|
|
|
switch intent {
|
|
case IntentQuestion:
|
|
// For questions, create a declarative variant
|
|
declarative := makeDeclarative(query)
|
|
if declarative != query {
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: declarative,
|
|
Weight: 0.85,
|
|
Source: "intent:declarative",
|
|
Intent: intent,
|
|
})
|
|
}
|
|
|
|
case IntentError:
|
|
// For errors, expand with solution-oriented terms
|
|
if len(keyTerms) > 0 {
|
|
solutionQuery := strings.Join(keyTerms, " ") + " solution fix"
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: solutionQuery,
|
|
Weight: 0.8,
|
|
Source: "intent:solution",
|
|
Intent: intent,
|
|
})
|
|
}
|
|
|
|
case IntentImplementation:
|
|
// For implementation queries, focus on the what/how
|
|
if len(keyTerms) > 0 {
|
|
howQuery := "how " + strings.Join(keyTerms, " ")
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: howQuery,
|
|
Weight: 0.75,
|
|
Source: "intent:how",
|
|
Intent: intent,
|
|
})
|
|
}
|
|
|
|
case IntentArchitecture:
|
|
// For architecture queries, expand with design context
|
|
if len(keyTerms) > 0 {
|
|
designQuery := strings.Join(keyTerms, " ") + " design structure"
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: designQuery,
|
|
Weight: 0.75,
|
|
Source: "intent:design",
|
|
Intent: intent,
|
|
})
|
|
}
|
|
|
|
case IntentGeneral:
|
|
// For general queries, try noun phrase extraction
|
|
// No additional expansion - rely on vocabulary expansion
|
|
}
|
|
|
|
return expansions
|
|
}
|
|
|
|
// expandByVocabulary finds similar terms from the observation vocabulary.
|
|
func (e *Expander) expandByVocabulary(ctx context.Context, query string, minSimilarity float64) []ExpandedQuery {
|
|
e.vocabMu.RLock()
|
|
defer e.vocabMu.RUnlock()
|
|
|
|
if len(e.vocabulary) == 0 || e.embedSvc == nil {
|
|
return nil
|
|
}
|
|
|
|
// Embed the query
|
|
queryEmb, err := e.embedSvc.Embed(query)
|
|
if err != nil {
|
|
log.Warn().Err(err).Msg("Failed to embed query for vocabulary expansion")
|
|
return nil
|
|
}
|
|
|
|
// Find similar vocabulary terms
|
|
type scoredTerm struct {
|
|
entry VocabEntry
|
|
score float64
|
|
}
|
|
|
|
var similar []scoredTerm
|
|
for i, entry := range e.vocabulary {
|
|
if i >= len(e.vocabVectors) {
|
|
break
|
|
}
|
|
|
|
score := cosineSimilarity(queryEmb, e.vocabVectors[i])
|
|
if score >= minSimilarity {
|
|
similar = append(similar, scoredTerm{entry: entry, score: score})
|
|
}
|
|
}
|
|
|
|
if len(similar) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Sort by score (descending) using bubble sort
|
|
for i := 0; i < len(similar)-1; i++ {
|
|
for j := i + 1; j < len(similar); j++ {
|
|
if similar[j].score > similar[i].score {
|
|
similar[i], similar[j] = similar[j], similar[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create expansion by combining top similar terms with query
|
|
var expansions []ExpandedQuery
|
|
if len(similar) > 0 {
|
|
// Take top 2 similar terms and combine with original key terms
|
|
keyTerms := extractKeyTerms(query)
|
|
for i := 0; i < min(2, len(similar)); i++ {
|
|
term := similar[i].entry.Term
|
|
// Don't add if term is already in query
|
|
if strings.Contains(strings.ToLower(query), strings.ToLower(term)) {
|
|
continue
|
|
}
|
|
|
|
combinedQuery := strings.Join(keyTerms, " ") + " " + term
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: combinedQuery,
|
|
Weight: 0.7 * similar[i].score * similar[i].entry.Weight,
|
|
Source: "vocabulary:" + term,
|
|
Intent: IntentGeneral,
|
|
})
|
|
}
|
|
}
|
|
|
|
return expansions
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
// extractKeyTerms extracts meaningful terms from a query.
|
|
func extractKeyTerms(query string) []string {
|
|
// Common stop words to filter out
|
|
stopWords := map[string]bool{
|
|
"a": true, "an": true, "the": true, "is": true, "are": true,
|
|
"was": true, "were": true, "be": true, "been": true, "being": true,
|
|
"have": true, "has": true, "had": true, "do": true, "does": true,
|
|
"did": true, "will": true, "would": true, "could": true, "should": true,
|
|
"may": true, "might": true, "must": true, "can": true,
|
|
"i": true, "me": true, "my": true, "we": true, "our": true,
|
|
"you": true, "your": true, "it": true, "its": true,
|
|
"this": true, "that": true, "these": true, "those": true,
|
|
"what": true, "which": true, "who": true, "whom": true,
|
|
"how": true, "why": true, "when": true, "where": true,
|
|
"to": true, "for": true, "with": true, "about": true, "from": true,
|
|
"in": true, "on": true, "at": true, "by": true, "of": true,
|
|
"and": true, "or": true, "but": true, "if": true, "then": true,
|
|
}
|
|
|
|
// Split and filter
|
|
words := strings.Fields(strings.ToLower(query))
|
|
var terms []string
|
|
|
|
for _, word := range words {
|
|
// Remove punctuation
|
|
word = strings.Trim(word, ".,?!;:'\"()[]{}")
|
|
if len(word) < 2 {
|
|
continue
|
|
}
|
|
if stopWords[word] {
|
|
continue
|
|
}
|
|
terms = append(terms, word)
|
|
}
|
|
|
|
return terms
|
|
}
|
|
|
|
// makeDeclarative converts a question to a declarative statement.
|
|
func makeDeclarative(query string) string {
|
|
query = strings.TrimSpace(query)
|
|
|
|
// Remove question mark
|
|
query = strings.TrimSuffix(query, "?")
|
|
|
|
// Handle common question patterns
|
|
patterns := []struct {
|
|
prefix string
|
|
replacement string
|
|
}{
|
|
{"how do i ", ""},
|
|
{"how to ", ""},
|
|
{"how does ", ""},
|
|
{"how is ", ""},
|
|
{"what is ", ""},
|
|
{"what are ", ""},
|
|
{"why does ", ""},
|
|
{"why is ", ""},
|
|
{"where is ", ""},
|
|
{"where are ", ""},
|
|
{"when does ", ""},
|
|
{"when is ", ""},
|
|
}
|
|
|
|
lower := strings.ToLower(query)
|
|
for _, p := range patterns {
|
|
if strings.HasPrefix(lower, p.prefix) {
|
|
return strings.TrimSpace(query[len(p.prefix):])
|
|
}
|
|
}
|
|
|
|
return query
|
|
}
|
|
|
|
// deduplicateExpansions removes duplicate queries while preserving order.
|
|
func deduplicateExpansions(expansions []ExpandedQuery) []ExpandedQuery {
|
|
seen := make(map[string]bool)
|
|
result := make([]ExpandedQuery, 0, len(expansions))
|
|
|
|
for _, exp := range expansions {
|
|
normalized := strings.ToLower(strings.TrimSpace(exp.Query))
|
|
if !seen[normalized] {
|
|
seen[normalized] = true
|
|
result = append(result, exp)
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// cosineSimilarity computes cosine similarity between two vectors.
|
|
func cosineSimilarity(a, b []float32) float64 {
|
|
if len(a) != len(b) || len(a) == 0 {
|
|
return 0
|
|
}
|
|
|
|
var dot, normA, normB float64
|
|
for i := range a {
|
|
dot += float64(a[i]) * float64(b[i])
|
|
normA += float64(a[i]) * float64(a[i])
|
|
normB += float64(b[i]) * float64(b[i])
|
|
}
|
|
|
|
if normA == 0 || normB == 0 {
|
|
return 0
|
|
}
|
|
|
|
return dot / (sqrt(normA) * sqrt(normB))
|
|
}
|
|
|
|
// sqrt is a simple square root implementation.
|
|
func sqrt(x float64) float64 {
|
|
if x <= 0 {
|
|
return 0
|
|
}
|
|
z := x
|
|
for i := 0; i < 10; i++ {
|
|
z = (z + x/z) / 2
|
|
}
|
|
return z
|
|
}
|
|
|
|
// truncate truncates a string to maxLen characters.
|
|
func truncate(s string, maxLen int) string {
|
|
if len(s) <= maxLen {
|
|
return s
|
|
}
|
|
return s[:maxLen] + "..."
|
|
}
|