mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-13 02:06:24 +00:00
5c2685c7b6
* feat(leann-phase2): implement hybrid vector storage and graph-based search
- [x] Add AST-aware code chunking for Go, Python, and TypeScript using tree-sitter
- [x] Implement LEANN-inspired hybrid vector storage with hub detection and selective embedding storage (60-80% savings)
- [x] Add observation relationship graph with CSR format and edge detection (file overlap, semantic similarity, temporal, concept)
- [x] Implement graph-aware search with two-level traversal and relationship-based ranking
- [x] Add auto-tuning system for dynamic hub threshold adjustment based on query performance
- [x] Add comprehensive metrics tracking for vector storage, queries, latency, and graph traversals
- [x] Update configuration system with graph and hybrid storage settings
- [x] Add graph stats and vector metrics endpoints to worker service
- [x] Enhance UI sidebar with advanced metrics display and graph visualization
- [x] Optimize struct field alignment throughout codebase for memory efficiency
- [x] Update documentation with LEANN Phase 2 features and performance benefits
- [x] Add tree-sitter dependency for AST parsing
* fix: add fts5 build tag to CI workflow
Pass build-tags: "fts5" to shared workflow to properly compile
sqlite-vec-go-bindings with SQLite FTS5 support.
This fixes test failures in hybrid vector storage tests that require
CGO and FTS5 build tags.
Requires shared-actions@8f7f235 or later.
* docs: add testing documentation and macOS ARM64 known issue
Document the macOS ARM64 CGO linking issue with sqlite-vec-go-bindings
that prevents hybrid package tests from compiling locally.
Added:
- .github/TESTING.md: Comprehensive testing guide with platform-specific
issues, workarounds, and CI configuration details
- internal/vector/hybrid/README.md: Package-specific documentation
explaining the macOS limitation
- .github/CI_FIX_SUMMARY.md: Technical details of the CI fix
Key points:
- 41 out of 42 packages test successfully on all platforms
- hybrid package tests fail only on macOS ARM64 (local dev issue)
- Linux CI tests pass with proper build-tags: "fts5" configuration
- Production builds and runtime functionality unaffected
This is a known limitation of sqlite-vec-go-bindings on macOS ARM64
and does not impact CI/CD or production deployments.
* fix: add SQLite busy_timeout to prevent database locked errors
Set PRAGMA busy_timeout=5000 (5 seconds) to allow SQLite to retry
when the database is locked instead of failing immediately.
This fixes race conditions when multiple goroutines try to write
simultaneously, particularly in tests where StoreObservation spawns
async cleanup goroutines.
Root cause:
- StoreObservation launches goroutine -> CleanupOldObservations
- Multiple concurrent cleanups caused "database is locked" errors
- Without busy_timeout, SQLite fails immediately on lock contention
Solution:
- Add 5-second busy timeout for automatic retry on lock
- Standard practice for concurrent SQLite usage
- Works with existing WAL mode configuration
Fixes TestObservationStore_CleanupOldObservations in CI.
* docs: complete summary of all CI test fixes
Comprehensive documentation of all fixes applied:
1. Missing build tags (fts5)
2. Database locked errors (busy_timeout)
All 41/42 packages now pass tests. The hybrid package has a known
macOS ARM64 limitation that doesn't affect CI or production.
No functionality was removed - all fixes are additive only.
* fix: add SQLite driver import to hybrid tests for CGO linking
Add blank import of mattn/go-sqlite3 to hybrid test files to ensure
the SQLite driver is linked into the test binary. This provides the
SQLite symbols that sqlite-vec-go-bindings requires.
Root cause:
- hybrid package imports sqlitevec (transitively depends on sqlite-vec CGO)
- Test binary needs SQLite symbols for linking
- sqlitevec tests already had this import, but hybrid tests didn't
- Without the driver import, linker fails with "undefined symbols"
This fix enables hybrid tests to run with -race flag on all platforms.
Before: 41/42 packages pass (hybrid failed to link)
After: 42/42 packages pass ✅
Fixes hybrid test compilation on macOS ARM64, Linux, and Windows.
* docs: remove outdated macOS limitation documentation
The hybrid test linking issue has been fixed by adding the SQLite
driver import. All tests now pass on all platforms including macOS.
Removed:
- internal/vector/hybrid/README.md (documented workaround no longer needed)
- .github/TESTING.md (macOS limitation section obsolete)
All 42/42 packages now test successfully with -race flag.
* docs: final comprehensive summary of all CI fixes
All three issues now resolved:
1. Missing fts5 build tags
2. Database busy_timeout for concurrent writes
3. Missing SQLite driver import in hybrid tests
Result: 42/42 packages pass with -race on all platforms.
Credit to reviewer for identifying the race detector concern.
449 lines
12 KiB
Go
449 lines
12 KiB
Go
// Package expansion provides context-aware query expansion for improved search recall.
|
|
package expansion
|
|
|
|
import (
|
|
"context"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/lukaszraczylo/claude-mnemonic/internal/embedding"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// QueryIntent represents the detected intent of a query.
|
|
type QueryIntent string
|
|
|
|
const (
|
|
// IntentQuestion indicates a question-type query (how, why, what, etc.)
|
|
IntentQuestion QueryIntent = "question"
|
|
// IntentError indicates an error/debugging query
|
|
IntentError QueryIntent = "error"
|
|
// IntentImplementation indicates an implementation/coding query
|
|
IntentImplementation QueryIntent = "implementation"
|
|
// IntentArchitecture indicates an architecture/design query
|
|
IntentArchitecture QueryIntent = "architecture"
|
|
// IntentGeneral indicates a general lookup query
|
|
IntentGeneral QueryIntent = "general"
|
|
)
|
|
|
|
// ExpandedQuery represents a query variant with metadata.
|
|
type ExpandedQuery struct {
|
|
Query string `json:"query"`
|
|
Source string `json:"source"`
|
|
Intent QueryIntent `json:"intent"`
|
|
Weight float64 `json:"weight"`
|
|
}
|
|
|
|
// Expander provides context-aware query expansion.
|
|
type Expander struct {
|
|
embedSvc *embedding.Service
|
|
intentPatterns map[QueryIntent][]*regexp.Regexp
|
|
vocabulary []VocabEntry
|
|
vocabVectors [][]float32
|
|
vocabMu sync.RWMutex
|
|
}
|
|
|
|
// VocabEntry represents a vocabulary term from observations.
|
|
type VocabEntry struct {
|
|
Term string
|
|
Source string
|
|
Weight float64
|
|
}
|
|
|
|
// Config holds expander configuration.
|
|
type Config struct {
|
|
// MaxExpansions limits the number of expanded queries returned
|
|
MaxExpansions int
|
|
// MinSimilarity is the minimum similarity score for vocabulary expansion
|
|
MinSimilarity float64
|
|
// EnableVocabularyExpansion enables finding related terms from observations
|
|
EnableVocabularyExpansion bool
|
|
}
|
|
|
|
// DefaultConfig returns sensible default configuration.
|
|
func DefaultConfig() Config {
|
|
return Config{
|
|
MaxExpansions: 4,
|
|
MinSimilarity: 0.5,
|
|
EnableVocabularyExpansion: true,
|
|
}
|
|
}
|
|
|
|
// NewExpander creates a new query expander.
|
|
func NewExpander(embedSvc *embedding.Service) *Expander {
|
|
e := &Expander{
|
|
embedSvc: embedSvc,
|
|
intentPatterns: buildIntentPatterns(),
|
|
}
|
|
return e
|
|
}
|
|
|
|
// buildIntentPatterns creates regex patterns for intent detection.
|
|
func buildIntentPatterns() map[QueryIntent][]*regexp.Regexp {
|
|
patterns := make(map[QueryIntent][]*regexp.Regexp)
|
|
|
|
// Question patterns
|
|
patterns[IntentQuestion] = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)^(how|why|what|when|where|which|who)\b`),
|
|
regexp.MustCompile(`(?i)\?$`),
|
|
regexp.MustCompile(`(?i)\b(explain|describe|understand)\b`),
|
|
}
|
|
|
|
// Error/debugging patterns
|
|
patterns[IntentError] = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)\b(error|bug|issue|problem|fail|crash|exception|panic)\b`),
|
|
regexp.MustCompile(`(?i)\b(fix|debug|troubleshoot|resolve)\b`),
|
|
regexp.MustCompile(`(?i)\b(doesn't work|not working|broken)\b`),
|
|
}
|
|
|
|
// Implementation patterns
|
|
patterns[IntentImplementation] = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)\b(implement|add|create|build|write|code)\b`),
|
|
regexp.MustCompile(`(?i)\b(function|method|handler|endpoint|api)\b`),
|
|
regexp.MustCompile(`(?i)\b(feature|functionality)\b`),
|
|
}
|
|
|
|
// Architecture patterns
|
|
patterns[IntentArchitecture] = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)\b(architecture|design|pattern|structure)\b`),
|
|
regexp.MustCompile(`(?i)\b(component|module|layer|service)\b`),
|
|
regexp.MustCompile(`(?i)\b(flow|pipeline|workflow)\b`),
|
|
}
|
|
|
|
return patterns
|
|
}
|
|
|
|
// DetectIntent analyzes a query to determine its intent.
|
|
func (e *Expander) DetectIntent(query string) QueryIntent {
|
|
query = strings.TrimSpace(query)
|
|
if query == "" {
|
|
return IntentGeneral
|
|
}
|
|
|
|
// Check patterns in priority order
|
|
intentOrder := []QueryIntent{IntentError, IntentQuestion, IntentImplementation, IntentArchitecture}
|
|
|
|
for _, intent := range intentOrder {
|
|
patterns := e.intentPatterns[intent]
|
|
for _, pattern := range patterns {
|
|
if pattern.MatchString(query) {
|
|
return intent
|
|
}
|
|
}
|
|
}
|
|
|
|
return IntentGeneral
|
|
}
|
|
|
|
// Expand generates expanded query variants based on the original query.
|
|
func (e *Expander) Expand(ctx context.Context, query string, cfg Config) []ExpandedQuery {
|
|
query = strings.TrimSpace(query)
|
|
if query == "" {
|
|
return nil
|
|
}
|
|
|
|
intent := e.DetectIntent(query)
|
|
expansions := make([]ExpandedQuery, 0, cfg.MaxExpansions)
|
|
|
|
// Always include the original query with highest weight
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: query,
|
|
Weight: 1.0,
|
|
Source: "original",
|
|
Intent: intent,
|
|
})
|
|
|
|
// Generate intent-based expansions
|
|
intentExpansions := e.expandByIntent(query, intent)
|
|
expansions = append(expansions, intentExpansions...)
|
|
|
|
// Generate vocabulary-based expansions if enabled and we have vocabulary
|
|
if cfg.EnableVocabularyExpansion && e.embedSvc != nil && len(e.vocabulary) > 0 {
|
|
vocabExpansions := e.expandByVocabulary(ctx, query, cfg.MinSimilarity)
|
|
expansions = append(expansions, vocabExpansions...)
|
|
}
|
|
|
|
// Deduplicate and limit
|
|
expansions = deduplicateExpansions(expansions)
|
|
if len(expansions) > cfg.MaxExpansions {
|
|
expansions = expansions[:cfg.MaxExpansions]
|
|
}
|
|
|
|
log.Debug().
|
|
Str("query", truncate(query, 50)).
|
|
Str("intent", string(intent)).
|
|
Int("expansions", len(expansions)).
|
|
Msg("Query expanded")
|
|
|
|
return expansions
|
|
}
|
|
|
|
// expandByIntent generates expansions based on detected query intent.
|
|
func (e *Expander) expandByIntent(query string, intent QueryIntent) []ExpandedQuery {
|
|
var expansions []ExpandedQuery
|
|
|
|
// Extract key terms from query for context-aware expansion
|
|
keyTerms := extractKeyTerms(query)
|
|
|
|
switch intent {
|
|
case IntentQuestion:
|
|
// For questions, create a declarative variant
|
|
declarative := makeDeclarative(query)
|
|
if declarative != query {
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: declarative,
|
|
Weight: 0.85,
|
|
Source: "intent:declarative",
|
|
Intent: intent,
|
|
})
|
|
}
|
|
|
|
case IntentError:
|
|
// For errors, expand with solution-oriented terms
|
|
if len(keyTerms) > 0 {
|
|
solutionQuery := strings.Join(keyTerms, " ") + " solution fix"
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: solutionQuery,
|
|
Weight: 0.8,
|
|
Source: "intent:solution",
|
|
Intent: intent,
|
|
})
|
|
}
|
|
|
|
case IntentImplementation:
|
|
// For implementation queries, focus on the what/how
|
|
if len(keyTerms) > 0 {
|
|
howQuery := "how " + strings.Join(keyTerms, " ")
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: howQuery,
|
|
Weight: 0.75,
|
|
Source: "intent:how",
|
|
Intent: intent,
|
|
})
|
|
}
|
|
|
|
case IntentArchitecture:
|
|
// For architecture queries, expand with design context
|
|
if len(keyTerms) > 0 {
|
|
designQuery := strings.Join(keyTerms, " ") + " design structure"
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: designQuery,
|
|
Weight: 0.75,
|
|
Source: "intent:design",
|
|
Intent: intent,
|
|
})
|
|
}
|
|
|
|
case IntentGeneral:
|
|
// For general queries, try noun phrase extraction
|
|
// No additional expansion - rely on vocabulary expansion
|
|
}
|
|
|
|
return expansions
|
|
}
|
|
|
|
// expandByVocabulary finds similar terms from the observation vocabulary.
|
|
func (e *Expander) expandByVocabulary(ctx context.Context, query string, minSimilarity float64) []ExpandedQuery {
|
|
e.vocabMu.RLock()
|
|
defer e.vocabMu.RUnlock()
|
|
|
|
if len(e.vocabulary) == 0 || e.embedSvc == nil {
|
|
return nil
|
|
}
|
|
|
|
// Embed the query
|
|
queryEmb, err := e.embedSvc.Embed(query)
|
|
if err != nil {
|
|
log.Warn().Err(err).Msg("Failed to embed query for vocabulary expansion")
|
|
return nil
|
|
}
|
|
|
|
// Find similar vocabulary terms
|
|
type scoredTerm struct {
|
|
entry VocabEntry
|
|
score float64
|
|
}
|
|
|
|
var similar []scoredTerm
|
|
for i, entry := range e.vocabulary {
|
|
if i >= len(e.vocabVectors) {
|
|
break
|
|
}
|
|
|
|
score := cosineSimilarity(queryEmb, e.vocabVectors[i])
|
|
if score >= minSimilarity {
|
|
similar = append(similar, scoredTerm{entry: entry, score: score})
|
|
}
|
|
}
|
|
|
|
if len(similar) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Sort by score (descending) using bubble sort
|
|
for i := 0; i < len(similar)-1; i++ {
|
|
for j := i + 1; j < len(similar); j++ {
|
|
if similar[j].score > similar[i].score {
|
|
similar[i], similar[j] = similar[j], similar[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create expansion by combining top similar terms with query
|
|
var expansions []ExpandedQuery
|
|
if len(similar) > 0 {
|
|
// Take top 2 similar terms and combine with original key terms
|
|
keyTerms := extractKeyTerms(query)
|
|
for i := 0; i < min(2, len(similar)); i++ {
|
|
term := similar[i].entry.Term
|
|
// Don't add if term is already in query
|
|
if strings.Contains(strings.ToLower(query), strings.ToLower(term)) {
|
|
continue
|
|
}
|
|
|
|
combinedQuery := strings.Join(keyTerms, " ") + " " + term
|
|
expansions = append(expansions, ExpandedQuery{
|
|
Query: combinedQuery,
|
|
Weight: 0.7 * similar[i].score * similar[i].entry.Weight,
|
|
Source: "vocabulary:" + term,
|
|
Intent: IntentGeneral,
|
|
})
|
|
}
|
|
}
|
|
|
|
return expansions
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
// extractKeyTerms extracts meaningful terms from a query.
|
|
func extractKeyTerms(query string) []string {
|
|
// Common stop words to filter out
|
|
stopWords := map[string]bool{
|
|
"a": true, "an": true, "the": true, "is": true, "are": true,
|
|
"was": true, "were": true, "be": true, "been": true, "being": true,
|
|
"have": true, "has": true, "had": true, "do": true, "does": true,
|
|
"did": true, "will": true, "would": true, "could": true, "should": true,
|
|
"may": true, "might": true, "must": true, "can": true,
|
|
"i": true, "me": true, "my": true, "we": true, "our": true,
|
|
"you": true, "your": true, "it": true, "its": true,
|
|
"this": true, "that": true, "these": true, "those": true,
|
|
"what": true, "which": true, "who": true, "whom": true,
|
|
"how": true, "why": true, "when": true, "where": true,
|
|
"to": true, "for": true, "with": true, "about": true, "from": true,
|
|
"in": true, "on": true, "at": true, "by": true, "of": true,
|
|
"and": true, "or": true, "but": true, "if": true, "then": true,
|
|
}
|
|
|
|
// Split and filter
|
|
words := strings.Fields(strings.ToLower(query))
|
|
var terms []string
|
|
|
|
for _, word := range words {
|
|
// Remove punctuation
|
|
word = strings.Trim(word, ".,?!;:'\"()[]{}")
|
|
if len(word) < 2 {
|
|
continue
|
|
}
|
|
if stopWords[word] {
|
|
continue
|
|
}
|
|
terms = append(terms, word)
|
|
}
|
|
|
|
return terms
|
|
}
|
|
|
|
// makeDeclarative converts a question to a declarative statement.
|
|
func makeDeclarative(query string) string {
|
|
query = strings.TrimSpace(query)
|
|
|
|
// Remove question mark
|
|
query = strings.TrimSuffix(query, "?")
|
|
|
|
// Handle common question patterns
|
|
patterns := []struct {
|
|
prefix string
|
|
replacement string
|
|
}{
|
|
{"how do i ", ""},
|
|
{"how to ", ""},
|
|
{"how does ", ""},
|
|
{"how is ", ""},
|
|
{"what is ", ""},
|
|
{"what are ", ""},
|
|
{"why does ", ""},
|
|
{"why is ", ""},
|
|
{"where is ", ""},
|
|
{"where are ", ""},
|
|
{"when does ", ""},
|
|
{"when is ", ""},
|
|
}
|
|
|
|
lower := strings.ToLower(query)
|
|
for _, p := range patterns {
|
|
if strings.HasPrefix(lower, p.prefix) {
|
|
return strings.TrimSpace(query[len(p.prefix):])
|
|
}
|
|
}
|
|
|
|
return query
|
|
}
|
|
|
|
// deduplicateExpansions removes duplicate queries while preserving order.
|
|
func deduplicateExpansions(expansions []ExpandedQuery) []ExpandedQuery {
|
|
seen := make(map[string]bool)
|
|
result := make([]ExpandedQuery, 0, len(expansions))
|
|
|
|
for _, exp := range expansions {
|
|
normalized := strings.ToLower(strings.TrimSpace(exp.Query))
|
|
if !seen[normalized] {
|
|
seen[normalized] = true
|
|
result = append(result, exp)
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// cosineSimilarity computes cosine similarity between two vectors.
|
|
func cosineSimilarity(a, b []float32) float64 {
|
|
if len(a) != len(b) || len(a) == 0 {
|
|
return 0
|
|
}
|
|
|
|
var dot, normA, normB float64
|
|
for i := range a {
|
|
dot += float64(a[i]) * float64(b[i])
|
|
normA += float64(a[i]) * float64(a[i])
|
|
normB += float64(b[i]) * float64(b[i])
|
|
}
|
|
|
|
if normA == 0 || normB == 0 {
|
|
return 0
|
|
}
|
|
|
|
return dot / (sqrt(normA) * sqrt(normB))
|
|
}
|
|
|
|
// sqrt is a simple square root implementation.
|
|
func sqrt(x float64) float64 {
|
|
if x <= 0 {
|
|
return 0
|
|
}
|
|
z := x
|
|
for i := 0; i < 10; i++ {
|
|
z = (z + x/z) / 2
|
|
}
|
|
return z
|
|
}
|
|
|
|
// truncate truncates a string to maxLen characters.
|
|
func truncate(s string, maxLen int) string {
|
|
if len(s) <= maxLen {
|
|
return s
|
|
}
|
|
return s[:maxLen] + "..."
|
|
}
|