feat(leann-phase2): implement hybrid vector storage and graph-based search

- [x] Add AST-aware code chunking for Go, Python, and TypeScript using tree-sitter
- [x] Implement LEANN-inspired hybrid vector storage with hub detection and selective embedding storage (60-80% savings)
- [x] Add observation relationship graph with CSR format and edge detection (file overlap, semantic similarity, temporal, concept)
- [x] Implement graph-aware search with two-level traversal and relationship-based ranking
- [x] Add auto-tuning system for dynamic hub threshold adjustment based on query performance
- [x] Add comprehensive metrics tracking for vector storage, queries, latency, and graph traversals
- [x] Update configuration system with graph and hybrid storage settings
- [x] Add graph stats and vector metrics endpoints to worker service
- [x] Enhance UI sidebar with advanced metrics display and graph visualization
- [x] Optimize struct field alignment throughout codebase for memory efficiency
- [x] Update documentation with LEANN Phase 2 features and performance benefits
- [x] Add tree-sitter dependency for AST parsing
This commit is contained in:
2026-01-07 20:43:10 +00:00
parent 7ab4b07cf2
commit 74ae8ed4c1
83 changed files with 5190 additions and 603 deletions
+417
View File
@@ -0,0 +1,417 @@
package graph
import (
"context"
"fmt"
"math"
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
"github.com/rs/zerolog/log"
)
const (
// SemanticSimilarityThreshold for creating semantic edges
SemanticSimilarityThreshold = 0.85
// MinFileOverlapForEdge minimum file overlap ratio to create edge
MinFileOverlapForEdge = 0.3
// MaxEdgesPerNode prevents creating too many edges
MaxEdgesPerNode = 20
)
// DetectEdges identifies relationships between observations
func DetectEdges(ctx context.Context, observations []*models.Observation) ([]Edge, error) {
if len(observations) < 2 {
return nil, nil
}
edges := make([]Edge, 0)
// Build lookup maps for efficient detection
sessionMap := buildSessionMap(observations)
conceptMap := buildConceptMap(observations)
fileMap := buildFileMap(observations)
log.Info().
Int("observations", len(observations)).
Int("sessions", len(sessionMap)).
Int("concepts", len(conceptMap)).
Msg("Starting edge detection")
// Detect temporal edges (same session)
temporalEdges := detectTemporalEdges(sessionMap)
edges = append(edges, temporalEdges...)
// Detect concept edges (shared tags)
conceptEdges := detectConceptEdges(conceptMap)
edges = append(edges, conceptEdges...)
// Detect file overlap edges
fileEdges := detectFileOverlapEdges(fileMap, observations)
edges = append(edges, fileEdges...)
// Prune excessive edges per node
edges = pruneEdges(edges, MaxEdgesPerNode)
log.Info().
Int("temporal_edges", len(temporalEdges)).
Int("concept_edges", len(conceptEdges)).
Int("file_edges", len(fileEdges)).
Int("total_edges", len(edges)).
Msg("Edge detection complete")
return edges, nil
}
// buildSessionMap groups observations by SDK session
func buildSessionMap(observations []*models.Observation) map[string][]int64 {
sessionMap := make(map[string][]int64)
for _, obs := range observations {
if obs.SDKSessionID != "" {
sessionMap[obs.SDKSessionID] = append(sessionMap[obs.SDKSessionID], obs.ID)
}
}
return sessionMap
}
// buildConceptMap groups observations by concept tags
func buildConceptMap(observations []*models.Observation) map[string][]int64 {
conceptMap := make(map[string][]int64)
for _, obs := range observations {
for _, concept := range obs.Concepts {
conceptMap[concept] = append(conceptMap[concept], obs.ID)
}
}
return conceptMap
}
// buildFileMap maps files to observations (from both FilesRead and FilesModified)
func buildFileMap(observations []*models.Observation) map[string][]int64 {
fileMap := make(map[string][]int64)
for _, obs := range observations {
// Add files from FilesRead
for _, file := range obs.FilesRead {
fileMap[file] = append(fileMap[file], obs.ID)
}
// Add files from FilesModified
for _, file := range obs.FilesModified {
fileMap[file] = append(fileMap[file], obs.ID)
}
}
return fileMap
}
// detectTemporalEdges creates edges between observations in the same session
func detectTemporalEdges(sessionMap map[string][]int64) []Edge {
edges := make([]Edge, 0)
for _, obsIDs := range sessionMap {
if len(obsIDs) < 2 {
continue
}
// Create edges between consecutive observations in session
for i := 0; i < len(obsIDs)-1; i++ {
edges = append(edges, Edge{
FromID: obsIDs[i],
ToID: obsIDs[i+1],
Relation: RelationTemporal,
Weight: 0.8, // High weight for temporal proximity
})
}
}
return edges
}
// detectConceptEdges creates edges between observations sharing concepts
func detectConceptEdges(conceptMap map[string][]int64) []Edge {
edges := make([]Edge, 0)
seen := make(map[string]bool)
for concept, obsIDs := range conceptMap {
if len(obsIDs) < 2 {
continue
}
// Create edges between all observations sharing this concept
for i := 0; i < len(obsIDs); i++ {
for j := i + 1; j < len(obsIDs); j++ {
// Use sorted pair as key to avoid duplicates
pairKey := edgeKey(obsIDs[i], obsIDs[j])
if seen[pairKey] {
continue
}
seen[pairKey] = true
// Weight based on concept specificity (longer = more specific)
weight := float32(0.5 + 0.3*math.Min(1.0, float64(len(concept))/20.0))
edges = append(edges, Edge{
FromID: obsIDs[i],
ToID: obsIDs[j],
Relation: RelationConcept,
Weight: weight,
})
}
}
}
return edges
}
// detectFileOverlapEdges creates edges based on file references
func detectFileOverlapEdges(fileMap map[string][]int64, observations []*models.Observation) []Edge {
edges := make([]Edge, 0)
seen := make(map[string]bool)
// Build observation ID to observation map for quick lookup
obsMap := make(map[int64]*models.Observation)
for _, obs := range observations {
obsMap[obs.ID] = obs
}
for _, obsIDs := range fileMap {
if len(obsIDs) < 2 {
continue
}
// Create edges between observations referencing same files
for i := 0; i < len(obsIDs); i++ {
for j := i + 1; j < len(obsIDs); j++ {
pairKey := edgeKey(obsIDs[i], obsIDs[j])
if seen[pairKey] {
continue
}
seen[pairKey] = true
// Calculate file overlap ratio
obs1, ok1 := obsMap[obsIDs[i]]
obs2, ok2 := obsMap[obsIDs[j]]
if !ok1 || !ok2 {
continue
}
// Merge FilesRead and FilesModified for both observations
files1 := append([]string{}, obs1.FilesRead...)
files1 = append(files1, obs1.FilesModified...)
files2 := append([]string{}, obs2.FilesRead...)
files2 = append(files2, obs2.FilesModified...)
overlap := calculateFileOverlap(files1, files2)
if overlap < MinFileOverlapForEdge {
continue
}
edges = append(edges, Edge{
FromID: obsIDs[i],
ToID: obsIDs[j],
Relation: RelationFileOverlap,
Weight: overlap,
})
}
}
}
return edges
}
// calculateFileOverlap computes Jaccard similarity of file sets
func calculateFileOverlap(files1, files2 []string) float32 {
if len(files1) == 0 || len(files2) == 0 {
return 0.0
}
// Convert to sets
set1 := make(map[string]bool)
for _, f := range files1 {
set1[f] = true
}
set2 := make(map[string]bool)
for _, f := range files2 {
set2[f] = true
}
// Count intersection
intersection := 0
for f := range set1 {
if set2[f] {
intersection++
}
}
// Jaccard similarity = intersection / union
union := len(set1) + len(set2) - intersection
if union == 0 {
return 0.0
}
return float32(intersection) / float32(union)
}
// pruneEdges limits edges per node to prevent graph explosion
func pruneEdges(edges []Edge, maxPerNode int) []Edge {
if maxPerNode <= 0 {
return edges
}
// Count edges per node
outEdges := make(map[int64][]Edge)
inEdges := make(map[int64][]Edge)
for _, edge := range edges {
outEdges[edge.FromID] = append(outEdges[edge.FromID], edge)
inEdges[edge.ToID] = append(inEdges[edge.ToID], edge)
}
// Prune low-weight edges if node has too many
pruned := make([]Edge, 0, len(edges))
processed := make(map[string]bool)
for _, edge := range edges {
pairKey := edgeKey(edge.FromID, edge.ToID)
if processed[pairKey] {
continue
}
processed[pairKey] = true
// Check if either node has too many edges
fromCount := len(outEdges[edge.FromID])
toCount := len(inEdges[edge.ToID])
if fromCount <= maxPerNode && toCount <= maxPerNode {
pruned = append(pruned, edge)
continue
}
// Keep edge if it's high-weight (top edges for this node)
if shouldKeepEdge(edge, outEdges[edge.FromID], maxPerNode) {
pruned = append(pruned, edge)
}
}
if len(pruned) < len(edges) {
log.Debug().
Int("original", len(edges)).
Int("pruned", len(pruned)).
Int("removed", len(edges)-len(pruned)).
Msg("Pruned excessive edges")
}
return pruned
}
// shouldKeepEdge determines if edge should be kept during pruning
func shouldKeepEdge(edge Edge, nodeEdges []Edge, maxPerNode int) bool {
// Sort node's edges by weight descending
sortedEdges := make([]Edge, len(nodeEdges))
copy(sortedEdges, nodeEdges)
sortEdgesByWeight(sortedEdges)
// Keep edge if it's in top maxPerNode
for i := 0; i < maxPerNode && i < len(sortedEdges); i++ {
if sortedEdges[i].FromID == edge.FromID && sortedEdges[i].ToID == edge.ToID {
return true
}
}
return false
}
// sortEdgesByWeight sorts edges by weight descending
func sortEdgesByWeight(edges []Edge) {
// Simple bubble sort (edges are typically small per node)
n := len(edges)
for i := 0; i < n-1; i++ {
for j := 0; j < n-i-1; j++ {
if edges[j].Weight < edges[j+1].Weight {
edges[j], edges[j+1] = edges[j+1], edges[j]
}
}
}
}
// edgeKey creates a unique key for an edge pair (sorted)
func edgeKey(id1, id2 int64) string {
if id1 < id2 {
return fmt.Sprintf("%d-%d", id1, id2)
}
return fmt.Sprintf("%d-%d", id2, id1)
}
// DetectSemanticEdges creates edges based on semantic similarity
// This requires embeddings and is called separately when available
func DetectSemanticEdges(ctx context.Context, observations []*models.Observation, embeddings map[int64][]float32) []Edge {
edges := make([]Edge, 0)
seen := make(map[string]bool)
// Compare all pairs (expensive, but necessary for semantic similarity)
for i := 0; i < len(observations); i++ {
emb1, ok1 := embeddings[observations[i].ID]
if !ok1 {
continue
}
for j := i + 1; j < len(observations); j++ {
emb2, ok2 := embeddings[observations[j].ID]
if !ok2 {
continue
}
similarity := cosineSimilarity(emb1, emb2)
if similarity < SemanticSimilarityThreshold {
continue
}
pairKey := edgeKey(observations[i].ID, observations[j].ID)
if seen[pairKey] {
continue
}
seen[pairKey] = true
edges = append(edges, Edge{
FromID: observations[i].ID,
ToID: observations[j].ID,
Relation: RelationSemantic,
Weight: similarity,
})
}
}
log.Info().
Int("semantic_edges", len(edges)).
Float32("threshold", SemanticSimilarityThreshold).
Msg("Detected semantic edges")
return edges
}
// cosineSimilarity computes cosine similarity between two vectors
func cosineSimilarity(a, b []float32) float32 {
if len(a) != len(b) {
return 0.0
}
var dotProduct, normA, normB float32
for i := range a {
dotProduct += a[i] * b[i]
normA += a[i] * a[i]
normB += b[i] * b[i]
}
if normA == 0 || normB == 0 {
return 0.0
}
return dotProduct / float32(math.Sqrt(float64(normA))*math.Sqrt(float64(normB)))
}
+423
View File
@@ -0,0 +1,423 @@
// Package graph provides observation relationship graphs for LEANN Phase 2.
//
// This package implements graph-based selective recomputation where observation
// relationships (file overlap, semantic similarity, temporal proximity) form a
// graph structure. Hub nodes (high-degree observations) store embeddings, while
// leaf nodes recompute on-demand.
package graph
import (
"context"
"fmt"
"math"
"sort"
"sync"
"time"
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
"github.com/rs/zerolog/log"
)
// RelationType defines the type of relationship between observations
type RelationType int
const (
// RelationFileOverlap indicates observations reference overlapping files
RelationFileOverlap RelationType = iota
// RelationSemantic indicates high semantic similarity (cosine > 0.85)
RelationSemantic
// RelationTemporal indicates observations from same session
RelationTemporal
// RelationConcept indicates shared concept tags
RelationConcept
)
// Edge represents a relationship between two observations
type Edge struct {
FromID int64
ToID int64
Relation RelationType
Weight float32 // 0.0-1.0, higher = stronger relationship
}
// Node represents an observation in the graph
type Node struct {
Metadata NodeMetadata
LastAccess time.Time
StoredEmb []float32 // Nil if recomputed on-demand
ID int64
Degree int // Number of edges (hub detection)
AccessCount int
}
// NodeMetadata contains observation metadata
type NodeMetadata struct {
CreatedAt time.Time
Project string
Type string
Title string
IsSuperseded bool
}
// CSRGraph represents a graph in Compressed Sparse Row format for memory efficiency
type CSRGraph struct {
RowPtr []int32 // Node adjacency list pointers
ColIdx []int32 // Edge destination IDs
Weights []float32 // Edge weights
mu sync.RWMutex
}
// ObservationGraph manages the observation relationship graph
type ObservationGraph struct {
nodes map[int64]*Node
csr *CSRGraph
edges []Edge
nodesMu sync.RWMutex
edgesMu sync.RWMutex
}
// NewObservationGraph creates a new empty observation graph
func NewObservationGraph() *ObservationGraph {
return &ObservationGraph{
nodes: make(map[int64]*Node),
edges: make([]Edge, 0),
csr: &CSRGraph{},
}
}
// AddNode adds or updates a node in the graph
func (g *ObservationGraph) AddNode(node *Node) {
g.nodesMu.Lock()
defer g.nodesMu.Unlock()
g.nodes[node.ID] = node
}
// AddEdge adds an edge to the graph
func (g *ObservationGraph) AddEdge(edge Edge) {
g.edgesMu.Lock()
defer g.edgesMu.Unlock()
g.edges = append(g.edges, edge)
// Update degree counts
g.nodesMu.Lock()
if fromNode, ok := g.nodes[edge.FromID]; ok {
fromNode.Degree++
}
if toNode, ok := g.nodes[edge.ToID]; ok {
toNode.Degree++
}
g.nodesMu.Unlock()
}
// BuildCSR converts edge list to CSR format for efficient traversal
func (g *ObservationGraph) BuildCSR() error {
g.edgesMu.RLock()
g.nodesMu.RLock()
defer g.edgesMu.RUnlock()
defer g.nodesMu.RUnlock()
if len(g.nodes) == 0 {
return fmt.Errorf("no nodes in graph")
}
// Create node ID to index mapping
nodeIDs := make([]int64, 0, len(g.nodes))
for id := range g.nodes {
nodeIDs = append(nodeIDs, id)
}
sort.Slice(nodeIDs, func(i, j int) bool {
return nodeIDs[i] < nodeIDs[j]
})
idToIdx := make(map[int64]int32)
for idx, id := range nodeIDs {
// #nosec G115 - observation count will never exceed int32 max (2.1B) in practice
idToIdx[id] = int32(idx)
}
// Count edges per node
edgeCounts := make([]int, len(nodeIDs))
for _, edge := range g.edges {
if fromIdx, ok := idToIdx[edge.FromID]; ok {
edgeCounts[fromIdx]++
}
}
// Build row pointers
rowPtr := make([]int32, len(nodeIDs)+1)
rowPtr[0] = 0
for i := 0; i < len(nodeIDs); i++ {
// #nosec G115 - edge counts per node will not exceed int32 max
rowPtr[i+1] = rowPtr[i] + int32(edgeCounts[i])
}
// Build column indices and weights
totalEdges := rowPtr[len(nodeIDs)]
colIdx := make([]int32, totalEdges)
weights := make([]float32, totalEdges)
// Temporary counter for filling CSR
currentPos := make([]int32, len(nodeIDs))
copy(currentPos, rowPtr[:len(nodeIDs)])
for _, edge := range g.edges {
fromIdx, fromOk := idToIdx[edge.FromID]
toIdx, toOk := idToIdx[edge.ToID]
if fromOk && toOk {
pos := currentPos[fromIdx]
colIdx[pos] = toIdx
weights[pos] = edge.Weight
currentPos[fromIdx]++
}
}
g.csr.mu.Lock()
g.csr.RowPtr = rowPtr
g.csr.ColIdx = colIdx
g.csr.Weights = weights
g.csr.mu.Unlock()
log.Info().
Int("nodes", len(nodeIDs)).
Int("edges", int(totalEdges)).
Msg("Built CSR graph representation")
return nil
}
// GetNeighbors returns neighboring nodes and their edge weights
func (g *ObservationGraph) GetNeighbors(nodeID int64) ([]int64, []float32, error) {
g.csr.mu.RLock()
defer g.csr.mu.RUnlock()
// Find node index in CSR
g.nodesMu.RLock()
nodeIDs := make([]int64, 0, len(g.nodes))
for id := range g.nodes {
nodeIDs = append(nodeIDs, id)
}
g.nodesMu.RUnlock()
sort.Slice(nodeIDs, func(i, j int) bool {
return nodeIDs[i] < nodeIDs[j]
})
nodeIdx := sort.Search(len(nodeIDs), func(i int) bool {
return nodeIDs[i] >= nodeID
})
if nodeIdx >= len(nodeIDs) || nodeIDs[nodeIdx] != nodeID {
return nil, nil, fmt.Errorf("node %d not found", nodeID)
}
// Extract neighbors from CSR
startIdx := g.csr.RowPtr[nodeIdx]
endIdx := g.csr.RowPtr[nodeIdx+1]
neighborCount := endIdx - startIdx
neighbors := make([]int64, neighborCount)
weights := make([]float32, neighborCount)
for i := int32(0); i < neighborCount; i++ {
neighborIdx := g.csr.ColIdx[startIdx+i]
neighbors[i] = nodeIDs[neighborIdx]
weights[i] = g.csr.Weights[startIdx+i]
}
return neighbors, weights, nil
}
// GetNode retrieves a node by ID
func (g *ObservationGraph) GetNode(nodeID int64) (*Node, error) {
g.nodesMu.RLock()
defer g.nodesMu.RUnlock()
node, ok := g.nodes[nodeID]
if !ok {
return nil, fmt.Errorf("node %d not found", nodeID)
}
return node, nil
}
// FindHubs identifies hub nodes (high degree) in the graph
func (g *ObservationGraph) FindHubs(percentile float64) []int64 {
g.nodesMu.RLock()
defer g.nodesMu.RUnlock()
if len(g.nodes) == 0 {
return nil
}
// Collect all degrees
degrees := make([]int, 0, len(g.nodes))
nodeIDs := make([]int64, 0, len(g.nodes))
for id, node := range g.nodes {
degrees = append(degrees, node.Degree)
nodeIDs = append(nodeIDs, id)
}
// Sort by degree
type nodeDegree struct {
ID int64
Degree int
}
nodeDegrees := make([]nodeDegree, len(nodeIDs))
for i := range nodeIDs {
nodeDegrees[i] = nodeDegree{
ID: nodeIDs[i],
Degree: degrees[i],
}
}
sort.Slice(nodeDegrees, func(i, j int) bool {
return nodeDegrees[i].Degree > nodeDegrees[j].Degree
})
// Return top percentile
cutoff := int(math.Ceil(float64(len(nodeDegrees)) * (1.0 - percentile)))
if cutoff > len(nodeDegrees) {
cutoff = len(nodeDegrees)
}
hubs := make([]int64, cutoff)
for i := 0; i < cutoff; i++ {
hubs[i] = nodeDegrees[i].ID
}
log.Info().
Int("total_nodes", len(g.nodes)).
Int("hubs", len(hubs)).
Float64("percentile", percentile).
Msg("Identified hub nodes")
return hubs
}
// Stats returns graph statistics
func (g *ObservationGraph) Stats() GraphStats {
g.nodesMu.RLock()
g.edgesMu.RLock()
defer g.nodesMu.RUnlock()
defer g.edgesMu.RUnlock()
stats := GraphStats{
NodeCount: len(g.nodes),
EdgeCount: len(g.edges),
}
if len(g.nodes) > 0 {
degrees := make([]int, 0, len(g.nodes))
for _, node := range g.nodes {
degrees = append(degrees, node.Degree)
}
sort.Ints(degrees)
stats.AvgDegree = float64(sum(degrees)) / float64(len(degrees))
stats.MaxDegree = degrees[len(degrees)-1]
stats.MinDegree = degrees[0]
// Median
mid := len(degrees) / 2
if len(degrees)%2 == 0 {
stats.MedianDegree = float64(degrees[mid-1]+degrees[mid]) / 2.0
} else {
stats.MedianDegree = float64(degrees[mid])
}
}
// Count edge types
stats.EdgeTypes = make(map[RelationType]int)
for _, edge := range g.edges {
stats.EdgeTypes[edge.Relation]++
}
return stats
}
// GraphStats contains graph statistics
type GraphStats struct {
EdgeTypes map[RelationType]int
AvgDegree float64
MedianDegree float64
NodeCount int
EdgeCount int
MaxDegree int
MinDegree int
}
// BuildFromObservations constructs a graph from a list of observations
func BuildFromObservations(ctx context.Context, observations []*models.Observation) (*ObservationGraph, error) {
graph := NewObservationGraph()
// Add nodes
for _, obs := range observations {
// Extract title from sql.NullString
title := ""
if obs.Title.Valid {
title = obs.Title.String
}
node := &Node{
ID: obs.ID,
Degree: 0,
Metadata: NodeMetadata{
Project: obs.Project,
Type: string(obs.Type),
Title: title,
CreatedAt: time.UnixMilli(obs.CreatedAtEpoch),
IsSuperseded: obs.IsSuperseded,
},
LastAccess: time.Now(),
AccessCount: 0,
}
graph.AddNode(node)
}
// Detect edges (will be implemented in edge_detector.go)
edges, err := DetectEdges(ctx, observations)
if err != nil {
return nil, fmt.Errorf("detect edges: %w", err)
}
for _, edge := range edges {
graph.AddEdge(edge)
}
// Build CSR representation
if err := graph.BuildCSR(); err != nil {
return nil, fmt.Errorf("build CSR: %w", err)
}
return graph, nil
}
// Helper function to sum integers
func sum(values []int) int {
total := 0
for _, v := range values {
total += v
}
return total
}
// String returns a human-readable representation of RelationType
func (r RelationType) String() string {
switch r {
case RelationFileOverlap:
return "file_overlap"
case RelationSemantic:
return "semantic"
case RelationTemporal:
return "temporal"
case RelationConcept:
return "concept"
default:
return "unknown"
}
}