mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-25 04:03:08 +00:00
feat(leann-phase2): implement hybrid vector storage and graph-based search
- [x] Add AST-aware code chunking for Go, Python, and TypeScript using tree-sitter - [x] Implement LEANN-inspired hybrid vector storage with hub detection and selective embedding storage (60-80% savings) - [x] Add observation relationship graph with CSR format and edge detection (file overlap, semantic similarity, temporal, concept) - [x] Implement graph-aware search with two-level traversal and relationship-based ranking - [x] Add auto-tuning system for dynamic hub threshold adjustment based on query performance - [x] Add comprehensive metrics tracking for vector storage, queries, latency, and graph traversals - [x] Update configuration system with graph and hybrid storage settings - [x] Add graph stats and vector metrics endpoints to worker service - [x] Enhance UI sidebar with advanced metrics display and graph visualization - [x] Optimize struct field alignment throughout codebase for memory efficiency - [x] Update documentation with LEANN Phase 2 features and performance benefits - [x] Add tree-sitter dependency for AST parsing
This commit is contained in:
@@ -0,0 +1,417 @@
|
||||
package graph
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
const (
|
||||
// SemanticSimilarityThreshold for creating semantic edges
|
||||
SemanticSimilarityThreshold = 0.85
|
||||
|
||||
// MinFileOverlapForEdge minimum file overlap ratio to create edge
|
||||
MinFileOverlapForEdge = 0.3
|
||||
|
||||
// MaxEdgesPerNode prevents creating too many edges
|
||||
MaxEdgesPerNode = 20
|
||||
)
|
||||
|
||||
// DetectEdges identifies relationships between observations
|
||||
func DetectEdges(ctx context.Context, observations []*models.Observation) ([]Edge, error) {
|
||||
if len(observations) < 2 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
edges := make([]Edge, 0)
|
||||
|
||||
// Build lookup maps for efficient detection
|
||||
sessionMap := buildSessionMap(observations)
|
||||
conceptMap := buildConceptMap(observations)
|
||||
fileMap := buildFileMap(observations)
|
||||
|
||||
log.Info().
|
||||
Int("observations", len(observations)).
|
||||
Int("sessions", len(sessionMap)).
|
||||
Int("concepts", len(conceptMap)).
|
||||
Msg("Starting edge detection")
|
||||
|
||||
// Detect temporal edges (same session)
|
||||
temporalEdges := detectTemporalEdges(sessionMap)
|
||||
edges = append(edges, temporalEdges...)
|
||||
|
||||
// Detect concept edges (shared tags)
|
||||
conceptEdges := detectConceptEdges(conceptMap)
|
||||
edges = append(edges, conceptEdges...)
|
||||
|
||||
// Detect file overlap edges
|
||||
fileEdges := detectFileOverlapEdges(fileMap, observations)
|
||||
edges = append(edges, fileEdges...)
|
||||
|
||||
// Prune excessive edges per node
|
||||
edges = pruneEdges(edges, MaxEdgesPerNode)
|
||||
|
||||
log.Info().
|
||||
Int("temporal_edges", len(temporalEdges)).
|
||||
Int("concept_edges", len(conceptEdges)).
|
||||
Int("file_edges", len(fileEdges)).
|
||||
Int("total_edges", len(edges)).
|
||||
Msg("Edge detection complete")
|
||||
|
||||
return edges, nil
|
||||
}
|
||||
|
||||
// buildSessionMap groups observations by SDK session
|
||||
func buildSessionMap(observations []*models.Observation) map[string][]int64 {
|
||||
sessionMap := make(map[string][]int64)
|
||||
|
||||
for _, obs := range observations {
|
||||
if obs.SDKSessionID != "" {
|
||||
sessionMap[obs.SDKSessionID] = append(sessionMap[obs.SDKSessionID], obs.ID)
|
||||
}
|
||||
}
|
||||
|
||||
return sessionMap
|
||||
}
|
||||
|
||||
// buildConceptMap groups observations by concept tags
|
||||
func buildConceptMap(observations []*models.Observation) map[string][]int64 {
|
||||
conceptMap := make(map[string][]int64)
|
||||
|
||||
for _, obs := range observations {
|
||||
for _, concept := range obs.Concepts {
|
||||
conceptMap[concept] = append(conceptMap[concept], obs.ID)
|
||||
}
|
||||
}
|
||||
|
||||
return conceptMap
|
||||
}
|
||||
|
||||
// buildFileMap maps files to observations (from both FilesRead and FilesModified)
|
||||
func buildFileMap(observations []*models.Observation) map[string][]int64 {
|
||||
fileMap := make(map[string][]int64)
|
||||
|
||||
for _, obs := range observations {
|
||||
// Add files from FilesRead
|
||||
for _, file := range obs.FilesRead {
|
||||
fileMap[file] = append(fileMap[file], obs.ID)
|
||||
}
|
||||
// Add files from FilesModified
|
||||
for _, file := range obs.FilesModified {
|
||||
fileMap[file] = append(fileMap[file], obs.ID)
|
||||
}
|
||||
}
|
||||
|
||||
return fileMap
|
||||
}
|
||||
|
||||
// detectTemporalEdges creates edges between observations in the same session
|
||||
func detectTemporalEdges(sessionMap map[string][]int64) []Edge {
|
||||
edges := make([]Edge, 0)
|
||||
|
||||
for _, obsIDs := range sessionMap {
|
||||
if len(obsIDs) < 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Create edges between consecutive observations in session
|
||||
for i := 0; i < len(obsIDs)-1; i++ {
|
||||
edges = append(edges, Edge{
|
||||
FromID: obsIDs[i],
|
||||
ToID: obsIDs[i+1],
|
||||
Relation: RelationTemporal,
|
||||
Weight: 0.8, // High weight for temporal proximity
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return edges
|
||||
}
|
||||
|
||||
// detectConceptEdges creates edges between observations sharing concepts
|
||||
func detectConceptEdges(conceptMap map[string][]int64) []Edge {
|
||||
edges := make([]Edge, 0)
|
||||
seen := make(map[string]bool)
|
||||
|
||||
for concept, obsIDs := range conceptMap {
|
||||
if len(obsIDs) < 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Create edges between all observations sharing this concept
|
||||
for i := 0; i < len(obsIDs); i++ {
|
||||
for j := i + 1; j < len(obsIDs); j++ {
|
||||
// Use sorted pair as key to avoid duplicates
|
||||
pairKey := edgeKey(obsIDs[i], obsIDs[j])
|
||||
if seen[pairKey] {
|
||||
continue
|
||||
}
|
||||
seen[pairKey] = true
|
||||
|
||||
// Weight based on concept specificity (longer = more specific)
|
||||
weight := float32(0.5 + 0.3*math.Min(1.0, float64(len(concept))/20.0))
|
||||
|
||||
edges = append(edges, Edge{
|
||||
FromID: obsIDs[i],
|
||||
ToID: obsIDs[j],
|
||||
Relation: RelationConcept,
|
||||
Weight: weight,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return edges
|
||||
}
|
||||
|
||||
// detectFileOverlapEdges creates edges based on file references
|
||||
func detectFileOverlapEdges(fileMap map[string][]int64, observations []*models.Observation) []Edge {
|
||||
edges := make([]Edge, 0)
|
||||
seen := make(map[string]bool)
|
||||
|
||||
// Build observation ID to observation map for quick lookup
|
||||
obsMap := make(map[int64]*models.Observation)
|
||||
for _, obs := range observations {
|
||||
obsMap[obs.ID] = obs
|
||||
}
|
||||
|
||||
for _, obsIDs := range fileMap {
|
||||
if len(obsIDs) < 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Create edges between observations referencing same files
|
||||
for i := 0; i < len(obsIDs); i++ {
|
||||
for j := i + 1; j < len(obsIDs); j++ {
|
||||
pairKey := edgeKey(obsIDs[i], obsIDs[j])
|
||||
if seen[pairKey] {
|
||||
continue
|
||||
}
|
||||
seen[pairKey] = true
|
||||
|
||||
// Calculate file overlap ratio
|
||||
obs1, ok1 := obsMap[obsIDs[i]]
|
||||
obs2, ok2 := obsMap[obsIDs[j]]
|
||||
|
||||
if !ok1 || !ok2 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Merge FilesRead and FilesModified for both observations
|
||||
files1 := append([]string{}, obs1.FilesRead...)
|
||||
files1 = append(files1, obs1.FilesModified...)
|
||||
files2 := append([]string{}, obs2.FilesRead...)
|
||||
files2 = append(files2, obs2.FilesModified...)
|
||||
|
||||
overlap := calculateFileOverlap(files1, files2)
|
||||
if overlap < MinFileOverlapForEdge {
|
||||
continue
|
||||
}
|
||||
|
||||
edges = append(edges, Edge{
|
||||
FromID: obsIDs[i],
|
||||
ToID: obsIDs[j],
|
||||
Relation: RelationFileOverlap,
|
||||
Weight: overlap,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return edges
|
||||
}
|
||||
|
||||
// calculateFileOverlap computes Jaccard similarity of file sets
|
||||
func calculateFileOverlap(files1, files2 []string) float32 {
|
||||
if len(files1) == 0 || len(files2) == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Convert to sets
|
||||
set1 := make(map[string]bool)
|
||||
for _, f := range files1 {
|
||||
set1[f] = true
|
||||
}
|
||||
|
||||
set2 := make(map[string]bool)
|
||||
for _, f := range files2 {
|
||||
set2[f] = true
|
||||
}
|
||||
|
||||
// Count intersection
|
||||
intersection := 0
|
||||
for f := range set1 {
|
||||
if set2[f] {
|
||||
intersection++
|
||||
}
|
||||
}
|
||||
|
||||
// Jaccard similarity = intersection / union
|
||||
union := len(set1) + len(set2) - intersection
|
||||
if union == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
return float32(intersection) / float32(union)
|
||||
}
|
||||
|
||||
// pruneEdges limits edges per node to prevent graph explosion
|
||||
func pruneEdges(edges []Edge, maxPerNode int) []Edge {
|
||||
if maxPerNode <= 0 {
|
||||
return edges
|
||||
}
|
||||
|
||||
// Count edges per node
|
||||
outEdges := make(map[int64][]Edge)
|
||||
inEdges := make(map[int64][]Edge)
|
||||
|
||||
for _, edge := range edges {
|
||||
outEdges[edge.FromID] = append(outEdges[edge.FromID], edge)
|
||||
inEdges[edge.ToID] = append(inEdges[edge.ToID], edge)
|
||||
}
|
||||
|
||||
// Prune low-weight edges if node has too many
|
||||
pruned := make([]Edge, 0, len(edges))
|
||||
processed := make(map[string]bool)
|
||||
|
||||
for _, edge := range edges {
|
||||
pairKey := edgeKey(edge.FromID, edge.ToID)
|
||||
if processed[pairKey] {
|
||||
continue
|
||||
}
|
||||
processed[pairKey] = true
|
||||
|
||||
// Check if either node has too many edges
|
||||
fromCount := len(outEdges[edge.FromID])
|
||||
toCount := len(inEdges[edge.ToID])
|
||||
|
||||
if fromCount <= maxPerNode && toCount <= maxPerNode {
|
||||
pruned = append(pruned, edge)
|
||||
continue
|
||||
}
|
||||
|
||||
// Keep edge if it's high-weight (top edges for this node)
|
||||
if shouldKeepEdge(edge, outEdges[edge.FromID], maxPerNode) {
|
||||
pruned = append(pruned, edge)
|
||||
}
|
||||
}
|
||||
|
||||
if len(pruned) < len(edges) {
|
||||
log.Debug().
|
||||
Int("original", len(edges)).
|
||||
Int("pruned", len(pruned)).
|
||||
Int("removed", len(edges)-len(pruned)).
|
||||
Msg("Pruned excessive edges")
|
||||
}
|
||||
|
||||
return pruned
|
||||
}
|
||||
|
||||
// shouldKeepEdge determines if edge should be kept during pruning
|
||||
func shouldKeepEdge(edge Edge, nodeEdges []Edge, maxPerNode int) bool {
|
||||
// Sort node's edges by weight descending
|
||||
sortedEdges := make([]Edge, len(nodeEdges))
|
||||
copy(sortedEdges, nodeEdges)
|
||||
|
||||
sortEdgesByWeight(sortedEdges)
|
||||
|
||||
// Keep edge if it's in top maxPerNode
|
||||
for i := 0; i < maxPerNode && i < len(sortedEdges); i++ {
|
||||
if sortedEdges[i].FromID == edge.FromID && sortedEdges[i].ToID == edge.ToID {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// sortEdgesByWeight sorts edges by weight descending
|
||||
func sortEdgesByWeight(edges []Edge) {
|
||||
// Simple bubble sort (edges are typically small per node)
|
||||
n := len(edges)
|
||||
for i := 0; i < n-1; i++ {
|
||||
for j := 0; j < n-i-1; j++ {
|
||||
if edges[j].Weight < edges[j+1].Weight {
|
||||
edges[j], edges[j+1] = edges[j+1], edges[j]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// edgeKey creates a unique key for an edge pair (sorted)
|
||||
func edgeKey(id1, id2 int64) string {
|
||||
if id1 < id2 {
|
||||
return fmt.Sprintf("%d-%d", id1, id2)
|
||||
}
|
||||
return fmt.Sprintf("%d-%d", id2, id1)
|
||||
}
|
||||
|
||||
// DetectSemanticEdges creates edges based on semantic similarity
|
||||
// This requires embeddings and is called separately when available
|
||||
func DetectSemanticEdges(ctx context.Context, observations []*models.Observation, embeddings map[int64][]float32) []Edge {
|
||||
edges := make([]Edge, 0)
|
||||
seen := make(map[string]bool)
|
||||
|
||||
// Compare all pairs (expensive, but necessary for semantic similarity)
|
||||
for i := 0; i < len(observations); i++ {
|
||||
emb1, ok1 := embeddings[observations[i].ID]
|
||||
if !ok1 {
|
||||
continue
|
||||
}
|
||||
|
||||
for j := i + 1; j < len(observations); j++ {
|
||||
emb2, ok2 := embeddings[observations[j].ID]
|
||||
if !ok2 {
|
||||
continue
|
||||
}
|
||||
|
||||
similarity := cosineSimilarity(emb1, emb2)
|
||||
if similarity < SemanticSimilarityThreshold {
|
||||
continue
|
||||
}
|
||||
|
||||
pairKey := edgeKey(observations[i].ID, observations[j].ID)
|
||||
if seen[pairKey] {
|
||||
continue
|
||||
}
|
||||
seen[pairKey] = true
|
||||
|
||||
edges = append(edges, Edge{
|
||||
FromID: observations[i].ID,
|
||||
ToID: observations[j].ID,
|
||||
Relation: RelationSemantic,
|
||||
Weight: similarity,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Int("semantic_edges", len(edges)).
|
||||
Float32("threshold", SemanticSimilarityThreshold).
|
||||
Msg("Detected semantic edges")
|
||||
|
||||
return edges
|
||||
}
|
||||
|
||||
// cosineSimilarity computes cosine similarity between two vectors
|
||||
func cosineSimilarity(a, b []float32) float32 {
|
||||
if len(a) != len(b) {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
var dotProduct, normA, normB float32
|
||||
for i := range a {
|
||||
dotProduct += a[i] * b[i]
|
||||
normA += a[i] * a[i]
|
||||
normB += b[i] * b[i]
|
||||
}
|
||||
|
||||
if normA == 0 || normB == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
return dotProduct / float32(math.Sqrt(float64(normA))*math.Sqrt(float64(normB)))
|
||||
}
|
||||
@@ -0,0 +1,423 @@
|
||||
// Package graph provides observation relationship graphs for LEANN Phase 2.
|
||||
//
|
||||
// This package implements graph-based selective recomputation where observation
|
||||
// relationships (file overlap, semantic similarity, temporal proximity) form a
|
||||
// graph structure. Hub nodes (high-degree observations) store embeddings, while
|
||||
// leaf nodes recompute on-demand.
|
||||
package graph
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
// RelationType defines the type of relationship between observations
|
||||
type RelationType int
|
||||
|
||||
const (
|
||||
// RelationFileOverlap indicates observations reference overlapping files
|
||||
RelationFileOverlap RelationType = iota
|
||||
// RelationSemantic indicates high semantic similarity (cosine > 0.85)
|
||||
RelationSemantic
|
||||
// RelationTemporal indicates observations from same session
|
||||
RelationTemporal
|
||||
// RelationConcept indicates shared concept tags
|
||||
RelationConcept
|
||||
)
|
||||
|
||||
// Edge represents a relationship between two observations
|
||||
type Edge struct {
|
||||
FromID int64
|
||||
ToID int64
|
||||
Relation RelationType
|
||||
Weight float32 // 0.0-1.0, higher = stronger relationship
|
||||
}
|
||||
|
||||
// Node represents an observation in the graph
|
||||
type Node struct {
|
||||
Metadata NodeMetadata
|
||||
LastAccess time.Time
|
||||
StoredEmb []float32 // Nil if recomputed on-demand
|
||||
ID int64
|
||||
Degree int // Number of edges (hub detection)
|
||||
AccessCount int
|
||||
}
|
||||
|
||||
// NodeMetadata contains observation metadata
|
||||
type NodeMetadata struct {
|
||||
CreatedAt time.Time
|
||||
Project string
|
||||
Type string
|
||||
Title string
|
||||
IsSuperseded bool
|
||||
}
|
||||
|
||||
// CSRGraph represents a graph in Compressed Sparse Row format for memory efficiency
|
||||
type CSRGraph struct {
|
||||
RowPtr []int32 // Node adjacency list pointers
|
||||
ColIdx []int32 // Edge destination IDs
|
||||
Weights []float32 // Edge weights
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// ObservationGraph manages the observation relationship graph
|
||||
type ObservationGraph struct {
|
||||
nodes map[int64]*Node
|
||||
csr *CSRGraph
|
||||
edges []Edge
|
||||
nodesMu sync.RWMutex
|
||||
edgesMu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewObservationGraph creates a new empty observation graph
|
||||
func NewObservationGraph() *ObservationGraph {
|
||||
return &ObservationGraph{
|
||||
nodes: make(map[int64]*Node),
|
||||
edges: make([]Edge, 0),
|
||||
csr: &CSRGraph{},
|
||||
}
|
||||
}
|
||||
|
||||
// AddNode adds or updates a node in the graph
|
||||
func (g *ObservationGraph) AddNode(node *Node) {
|
||||
g.nodesMu.Lock()
|
||||
defer g.nodesMu.Unlock()
|
||||
|
||||
g.nodes[node.ID] = node
|
||||
}
|
||||
|
||||
// AddEdge adds an edge to the graph
|
||||
func (g *ObservationGraph) AddEdge(edge Edge) {
|
||||
g.edgesMu.Lock()
|
||||
defer g.edgesMu.Unlock()
|
||||
|
||||
g.edges = append(g.edges, edge)
|
||||
|
||||
// Update degree counts
|
||||
g.nodesMu.Lock()
|
||||
if fromNode, ok := g.nodes[edge.FromID]; ok {
|
||||
fromNode.Degree++
|
||||
}
|
||||
if toNode, ok := g.nodes[edge.ToID]; ok {
|
||||
toNode.Degree++
|
||||
}
|
||||
g.nodesMu.Unlock()
|
||||
}
|
||||
|
||||
// BuildCSR converts edge list to CSR format for efficient traversal
|
||||
func (g *ObservationGraph) BuildCSR() error {
|
||||
g.edgesMu.RLock()
|
||||
g.nodesMu.RLock()
|
||||
defer g.edgesMu.RUnlock()
|
||||
defer g.nodesMu.RUnlock()
|
||||
|
||||
if len(g.nodes) == 0 {
|
||||
return fmt.Errorf("no nodes in graph")
|
||||
}
|
||||
|
||||
// Create node ID to index mapping
|
||||
nodeIDs := make([]int64, 0, len(g.nodes))
|
||||
for id := range g.nodes {
|
||||
nodeIDs = append(nodeIDs, id)
|
||||
}
|
||||
sort.Slice(nodeIDs, func(i, j int) bool {
|
||||
return nodeIDs[i] < nodeIDs[j]
|
||||
})
|
||||
|
||||
idToIdx := make(map[int64]int32)
|
||||
for idx, id := range nodeIDs {
|
||||
// #nosec G115 - observation count will never exceed int32 max (2.1B) in practice
|
||||
idToIdx[id] = int32(idx)
|
||||
}
|
||||
|
||||
// Count edges per node
|
||||
edgeCounts := make([]int, len(nodeIDs))
|
||||
for _, edge := range g.edges {
|
||||
if fromIdx, ok := idToIdx[edge.FromID]; ok {
|
||||
edgeCounts[fromIdx]++
|
||||
}
|
||||
}
|
||||
|
||||
// Build row pointers
|
||||
rowPtr := make([]int32, len(nodeIDs)+1)
|
||||
rowPtr[0] = 0
|
||||
for i := 0; i < len(nodeIDs); i++ {
|
||||
// #nosec G115 - edge counts per node will not exceed int32 max
|
||||
rowPtr[i+1] = rowPtr[i] + int32(edgeCounts[i])
|
||||
}
|
||||
|
||||
// Build column indices and weights
|
||||
totalEdges := rowPtr[len(nodeIDs)]
|
||||
colIdx := make([]int32, totalEdges)
|
||||
weights := make([]float32, totalEdges)
|
||||
|
||||
// Temporary counter for filling CSR
|
||||
currentPos := make([]int32, len(nodeIDs))
|
||||
copy(currentPos, rowPtr[:len(nodeIDs)])
|
||||
|
||||
for _, edge := range g.edges {
|
||||
fromIdx, fromOk := idToIdx[edge.FromID]
|
||||
toIdx, toOk := idToIdx[edge.ToID]
|
||||
|
||||
if fromOk && toOk {
|
||||
pos := currentPos[fromIdx]
|
||||
colIdx[pos] = toIdx
|
||||
weights[pos] = edge.Weight
|
||||
currentPos[fromIdx]++
|
||||
}
|
||||
}
|
||||
|
||||
g.csr.mu.Lock()
|
||||
g.csr.RowPtr = rowPtr
|
||||
g.csr.ColIdx = colIdx
|
||||
g.csr.Weights = weights
|
||||
g.csr.mu.Unlock()
|
||||
|
||||
log.Info().
|
||||
Int("nodes", len(nodeIDs)).
|
||||
Int("edges", int(totalEdges)).
|
||||
Msg("Built CSR graph representation")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetNeighbors returns neighboring nodes and their edge weights
|
||||
func (g *ObservationGraph) GetNeighbors(nodeID int64) ([]int64, []float32, error) {
|
||||
g.csr.mu.RLock()
|
||||
defer g.csr.mu.RUnlock()
|
||||
|
||||
// Find node index in CSR
|
||||
g.nodesMu.RLock()
|
||||
nodeIDs := make([]int64, 0, len(g.nodes))
|
||||
for id := range g.nodes {
|
||||
nodeIDs = append(nodeIDs, id)
|
||||
}
|
||||
g.nodesMu.RUnlock()
|
||||
|
||||
sort.Slice(nodeIDs, func(i, j int) bool {
|
||||
return nodeIDs[i] < nodeIDs[j]
|
||||
})
|
||||
|
||||
nodeIdx := sort.Search(len(nodeIDs), func(i int) bool {
|
||||
return nodeIDs[i] >= nodeID
|
||||
})
|
||||
|
||||
if nodeIdx >= len(nodeIDs) || nodeIDs[nodeIdx] != nodeID {
|
||||
return nil, nil, fmt.Errorf("node %d not found", nodeID)
|
||||
}
|
||||
|
||||
// Extract neighbors from CSR
|
||||
startIdx := g.csr.RowPtr[nodeIdx]
|
||||
endIdx := g.csr.RowPtr[nodeIdx+1]
|
||||
|
||||
neighborCount := endIdx - startIdx
|
||||
neighbors := make([]int64, neighborCount)
|
||||
weights := make([]float32, neighborCount)
|
||||
|
||||
for i := int32(0); i < neighborCount; i++ {
|
||||
neighborIdx := g.csr.ColIdx[startIdx+i]
|
||||
neighbors[i] = nodeIDs[neighborIdx]
|
||||
weights[i] = g.csr.Weights[startIdx+i]
|
||||
}
|
||||
|
||||
return neighbors, weights, nil
|
||||
}
|
||||
|
||||
// GetNode retrieves a node by ID
|
||||
func (g *ObservationGraph) GetNode(nodeID int64) (*Node, error) {
|
||||
g.nodesMu.RLock()
|
||||
defer g.nodesMu.RUnlock()
|
||||
|
||||
node, ok := g.nodes[nodeID]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("node %d not found", nodeID)
|
||||
}
|
||||
|
||||
return node, nil
|
||||
}
|
||||
|
||||
// FindHubs identifies hub nodes (high degree) in the graph
|
||||
func (g *ObservationGraph) FindHubs(percentile float64) []int64 {
|
||||
g.nodesMu.RLock()
|
||||
defer g.nodesMu.RUnlock()
|
||||
|
||||
if len(g.nodes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Collect all degrees
|
||||
degrees := make([]int, 0, len(g.nodes))
|
||||
nodeIDs := make([]int64, 0, len(g.nodes))
|
||||
|
||||
for id, node := range g.nodes {
|
||||
degrees = append(degrees, node.Degree)
|
||||
nodeIDs = append(nodeIDs, id)
|
||||
}
|
||||
|
||||
// Sort by degree
|
||||
type nodeDegree struct {
|
||||
ID int64
|
||||
Degree int
|
||||
}
|
||||
|
||||
nodeDegrees := make([]nodeDegree, len(nodeIDs))
|
||||
for i := range nodeIDs {
|
||||
nodeDegrees[i] = nodeDegree{
|
||||
ID: nodeIDs[i],
|
||||
Degree: degrees[i],
|
||||
}
|
||||
}
|
||||
|
||||
sort.Slice(nodeDegrees, func(i, j int) bool {
|
||||
return nodeDegrees[i].Degree > nodeDegrees[j].Degree
|
||||
})
|
||||
|
||||
// Return top percentile
|
||||
cutoff := int(math.Ceil(float64(len(nodeDegrees)) * (1.0 - percentile)))
|
||||
if cutoff > len(nodeDegrees) {
|
||||
cutoff = len(nodeDegrees)
|
||||
}
|
||||
|
||||
hubs := make([]int64, cutoff)
|
||||
for i := 0; i < cutoff; i++ {
|
||||
hubs[i] = nodeDegrees[i].ID
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Int("total_nodes", len(g.nodes)).
|
||||
Int("hubs", len(hubs)).
|
||||
Float64("percentile", percentile).
|
||||
Msg("Identified hub nodes")
|
||||
|
||||
return hubs
|
||||
}
|
||||
|
||||
// Stats returns graph statistics
|
||||
func (g *ObservationGraph) Stats() GraphStats {
|
||||
g.nodesMu.RLock()
|
||||
g.edgesMu.RLock()
|
||||
defer g.nodesMu.RUnlock()
|
||||
defer g.edgesMu.RUnlock()
|
||||
|
||||
stats := GraphStats{
|
||||
NodeCount: len(g.nodes),
|
||||
EdgeCount: len(g.edges),
|
||||
}
|
||||
|
||||
if len(g.nodes) > 0 {
|
||||
degrees := make([]int, 0, len(g.nodes))
|
||||
for _, node := range g.nodes {
|
||||
degrees = append(degrees, node.Degree)
|
||||
}
|
||||
|
||||
sort.Ints(degrees)
|
||||
stats.AvgDegree = float64(sum(degrees)) / float64(len(degrees))
|
||||
stats.MaxDegree = degrees[len(degrees)-1]
|
||||
stats.MinDegree = degrees[0]
|
||||
|
||||
// Median
|
||||
mid := len(degrees) / 2
|
||||
if len(degrees)%2 == 0 {
|
||||
stats.MedianDegree = float64(degrees[mid-1]+degrees[mid]) / 2.0
|
||||
} else {
|
||||
stats.MedianDegree = float64(degrees[mid])
|
||||
}
|
||||
}
|
||||
|
||||
// Count edge types
|
||||
stats.EdgeTypes = make(map[RelationType]int)
|
||||
for _, edge := range g.edges {
|
||||
stats.EdgeTypes[edge.Relation]++
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// GraphStats contains graph statistics
|
||||
type GraphStats struct {
|
||||
EdgeTypes map[RelationType]int
|
||||
AvgDegree float64
|
||||
MedianDegree float64
|
||||
NodeCount int
|
||||
EdgeCount int
|
||||
MaxDegree int
|
||||
MinDegree int
|
||||
}
|
||||
|
||||
// BuildFromObservations constructs a graph from a list of observations
|
||||
func BuildFromObservations(ctx context.Context, observations []*models.Observation) (*ObservationGraph, error) {
|
||||
graph := NewObservationGraph()
|
||||
|
||||
// Add nodes
|
||||
for _, obs := range observations {
|
||||
// Extract title from sql.NullString
|
||||
title := ""
|
||||
if obs.Title.Valid {
|
||||
title = obs.Title.String
|
||||
}
|
||||
|
||||
node := &Node{
|
||||
ID: obs.ID,
|
||||
Degree: 0,
|
||||
Metadata: NodeMetadata{
|
||||
Project: obs.Project,
|
||||
Type: string(obs.Type),
|
||||
Title: title,
|
||||
CreatedAt: time.UnixMilli(obs.CreatedAtEpoch),
|
||||
IsSuperseded: obs.IsSuperseded,
|
||||
},
|
||||
LastAccess: time.Now(),
|
||||
AccessCount: 0,
|
||||
}
|
||||
graph.AddNode(node)
|
||||
}
|
||||
|
||||
// Detect edges (will be implemented in edge_detector.go)
|
||||
edges, err := DetectEdges(ctx, observations)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("detect edges: %w", err)
|
||||
}
|
||||
|
||||
for _, edge := range edges {
|
||||
graph.AddEdge(edge)
|
||||
}
|
||||
|
||||
// Build CSR representation
|
||||
if err := graph.BuildCSR(); err != nil {
|
||||
return nil, fmt.Errorf("build CSR: %w", err)
|
||||
}
|
||||
|
||||
return graph, nil
|
||||
}
|
||||
|
||||
// Helper function to sum integers
|
||||
func sum(values []int) int {
|
||||
total := 0
|
||||
for _, v := range values {
|
||||
total += v
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// String returns a human-readable representation of RelationType
|
||||
func (r RelationType) String() string {
|
||||
switch r {
|
||||
case RelationFileOverlap:
|
||||
return "file_overlap"
|
||||
case RelationSemantic:
|
||||
return "semantic"
|
||||
case RelationTemporal:
|
||||
return "temporal"
|
||||
case RelationConcept:
|
||||
return "concept"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user