feat(leann-phase2): implement hybrid vector storage and graph-based search

- [x] Add AST-aware code chunking for Go, Python, and TypeScript using tree-sitter - [x] Implement LEANN-inspired hybrid vector storage with hub detection and selective embedding storage (60-80% savings) - [x] Add observation relationship graph with CSR format and edge detection (file overlap, semantic similarity, temporal, concept) - [x] Implement graph-aware search with two-level traversal and relationship-based ranking - [x] Add auto-tuning system for dynamic hub threshold adjustment based on query performance - [x] Add comprehensive metrics tracking for vector storage, queries, latency, and graph traversals - [x] Update configuration system with graph and hybrid storage settings - [x] Add graph stats and vector metrics endpoints to worker service - [x] Enhance UI sidebar with advanced metrics display and graph visualization - [x] Optimize struct field alignment throughout codebase for memory efficiency - [x] Update documentation with LEANN Phase 2 features and performance benefits - [x] Add tree-sitter dependency for AST parsing
2026-06-25 04:03:08 +00:00 · 2026-01-07 20:43:10 +00:00
parent 7ab4b07cf2
commit 74ae8ed4c1
83 changed files with 5190 additions and 603 deletions
@@ -0,0 +1,417 @@
+package graph
+
+import (
+	"context"
+	"fmt"
+	"math"
+
+	"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
+	"github.com/rs/zerolog/log"
+)
+
+const (
+	// SemanticSimilarityThreshold for creating semantic edges
+	SemanticSimilarityThreshold = 0.85
+
+	// MinFileOverlapForEdge minimum file overlap ratio to create edge
+	MinFileOverlapForEdge = 0.3
+
+	// MaxEdgesPerNode prevents creating too many edges
+	MaxEdgesPerNode = 20
+)
+
+// DetectEdges identifies relationships between observations
+func DetectEdges(ctx context.Context, observations []*models.Observation) ([]Edge, error) {
+	if len(observations) < 2 {
+		return nil, nil
+	}
+
+	edges := make([]Edge, 0)
+
+	// Build lookup maps for efficient detection
+	sessionMap := buildSessionMap(observations)
+	conceptMap := buildConceptMap(observations)
+	fileMap := buildFileMap(observations)
+
+	log.Info().
+		Int("observations", len(observations)).
+		Int("sessions", len(sessionMap)).
+		Int("concepts", len(conceptMap)).
+		Msg("Starting edge detection")
+
+	// Detect temporal edges (same session)
+	temporalEdges := detectTemporalEdges(sessionMap)
+	edges = append(edges, temporalEdges...)
+
+	// Detect concept edges (shared tags)
+	conceptEdges := detectConceptEdges(conceptMap)
+	edges = append(edges, conceptEdges...)
+
+	// Detect file overlap edges
+	fileEdges := detectFileOverlapEdges(fileMap, observations)
+	edges = append(edges, fileEdges...)
+
+	// Prune excessive edges per node
+	edges = pruneEdges(edges, MaxEdgesPerNode)
+
+	log.Info().
+		Int("temporal_edges", len(temporalEdges)).
+		Int("concept_edges", len(conceptEdges)).
+		Int("file_edges", len(fileEdges)).
+		Int("total_edges", len(edges)).
+		Msg("Edge detection complete")
+
+	return edges, nil
+}
+
+// buildSessionMap groups observations by SDK session
+func buildSessionMap(observations []*models.Observation) map[string][]int64 {
+	sessionMap := make(map[string][]int64)
+
+	for _, obs := range observations {
+		if obs.SDKSessionID != "" {
+			sessionMap[obs.SDKSessionID] = append(sessionMap[obs.SDKSessionID], obs.ID)
+		}
+	}
+
+	return sessionMap
+}
+
+// buildConceptMap groups observations by concept tags
+func buildConceptMap(observations []*models.Observation) map[string][]int64 {
+	conceptMap := make(map[string][]int64)
+
+	for _, obs := range observations {
+		for _, concept := range obs.Concepts {
+			conceptMap[concept] = append(conceptMap[concept], obs.ID)
+		}
+	}
+
+	return conceptMap
+}
+
+// buildFileMap maps files to observations (from both FilesRead and FilesModified)
+func buildFileMap(observations []*models.Observation) map[string][]int64 {
+	fileMap := make(map[string][]int64)
+
+	for _, obs := range observations {
+		// Add files from FilesRead
+		for _, file := range obs.FilesRead {
+			fileMap[file] = append(fileMap[file], obs.ID)
+		}
+		// Add files from FilesModified
+		for _, file := range obs.FilesModified {
+			fileMap[file] = append(fileMap[file], obs.ID)
+		}
+	}
+
+	return fileMap
+}
+
+// detectTemporalEdges creates edges between observations in the same session
+func detectTemporalEdges(sessionMap map[string][]int64) []Edge {
+	edges := make([]Edge, 0)
+
+	for _, obsIDs := range sessionMap {
+		if len(obsIDs) < 2 {
+			continue
+		}
+
+		// Create edges between consecutive observations in session
+		for i := 0; i < len(obsIDs)-1; i++ {
+			edges = append(edges, Edge{
+				FromID:   obsIDs[i],
+				ToID:     obsIDs[i+1],
+				Relation: RelationTemporal,
+				Weight:   0.8, // High weight for temporal proximity
+			})
+		}
+	}
+
+	return edges
+}
+
+// detectConceptEdges creates edges between observations sharing concepts
+func detectConceptEdges(conceptMap map[string][]int64) []Edge {
+	edges := make([]Edge, 0)
+	seen := make(map[string]bool)
+
+	for concept, obsIDs := range conceptMap {
+		if len(obsIDs) < 2 {
+			continue
+		}
+
+		// Create edges between all observations sharing this concept
+		for i := 0; i < len(obsIDs); i++ {
+			for j := i + 1; j < len(obsIDs); j++ {
+				// Use sorted pair as key to avoid duplicates
+				pairKey := edgeKey(obsIDs[i], obsIDs[j])
+				if seen[pairKey] {
+					continue
+				}
+				seen[pairKey] = true
+
+				// Weight based on concept specificity (longer = more specific)
+				weight := float32(0.5 + 0.3*math.Min(1.0, float64(len(concept))/20.0))
+
+				edges = append(edges, Edge{
+					FromID:   obsIDs[i],
+					ToID:     obsIDs[j],
+					Relation: RelationConcept,
+					Weight:   weight,
+				})
+			}
+		}
+	}
+
+	return edges
+}
+
+// detectFileOverlapEdges creates edges based on file references
+func detectFileOverlapEdges(fileMap map[string][]int64, observations []*models.Observation) []Edge {
+	edges := make([]Edge, 0)
+	seen := make(map[string]bool)
+
+	// Build observation ID to observation map for quick lookup
+	obsMap := make(map[int64]*models.Observation)
+	for _, obs := range observations {
+		obsMap[obs.ID] = obs
+	}
+
+	for _, obsIDs := range fileMap {
+		if len(obsIDs) < 2 {
+			continue
+		}
+
+		// Create edges between observations referencing same files
+		for i := 0; i < len(obsIDs); i++ {
+			for j := i + 1; j < len(obsIDs); j++ {
+				pairKey := edgeKey(obsIDs[i], obsIDs[j])
+				if seen[pairKey] {
+					continue
+				}
+				seen[pairKey] = true
+
+				// Calculate file overlap ratio
+				obs1, ok1 := obsMap[obsIDs[i]]
+				obs2, ok2 := obsMap[obsIDs[j]]
+
+				if !ok1 || !ok2 {
+					continue
+				}
+
+				// Merge FilesRead and FilesModified for both observations
+				files1 := append([]string{}, obs1.FilesRead...)
+				files1 = append(files1, obs1.FilesModified...)
+				files2 := append([]string{}, obs2.FilesRead...)
+				files2 = append(files2, obs2.FilesModified...)
+
+				overlap := calculateFileOverlap(files1, files2)
+				if overlap < MinFileOverlapForEdge {
+					continue
+				}
+
+				edges = append(edges, Edge{
+					FromID:   obsIDs[i],
+					ToID:     obsIDs[j],
+					Relation: RelationFileOverlap,
+					Weight:   overlap,
+				})
+			}
+		}
+	}
+
+	return edges
+}
+
+// calculateFileOverlap computes Jaccard similarity of file sets
+func calculateFileOverlap(files1, files2 []string) float32 {
+	if len(files1) == 0 || len(files2) == 0 {
+		return 0.0
+	}
+
+	// Convert to sets
+	set1 := make(map[string]bool)
+	for _, f := range files1 {
+		set1[f] = true
+	}
+
+	set2 := make(map[string]bool)
+	for _, f := range files2 {
+		set2[f] = true
+	}
+
+	// Count intersection
+	intersection := 0
+	for f := range set1 {
+		if set2[f] {
+			intersection++
+		}
+	}
+
+	// Jaccard similarity = intersection / union
+	union := len(set1) + len(set2) - intersection
+	if union == 0 {
+		return 0.0
+	}
+
+	return float32(intersection) / float32(union)
+}
+
+// pruneEdges limits edges per node to prevent graph explosion
+func pruneEdges(edges []Edge, maxPerNode int) []Edge {
+	if maxPerNode <= 0 {
+		return edges
+	}
+
+	// Count edges per node
+	outEdges := make(map[int64][]Edge)
+	inEdges := make(map[int64][]Edge)
+
+	for _, edge := range edges {
+		outEdges[edge.FromID] = append(outEdges[edge.FromID], edge)
+		inEdges[edge.ToID] = append(inEdges[edge.ToID], edge)
+	}
+
+	// Prune low-weight edges if node has too many
+	pruned := make([]Edge, 0, len(edges))
+	processed := make(map[string]bool)
+
+	for _, edge := range edges {
+		pairKey := edgeKey(edge.FromID, edge.ToID)
+		if processed[pairKey] {
+			continue
+		}
+		processed[pairKey] = true
+
+		// Check if either node has too many edges
+		fromCount := len(outEdges[edge.FromID])
+		toCount := len(inEdges[edge.ToID])
+
+		if fromCount <= maxPerNode && toCount <= maxPerNode {
+			pruned = append(pruned, edge)
+			continue
+		}
+
+		// Keep edge if it's high-weight (top edges for this node)
+		if shouldKeepEdge(edge, outEdges[edge.FromID], maxPerNode) {
+			pruned = append(pruned, edge)
+		}
+	}
+
+	if len(pruned) < len(edges) {
+		log.Debug().
+			Int("original", len(edges)).
+			Int("pruned", len(pruned)).
+			Int("removed", len(edges)-len(pruned)).
+			Msg("Pruned excessive edges")
+	}
+
+	return pruned
+}
+
+// shouldKeepEdge determines if edge should be kept during pruning
+func shouldKeepEdge(edge Edge, nodeEdges []Edge, maxPerNode int) bool {
+	// Sort node's edges by weight descending
+	sortedEdges := make([]Edge, len(nodeEdges))
+	copy(sortedEdges, nodeEdges)
+
+	sortEdgesByWeight(sortedEdges)
+
+	// Keep edge if it's in top maxPerNode
+	for i := 0; i < maxPerNode && i < len(sortedEdges); i++ {
+		if sortedEdges[i].FromID == edge.FromID && sortedEdges[i].ToID == edge.ToID {
+			return true
+		}
+	}
+
+	return false
+}
+
+// sortEdgesByWeight sorts edges by weight descending
+func sortEdgesByWeight(edges []Edge) {
+	// Simple bubble sort (edges are typically small per node)
+	n := len(edges)
+	for i := 0; i < n-1; i++ {
+		for j := 0; j < n-i-1; j++ {
+			if edges[j].Weight < edges[j+1].Weight {
+				edges[j], edges[j+1] = edges[j+1], edges[j]
+			}
+		}
+	}
+}
+
+// edgeKey creates a unique key for an edge pair (sorted)
+func edgeKey(id1, id2 int64) string {
+	if id1 < id2 {
+		return fmt.Sprintf("%d-%d", id1, id2)
+	}
+	return fmt.Sprintf("%d-%d", id2, id1)
+}
+
+// DetectSemanticEdges creates edges based on semantic similarity
+// This requires embeddings and is called separately when available
+func DetectSemanticEdges(ctx context.Context, observations []*models.Observation, embeddings map[int64][]float32) []Edge {
+	edges := make([]Edge, 0)
+	seen := make(map[string]bool)
+
+	// Compare all pairs (expensive, but necessary for semantic similarity)
+	for i := 0; i < len(observations); i++ {
+		emb1, ok1 := embeddings[observations[i].ID]
+		if !ok1 {
+			continue
+		}
+
+		for j := i + 1; j < len(observations); j++ {
+			emb2, ok2 := embeddings[observations[j].ID]
+			if !ok2 {
+				continue
+			}
+
+			similarity := cosineSimilarity(emb1, emb2)
+			if similarity < SemanticSimilarityThreshold {
+				continue
+			}
+
+			pairKey := edgeKey(observations[i].ID, observations[j].ID)
+			if seen[pairKey] {
+				continue
+			}
+			seen[pairKey] = true
+
+			edges = append(edges, Edge{
+				FromID:   observations[i].ID,
+				ToID:     observations[j].ID,
+				Relation: RelationSemantic,
+				Weight:   similarity,
+			})
+		}
+	}
+
+	log.Info().
+		Int("semantic_edges", len(edges)).
+		Float32("threshold", SemanticSimilarityThreshold).
+		Msg("Detected semantic edges")
+
+	return edges
+}
+
+// cosineSimilarity computes cosine similarity between two vectors
+func cosineSimilarity(a, b []float32) float32 {
+	if len(a) != len(b) {
+		return 0.0
+	}
+
+	var dotProduct, normA, normB float32
+	for i := range a {
+		dotProduct += a[i] * b[i]
+		normA += a[i] * a[i]
+		normB += b[i] * b[i]
+	}
+
+	if normA == 0 || normB == 0 {
+		return 0.0
+	}
+
+	return dotProduct / float32(math.Sqrt(float64(normA))*math.Sqrt(float64(normB)))
+}
@@ -0,0 +1,423 @@
+// Package graph provides observation relationship graphs for LEANN Phase 2.
+//
+// This package implements graph-based selective recomputation where observation
+// relationships (file overlap, semantic similarity, temporal proximity) form a
+// graph structure. Hub nodes (high-degree observations) store embeddings, while
+// leaf nodes recompute on-demand.
+package graph
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
+	"github.com/rs/zerolog/log"
+)
+
+// RelationType defines the type of relationship between observations
+type RelationType int
+
+const (
+	// RelationFileOverlap indicates observations reference overlapping files
+	RelationFileOverlap RelationType = iota
+	// RelationSemantic indicates high semantic similarity (cosine > 0.85)
+	RelationSemantic
+	// RelationTemporal indicates observations from same session
+	RelationTemporal
+	// RelationConcept indicates shared concept tags
+	RelationConcept
+)
+
+// Edge represents a relationship between two observations
+type Edge struct {
+	FromID   int64
+	ToID     int64
+	Relation RelationType
+	Weight   float32 // 0.0-1.0, higher = stronger relationship
+}
+
+// Node represents an observation in the graph
+type Node struct {
+	Metadata    NodeMetadata
+	LastAccess  time.Time
+	StoredEmb   []float32 // Nil if recomputed on-demand
+	ID          int64
+	Degree      int // Number of edges (hub detection)
+	AccessCount int
+}
+
+// NodeMetadata contains observation metadata
+type NodeMetadata struct {
+	CreatedAt    time.Time
+	Project      string
+	Type         string
+	Title        string
+	IsSuperseded bool
+}
+
+// CSRGraph represents a graph in Compressed Sparse Row format for memory efficiency
+type CSRGraph struct {
+	RowPtr  []int32   // Node adjacency list pointers
+	ColIdx  []int32   // Edge destination IDs
+	Weights []float32 // Edge weights
+	mu      sync.RWMutex
+}
+
+// ObservationGraph manages the observation relationship graph
+type ObservationGraph struct {
+	nodes   map[int64]*Node
+	csr     *CSRGraph
+	edges   []Edge
+	nodesMu sync.RWMutex
+	edgesMu sync.RWMutex
+}
+
+// NewObservationGraph creates a new empty observation graph
+func NewObservationGraph() *ObservationGraph {
+	return &ObservationGraph{
+		nodes: make(map[int64]*Node),
+		edges: make([]Edge, 0),
+		csr:   &CSRGraph{},
+	}
+}
+
+// AddNode adds or updates a node in the graph
+func (g *ObservationGraph) AddNode(node *Node) {
+	g.nodesMu.Lock()
+	defer g.nodesMu.Unlock()
+
+	g.nodes[node.ID] = node
+}
+
+// AddEdge adds an edge to the graph
+func (g *ObservationGraph) AddEdge(edge Edge) {
+	g.edgesMu.Lock()
+	defer g.edgesMu.Unlock()
+
+	g.edges = append(g.edges, edge)
+
+	// Update degree counts
+	g.nodesMu.Lock()
+	if fromNode, ok := g.nodes[edge.FromID]; ok {
+		fromNode.Degree++
+	}
+	if toNode, ok := g.nodes[edge.ToID]; ok {
+		toNode.Degree++
+	}
+	g.nodesMu.Unlock()
+}
+
+// BuildCSR converts edge list to CSR format for efficient traversal
+func (g *ObservationGraph) BuildCSR() error {
+	g.edgesMu.RLock()
+	g.nodesMu.RLock()
+	defer g.edgesMu.RUnlock()
+	defer g.nodesMu.RUnlock()
+
+	if len(g.nodes) == 0 {
+		return fmt.Errorf("no nodes in graph")
+	}
+
+	// Create node ID to index mapping
+	nodeIDs := make([]int64, 0, len(g.nodes))
+	for id := range g.nodes {
+		nodeIDs = append(nodeIDs, id)
+	}
+	sort.Slice(nodeIDs, func(i, j int) bool {
+		return nodeIDs[i] < nodeIDs[j]
+	})
+
+	idToIdx := make(map[int64]int32)
+	for idx, id := range nodeIDs {
+		// #nosec G115 - observation count will never exceed int32 max (2.1B) in practice
+		idToIdx[id] = int32(idx)
+	}
+
+	// Count edges per node
+	edgeCounts := make([]int, len(nodeIDs))
+	for _, edge := range g.edges {
+		if fromIdx, ok := idToIdx[edge.FromID]; ok {
+			edgeCounts[fromIdx]++
+		}
+	}
+
+	// Build row pointers
+	rowPtr := make([]int32, len(nodeIDs)+1)
+	rowPtr[0] = 0
+	for i := 0; i < len(nodeIDs); i++ {
+		// #nosec G115 - edge counts per node will not exceed int32 max
+		rowPtr[i+1] = rowPtr[i] + int32(edgeCounts[i])
+	}
+
+	// Build column indices and weights
+	totalEdges := rowPtr[len(nodeIDs)]
+	colIdx := make([]int32, totalEdges)
+	weights := make([]float32, totalEdges)
+
+	// Temporary counter for filling CSR
+	currentPos := make([]int32, len(nodeIDs))
+	copy(currentPos, rowPtr[:len(nodeIDs)])
+
+	for _, edge := range g.edges {
+		fromIdx, fromOk := idToIdx[edge.FromID]
+		toIdx, toOk := idToIdx[edge.ToID]
+
+		if fromOk && toOk {
+			pos := currentPos[fromIdx]
+			colIdx[pos] = toIdx
+			weights[pos] = edge.Weight
+			currentPos[fromIdx]++
+		}
+	}
+
+	g.csr.mu.Lock()
+	g.csr.RowPtr = rowPtr
+	g.csr.ColIdx = colIdx
+	g.csr.Weights = weights
+	g.csr.mu.Unlock()
+
+	log.Info().
+		Int("nodes", len(nodeIDs)).
+		Int("edges", int(totalEdges)).
+		Msg("Built CSR graph representation")
+
+	return nil
+}
+
+// GetNeighbors returns neighboring nodes and their edge weights
+func (g *ObservationGraph) GetNeighbors(nodeID int64) ([]int64, []float32, error) {
+	g.csr.mu.RLock()
+	defer g.csr.mu.RUnlock()
+
+	// Find node index in CSR
+	g.nodesMu.RLock()
+	nodeIDs := make([]int64, 0, len(g.nodes))
+	for id := range g.nodes {
+		nodeIDs = append(nodeIDs, id)
+	}
+	g.nodesMu.RUnlock()
+
+	sort.Slice(nodeIDs, func(i, j int) bool {
+		return nodeIDs[i] < nodeIDs[j]
+	})
+
+	nodeIdx := sort.Search(len(nodeIDs), func(i int) bool {
+		return nodeIDs[i] >= nodeID
+	})
+
+	if nodeIdx >= len(nodeIDs) || nodeIDs[nodeIdx] != nodeID {
+		return nil, nil, fmt.Errorf("node %d not found", nodeID)
+	}
+
+	// Extract neighbors from CSR
+	startIdx := g.csr.RowPtr[nodeIdx]
+	endIdx := g.csr.RowPtr[nodeIdx+1]
+
+	neighborCount := endIdx - startIdx
+	neighbors := make([]int64, neighborCount)
+	weights := make([]float32, neighborCount)
+
+	for i := int32(0); i < neighborCount; i++ {
+		neighborIdx := g.csr.ColIdx[startIdx+i]
+		neighbors[i] = nodeIDs[neighborIdx]
+		weights[i] = g.csr.Weights[startIdx+i]
+	}
+
+	return neighbors, weights, nil
+}
+
+// GetNode retrieves a node by ID
+func (g *ObservationGraph) GetNode(nodeID int64) (*Node, error) {
+	g.nodesMu.RLock()
+	defer g.nodesMu.RUnlock()
+
+	node, ok := g.nodes[nodeID]
+	if !ok {
+		return nil, fmt.Errorf("node %d not found", nodeID)
+	}
+
+	return node, nil
+}
+
+// FindHubs identifies hub nodes (high degree) in the graph
+func (g *ObservationGraph) FindHubs(percentile float64) []int64 {
+	g.nodesMu.RLock()
+	defer g.nodesMu.RUnlock()
+
+	if len(g.nodes) == 0 {
+		return nil
+	}
+
+	// Collect all degrees
+	degrees := make([]int, 0, len(g.nodes))
+	nodeIDs := make([]int64, 0, len(g.nodes))
+
+	for id, node := range g.nodes {
+		degrees = append(degrees, node.Degree)
+		nodeIDs = append(nodeIDs, id)
+	}
+
+	// Sort by degree
+	type nodeDegree struct {
+		ID     int64
+		Degree int
+	}
+
+	nodeDegrees := make([]nodeDegree, len(nodeIDs))
+	for i := range nodeIDs {
+		nodeDegrees[i] = nodeDegree{
+			ID:     nodeIDs[i],
+			Degree: degrees[i],
+		}
+	}
+
+	sort.Slice(nodeDegrees, func(i, j int) bool {
+		return nodeDegrees[i].Degree > nodeDegrees[j].Degree
+	})
+
+	// Return top percentile
+	cutoff := int(math.Ceil(float64(len(nodeDegrees)) * (1.0 - percentile)))
+	if cutoff > len(nodeDegrees) {
+		cutoff = len(nodeDegrees)
+	}
+
+	hubs := make([]int64, cutoff)
+	for i := 0; i < cutoff; i++ {
+		hubs[i] = nodeDegrees[i].ID
+	}
+
+	log.Info().
+		Int("total_nodes", len(g.nodes)).
+		Int("hubs", len(hubs)).
+		Float64("percentile", percentile).
+		Msg("Identified hub nodes")
+
+	return hubs
+}
+
+// Stats returns graph statistics
+func (g *ObservationGraph) Stats() GraphStats {
+	g.nodesMu.RLock()
+	g.edgesMu.RLock()
+	defer g.nodesMu.RUnlock()
+	defer g.edgesMu.RUnlock()
+
+	stats := GraphStats{
+		NodeCount: len(g.nodes),
+		EdgeCount: len(g.edges),
+	}
+
+	if len(g.nodes) > 0 {
+		degrees := make([]int, 0, len(g.nodes))
+		for _, node := range g.nodes {
+			degrees = append(degrees, node.Degree)
+		}
+
+		sort.Ints(degrees)
+		stats.AvgDegree = float64(sum(degrees)) / float64(len(degrees))
+		stats.MaxDegree = degrees[len(degrees)-1]
+		stats.MinDegree = degrees[0]
+
+		// Median
+		mid := len(degrees) / 2
+		if len(degrees)%2 == 0 {
+			stats.MedianDegree = float64(degrees[mid-1]+degrees[mid]) / 2.0
+		} else {
+			stats.MedianDegree = float64(degrees[mid])
+		}
+	}
+
+	// Count edge types
+	stats.EdgeTypes = make(map[RelationType]int)
+	for _, edge := range g.edges {
+		stats.EdgeTypes[edge.Relation]++
+	}
+
+	return stats
+}
+
+// GraphStats contains graph statistics
+type GraphStats struct {
+	EdgeTypes    map[RelationType]int
+	AvgDegree    float64
+	MedianDegree float64
+	NodeCount    int
+	EdgeCount    int
+	MaxDegree    int
+	MinDegree    int
+}
+
+// BuildFromObservations constructs a graph from a list of observations
+func BuildFromObservations(ctx context.Context, observations []*models.Observation) (*ObservationGraph, error) {
+	graph := NewObservationGraph()
+
+	// Add nodes
+	for _, obs := range observations {
+		// Extract title from sql.NullString
+		title := ""
+		if obs.Title.Valid {
+			title = obs.Title.String
+		}
+
+		node := &Node{
+			ID:     obs.ID,
+			Degree: 0,
+			Metadata: NodeMetadata{
+				Project:      obs.Project,
+				Type:         string(obs.Type),
+				Title:        title,
+				CreatedAt:    time.UnixMilli(obs.CreatedAtEpoch),
+				IsSuperseded: obs.IsSuperseded,
+			},
+			LastAccess:  time.Now(),
+			AccessCount: 0,
+		}
+		graph.AddNode(node)
+	}
+
+	// Detect edges (will be implemented in edge_detector.go)
+	edges, err := DetectEdges(ctx, observations)
+	if err != nil {
+		return nil, fmt.Errorf("detect edges: %w", err)
+	}
+
+	for _, edge := range edges {
+		graph.AddEdge(edge)
+	}
+
+	// Build CSR representation
+	if err := graph.BuildCSR(); err != nil {
+		return nil, fmt.Errorf("build CSR: %w", err)
+	}
+
+	return graph, nil
+}
+
+// Helper function to sum integers
+func sum(values []int) int {
+	total := 0
+	for _, v := range values {
+		total += v
+	}
+	return total
+}
+
+// String returns a human-readable representation of RelationType
+func (r RelationType) String() string {
+	switch r {
+	case RelationFileOverlap:
+		return "file_overlap"
+	case RelationSemantic:
+		return "semantic"
+	case RelationTemporal:
+		return "temporal"
+	case RelationConcept:
+		return "concept"
+	default:
+		return "unknown"
+	}
+}