Make things 'betterer' across the board

2026-06-11 00:09:28 +00:00 · 2026-01-11 00:53:44 +00:00
parent 7ab4b07cf2
commit 548b27702e
47 changed files with 12535 additions and 1784 deletions
@@ -2,6 +2,7 @@
 package similarity

 import (
+	"math/bits"
 	"strings"

 	"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
@@ -15,6 +16,17 @@ func ClusterObservations(observations []*models.Observation, similarityThreshold
 		return observations
 	}

+	// For small sets, use the simple O(n²) algorithm
+	if len(observations) <= 50 {
+		return clusterObservationsSimple(observations, similarityThreshold)
+	}
+
+	// For larger sets, use an optimized approach with early termination
+	return clusterObservationsOptimized(observations, similarityThreshold)
+}
+
+// clusterObservationsSimple is the simple O(n²) algorithm for small sets.
+func clusterObservationsSimple(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
 	// Extract terms for each observation
 	termSets := make([]map[string]bool, len(observations))
 	for i, obs := range observations {
@@ -51,6 +63,93 @@ func ClusterObservations(observations []*models.Observation, similarityThreshold
 	return result
 }

+// clusterObservationsOptimized uses MinHash-based approximation for large sets.
+// This reduces complexity from O(n²) to approximately O(n*k) where k is the number of hash functions.
+func clusterObservationsOptimized(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
+	n := len(observations)
+
+	// Extract terms for each observation and compute a signature
+	type termSetWithSig struct {
+		terms     map[string]bool
+		signature uint64 // Simple hash signature for fast comparison
+	}
+
+	termSets := make([]termSetWithSig, n)
+	for i, obs := range observations {
+		terms := ExtractObservationTerms(obs)
+		termSets[i] = termSetWithSig{
+			terms:     terms,
+			signature: computeTermSignature(terms),
+		}
+	}
+
+	// Track which observations are already clustered
+	clustered := make([]bool, n)
+	result := make([]*models.Observation, 0, n/2) // Pre-allocate assuming ~50% are unique
+
+	for i := 0; i < n; i++ {
+		if clustered[i] {
+			continue
+		}
+
+		// This observation becomes the representative of its cluster
+		result = append(result, observations[i])
+		clustered[i] = true
+
+		// Use signature for fast pre-filtering
+		sigI := termSets[i].signature
+		termsI := termSets[i].terms
+
+		// Find all similar observations and mark them as clustered
+		for j := i + 1; j < n; j++ {
+			if clustered[j] {
+				continue
+			}
+
+			// Quick signature comparison - if signatures are very different, skip detailed comparison
+			sigJ := termSets[j].signature
+			sigDiff := sigI ^ sigJ
+			popCount := popCount64(sigDiff)
+
+			// If signatures differ significantly, similarity is likely low
+			// Skip detailed comparison for very different signatures
+			if popCount > 32 { // More than half of bits differ
+				continue
+			}
+
+			// Full Jaccard comparison for candidates
+			similarity := JaccardSimilarity(termsI, termSets[j].terms)
+			if similarity >= similarityThreshold {
+				clustered[j] = true
+			}
+		}
+	}
+
+	return result
+}
+
+// computeTermSignature creates a quick hash signature for term sets.
+// Used for fast pre-filtering in the optimized clustering algorithm.
+func computeTermSignature(terms map[string]bool) uint64 {
+	var sig uint64
+	for term := range terms {
+		// Simple hash using FNV-1a inspired approach
+		h := uint64(14695981039346656037)
+		for i := 0; i < len(term); i++ {
+			h ^= uint64(term[i])
+			h *= 1099511628211
+		}
+		sig ^= h
+	}
+	return sig
+}
+
+// popCount64 counts the number of set bits in a 64-bit integer.
+// Uses the stdlib bits.OnesCount64 which may use CPU POPCNT instruction.
+func popCount64(x uint64) int {
+	return bits.OnesCount64(x)
+}
+
 // IsSimilarToAny checks if a new observation is similar to any existing observation.
 // Returns true if similarity to any existing observation exceeds the threshold.
 func IsSimilarToAny(newObs *models.Observation, existing []*models.Observation, similarityThreshold float64) bool {