Make things 'betterer' across the board

This commit is contained in:
2026-01-11 00:53:44 +00:00
parent 7ab4b07cf2
commit 548b27702e
47 changed files with 12535 additions and 1784 deletions
+99
View File
@@ -2,6 +2,7 @@
package similarity
import (
"math/bits"
"strings"
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
@@ -15,6 +16,17 @@ func ClusterObservations(observations []*models.Observation, similarityThreshold
return observations
}
// For small sets, use the simple O(n²) algorithm
if len(observations) <= 50 {
return clusterObservationsSimple(observations, similarityThreshold)
}
// For larger sets, use an optimized approach with early termination
return clusterObservationsOptimized(observations, similarityThreshold)
}
// clusterObservationsSimple is the simple O(n²) algorithm for small sets.
func clusterObservationsSimple(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
// Extract terms for each observation
termSets := make([]map[string]bool, len(observations))
for i, obs := range observations {
@@ -51,6 +63,93 @@ func ClusterObservations(observations []*models.Observation, similarityThreshold
return result
}
// clusterObservationsOptimized uses MinHash-based approximation for large sets.
// This reduces complexity from O(n²) to approximately O(n*k) where k is the number of hash functions.
func clusterObservationsOptimized(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
n := len(observations)
// Extract terms for each observation and compute a signature
type termSetWithSig struct {
terms map[string]bool
signature uint64 // Simple hash signature for fast comparison
}
termSets := make([]termSetWithSig, n)
for i, obs := range observations {
terms := ExtractObservationTerms(obs)
termSets[i] = termSetWithSig{
terms: terms,
signature: computeTermSignature(terms),
}
}
// Track which observations are already clustered
clustered := make([]bool, n)
result := make([]*models.Observation, 0, n/2) // Pre-allocate assuming ~50% are unique
for i := 0; i < n; i++ {
if clustered[i] {
continue
}
// This observation becomes the representative of its cluster
result = append(result, observations[i])
clustered[i] = true
// Use signature for fast pre-filtering
sigI := termSets[i].signature
termsI := termSets[i].terms
// Find all similar observations and mark them as clustered
for j := i + 1; j < n; j++ {
if clustered[j] {
continue
}
// Quick signature comparison - if signatures are very different, skip detailed comparison
sigJ := termSets[j].signature
sigDiff := sigI ^ sigJ
popCount := popCount64(sigDiff)
// If signatures differ significantly, similarity is likely low
// Skip detailed comparison for very different signatures
if popCount > 32 { // More than half of bits differ
continue
}
// Full Jaccard comparison for candidates
similarity := JaccardSimilarity(termsI, termSets[j].terms)
if similarity >= similarityThreshold {
clustered[j] = true
}
}
}
return result
}
// computeTermSignature creates a quick hash signature for term sets.
// Used for fast pre-filtering in the optimized clustering algorithm.
func computeTermSignature(terms map[string]bool) uint64 {
var sig uint64
for term := range terms {
// Simple hash using FNV-1a inspired approach
h := uint64(14695981039346656037)
for i := 0; i < len(term); i++ {
h ^= uint64(term[i])
h *= 1099511628211
}
sig ^= h
}
return sig
}
// popCount64 counts the number of set bits in a 64-bit integer.
// Uses the stdlib bits.OnesCount64 which may use CPU POPCNT instruction.
func popCount64(x uint64) int {
return bits.OnesCount64(x)
}
// IsSimilarToAny checks if a new observation is similar to any existing observation.
// Returns true if similarity to any existing observation exceeds the threshold.
func IsSimilarToAny(newObs *models.Observation, existing []*models.Observation, similarityThreshold float64) bool {