claude-mnemonic/pkg/similarity/clustering.go

// Package similarity provides text similarity and clustering utilities.
package similarity

import (
	"math/bits"
	"strings"

	"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
)

// ClusterObservations groups similar observations and returns only one representative per cluster.
// Uses Jaccard similarity on extracted terms from title, narrative, and facts.
// Observations should be sorted by preference (e.g., recency) - first one in each cluster is kept.
func ClusterObservations(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
	if len(observations) <= 1 {
		return observations
	}

	// For small sets, use the simple O(n²) algorithm
	if len(observations) <= 50 {
		return clusterObservationsSimple(observations, similarityThreshold)
	}

	// For larger sets, use an optimized approach with early termination
	return clusterObservationsOptimized(observations, similarityThreshold)
}

// clusterObservationsSimple is the simple O(n²) algorithm for small sets.
func clusterObservationsSimple(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
	// Extract terms for each observation
	termSets := make([]map[string]bool, len(observations))
	for i, obs := range observations {
		termSets[i] = ExtractObservationTerms(obs)
	}

	// Track which observations are already clustered
	clustered := make([]bool, len(observations))
	result := make([]*models.Observation, 0)

	for i := 0; i < len(observations); i++ {
		if clustered[i] {
			continue
		}

		// This observation becomes the representative of its cluster
		// (observations are already sorted by recency, so first one is newest)
		result = append(result, observations[i])
		clustered[i] = true

		// Find all similar observations and mark them as clustered
		for j := i + 1; j < len(observations); j++ {
			if clustered[j] {
				continue
			}

			similarity := JaccardSimilarity(termSets[i], termSets[j])
			if similarity >= similarityThreshold {
				clustered[j] = true
			}
		}
	}

	return result
}

// clusterObservationsOptimized uses MinHash-based approximation for large sets.
// This reduces complexity from O(n²) to approximately O(n*k) where k is the number of hash functions.
func clusterObservationsOptimized(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
	n := len(observations)

	// Extract terms for each observation and compute a signature
	type termSetWithSig struct {
		terms     map[string]bool
		signature uint64 // Simple hash signature for fast comparison
	}

	termSets := make([]termSetWithSig, n)
	for i, obs := range observations {
		terms := ExtractObservationTerms(obs)
		termSets[i] = termSetWithSig{
			terms:     terms,
			signature: computeTermSignature(terms),
		}
	}

	// Track which observations are already clustered
	clustered := make([]bool, n)
	result := make([]*models.Observation, 0, n/2) // Pre-allocate assuming ~50% are unique

	for i := 0; i < n; i++ {
		if clustered[i] {
			continue
		}

		// This observation becomes the representative of its cluster
		result = append(result, observations[i])
		clustered[i] = true

		// Use signature for fast pre-filtering
		sigI := termSets[i].signature
		termsI := termSets[i].terms

		// Find all similar observations and mark them as clustered
		for j := i + 1; j < n; j++ {
			if clustered[j] {
				continue
			}

			// Quick signature comparison - if signatures are very different, skip detailed comparison
			sigJ := termSets[j].signature
			sigDiff := sigI ^ sigJ
			popCount := popCount64(sigDiff)

			// If signatures differ significantly, similarity is likely low
			// Skip detailed comparison for very different signatures
			if popCount > 32 { // More than half of bits differ
				continue
			}

			// Full Jaccard comparison for candidates
			similarity := JaccardSimilarity(termsI, termSets[j].terms)
			if similarity >= similarityThreshold {
				clustered[j] = true
			}
		}
	}

	return result
}

// computeTermSignature creates a quick hash signature for term sets.
// Used for fast pre-filtering in the optimized clustering algorithm.
func computeTermSignature(terms map[string]bool) uint64 {
	var sig uint64
	for term := range terms {
		// Simple hash using FNV-1a inspired approach
		h := uint64(14695981039346656037)
		for i := 0; i < len(term); i++ {
			h ^= uint64(term[i])
			h *= 1099511628211
		}
		sig ^= h
	}
	return sig
}

// popCount64 counts the number of set bits in a 64-bit integer.
// Uses the stdlib bits.OnesCount64 which may use CPU POPCNT instruction.
func popCount64(x uint64) int {
	return bits.OnesCount64(x)
}

// IsSimilarToAny checks if a new observation is similar to any existing observation.
// Returns true if similarity to any existing observation exceeds the threshold.
func IsSimilarToAny(newObs *models.Observation, existing []*models.Observation, similarityThreshold float64) bool {
	if len(existing) == 0 {
		return false
	}

	newTerms := ExtractObservationTerms(newObs)
	if len(newTerms) == 0 {
		return false
	}

	for _, obs := range existing {
		existingTerms := ExtractObservationTerms(obs)
		similarity := JaccardSimilarity(newTerms, existingTerms)
		if similarity >= similarityThreshold {
			return true
		}
	}

	return false
}

// ExtractObservationTerms extracts meaningful terms from an observation for similarity comparison.
func ExtractObservationTerms(obs *models.Observation) map[string]bool {
	terms := make(map[string]bool)

	// Add terms from title
	addTerms(terms, obs.Title.String)

	// Add terms from narrative
	addTerms(terms, obs.Narrative.String)

	// Add terms from facts
	for _, fact := range obs.Facts {
		addTerms(terms, fact)
	}

	// Add file paths as terms (normalized)
	for _, file := range obs.FilesRead {
		// Use just the filename without path for matching
		parts := strings.Split(file, "/")
		if len(parts) > 0 {
			terms[strings.ToLower(parts[len(parts)-1])] = true
		}
	}

	for _, file := range obs.FilesModified {
		parts := strings.Split(file, "/")
		if len(parts) > 0 {
			terms[strings.ToLower(parts[len(parts)-1])] = true
		}
	}

	return terms
}

// addTerms tokenizes text and adds meaningful terms to the set.
func addTerms(terms map[string]bool, text string) {
	// Simple tokenization: split on non-alphanumeric, filter short words
	words := strings.FieldsFunc(strings.ToLower(text), func(r rune) bool {
		return !((r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_')
	})

	stopWords := map[string]bool{
		"the": true, "a": true, "an": true, "is": true, "are": true,
		"was": true, "were": true, "be": true, "been": true, "being": true,
		"have": true, "has": true, "had": true, "do": true, "does": true,
		"did": true, "will": true, "would": true, "could": true, "should": true,
		"may": true, "might": true, "must": true, "shall": true,
		"this": true, "that": true, "these": true, "those": true,
		"and": true, "or": true, "but": true, "if": true, "then": true,
		"for": true, "from": true, "with": true, "about": true, "into": true,
		"to": true, "of": true, "in": true, "on": true, "at": true, "by": true,
		"it": true, "its": true, "which": true, "who": true, "what": true,
		"when": true, "where": true, "how": true, "why": true,
	}

	for _, word := range words {
		if len(word) >= 3 && !stopWords[word] {
			terms[word] = true
		}
	}
}

// JaccardSimilarity calculates the Jaccard similarity between two term sets.
// Returns a value between 0 (no overlap) and 1 (identical).
func JaccardSimilarity(set1, set2 map[string]bool) float64 {
	if len(set1) == 0 && len(set2) == 0 {
		return 1.0
	}
	if len(set1) == 0 || len(set2) == 0 {
		return 0.0
	}

	intersection := 0
	for term := range set1 {
		if set2[term] {
			intersection++
		}
	}

	union := len(set1) + len(set2) - intersection
	if union == 0 {
		return 0.0
	}

	return float64(intersection) / float64(union)
}