Files
claude-mnemonic/pkg/similarity/clustering.go
2025-12-19 02:17:02 +00:00

163 lines
4.6 KiB
Go

// Package similarity provides text similarity and clustering utilities.
package similarity
import (
"strings"
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
)
// ClusterObservations groups similar observations and returns only one representative per cluster.
// Uses Jaccard similarity on extracted terms from title, narrative, and facts.
// Observations should be sorted by preference (e.g., recency) - first one in each cluster is kept.
func ClusterObservations(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
if len(observations) <= 1 {
return observations
}
// Extract terms for each observation
termSets := make([]map[string]bool, len(observations))
for i, obs := range observations {
termSets[i] = ExtractObservationTerms(obs)
}
// Track which observations are already clustered
clustered := make([]bool, len(observations))
result := make([]*models.Observation, 0)
for i := 0; i < len(observations); i++ {
if clustered[i] {
continue
}
// This observation becomes the representative of its cluster
// (observations are already sorted by recency, so first one is newest)
result = append(result, observations[i])
clustered[i] = true
// Find all similar observations and mark them as clustered
for j := i + 1; j < len(observations); j++ {
if clustered[j] {
continue
}
similarity := JaccardSimilarity(termSets[i], termSets[j])
if similarity >= similarityThreshold {
clustered[j] = true
}
}
}
return result
}
// IsSimilarToAny checks if a new observation is similar to any existing observation.
// Returns true if similarity to any existing observation exceeds the threshold.
func IsSimilarToAny(newObs *models.Observation, existing []*models.Observation, similarityThreshold float64) bool {
if len(existing) == 0 {
return false
}
newTerms := ExtractObservationTerms(newObs)
if len(newTerms) == 0 {
return false
}
for _, obs := range existing {
existingTerms := ExtractObservationTerms(obs)
similarity := JaccardSimilarity(newTerms, existingTerms)
if similarity >= similarityThreshold {
return true
}
}
return false
}
// ExtractObservationTerms extracts meaningful terms from an observation for similarity comparison.
func ExtractObservationTerms(obs *models.Observation) map[string]bool {
terms := make(map[string]bool)
// Add terms from title
addTerms(terms, obs.Title.String)
// Add terms from narrative
addTerms(terms, obs.Narrative.String)
// Add terms from facts
for _, fact := range obs.Facts {
addTerms(terms, fact)
}
// Add file paths as terms (normalized)
for _, file := range obs.FilesRead {
// Use just the filename without path for matching
parts := strings.Split(file, "/")
if len(parts) > 0 {
terms[strings.ToLower(parts[len(parts)-1])] = true
}
}
for _, file := range obs.FilesModified {
parts := strings.Split(file, "/")
if len(parts) > 0 {
terms[strings.ToLower(parts[len(parts)-1])] = true
}
}
return terms
}
// addTerms tokenizes text and adds meaningful terms to the set.
func addTerms(terms map[string]bool, text string) {
// Simple tokenization: split on non-alphanumeric, filter short words
words := strings.FieldsFunc(strings.ToLower(text), func(r rune) bool {
return !((r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_')
})
stopWords := map[string]bool{
"the": true, "a": true, "an": true, "is": true, "are": true,
"was": true, "were": true, "be": true, "been": true, "being": true,
"have": true, "has": true, "had": true, "do": true, "does": true,
"did": true, "will": true, "would": true, "could": true, "should": true,
"may": true, "might": true, "must": true, "shall": true,
"this": true, "that": true, "these": true, "those": true,
"and": true, "or": true, "but": true, "if": true, "then": true,
"for": true, "from": true, "with": true, "about": true, "into": true,
"to": true, "of": true, "in": true, "on": true, "at": true, "by": true,
"it": true, "its": true, "which": true, "who": true, "what": true,
"when": true, "where": true, "how": true, "why": true,
}
for _, word := range words {
if len(word) >= 3 && !stopWords[word] {
terms[word] = true
}
}
}
// JaccardSimilarity calculates the Jaccard similarity between two term sets.
// Returns a value between 0 (no overlap) and 1 (identical).
func JaccardSimilarity(set1, set2 map[string]bool) float64 {
if len(set1) == 0 && len(set2) == 0 {
return 1.0
}
if len(set1) == 0 || len(set2) == 0 {
return 0.0
}
intersection := 0
for term := range set1 {
if set2[term] {
intersection++
}
}
union := len(set1) + len(set2) - intersection
if union == 0 {
return 0.0
}
return float64(intersection) / float64(union)
}