mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-09 23:59:40 +00:00
d04b60517a
* Make things 'betterer' across the board * fix: reorganize struct fields and config parameters for consistency - [x] Reorder Config struct fields alphabetically and by related functionality - [x] Reorganize Observation model fields with archival fields grouped together - [x] Reorder ObservationStore fields to group related members - [x] Reorder Store struct fields with health check caching grouped - [x] Reorganize HealthInfo and PoolMetrics struct field order - [x] Reorder maintenance Service struct fields logically - [x] Reorganize MCP server handler parameter structs alphabetically - [x] Reorder pattern detector candidate tracking fields - [x] Reorganize search Manager struct fields by functionality - [x] Reorder vector Client struct fields with mutex protections grouped - [x] Reorganize handler request/response struct fields - [x] Update handlers_test.go to expect wrapped response format - [x] Reorder middleware TokenAuth and rate limiter fields - [x] Reorganize Service struct fields with grouped functionality - [x] Fix RateLimiter field ordering for clarity - [x] Reorder CircuitBreaker metrics fields * fix(security): improve JSON output safety and path traversal protection - [x] Replace unsafe JSON string formatting with proper json.Marshal in export handler - [x] Remove escapeJSONString helper function in favor of standard JSON marshaling - [x] Add safeResolvePath function to validate paths and prevent directory traversal - [x] Apply path traversal validation in captureFileMtimes operations - [x] Cap result slice capacity in getRecentSearchQueries to prevent DoS via excessive allocation * fix(sdk): improve path traversal protection and allocation safety - [x] Enhance safeResolvePath with stricter validation using filepath.Rel - [x] Reject paths containing ".." after cleaning to prevent traversal - [x] Validate absolute paths are within cwd when cwd is specified - [x] Apply safeResolvePath validation to GetFileContent for consistency - [x] Add comprehensive test coverage for path traversal protection - [x] Fix allocation safety in getRecentSearchQueries by using constant capacity
262 lines
7.7 KiB
Go
262 lines
7.7 KiB
Go
// Package similarity provides text similarity and clustering utilities.
|
|
package similarity
|
|
|
|
import (
|
|
"math/bits"
|
|
"strings"
|
|
|
|
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
|
|
)
|
|
|
|
// ClusterObservations groups similar observations and returns only one representative per cluster.
|
|
// Uses Jaccard similarity on extracted terms from title, narrative, and facts.
|
|
// Observations should be sorted by preference (e.g., recency) - first one in each cluster is kept.
|
|
func ClusterObservations(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
|
|
if len(observations) <= 1 {
|
|
return observations
|
|
}
|
|
|
|
// For small sets, use the simple O(n²) algorithm
|
|
if len(observations) <= 50 {
|
|
return clusterObservationsSimple(observations, similarityThreshold)
|
|
}
|
|
|
|
// For larger sets, use an optimized approach with early termination
|
|
return clusterObservationsOptimized(observations, similarityThreshold)
|
|
}
|
|
|
|
// clusterObservationsSimple is the simple O(n²) algorithm for small sets.
|
|
func clusterObservationsSimple(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
|
|
// Extract terms for each observation
|
|
termSets := make([]map[string]bool, len(observations))
|
|
for i, obs := range observations {
|
|
termSets[i] = ExtractObservationTerms(obs)
|
|
}
|
|
|
|
// Track which observations are already clustered
|
|
clustered := make([]bool, len(observations))
|
|
result := make([]*models.Observation, 0)
|
|
|
|
for i := 0; i < len(observations); i++ {
|
|
if clustered[i] {
|
|
continue
|
|
}
|
|
|
|
// This observation becomes the representative of its cluster
|
|
// (observations are already sorted by recency, so first one is newest)
|
|
result = append(result, observations[i])
|
|
clustered[i] = true
|
|
|
|
// Find all similar observations and mark them as clustered
|
|
for j := i + 1; j < len(observations); j++ {
|
|
if clustered[j] {
|
|
continue
|
|
}
|
|
|
|
similarity := JaccardSimilarity(termSets[i], termSets[j])
|
|
if similarity >= similarityThreshold {
|
|
clustered[j] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// clusterObservationsOptimized uses MinHash-based approximation for large sets.
|
|
// This reduces complexity from O(n²) to approximately O(n*k) where k is the number of hash functions.
|
|
func clusterObservationsOptimized(observations []*models.Observation, similarityThreshold float64) []*models.Observation {
|
|
n := len(observations)
|
|
|
|
// Extract terms for each observation and compute a signature
|
|
type termSetWithSig struct {
|
|
terms map[string]bool
|
|
signature uint64 // Simple hash signature for fast comparison
|
|
}
|
|
|
|
termSets := make([]termSetWithSig, n)
|
|
for i, obs := range observations {
|
|
terms := ExtractObservationTerms(obs)
|
|
termSets[i] = termSetWithSig{
|
|
terms: terms,
|
|
signature: computeTermSignature(terms),
|
|
}
|
|
}
|
|
|
|
// Track which observations are already clustered
|
|
clustered := make([]bool, n)
|
|
result := make([]*models.Observation, 0, n/2) // Pre-allocate assuming ~50% are unique
|
|
|
|
for i := 0; i < n; i++ {
|
|
if clustered[i] {
|
|
continue
|
|
}
|
|
|
|
// This observation becomes the representative of its cluster
|
|
result = append(result, observations[i])
|
|
clustered[i] = true
|
|
|
|
// Use signature for fast pre-filtering
|
|
sigI := termSets[i].signature
|
|
termsI := termSets[i].terms
|
|
|
|
// Find all similar observations and mark them as clustered
|
|
for j := i + 1; j < n; j++ {
|
|
if clustered[j] {
|
|
continue
|
|
}
|
|
|
|
// Quick signature comparison - if signatures are very different, skip detailed comparison
|
|
sigJ := termSets[j].signature
|
|
sigDiff := sigI ^ sigJ
|
|
popCount := popCount64(sigDiff)
|
|
|
|
// If signatures differ significantly, similarity is likely low
|
|
// Skip detailed comparison for very different signatures
|
|
if popCount > 32 { // More than half of bits differ
|
|
continue
|
|
}
|
|
|
|
// Full Jaccard comparison for candidates
|
|
similarity := JaccardSimilarity(termsI, termSets[j].terms)
|
|
if similarity >= similarityThreshold {
|
|
clustered[j] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// computeTermSignature creates a quick hash signature for term sets.
|
|
// Used for fast pre-filtering in the optimized clustering algorithm.
|
|
func computeTermSignature(terms map[string]bool) uint64 {
|
|
var sig uint64
|
|
for term := range terms {
|
|
// Simple hash using FNV-1a inspired approach
|
|
h := uint64(14695981039346656037)
|
|
for i := 0; i < len(term); i++ {
|
|
h ^= uint64(term[i])
|
|
h *= 1099511628211
|
|
}
|
|
sig ^= h
|
|
}
|
|
return sig
|
|
}
|
|
|
|
// popCount64 counts the number of set bits in a 64-bit integer.
|
|
// Uses the stdlib bits.OnesCount64 which may use CPU POPCNT instruction.
|
|
func popCount64(x uint64) int {
|
|
return bits.OnesCount64(x)
|
|
}
|
|
|
|
// IsSimilarToAny checks if a new observation is similar to any existing observation.
|
|
// Returns true if similarity to any existing observation exceeds the threshold.
|
|
func IsSimilarToAny(newObs *models.Observation, existing []*models.Observation, similarityThreshold float64) bool {
|
|
if len(existing) == 0 {
|
|
return false
|
|
}
|
|
|
|
newTerms := ExtractObservationTerms(newObs)
|
|
if len(newTerms) == 0 {
|
|
return false
|
|
}
|
|
|
|
for _, obs := range existing {
|
|
existingTerms := ExtractObservationTerms(obs)
|
|
similarity := JaccardSimilarity(newTerms, existingTerms)
|
|
if similarity >= similarityThreshold {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// ExtractObservationTerms extracts meaningful terms from an observation for similarity comparison.
|
|
func ExtractObservationTerms(obs *models.Observation) map[string]bool {
|
|
terms := make(map[string]bool)
|
|
|
|
// Add terms from title
|
|
addTerms(terms, obs.Title.String)
|
|
|
|
// Add terms from narrative
|
|
addTerms(terms, obs.Narrative.String)
|
|
|
|
// Add terms from facts
|
|
for _, fact := range obs.Facts {
|
|
addTerms(terms, fact)
|
|
}
|
|
|
|
// Add file paths as terms (normalized)
|
|
for _, file := range obs.FilesRead {
|
|
// Use just the filename without path for matching
|
|
parts := strings.Split(file, "/")
|
|
if len(parts) > 0 {
|
|
terms[strings.ToLower(parts[len(parts)-1])] = true
|
|
}
|
|
}
|
|
|
|
for _, file := range obs.FilesModified {
|
|
parts := strings.Split(file, "/")
|
|
if len(parts) > 0 {
|
|
terms[strings.ToLower(parts[len(parts)-1])] = true
|
|
}
|
|
}
|
|
|
|
return terms
|
|
}
|
|
|
|
// addTerms tokenizes text and adds meaningful terms to the set.
|
|
func addTerms(terms map[string]bool, text string) {
|
|
// Simple tokenization: split on non-alphanumeric, filter short words
|
|
words := strings.FieldsFunc(strings.ToLower(text), func(r rune) bool {
|
|
return !((r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '_')
|
|
})
|
|
|
|
stopWords := map[string]bool{
|
|
"the": true, "a": true, "an": true, "is": true, "are": true,
|
|
"was": true, "were": true, "be": true, "been": true, "being": true,
|
|
"have": true, "has": true, "had": true, "do": true, "does": true,
|
|
"did": true, "will": true, "would": true, "could": true, "should": true,
|
|
"may": true, "might": true, "must": true, "shall": true,
|
|
"this": true, "that": true, "these": true, "those": true,
|
|
"and": true, "or": true, "but": true, "if": true, "then": true,
|
|
"for": true, "from": true, "with": true, "about": true, "into": true,
|
|
"to": true, "of": true, "in": true, "on": true, "at": true, "by": true,
|
|
"it": true, "its": true, "which": true, "who": true, "what": true,
|
|
"when": true, "where": true, "how": true, "why": true,
|
|
}
|
|
|
|
for _, word := range words {
|
|
if len(word) >= 3 && !stopWords[word] {
|
|
terms[word] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// JaccardSimilarity calculates the Jaccard similarity between two term sets.
|
|
// Returns a value between 0 (no overlap) and 1 (identical).
|
|
func JaccardSimilarity(set1, set2 map[string]bool) float64 {
|
|
if len(set1) == 0 && len(set2) == 0 {
|
|
return 1.0
|
|
}
|
|
if len(set1) == 0 || len(set2) == 0 {
|
|
return 0.0
|
|
}
|
|
|
|
intersection := 0
|
|
for term := range set1 {
|
|
if set2[term] {
|
|
intersection++
|
|
}
|
|
}
|
|
|
|
union := len(set1) + len(set2) - intersection
|
|
if union == 0 {
|
|
return 0.0
|
|
}
|
|
|
|
return float64(intersection) / float64(union)
|
|
}
|