mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-09 23:59:40 +00:00
5335a8a7a6
* Make things 'betterer' across the board * fix: reorganize struct fields and config parameters for consistency - [x] Reorder Config struct fields alphabetically and by related functionality - [x] Reorganize Observation model fields with archival fields grouped together - [x] Reorder ObservationStore fields to group related members - [x] Reorder Store struct fields with health check caching grouped - [x] Reorganize HealthInfo and PoolMetrics struct field order - [x] Reorder maintenance Service struct fields logically - [x] Reorganize MCP server handler parameter structs alphabetically - [x] Reorder pattern detector candidate tracking fields - [x] Reorganize search Manager struct fields by functionality - [x] Reorder vector Client struct fields with mutex protections grouped - [x] Reorganize handler request/response struct fields - [x] Update handlers_test.go to expect wrapped response format - [x] Reorder middleware TokenAuth and rate limiter fields - [x] Reorganize Service struct fields with grouped functionality - [x] Fix RateLimiter field ordering for clarity - [x] Reorder CircuitBreaker metrics fields * fix(security): improve JSON output safety and path traversal protection - [x] Replace unsafe JSON string formatting with proper json.Marshal in export handler - [x] Remove escapeJSONString helper function in favor of standard JSON marshaling - [x] Add safeResolvePath function to validate paths and prevent directory traversal - [x] Apply path traversal validation in captureFileMtimes operations - [x] Cap result slice capacity in getRecentSearchQueries to prevent DoS via excessive allocation * fix(sdk): improve path traversal protection and allocation safety - [x] Enhance safeResolvePath with stricter validation using filepath.Rel - [x] Reject paths containing ".." after cleaning to prevent traversal - [x] Validate absolute paths are within cwd when cwd is specified - [x] Apply safeResolvePath validation to GetFileContent for consistency - [x] Add comprehensive test coverage for path traversal protection - [x] Fix allocation safety in getRecentSearchQueries by using constant capacity * feat(dashboard): add graph stats and vector metrics endpoints - [x] Add handleGraphStats endpoint for knowledge graph visualization - [x] Add handleVectorMetrics endpoint for vector database dashboard - [x] Improve update check error handling with JSON response - [x] Register new API routes for graph and vector metrics - [x] Migrate Font Awesome to npm package from CDN - [x] Fix observations API response type handling - [x] Update package version to v0.10.5-15-g385d05a * fixup! feat(dashboard): add graph stats and vector metrics endpoints * test: add comprehensive test coverage across multiple packages - [x] Add 298 tests for Python chunker functionality - [x] Add 213 tests for chunking types and constants - [x] Add 398 tests for TypeScript/JavaScript chunker - [x] Add 954 tests for MCP server handlers and validation - [x] Add 563 tests for pattern detector and analysis - [x] Add 1149 tests for vector client cache and operations - [x] Add 663 tests for SDK processor, circuit breaker, and deduplication - [x] Add 731 tests for session manager lifecycle and concurrency - [x] Add 331 tests for similarity clustering and term extraction * fix(pattern): add nil check and fmt import for GetPatternInsight - [x] Add `fmt` import for error formatting - [x] Add nil check for pattern before using it - [x] Remove duplicate comment line
624 lines
21 KiB
Go
624 lines
21 KiB
Go
// Package similarity provides text similarity and clustering utilities.
|
|
package similarity
|
|
|
|
import (
|
|
"database/sql"
|
|
"testing"
|
|
|
|
"github.com/lukaszraczylo/claude-mnemonic/pkg/models"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func TestJaccardSimilarity(t *testing.T) {
|
|
tests := []struct {
|
|
set1 map[string]bool
|
|
set2 map[string]bool
|
|
name string
|
|
expected float64
|
|
}{
|
|
{
|
|
name: "identical sets",
|
|
set1: map[string]bool{"a": true, "b": true, "c": true},
|
|
set2: map[string]bool{"a": true, "b": true, "c": true},
|
|
expected: 1.0,
|
|
},
|
|
{
|
|
name: "no overlap",
|
|
set1: map[string]bool{"a": true, "b": true},
|
|
set2: map[string]bool{"c": true, "d": true},
|
|
expected: 0.0,
|
|
},
|
|
{
|
|
name: "partial overlap",
|
|
set1: map[string]bool{"a": true, "b": true, "c": true},
|
|
set2: map[string]bool{"b": true, "c": true, "d": true},
|
|
expected: 0.5, // intersection=2, union=4
|
|
},
|
|
{
|
|
name: "empty sets",
|
|
set1: map[string]bool{},
|
|
set2: map[string]bool{},
|
|
expected: 1.0,
|
|
},
|
|
{
|
|
name: "one empty set",
|
|
set1: map[string]bool{"a": true},
|
|
set2: map[string]bool{},
|
|
expected: 0.0,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := JaccardSimilarity(tt.set1, tt.set2)
|
|
assert.InDelta(t, tt.expected, result, 0.001)
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractObservationTerms(t *testing.T) {
|
|
obs := &models.Observation{
|
|
Title: sql.NullString{String: "Authentication flow implementation", Valid: true},
|
|
Narrative: sql.NullString{String: "We implemented JWT-based authentication", Valid: true},
|
|
Facts: models.JSONStringArray{"Users authenticate via API", "Tokens expire after 24 hours"},
|
|
FilesRead: models.JSONStringArray{"/src/auth/handler.go", "/src/auth/jwt.go"},
|
|
}
|
|
|
|
terms := ExtractObservationTerms(obs)
|
|
|
|
// Should contain terms from title
|
|
assert.Contains(t, terms, "authentication")
|
|
assert.Contains(t, terms, "flow")
|
|
assert.Contains(t, terms, "implementation")
|
|
|
|
// Should contain terms from narrative
|
|
assert.Contains(t, terms, "implemented")
|
|
|
|
// Should contain terms from facts
|
|
assert.Contains(t, terms, "tokens")
|
|
assert.Contains(t, terms, "expire")
|
|
assert.Contains(t, terms, "hours")
|
|
|
|
// Should contain filenames (without path)
|
|
assert.Contains(t, terms, "handler.go")
|
|
assert.Contains(t, terms, "jwt.go")
|
|
|
|
// Should NOT contain stop words
|
|
assert.NotContains(t, terms, "the")
|
|
assert.NotContains(t, terms, "and")
|
|
assert.NotContains(t, terms, "we")
|
|
}
|
|
|
|
func TestClusterObservations(t *testing.T) {
|
|
// Create similar observations
|
|
obs1 := &models.Observation{
|
|
ID: 1,
|
|
Title: sql.NullString{String: "Authentication flow implementation", Valid: true},
|
|
Narrative: sql.NullString{String: "JWT-based authentication for API", Valid: true},
|
|
}
|
|
obs2 := &models.Observation{
|
|
ID: 2,
|
|
Title: sql.NullString{String: "Authentication flow update", Valid: true},
|
|
Narrative: sql.NullString{String: "Updated JWT authentication logic", Valid: true},
|
|
}
|
|
obs3 := &models.Observation{
|
|
ID: 3,
|
|
Title: sql.NullString{String: "Database migration guide", Valid: true},
|
|
Narrative: sql.NullString{String: "How to run database migrations", Valid: true},
|
|
}
|
|
obs4 := &models.Observation{
|
|
ID: 4,
|
|
Title: sql.NullString{String: "Database schema changes", Valid: true},
|
|
Narrative: sql.NullString{String: "Updated database schema for users", Valid: true},
|
|
}
|
|
|
|
observations := []*models.Observation{obs1, obs2, obs3, obs4}
|
|
|
|
// Cluster with 0.4 threshold
|
|
clustered := ClusterObservations(observations, 0.4)
|
|
|
|
// obs1 and obs2 should be clustered (similar authentication content)
|
|
// obs3 and obs4 should be clustered (similar database content)
|
|
t.Logf("Clustered %d observations down to %d", len(observations), len(clustered))
|
|
assert.LessOrEqual(t, len(clustered), 4)
|
|
assert.GreaterOrEqual(t, len(clustered), 1)
|
|
|
|
// First observation in each cluster should be kept (obs1 for auth, obs3 for db)
|
|
ids := make(map[int64]bool)
|
|
for _, obs := range clustered {
|
|
ids[obs.ID] = true
|
|
}
|
|
|
|
// Depending on threshold, obs1 should be kept (first in auth cluster)
|
|
if len(clustered) <= 3 {
|
|
assert.True(t, ids[1], "First observation (ID=1) should be kept as cluster representative")
|
|
}
|
|
}
|
|
|
|
func TestClusterObservations_SingleObservation(t *testing.T) {
|
|
obs := &models.Observation{
|
|
ID: 1,
|
|
Title: sql.NullString{String: "Single observation", Valid: true},
|
|
}
|
|
|
|
clustered := ClusterObservations([]*models.Observation{obs}, 0.4)
|
|
|
|
assert.Len(t, clustered, 1)
|
|
assert.Equal(t, int64(1), clustered[0].ID)
|
|
}
|
|
|
|
func TestClusterObservations_EmptyList(t *testing.T) {
|
|
clustered := ClusterObservations([]*models.Observation{}, 0.4)
|
|
assert.Len(t, clustered, 0)
|
|
}
|
|
|
|
func TestClusterObservations_NoDuplicates(t *testing.T) {
|
|
// Create observations with completely different content
|
|
observations := []*models.Observation{
|
|
{
|
|
ID: 1,
|
|
Title: sql.NullString{String: "Authentication system", Valid: true},
|
|
Narrative: sql.NullString{String: "JWT tokens for user auth", Valid: true},
|
|
},
|
|
{
|
|
ID: 2,
|
|
Title: sql.NullString{String: "Database configuration", Valid: true},
|
|
Narrative: sql.NullString{String: "PostgreSQL setup and migrations", Valid: true},
|
|
},
|
|
{
|
|
ID: 3,
|
|
Title: sql.NullString{String: "Caching layer", Valid: true},
|
|
Narrative: sql.NullString{String: "Redis caching implementation", Valid: true},
|
|
},
|
|
{
|
|
ID: 4,
|
|
Title: sql.NullString{String: "Logging setup", Valid: true},
|
|
Narrative: sql.NullString{String: "Structured logging with zerolog", Valid: true},
|
|
},
|
|
{
|
|
ID: 5,
|
|
Title: sql.NullString{String: "API endpoints", Valid: true},
|
|
Narrative: sql.NullString{String: "REST API implementation", Valid: true},
|
|
},
|
|
}
|
|
|
|
clustered := ClusterObservations(observations, 0.4)
|
|
|
|
// With completely different content, all should be kept
|
|
assert.Len(t, clustered, 5, "All unique observations should be kept")
|
|
}
|
|
|
|
func TestIsSimilarToAny(t *testing.T) {
|
|
existing := []*models.Observation{
|
|
{
|
|
ID: 1,
|
|
Title: sql.NullString{String: "Authentication implementation", Valid: true},
|
|
Narrative: sql.NullString{String: "JWT authentication flow", Valid: true},
|
|
},
|
|
{
|
|
ID: 2,
|
|
Title: sql.NullString{String: "Database setup", Valid: true},
|
|
Narrative: sql.NullString{String: "PostgreSQL configuration", Valid: true},
|
|
},
|
|
}
|
|
|
|
// New observation similar to existing
|
|
similar := &models.Observation{
|
|
ID: 3,
|
|
Title: sql.NullString{String: "Authentication update", Valid: true},
|
|
Narrative: sql.NullString{String: "JWT authentication changes", Valid: true},
|
|
}
|
|
|
|
// New observation not similar to any existing
|
|
different := &models.Observation{
|
|
ID: 4,
|
|
Title: sql.NullString{String: "Caching layer", Valid: true},
|
|
Narrative: sql.NullString{String: "Redis caching implementation", Valid: true},
|
|
}
|
|
|
|
assert.True(t, IsSimilarToAny(similar, existing, 0.3), "Similar observation should be detected")
|
|
assert.False(t, IsSimilarToAny(different, existing, 0.3), "Different observation should not match")
|
|
}
|
|
|
|
func TestIsSimilarToAny_EmptyExisting(t *testing.T) {
|
|
newObs := &models.Observation{
|
|
ID: 1,
|
|
Title: sql.NullString{String: "New observation", Valid: true},
|
|
}
|
|
|
|
assert.False(t, IsSimilarToAny(newObs, []*models.Observation{}, 0.4))
|
|
assert.False(t, IsSimilarToAny(newObs, nil, 0.4))
|
|
}
|
|
|
|
func TestAddTerms(t *testing.T) {
|
|
terms := make(map[string]bool)
|
|
|
|
addTerms(terms, "The quick brown fox jumps over the lazy dog")
|
|
|
|
// Should contain words >= 3 chars that aren't stop words
|
|
assert.Contains(t, terms, "quick")
|
|
assert.Contains(t, terms, "brown")
|
|
assert.Contains(t, terms, "fox")
|
|
assert.Contains(t, terms, "jumps")
|
|
assert.Contains(t, terms, "over")
|
|
assert.Contains(t, terms, "lazy")
|
|
assert.Contains(t, terms, "dog")
|
|
|
|
// Should NOT contain stop words
|
|
assert.NotContains(t, terms, "the")
|
|
|
|
// Should NOT contain short words
|
|
// (all words in the sentence are >= 3 chars after stop word removal)
|
|
}
|
|
|
|
func TestClusterObservations_MoreThanOldLimit(t *testing.T) {
|
|
// This test verifies that we can now return more than 5 observations
|
|
// after removing the hardcoded limit
|
|
|
|
// Create 10 completely unique observations with very different content
|
|
observations := []*models.Observation{
|
|
{ID: 1, Title: sql.NullString{String: "JWT tokens expire daily", Valid: true}},
|
|
{ID: 2, Title: sql.NullString{String: "PostgreSQL indexes optimize", Valid: true}},
|
|
{ID: 3, Title: sql.NullString{String: "Redis caching TTL values", Valid: true}},
|
|
{ID: 4, Title: sql.NullString{String: "Zerolog structured logging", Valid: true}},
|
|
{ID: 5, Title: sql.NullString{String: "Pytest fixtures setup", Valid: true}},
|
|
{ID: 6, Title: sql.NullString{String: "Docker containers orchestration", Valid: true}},
|
|
{ID: 7, Title: sql.NullString{String: "Prometheus metrics collection", Valid: true}},
|
|
{ID: 8, Title: sql.NullString{String: "OWASP vulnerability scanning", Valid: true}},
|
|
{ID: 9, Title: sql.NullString{String: "Goroutines parallel execution", Valid: true}},
|
|
{ID: 10, Title: sql.NullString{String: "Kubernetes horizontal scaling", Valid: true}},
|
|
}
|
|
|
|
clustered := ClusterObservations(observations, 0.4)
|
|
|
|
// With unique content, all 10 should be kept (previously would have been capped at 5)
|
|
assert.Len(t, clustered, 10, "Should return all 10 unique observations, not limited to 5")
|
|
}
|
|
|
|
func TestClusterObservations_PreservesOrder(t *testing.T) {
|
|
// The first observation in each cluster should be kept
|
|
observations := []*models.Observation{
|
|
{ID: 1, Title: sql.NullString{String: "First auth observation", Valid: true}},
|
|
{ID: 2, Title: sql.NullString{String: "Second auth observation", Valid: true}},
|
|
{ID: 3, Title: sql.NullString{String: "Database observation", Valid: true}},
|
|
}
|
|
|
|
clustered := ClusterObservations(observations, 0.4)
|
|
|
|
// First observation should always be first in result
|
|
require.NotEmpty(t, clustered)
|
|
assert.Equal(t, int64(1), clustered[0].ID, "First observation should be kept as first result")
|
|
}
|
|
|
|
// =============================================================================
|
|
// TESTS FOR OPTIMIZED CLUSTERING (triggered when len(observations) > 50)
|
|
// =============================================================================
|
|
|
|
func TestClusterObservationsOptimized_LargeSet(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Create 60 observations to trigger optimized path (threshold is 50)
|
|
observations := make([]*models.Observation, 60)
|
|
|
|
// Create 30 pairs of similar observations
|
|
topics := []string{
|
|
"authentication", "authorization", "database", "caching", "logging",
|
|
"monitoring", "testing", "deployment", "scaling", "security",
|
|
"networking", "storage", "messaging", "scheduling", "configuration",
|
|
"validation", "serialization", "encryption", "compression", "indexing",
|
|
"backup", "recovery", "migration", "versioning", "documentation",
|
|
"profiling", "debugging", "tracing", "alerting", "reporting",
|
|
}
|
|
|
|
for i := 0; i < 30; i++ {
|
|
// First observation of pair
|
|
observations[i*2] = &models.Observation{
|
|
ID: int64(i*2 + 1),
|
|
Title: sql.NullString{String: topics[i] + " implementation", Valid: true},
|
|
Narrative: sql.NullString{String: "Detailed " + topics[i] + " system design", Valid: true},
|
|
}
|
|
// Second observation of pair (similar to first)
|
|
observations[i*2+1] = &models.Observation{
|
|
ID: int64(i*2 + 2),
|
|
Title: sql.NullString{String: topics[i] + " update", Valid: true},
|
|
Narrative: sql.NullString{String: "Updated " + topics[i] + " logic", Valid: true},
|
|
}
|
|
}
|
|
|
|
clustered := ClusterObservations(observations, 0.4)
|
|
|
|
// With similar pairs, we should get roughly 30 clusters (one per topic)
|
|
t.Logf("Clustered %d observations down to %d", len(observations), len(clustered))
|
|
assert.Less(t, len(clustered), 60, "Similar observations should be clustered together")
|
|
assert.GreaterOrEqual(t, len(clustered), 1, "Should have at least one cluster")
|
|
}
|
|
|
|
func TestClusterObservationsOptimized_AllUnique(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Create 55 completely unique observations with NO shared terms
|
|
// Each observation has only its unique term (no common words like "topic" or "content")
|
|
uniqueTerms := []string{
|
|
"aardvark", "butterfly", "caterpillar", "dragonfly", "elephant",
|
|
"flamingo", "giraffe", "hippopotamus", "iguana", "jellyfish",
|
|
"kangaroo", "leopard", "mongoose", "nightingale", "octopus",
|
|
"penguin", "quail", "rhinoceros", "salamander", "toucan",
|
|
"umbrella", "vulture", "walrus", "xylophone", "yakking",
|
|
"zebra123", "astronomy99", "biology88", "chemistry77", "dynamics66",
|
|
"economics55", "forensics44", "genetics33", "hydraulics22", "immunology11",
|
|
"jurisprudence", "kinetics", "linguistics", "metallurgy", "neurology",
|
|
"oceanography", "pharmacology", "quantumphysics", "robotics", "sociology",
|
|
"thermodynamics", "ultrasound", "virology", "wavelength", "xenobiology",
|
|
"yeastculture", "zoology123", "algebra456", "botany789", "calculus012",
|
|
}
|
|
|
|
observations := make([]*models.Observation, 55)
|
|
for i := 0; i < 55; i++ {
|
|
// Each observation has ONLY its unique term - no shared words
|
|
observations[i] = &models.Observation{
|
|
ID: int64(i + 1),
|
|
Title: sql.NullString{String: uniqueTerms[i], Valid: true},
|
|
Narrative: sql.NullString{String: uniqueTerms[i], Valid: true},
|
|
}
|
|
}
|
|
|
|
clustered := ClusterObservations(observations, 0.4)
|
|
|
|
// All unique content should remain unclustered
|
|
assert.Len(t, clustered, 55, "All unique observations should be kept")
|
|
}
|
|
|
|
func TestClusterObservationsOptimized_SignaturePrefiltering(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Test that signature prefiltering works correctly
|
|
// Create observations where some have very different signatures
|
|
observations := make([]*models.Observation, 60)
|
|
|
|
// First half: all identical (about "authentication") - should cluster to 1
|
|
for i := 0; i < 30; i++ {
|
|
observations[i] = &models.Observation{
|
|
ID: int64(i + 1),
|
|
Title: sql.NullString{String: "authentication security login", Valid: true},
|
|
Narrative: sql.NullString{String: "JWT tokens OAuth authentication", Valid: true},
|
|
}
|
|
}
|
|
|
|
// Second half: each completely unique with NO shared terms
|
|
diffTerms := []string{
|
|
"quantumphysics", "photosynthesis", "archaeologydig", "linguisticstudy", "astronomystar",
|
|
"paleontologyfossil", "oceanographywave", "entomologybug", "mycologyfungi", "herpetologysnake",
|
|
"ornithologybird", "ichthyologyfish", "seismologyquake", "volcanologylava", "meteorologyrain",
|
|
"cartographymap", "ethnographyculture", "philologyword", "numismaticscoin", "heraldryshield",
|
|
"genealogytree", "chronologytime", "typographyfont", "calligraphyink", "epigraphystone",
|
|
"papyrologytext", "codicologybook", "diplomaticseal", "sigillographywax", "sphragisticsring",
|
|
}
|
|
for i := 30; i < 60; i++ {
|
|
term := diffTerms[i-30]
|
|
// Each has ONLY its unique term - no shared words
|
|
observations[i] = &models.Observation{
|
|
ID: int64(i + 1),
|
|
Title: sql.NullString{String: term, Valid: true},
|
|
Narrative: sql.NullString{String: term, Valid: true},
|
|
}
|
|
}
|
|
|
|
clustered := ClusterObservations(observations, 0.5)
|
|
|
|
// Should have 31 clusters: 1 for all auth topics + 30 unique topics
|
|
t.Logf("Clustered %d observations down to %d", len(observations), len(clustered))
|
|
assert.Equal(t, 31, len(clustered), "Should have 31 clusters (1 auth + 30 unique)")
|
|
}
|
|
|
|
// =============================================================================
|
|
// TESTS FOR HELPER FUNCTIONS
|
|
// =============================================================================
|
|
|
|
func TestComputeTermSignature(t *testing.T) {
|
|
tests := []struct {
|
|
terms map[string]bool
|
|
compareTo map[string]bool
|
|
name string
|
|
expectZero bool
|
|
expectSame bool
|
|
}{
|
|
// ===== GOOD CASES =====
|
|
{
|
|
name: "single term",
|
|
terms: map[string]bool{"hello": true},
|
|
expectZero: false,
|
|
},
|
|
{
|
|
name: "multiple terms",
|
|
terms: map[string]bool{"hello": true, "world": true},
|
|
expectZero: false,
|
|
},
|
|
{
|
|
name: "identical terms produce same signature",
|
|
terms: map[string]bool{"alpha": true, "beta": true},
|
|
expectSame: true,
|
|
compareTo: map[string]bool{"alpha": true, "beta": true},
|
|
},
|
|
|
|
// ===== EDGE CASES =====
|
|
{
|
|
name: "empty set",
|
|
terms: map[string]bool{},
|
|
expectZero: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
sig := computeTermSignature(tt.terms)
|
|
|
|
if tt.expectZero {
|
|
assert.Equal(t, uint64(0), sig, "Empty set should produce zero signature")
|
|
} else {
|
|
assert.NotEqual(t, uint64(0), sig, "Non-empty set should produce non-zero signature")
|
|
}
|
|
|
|
if tt.expectSame && tt.compareTo != nil {
|
|
sig2 := computeTermSignature(tt.compareTo)
|
|
assert.Equal(t, sig, sig2, "Identical term sets should produce identical signatures")
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestComputeTermSignature_DifferentSets(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Different term sets should usually produce different signatures
|
|
set1 := map[string]bool{"authentication": true, "security": true}
|
|
set2 := map[string]bool{"database": true, "migration": true}
|
|
|
|
sig1 := computeTermSignature(set1)
|
|
sig2 := computeTermSignature(set2)
|
|
|
|
// While hash collisions are possible, they should be rare
|
|
assert.NotEqual(t, sig1, sig2, "Different term sets should usually produce different signatures")
|
|
}
|
|
|
|
func TestPopCount64(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
input uint64
|
|
expected int
|
|
}{
|
|
// ===== GOOD CASES =====
|
|
{name: "zero", input: 0, expected: 0},
|
|
{name: "one", input: 1, expected: 1},
|
|
{name: "powers of two", input: 8, expected: 1},
|
|
{name: "all ones in byte", input: 0xFF, expected: 8},
|
|
{name: "alternating bits", input: 0xAAAAAAAAAAAAAAAA, expected: 32},
|
|
{name: "max uint64", input: 0xFFFFFFFFFFFFFFFF, expected: 64},
|
|
|
|
// ===== EDGE CASES =====
|
|
{name: "single high bit", input: 1 << 63, expected: 1},
|
|
{name: "sparse bits", input: 0x8000000000000001, expected: 2},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := popCount64(tt.input)
|
|
assert.Equal(t, tt.expected, result)
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestIsSimilarToAny_EmptyTerms(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Observation with no extractable terms
|
|
emptyObs := &models.Observation{
|
|
ID: 1,
|
|
Title: sql.NullString{String: "", Valid: false},
|
|
Narrative: sql.NullString{String: "", Valid: false},
|
|
}
|
|
|
|
existing := []*models.Observation{
|
|
{
|
|
ID: 2,
|
|
Title: sql.NullString{String: "Some content here", Valid: true},
|
|
Narrative: sql.NullString{String: "More content", Valid: true},
|
|
},
|
|
}
|
|
|
|
// Should return false when new observation has no terms
|
|
assert.False(t, IsSimilarToAny(emptyObs, existing, 0.3))
|
|
}
|
|
|
|
func TestExtractObservationTerms_FilesModified(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
obs := &models.Observation{
|
|
ID: 1,
|
|
Title: sql.NullString{String: "Code changes", Valid: true},
|
|
FilesModified: models.JSONStringArray{"/src/handler.go", "/pkg/models/user.go"},
|
|
}
|
|
|
|
terms := ExtractObservationTerms(obs)
|
|
|
|
// Should contain filenames from FilesModified
|
|
assert.Contains(t, terms, "handler.go")
|
|
assert.Contains(t, terms, "user.go")
|
|
}
|
|
|
|
func TestAddTerms_ShortWords(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
terms := make(map[string]bool)
|
|
|
|
addTerms(terms, "I am a go developer")
|
|
|
|
// Short words (< 3 chars) should be excluded
|
|
assert.NotContains(t, terms, "i")
|
|
assert.NotContains(t, terms, "am")
|
|
assert.NotContains(t, terms, "a")
|
|
assert.NotContains(t, terms, "go") // Only 2 chars
|
|
|
|
// "developer" should be included
|
|
assert.Contains(t, terms, "developer")
|
|
}
|
|
|
|
func TestAddTerms_SpecialCharacters(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
terms := make(map[string]bool)
|
|
|
|
addTerms(terms, "user_id authentication-flow JWT_token")
|
|
|
|
// Hyphens split words, but underscores are kept as part of the word
|
|
// (underscore is included in the tokenization regex)
|
|
assert.Contains(t, terms, "user_id")
|
|
assert.Contains(t, terms, "authentication")
|
|
assert.Contains(t, terms, "flow")
|
|
assert.Contains(t, terms, "jwt_token")
|
|
}
|
|
|
|
func TestJaccardSimilarity_SubsetSuperset(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
subset := map[string]bool{"a": true, "b": true}
|
|
superset := map[string]bool{"a": true, "b": true, "c": true, "d": true}
|
|
|
|
// Subset similarity should be intersection/union = 2/4 = 0.5
|
|
result := JaccardSimilarity(subset, superset)
|
|
assert.InDelta(t, 0.5, result, 0.001)
|
|
}
|
|
|
|
func TestClusterObservations_HighThreshold(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// With a very high threshold, almost nothing should be clustered
|
|
observations := []*models.Observation{
|
|
{ID: 1, Title: sql.NullString{String: "authentication implementation", Valid: true}},
|
|
{ID: 2, Title: sql.NullString{String: "authentication update", Valid: true}},
|
|
{ID: 3, Title: sql.NullString{String: "authentication refactor", Valid: true}},
|
|
}
|
|
|
|
// With threshold of 0.9, even similar observations shouldn't cluster
|
|
clustered := ClusterObservations(observations, 0.9)
|
|
|
|
assert.Len(t, clustered, 3, "High threshold should prevent clustering")
|
|
}
|
|
|
|
func TestClusterObservations_LowThreshold(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// With a very low threshold, more things should be clustered
|
|
observations := []*models.Observation{
|
|
{ID: 1, Title: sql.NullString{String: "authentication implementation details", Valid: true}},
|
|
{ID: 2, Title: sql.NullString{String: "authentication security update", Valid: true}},
|
|
{ID: 3, Title: sql.NullString{String: "something completely different topic", Valid: true}},
|
|
}
|
|
|
|
// With threshold of 0.1, partial overlap should cluster
|
|
clustered := ClusterObservations(observations, 0.1)
|
|
|
|
// First two share "authentication", should likely cluster
|
|
assert.LessOrEqual(t, len(clustered), 3)
|
|
}
|