Files
claude-mnemonic/internal/embedding/service_test.go
T
lukaszraczylo f79782a008 Release dec 2025 (#15)
* Resolves issue #13

- Switched model to bge-small-en-v1.5
- Added lazy re-embedding
- Added model version tracking per vector
- Added conversion of vectors to the new model

* Add lfs support to the workflow.

* Implements importance scoring with decay + voting #6

* Resolves issue #5 by marking observations as superseeded and scheduled for deletion

* Implement pattern detection #7

* Improve injections and observations accuracy

- Session start: Recent observations for project context (recency-based)
- User prompt: Semantically relevant observations (similarity-based with threshold)

* Added two stage retrieval with bi and cross encoder #8

* Implement query expansion and reformulation #9

* Knowledge graph and relationships ( resolves #4 )

- File Overlap Detection: Detects relationships when observations modify/read the same files
- Concept Overlap Detection: Detects relationships based on shared semantic concepts
- Type Progression Detection: Infers relationships from natural observation type progressions (e.g., discovery → bugfix = "fixes")
- Temporal Proximity Detection: Detects relationships between observations in the same session within 5 minutes
- Narrative Mention Detection: Detects explicit relationship language in narratives (e.g., "fixes", "depends on", "supersedes")

* Add visualisation of the relations to the dashboard.

* fixup! Add visualisation of the relations to the dashboard.

* Update documentation with new settings and screenshots.
2025-12-19 17:57:11 +00:00

335 lines
8.0 KiB
Go

package embedding
import (
"math"
"sync"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestEmbeddingDim verifies the embedding dimension constant.
func TestEmbeddingDim(t *testing.T) {
assert.Equal(t, 384, EmbeddingDim)
}
// TestNewService tests creating a new embedding service.
func TestNewService(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
require.NotNil(t, svc)
defer svc.Close()
// Verify the service is properly initialized via public methods
assert.NotEmpty(t, svc.Name())
assert.NotEmpty(t, svc.Version())
assert.Equal(t, EmbeddingDim, svc.Dimensions())
}
// TestEmbed_SingleText tests embedding a single text.
func TestEmbed_SingleText(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
embedding, err := svc.Embed("Hello, world!")
require.NoError(t, err)
assert.Len(t, embedding, EmbeddingDim)
// Verify non-zero embedding
var sum float32
for _, v := range embedding {
sum += v * v
}
assert.Greater(t, sum, float32(0), "Embedding should not be all zeros")
}
// TestEmbed_EmptyText tests embedding an empty string.
func TestEmbed_EmptyText(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
embedding, err := svc.Embed("")
require.NoError(t, err)
assert.Len(t, embedding, EmbeddingDim)
// Empty text should return zero vector
for _, v := range embedding {
assert.Equal(t, float32(0), v)
}
}
// TestEmbed_SimilarTexts tests that similar texts produce similar embeddings.
func TestEmbed_SimilarTexts(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
emb1, err := svc.Embed("The quick brown fox jumps over the lazy dog.")
require.NoError(t, err)
emb2, err := svc.Embed("A fast brown fox leaps over a sleepy dog.")
require.NoError(t, err)
emb3, err := svc.Embed("Go programming language concurrency patterns.")
require.NoError(t, err)
// Calculate cosine similarity
sim12 := cosineSimilarity(emb1, emb2)
sim13 := cosineSimilarity(emb1, emb3)
// Similar texts should have higher similarity
assert.Greater(t, sim12, sim13, "Similar sentences should have higher similarity than dissimilar ones")
assert.Greater(t, sim12, float64(0.7), "Similar sentences should have high similarity")
}
// TestEmbedBatch_MultipleTexts tests batch embedding.
func TestEmbedBatch_MultipleTexts(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
texts := []string{
"First text about programming.",
"Second text about databases.",
"Third text about machine learning.",
}
embeddings, err := svc.EmbedBatch(texts)
require.NoError(t, err)
assert.Len(t, embeddings, len(texts))
for i, emb := range embeddings {
assert.Len(t, emb, EmbeddingDim, "Embedding %d should have correct dimension", i)
}
}
// TestEmbedBatch_EmptySlice tests batch embedding with empty slice.
func TestEmbedBatch_EmptySlice(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
embeddings, err := svc.EmbedBatch([]string{})
require.NoError(t, err)
assert.Nil(t, embeddings)
}
// TestEmbedBatch_WithEmptyTexts tests batch embedding with some empty texts.
func TestEmbedBatch_WithEmptyTexts(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
texts := []string{
"Valid text one.",
"",
"Valid text two.",
"",
}
embeddings, err := svc.EmbedBatch(texts)
require.NoError(t, err)
assert.Len(t, embeddings, 4)
// Non-empty texts should have non-zero embeddings
var sum0 float32
for _, v := range embeddings[0] {
sum0 += v * v
}
assert.Greater(t, sum0, float32(0))
// Empty texts should have zero embeddings
for _, v := range embeddings[1] {
assert.Equal(t, float32(0), v)
}
var sum2 float32
for _, v := range embeddings[2] {
sum2 += v * v
}
assert.Greater(t, sum2, float32(0))
for _, v := range embeddings[3] {
assert.Equal(t, float32(0), v)
}
}
// TestEmbedBatch_AllEmpty tests batch embedding with all empty texts.
func TestEmbedBatch_AllEmpty(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
texts := []string{"", "", ""}
embeddings, err := svc.EmbedBatch(texts)
require.NoError(t, err)
assert.Len(t, embeddings, 3)
for i, emb := range embeddings {
assert.Len(t, emb, EmbeddingDim, "Embedding %d should have correct dimension", i)
for j, v := range emb {
assert.Equal(t, float32(0), v, "Embedding %d[%d] should be zero", i, j)
}
}
}
// TestEmbed_Concurrent tests concurrent embedding calls.
func TestEmbed_Concurrent(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
var wg sync.WaitGroup
numGoroutines := 10
errors := make(chan error, numGoroutines)
embeddings := make([][]float32, numGoroutines)
for i := 0; i < numGoroutines; i++ {
wg.Add(1)
go func(idx int) {
defer wg.Done()
text := "Test text for concurrent embedding test"
emb, err := svc.Embed(text)
if err != nil {
errors <- err
return
}
embeddings[idx] = emb
}(i)
}
wg.Wait()
close(errors)
for err := range errors {
t.Errorf("Concurrent embedding error: %v", err)
}
// All embeddings should be valid
for i, emb := range embeddings {
if emb != nil {
assert.Len(t, emb, EmbeddingDim, "Embedding %d should have correct dimension", i)
}
}
}
// TestEmbed_SpecialCharacters tests embedding text with special characters.
func TestEmbed_SpecialCharacters(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
texts := []string{
"Text with unicode: 你好世界 🎉",
"Text with newlines:\nLine 1\nLine 2",
"Text with tabs:\tColumn1\tColumn2",
"Text with quotes: \"quoted\" and 'single'",
"Text with code: func main() { fmt.Println(\"hello\") }",
}
for _, text := range texts {
t.Run(text[:20], func(t *testing.T) {
emb, err := svc.Embed(text)
require.NoError(t, err)
assert.Len(t, emb, EmbeddingDim)
})
}
}
// TestEmbed_LongText tests embedding long text.
func TestEmbed_LongText(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
// Create a long text (tokenizer should truncate appropriately)
longText := ""
for i := 0; i < 100; i++ {
longText += "This is a sentence to make the text very long. "
}
emb, err := svc.Embed(longText)
require.NoError(t, err)
assert.Len(t, emb, EmbeddingDim)
}
// TestClose tests closing the service.
func TestClose(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
err = svc.Close()
require.NoError(t, err)
// After close, embedding should fail (model resources released)
// Note: This behavior is model-specific; some models may still work after close
}
// TestEmbedBatch_SingleItem tests batch embedding with single item.
func TestEmbedBatch_SingleItem(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
texts := []string{"Single text for batch embedding."}
embeddings, err := svc.EmbedBatch(texts)
require.NoError(t, err)
assert.Len(t, embeddings, 1)
assert.Len(t, embeddings[0], EmbeddingDim)
}
// TestEmbed_Deterministic tests that embedding is deterministic.
func TestEmbed_Deterministic(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
text := "Test text for deterministic embedding."
emb1, err := svc.Embed(text)
require.NoError(t, err)
emb2, err := svc.Embed(text)
require.NoError(t, err)
// Same text should produce same embedding
for i := 0; i < EmbeddingDim; i++ {
assert.Equal(t, emb1[i], emb2[i], "Embedding should be deterministic at index %d", i)
}
}
// Helper function to calculate cosine similarity
func cosineSimilarity(a, b []float32) float64 {
if len(a) != len(b) {
return 0
}
var dotProduct float64
var normA float64
var normB float64
for i := range a {
dotProduct += float64(a[i]) * float64(b[i])
normA += float64(a[i]) * float64(a[i])
normB += float64(b[i]) * float64(b[i])
}
if normA == 0 || normB == 0 {
return 0
}
return dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))
}