test: add regression tests for #45 hang fixes

- MCP server: 4 tests verifying concurrent dispatch, slow-request
  isolation, semaphore limiting, and graceful drain on cancel
- Embedding: 4 tests verifying context-aware mutex cancellation,
  uncontended success, batch cancellation, and cleanup after cancel
- Vector client: 3 tests for acquireRLockWithContext cancel, success,
  and cleanup goroutine correctness
- Worker handlers: 1 test verifying handleSearchByPrompt inherits
  request context cancellation (skips without FTS5)

12 regression tests total covering the four fix areas.
This commit is contained in:
2026-05-26 12:45:12 +01:00
parent 29d57857ff
commit de5796bbe6
4 changed files with 668 additions and 0 deletions
+143
View File
@@ -1,9 +1,12 @@
package embedding
import (
"context"
"errors"
"math"
"sync"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -310,6 +313,146 @@ func TestEmbed_Deterministic(t *testing.T) {
}
}
// --- Regression tests: context-aware mutex (Fix #45) ---
// asBGE casts the service model to *bgeModel for direct mutex access.
// Tests are in the same package so this is safe.
func asBGE(t *testing.T, svc *Service) *bgeModel {
t.Helper()
m, ok := svc.model.(*bgeModel)
require.True(t, ok, "model is not *bgeModel — test invariant broken")
return m
}
// holdMutex locks m.mu in a background goroutine and returns a release func.
// The returned ready channel is closed once the lock is held.
func holdMutex(m *bgeModel) (ready <-chan struct{}, release func()) {
ch := make(chan struct{})
done := make(chan struct{})
go func() {
m.mu.Lock()
close(ch) // signal: lock acquired
<-done // wait for release signal
m.mu.Unlock()
}()
return ch, func() { close(done) }
}
// TestEmbedWithContext_CancelWhileWaitingForMutex is the core regression test.
// If the mutex is held and the context times out, EmbedWithContext must return
// immediately with a context error — not block until the mutex is released.
func TestEmbedWithContext_CancelWhileWaitingForMutex(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
m := asBGE(t, svc)
// Hold the mutex to simulate a stuck ONNX call.
ready, release := holdMutex(m)
<-ready // ensure lock is held before proceeding
defer release()
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
start := time.Now()
_, err = svc.EmbedWithContext(ctx, "test text")
elapsed := time.Since(start)
// Must return a context error.
require.Error(t, err)
assert.True(t,
errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled),
"expected context error, got: %v", err)
// Must return quickly (well under the 30 s default; allow 2× the timeout for CI slack).
assert.Less(t, elapsed, 200*time.Millisecond,
"EmbedWithContext blocked too long: %v", elapsed)
}
// TestEmbedWithContext_SuccessWhenUncontended verifies normal operation still works.
func TestEmbedWithContext_SuccessWhenUncontended(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
emb, err := svc.EmbedWithContext(context.Background(), "hello world")
require.NoError(t, err)
assert.Len(t, emb, EmbeddingDim)
var sum float32
for _, v := range emb {
sum += v * v
}
assert.Greater(t, sum, float32(0), "embedding should not be all zeros")
}
// TestEmbedBatchWithContext_CancelDuringBatch verifies batch embedding respects
// context cancellation while blocked on mutex acquisition.
func TestEmbedBatchWithContext_CancelDuringBatch(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
m := asBGE(t, svc)
ready, release := holdMutex(m)
<-ready
defer release()
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
start := time.Now()
_, err = svc.EmbedBatchWithContext(ctx, []string{"a", "b", "c"})
elapsed := time.Since(start)
require.Error(t, err)
assert.True(t,
errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled),
"expected context error, got: %v", err)
assert.Less(t, elapsed, 200*time.Millisecond,
"EmbedBatchWithContext blocked too long: %v", elapsed)
}
// TestEmbedWithContext_CleanupAfterCancel verifies the cleanup goroutine in
// acquireMutex properly unlocks the mutex after context cancellation,
// so subsequent calls do not deadlock.
func TestEmbedWithContext_CleanupAfterCancel(t *testing.T) {
svc, err := NewService()
require.NoError(t, err)
defer svc.Close()
m := asBGE(t, svc)
// --- first call: context expires while mutex is held ---
ready, release := holdMutex(m)
<-ready
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
defer cancel()
_, firstErr := svc.EmbedWithContext(ctx, "should fail")
require.Error(t, firstErr)
assert.True(t,
errors.Is(firstErr, context.DeadlineExceeded) || errors.Is(firstErr, context.Canceled),
"expected context error on first call, got: %v", firstErr)
// Release the held mutex so the cleanup goroutine inside acquireMutex can finish.
release()
// Give the cleanup goroutine a moment to acquire-and-release the mutex.
// 50 ms is generous; the goroutine only has to lock+unlock with no contention.
time.Sleep(50 * time.Millisecond)
// --- second call: mutex should be free, no deadlock ---
emb, secondErr := svc.EmbedWithContext(context.Background(), "should work")
require.NoError(t, secondErr, "second call should succeed after cleanup goroutine released mutex")
assert.Len(t, emb, EmbeddingDim)
}
// Helper function to calculate cosine similarity
func cosineSimilarity(a, b []float32) float64 {
if len(a) != len(b) {