fix: address 15 additional hang vectors found during deep audit (#45)

MCP server (5 fixes):
- Move semaphore acquisition inside goroutine so main loop stays
  responsive when all slots are taken
- Add 10s write timeout to sendResponse to prevent pipe deadlock
  when Claude Code pauses reading stdout
- Send fallback JSON-RPC error when json.Marshal fails instead of
  silently swallowing the error and leaving caller waiting forever
- Silence unknown notification methods (req.ID == nil) instead of
  sending unsolicited error responses that may desync the host
- Return MCP isError content for tool failures instead of top-level
  JSON-RPC error, matching the MCP specification

Vector/embedding (3 fixes):
- Move EmbedBatchWithContext call before writeMu.Lock in AddDocuments
  so ONNX inference runs outside the write lock
- Replace singleflight.Do with DoChan + ctx select in both
  getOrComputeEmbedding and UnifiedSearch so callers can bail out
  independently when their context expires
- Add activeQueries atomic counter; skip cache warming when user
  queries are in-flight; reduce warming timeout from 5s to 2s

Hooks (4 fixes):
- Cap EnsureWorkerRunning to 15s hard deadline with context; reduce
  StartupTimeout from 30s to 10s; reduce port-in-use retries
- Fix nil dereference panic in user-prompt hook when initResult is
  nil (non-JSON worker response); use comma-ok assertions
- Use package-level hookClient/healthClient with DisableKeepAlives
  to prevent FD leaks in short-lived hook processes
- Set SysProcAttr{Setpgid: true} to detach worker from hook process
  group, preventing kill-cascade from Claude Code

Worker/DB (3 fixes):
- Replace os.Exit(0) in MCP config watcher with context cancellation
  for clean protocol shutdown
- Add 60s context.WithTimeout around ProcessObservation calls in
  processAllSessions to prevent hung CLI subprocesses from blocking
  the queue processor forever
- Set explicit PRAGMA wal_autocheckpoint=1000 and add PASSIVE WAL
  checkpoint to Optimize() to prevent checkpoint stalls

Adds 20+ regression tests across all fix areas.
This commit is contained in:
2026-05-26 13:52:09 +01:00
parent de5796bbe6
commit a81482d06a
15 changed files with 952 additions and 92 deletions
+25 -12
View File
@@ -145,20 +145,24 @@ func (c *Client) AddDocuments(ctx context.Context, docs []Document) error {
return nil
}
c.writeMu.Lock()
defer c.writeMu.Unlock()
// Generate embeddings for all documents
// Prepare texts for embedding
texts := make([]string, len(docs))
for i, doc := range docs {
texts[i] = doc.Content
}
embeddings, err := c.embedSvc.EmbedBatch(texts)
// Compute embeddings OUTSIDE the write lock for better concurrency.
// Embedding is ONNX inference (slow, mutex-protected internally) — holding
// writeMu during inference blocks all concurrent writes and reads.
embeddings, err := c.embedSvc.EmbedBatchWithContext(ctx, texts)
if err != nil {
return fmt.Errorf("generate embeddings: %w", err)
}
// Acquire write lock for DB operations only
c.writeMu.Lock()
defer c.writeMu.Unlock()
// Insert into vectors table with model version tracking
const insertQuery = `
INSERT OR REPLACE INTO vectors (doc_id, embedding, sqlite_id, doc_type, field_type, project, scope, model_version)
@@ -906,8 +910,10 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
}
c.queryCacheMu.RUnlock()
// Cache miss - use singleflight to deduplicate concurrent embedding requests
result, err, _ := c.embeddingGroup.Do(query, func() (any, error) {
// Cache miss - use singleflight DoChan to deduplicate concurrent embedding requests.
// DoChan + select allows per-caller context cancellation: if a caller's context
// expires it can bail out without waiting for the shared computation to finish.
ch := c.embeddingGroup.DoChan(query, func() (any, error) {
// Double-check cache inside singleflight (another goroutine may have just cached it)
c.queryCacheMu.RLock()
if entry, ok := c.queryCache[query]; ok {
@@ -921,8 +927,10 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
// Record cache miss
c.stats.embeddingMisses.Add(1)
// Compute embedding with context-aware lock acquisition
emb, err := c.embedSvc.EmbedWithContext(ctx, query)
// Compute embedding — use non-context Embed here because the singleflight
// result is shared across callers with different contexts. Per-caller
// cancellation is handled by the select below.
emb, err := c.embedSvc.Embed(query)
if err != nil {
return nil, err
}
@@ -969,10 +977,15 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
return emb, nil
})
if err != nil {
return nil, err
select {
case res := <-ch:
if res.Err != nil {
return nil, res.Err
}
return res.Val.([]float32), nil
case <-ctx.Done():
return nil, ctx.Err()
}
return result.([]float32), nil
}
// ClearCache clears the embedding cache.
+139
View File
@@ -3,6 +3,7 @@ package sqlitevec
import (
"context"
"database/sql"
"fmt"
"os"
"path/filepath"
"sync"
@@ -2013,3 +2014,141 @@ func TestAcquireRLockWithContext_CleanupOnCancel(t *testing.T) {
t.Fatal("write lock acquisition timed out: cleanup goroutine may have leaked an RLock")
}
}
// =============================================================================
// REGRESSION TESTS FOR Fix #1: Embedding outside writeMu in AddDocuments
// =============================================================================
// TestAddDocuments_EmbeddingOutsideWriteLock verifies that AddDocuments does NOT
// hold the write lock during embedding computation. A concurrent Query call
// should complete quickly while AddDocuments is computing embeddings.
func TestAddDocuments_EmbeddingOutsideWriteLock(t *testing.T) {
db, dbCleanup := testDB(t)
defer dbCleanup()
embedSvc, embedCleanup := testEmbeddingService(t)
defer embedCleanup()
client, err := NewClient(Config{DB: db}, embedSvc)
require.NoError(t, err)
// Seed the DB with one document so Query has something to search
seedDocs := []Document{
{ID: "seed-1", Content: "Seed document for concurrency test"},
}
err = client.AddDocuments(context.Background(), seedDocs)
require.NoError(t, err)
// Pre-warm the embedding cache for the query text so the Query call
// itself doesn't need the embedding mutex — it only needs the DB read lock.
_, err = client.Query(context.Background(), "concurrency test", 5, nil)
require.NoError(t, err)
// Prepare a batch of documents to trigger a slow AddDocuments call
batchDocs := make([]Document, 10)
for i := range batchDocs {
batchDocs[i] = Document{
ID: fmt.Sprintf("batch-%d", i),
Content: fmt.Sprintf("Batch document number %d for write lock test with unique content", i),
}
}
// Launch AddDocuments in background — embedding will take time
addDone := make(chan error, 1)
go func() {
addDone <- client.AddDocuments(context.Background(), batchDocs)
}()
// Give AddDocuments a moment to start embedding computation
time.Sleep(10 * time.Millisecond)
// Invalidate result cache so Query must go through to DB (tests read lock)
client.InvalidateResultCache()
// A concurrent Query should NOT be blocked by AddDocuments.
// If the old code held writeMu during embedding, this would block until
// embedding finishes. With the fix, it should complete quickly.
queryCtx, queryCancel := context.WithTimeout(context.Background(), 2*time.Second)
defer queryCancel()
start := time.Now()
_, err = client.Query(queryCtx, "concurrency test", 5, nil)
queryDuration := time.Since(start)
require.NoError(t, err, "Query should succeed while AddDocuments is embedding")
// The query should complete well within the timeout if writeMu is not held
assert.Less(t, queryDuration, 1*time.Second,
"Query should complete quickly when writeMu is not held during embedding")
// Wait for AddDocuments to finish
err = <-addDone
require.NoError(t, err, "AddDocuments should succeed")
}
// =============================================================================
// REGRESSION TESTS FOR Fix #2a: DoChan + context select in getOrComputeEmbedding
// =============================================================================
// TestGetOrComputeEmbedding_ContextCancelDuringSingleflight verifies that when
// a singleflight embedding computation is in progress, a second caller with a
// short-lived context returns context.DeadlineExceeded promptly rather than
// waiting for the slow first call to finish.
func TestGetOrComputeEmbedding_ContextCancelDuringSingleflight(t *testing.T) {
db, dbCleanup := testDB(t)
defer dbCleanup()
embedSvc, embedCleanup := testEmbeddingService(t)
defer embedCleanup()
client, err := NewClient(Config{DB: db}, embedSvc)
require.NoError(t, err)
// Add a document so we can query
docs := []Document{
{ID: "sf-test-1", Content: "Singleflight context cancellation test document"},
}
err = client.AddDocuments(context.Background(), docs)
require.NoError(t, err)
// Clear cache to force embedding computation
client.ClearCache()
client.InvalidateResultCache()
queryText := "unique singleflight context test query"
// First call: start a normal query in background (will trigger singleflight)
firstDone := make(chan struct{})
go func() {
defer close(firstDone)
_, _ = client.Query(context.Background(), queryText, 5, nil)
}()
// Give the first call a moment to start the singleflight computation
time.Sleep(5 * time.Millisecond)
// Second call: use a very short context that should expire quickly
shortCtx, shortCancel := context.WithTimeout(context.Background(), 20*time.Millisecond)
defer shortCancel()
// Clear result cache again so the second call hits getOrComputeEmbedding
client.InvalidateResultCache()
start := time.Now()
_, err = client.Query(shortCtx, queryText, 5, nil)
elapsed := time.Since(start)
// With DoChan + select, the second caller should return quickly with context error.
// Note: If the embedding completes fast enough, the second call may succeed
// via singleflight sharing. That's also valid — the test primarily checks
// that it doesn't BLOCK for the full embedding duration on context cancel.
if err != nil {
assert.ErrorIs(t, err, context.DeadlineExceeded,
"Should return DeadlineExceeded when context expires during singleflight")
assert.Less(t, elapsed, 500*time.Millisecond,
"Should return promptly on context cancellation, not wait for slow computation")
}
// If err == nil, the singleflight completed before the context expired — also valid.
// Wait for first call to finish
<-firstDone
}