mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-05 23:03:55 +00:00
fix: address 15 additional hang vectors found during deep audit (#45)
MCP server (5 fixes):
- Move semaphore acquisition inside goroutine so main loop stays
responsive when all slots are taken
- Add 10s write timeout to sendResponse to prevent pipe deadlock
when Claude Code pauses reading stdout
- Send fallback JSON-RPC error when json.Marshal fails instead of
silently swallowing the error and leaving caller waiting forever
- Silence unknown notification methods (req.ID == nil) instead of
sending unsolicited error responses that may desync the host
- Return MCP isError content for tool failures instead of top-level
JSON-RPC error, matching the MCP specification
Vector/embedding (3 fixes):
- Move EmbedBatchWithContext call before writeMu.Lock in AddDocuments
so ONNX inference runs outside the write lock
- Replace singleflight.Do with DoChan + ctx select in both
getOrComputeEmbedding and UnifiedSearch so callers can bail out
independently when their context expires
- Add activeQueries atomic counter; skip cache warming when user
queries are in-flight; reduce warming timeout from 5s to 2s
Hooks (4 fixes):
- Cap EnsureWorkerRunning to 15s hard deadline with context; reduce
StartupTimeout from 30s to 10s; reduce port-in-use retries
- Fix nil dereference panic in user-prompt hook when initResult is
nil (non-JSON worker response); use comma-ok assertions
- Use package-level hookClient/healthClient with DisableKeepAlives
to prevent FD leaks in short-lived hook processes
- Set SysProcAttr{Setpgid: true} to detach worker from hook process
group, preventing kill-cascade from Claude Code
Worker/DB (3 fixes):
- Replace os.Exit(0) in MCP config watcher with context cancellation
for clean protocol shutdown
- Add 60s context.WithTimeout around ProcessObservation calls in
processAllSessions to prevent hung CLI subprocesses from blocking
the queue processor forever
- Set explicit PRAGMA wal_autocheckpoint=1000 and add PASSIVE WAL
checkpoint to Optimize() to prevent checkpoint stalls
Adds 20+ regression tests across all fix areas.
This commit is contained in:
@@ -145,20 +145,24 @@ func (c *Client) AddDocuments(ctx context.Context, docs []Document) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
c.writeMu.Lock()
|
||||
defer c.writeMu.Unlock()
|
||||
|
||||
// Generate embeddings for all documents
|
||||
// Prepare texts for embedding
|
||||
texts := make([]string, len(docs))
|
||||
for i, doc := range docs {
|
||||
texts[i] = doc.Content
|
||||
}
|
||||
|
||||
embeddings, err := c.embedSvc.EmbedBatch(texts)
|
||||
// Compute embeddings OUTSIDE the write lock for better concurrency.
|
||||
// Embedding is ONNX inference (slow, mutex-protected internally) — holding
|
||||
// writeMu during inference blocks all concurrent writes and reads.
|
||||
embeddings, err := c.embedSvc.EmbedBatchWithContext(ctx, texts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("generate embeddings: %w", err)
|
||||
}
|
||||
|
||||
// Acquire write lock for DB operations only
|
||||
c.writeMu.Lock()
|
||||
defer c.writeMu.Unlock()
|
||||
|
||||
// Insert into vectors table with model version tracking
|
||||
const insertQuery = `
|
||||
INSERT OR REPLACE INTO vectors (doc_id, embedding, sqlite_id, doc_type, field_type, project, scope, model_version)
|
||||
@@ -906,8 +910,10 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
|
||||
}
|
||||
c.queryCacheMu.RUnlock()
|
||||
|
||||
// Cache miss - use singleflight to deduplicate concurrent embedding requests
|
||||
result, err, _ := c.embeddingGroup.Do(query, func() (any, error) {
|
||||
// Cache miss - use singleflight DoChan to deduplicate concurrent embedding requests.
|
||||
// DoChan + select allows per-caller context cancellation: if a caller's context
|
||||
// expires it can bail out without waiting for the shared computation to finish.
|
||||
ch := c.embeddingGroup.DoChan(query, func() (any, error) {
|
||||
// Double-check cache inside singleflight (another goroutine may have just cached it)
|
||||
c.queryCacheMu.RLock()
|
||||
if entry, ok := c.queryCache[query]; ok {
|
||||
@@ -921,8 +927,10 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
|
||||
// Record cache miss
|
||||
c.stats.embeddingMisses.Add(1)
|
||||
|
||||
// Compute embedding with context-aware lock acquisition
|
||||
emb, err := c.embedSvc.EmbedWithContext(ctx, query)
|
||||
// Compute embedding — use non-context Embed here because the singleflight
|
||||
// result is shared across callers with different contexts. Per-caller
|
||||
// cancellation is handled by the select below.
|
||||
emb, err := c.embedSvc.Embed(query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -969,10 +977,15 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
|
||||
return emb, nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
select {
|
||||
case res := <-ch:
|
||||
if res.Err != nil {
|
||||
return nil, res.Err
|
||||
}
|
||||
return res.Val.([]float32), nil
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
return result.([]float32), nil
|
||||
}
|
||||
|
||||
// ClearCache clears the embedding cache.
|
||||
|
||||
@@ -3,6 +3,7 @@ package sqlitevec
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
@@ -2013,3 +2014,141 @@ func TestAcquireRLockWithContext_CleanupOnCancel(t *testing.T) {
|
||||
t.Fatal("write lock acquisition timed out: cleanup goroutine may have leaked an RLock")
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// REGRESSION TESTS FOR Fix #1: Embedding outside writeMu in AddDocuments
|
||||
// =============================================================================
|
||||
|
||||
// TestAddDocuments_EmbeddingOutsideWriteLock verifies that AddDocuments does NOT
|
||||
// hold the write lock during embedding computation. A concurrent Query call
|
||||
// should complete quickly while AddDocuments is computing embeddings.
|
||||
func TestAddDocuments_EmbeddingOutsideWriteLock(t *testing.T) {
|
||||
db, dbCleanup := testDB(t)
|
||||
defer dbCleanup()
|
||||
|
||||
embedSvc, embedCleanup := testEmbeddingService(t)
|
||||
defer embedCleanup()
|
||||
|
||||
client, err := NewClient(Config{DB: db}, embedSvc)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Seed the DB with one document so Query has something to search
|
||||
seedDocs := []Document{
|
||||
{ID: "seed-1", Content: "Seed document for concurrency test"},
|
||||
}
|
||||
err = client.AddDocuments(context.Background(), seedDocs)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Pre-warm the embedding cache for the query text so the Query call
|
||||
// itself doesn't need the embedding mutex — it only needs the DB read lock.
|
||||
_, err = client.Query(context.Background(), "concurrency test", 5, nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Prepare a batch of documents to trigger a slow AddDocuments call
|
||||
batchDocs := make([]Document, 10)
|
||||
for i := range batchDocs {
|
||||
batchDocs[i] = Document{
|
||||
ID: fmt.Sprintf("batch-%d", i),
|
||||
Content: fmt.Sprintf("Batch document number %d for write lock test with unique content", i),
|
||||
}
|
||||
}
|
||||
|
||||
// Launch AddDocuments in background — embedding will take time
|
||||
addDone := make(chan error, 1)
|
||||
go func() {
|
||||
addDone <- client.AddDocuments(context.Background(), batchDocs)
|
||||
}()
|
||||
|
||||
// Give AddDocuments a moment to start embedding computation
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
// Invalidate result cache so Query must go through to DB (tests read lock)
|
||||
client.InvalidateResultCache()
|
||||
|
||||
// A concurrent Query should NOT be blocked by AddDocuments.
|
||||
// If the old code held writeMu during embedding, this would block until
|
||||
// embedding finishes. With the fix, it should complete quickly.
|
||||
queryCtx, queryCancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer queryCancel()
|
||||
|
||||
start := time.Now()
|
||||
_, err = client.Query(queryCtx, "concurrency test", 5, nil)
|
||||
queryDuration := time.Since(start)
|
||||
|
||||
require.NoError(t, err, "Query should succeed while AddDocuments is embedding")
|
||||
// The query should complete well within the timeout if writeMu is not held
|
||||
assert.Less(t, queryDuration, 1*time.Second,
|
||||
"Query should complete quickly when writeMu is not held during embedding")
|
||||
|
||||
// Wait for AddDocuments to finish
|
||||
err = <-addDone
|
||||
require.NoError(t, err, "AddDocuments should succeed")
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// REGRESSION TESTS FOR Fix #2a: DoChan + context select in getOrComputeEmbedding
|
||||
// =============================================================================
|
||||
|
||||
// TestGetOrComputeEmbedding_ContextCancelDuringSingleflight verifies that when
|
||||
// a singleflight embedding computation is in progress, a second caller with a
|
||||
// short-lived context returns context.DeadlineExceeded promptly rather than
|
||||
// waiting for the slow first call to finish.
|
||||
func TestGetOrComputeEmbedding_ContextCancelDuringSingleflight(t *testing.T) {
|
||||
db, dbCleanup := testDB(t)
|
||||
defer dbCleanup()
|
||||
|
||||
embedSvc, embedCleanup := testEmbeddingService(t)
|
||||
defer embedCleanup()
|
||||
|
||||
client, err := NewClient(Config{DB: db}, embedSvc)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Add a document so we can query
|
||||
docs := []Document{
|
||||
{ID: "sf-test-1", Content: "Singleflight context cancellation test document"},
|
||||
}
|
||||
err = client.AddDocuments(context.Background(), docs)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Clear cache to force embedding computation
|
||||
client.ClearCache()
|
||||
client.InvalidateResultCache()
|
||||
|
||||
queryText := "unique singleflight context test query"
|
||||
|
||||
// First call: start a normal query in background (will trigger singleflight)
|
||||
firstDone := make(chan struct{})
|
||||
go func() {
|
||||
defer close(firstDone)
|
||||
_, _ = client.Query(context.Background(), queryText, 5, nil)
|
||||
}()
|
||||
|
||||
// Give the first call a moment to start the singleflight computation
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
|
||||
// Second call: use a very short context that should expire quickly
|
||||
shortCtx, shortCancel := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||
defer shortCancel()
|
||||
|
||||
// Clear result cache again so the second call hits getOrComputeEmbedding
|
||||
client.InvalidateResultCache()
|
||||
|
||||
start := time.Now()
|
||||
_, err = client.Query(shortCtx, queryText, 5, nil)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// With DoChan + select, the second caller should return quickly with context error.
|
||||
// Note: If the embedding completes fast enough, the second call may succeed
|
||||
// via singleflight sharing. That's also valid — the test primarily checks
|
||||
// that it doesn't BLOCK for the full embedding duration on context cancel.
|
||||
if err != nil {
|
||||
assert.ErrorIs(t, err, context.DeadlineExceeded,
|
||||
"Should return DeadlineExceeded when context expires during singleflight")
|
||||
assert.Less(t, elapsed, 500*time.Millisecond,
|
||||
"Should return promptly on context cancellation, not wait for slow computation")
|
||||
}
|
||||
// If err == nil, the singleflight completed before the context expired — also valid.
|
||||
|
||||
// Wait for first call to finish
|
||||
<-firstDone
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user