fix: address 15 additional hang vectors found during deep audit (#45)

MCP server (5 fixes):
- Move semaphore acquisition inside goroutine so main loop stays
  responsive when all slots are taken
- Add 10s write timeout to sendResponse to prevent pipe deadlock
  when Claude Code pauses reading stdout
- Send fallback JSON-RPC error when json.Marshal fails instead of
  silently swallowing the error and leaving caller waiting forever
- Silence unknown notification methods (req.ID == nil) instead of
  sending unsolicited error responses that may desync the host
- Return MCP isError content for tool failures instead of top-level
  JSON-RPC error, matching the MCP specification

Vector/embedding (3 fixes):
- Move EmbedBatchWithContext call before writeMu.Lock in AddDocuments
  so ONNX inference runs outside the write lock
- Replace singleflight.Do with DoChan + ctx select in both
  getOrComputeEmbedding and UnifiedSearch so callers can bail out
  independently when their context expires
- Add activeQueries atomic counter; skip cache warming when user
  queries are in-flight; reduce warming timeout from 5s to 2s

Hooks (4 fixes):
- Cap EnsureWorkerRunning to 15s hard deadline with context; reduce
  StartupTimeout from 30s to 10s; reduce port-in-use retries
- Fix nil dereference panic in user-prompt hook when initResult is
  nil (non-JSON worker response); use comma-ok assertions
- Use package-level hookClient/healthClient with DisableKeepAlives
  to prevent FD leaks in short-lived hook processes
- Set SysProcAttr{Setpgid: true} to detach worker from hook process
  group, preventing kill-cascade from Claude Code

Worker/DB (3 fixes):
- Replace os.Exit(0) in MCP config watcher with context cancellation
  for clean protocol shutdown
- Add 60s context.WithTimeout around ProcessObservation calls in
  processAllSessions to prevent hung CLI subprocesses from blocking
  the queue processor forever
- Set explicit PRAGMA wal_autocheckpoint=1000 and add PASSIVE WAL
  checkpoint to Optimize() to prevent checkpoint stalls

Adds 20+ regression tests across all fix areas.
This commit is contained in:
2026-05-26 13:52:09 +01:00
parent de5796bbe6
commit a81482d06a
15 changed files with 952 additions and 92 deletions
+25 -12
View File
@@ -145,20 +145,24 @@ func (c *Client) AddDocuments(ctx context.Context, docs []Document) error {
return nil
}
c.writeMu.Lock()
defer c.writeMu.Unlock()
// Generate embeddings for all documents
// Prepare texts for embedding
texts := make([]string, len(docs))
for i, doc := range docs {
texts[i] = doc.Content
}
embeddings, err := c.embedSvc.EmbedBatch(texts)
// Compute embeddings OUTSIDE the write lock for better concurrency.
// Embedding is ONNX inference (slow, mutex-protected internally) — holding
// writeMu during inference blocks all concurrent writes and reads.
embeddings, err := c.embedSvc.EmbedBatchWithContext(ctx, texts)
if err != nil {
return fmt.Errorf("generate embeddings: %w", err)
}
// Acquire write lock for DB operations only
c.writeMu.Lock()
defer c.writeMu.Unlock()
// Insert into vectors table with model version tracking
const insertQuery = `
INSERT OR REPLACE INTO vectors (doc_id, embedding, sqlite_id, doc_type, field_type, project, scope, model_version)
@@ -906,8 +910,10 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
}
c.queryCacheMu.RUnlock()
// Cache miss - use singleflight to deduplicate concurrent embedding requests
result, err, _ := c.embeddingGroup.Do(query, func() (any, error) {
// Cache miss - use singleflight DoChan to deduplicate concurrent embedding requests.
// DoChan + select allows per-caller context cancellation: if a caller's context
// expires it can bail out without waiting for the shared computation to finish.
ch := c.embeddingGroup.DoChan(query, func() (any, error) {
// Double-check cache inside singleflight (another goroutine may have just cached it)
c.queryCacheMu.RLock()
if entry, ok := c.queryCache[query]; ok {
@@ -921,8 +927,10 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
// Record cache miss
c.stats.embeddingMisses.Add(1)
// Compute embedding with context-aware lock acquisition
emb, err := c.embedSvc.EmbedWithContext(ctx, query)
// Compute embedding — use non-context Embed here because the singleflight
// result is shared across callers with different contexts. Per-caller
// cancellation is handled by the select below.
emb, err := c.embedSvc.Embed(query)
if err != nil {
return nil, err
}
@@ -969,10 +977,15 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
return emb, nil
})
if err != nil {
return nil, err
select {
case res := <-ch:
if res.Err != nil {
return nil, res.Err
}
return res.Val.([]float32), nil
case <-ctx.Done():
return nil, ctx.Err()
}
return result.([]float32), nil
}
// ClearCache clears the embedding cache.