mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-05 23:03:55 +00:00
fix: address 15 additional hang vectors found during deep audit (#45)
MCP server (5 fixes):
- Move semaphore acquisition inside goroutine so main loop stays
responsive when all slots are taken
- Add 10s write timeout to sendResponse to prevent pipe deadlock
when Claude Code pauses reading stdout
- Send fallback JSON-RPC error when json.Marshal fails instead of
silently swallowing the error and leaving caller waiting forever
- Silence unknown notification methods (req.ID == nil) instead of
sending unsolicited error responses that may desync the host
- Return MCP isError content for tool failures instead of top-level
JSON-RPC error, matching the MCP specification
Vector/embedding (3 fixes):
- Move EmbedBatchWithContext call before writeMu.Lock in AddDocuments
so ONNX inference runs outside the write lock
- Replace singleflight.Do with DoChan + ctx select in both
getOrComputeEmbedding and UnifiedSearch so callers can bail out
independently when their context expires
- Add activeQueries atomic counter; skip cache warming when user
queries are in-flight; reduce warming timeout from 5s to 2s
Hooks (4 fixes):
- Cap EnsureWorkerRunning to 15s hard deadline with context; reduce
StartupTimeout from 30s to 10s; reduce port-in-use retries
- Fix nil dereference panic in user-prompt hook when initResult is
nil (non-JSON worker response); use comma-ok assertions
- Use package-level hookClient/healthClient with DisableKeepAlives
to prevent FD leaks in short-lived hook processes
- Set SysProcAttr{Setpgid: true} to detach worker from hook process
group, preventing kill-cascade from Claude Code
Worker/DB (3 fixes):
- Replace os.Exit(0) in MCP config watcher with context cancellation
for clean protocol shutdown
- Add 60s context.WithTimeout around ProcessObservation calls in
processAllSessions to prevent hung CLI subprocesses from blocking
the queue processor forever
- Set explicit PRAGMA wal_autocheckpoint=1000 and add PASSIVE WAL
checkpoint to Optimize() to prevent checkpoint stalls
Adds 20+ regression tests across all fix areas.
This commit is contained in:
@@ -145,20 +145,24 @@ func (c *Client) AddDocuments(ctx context.Context, docs []Document) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
c.writeMu.Lock()
|
||||
defer c.writeMu.Unlock()
|
||||
|
||||
// Generate embeddings for all documents
|
||||
// Prepare texts for embedding
|
||||
texts := make([]string, len(docs))
|
||||
for i, doc := range docs {
|
||||
texts[i] = doc.Content
|
||||
}
|
||||
|
||||
embeddings, err := c.embedSvc.EmbedBatch(texts)
|
||||
// Compute embeddings OUTSIDE the write lock for better concurrency.
|
||||
// Embedding is ONNX inference (slow, mutex-protected internally) — holding
|
||||
// writeMu during inference blocks all concurrent writes and reads.
|
||||
embeddings, err := c.embedSvc.EmbedBatchWithContext(ctx, texts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("generate embeddings: %w", err)
|
||||
}
|
||||
|
||||
// Acquire write lock for DB operations only
|
||||
c.writeMu.Lock()
|
||||
defer c.writeMu.Unlock()
|
||||
|
||||
// Insert into vectors table with model version tracking
|
||||
const insertQuery = `
|
||||
INSERT OR REPLACE INTO vectors (doc_id, embedding, sqlite_id, doc_type, field_type, project, scope, model_version)
|
||||
@@ -906,8 +910,10 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
|
||||
}
|
||||
c.queryCacheMu.RUnlock()
|
||||
|
||||
// Cache miss - use singleflight to deduplicate concurrent embedding requests
|
||||
result, err, _ := c.embeddingGroup.Do(query, func() (any, error) {
|
||||
// Cache miss - use singleflight DoChan to deduplicate concurrent embedding requests.
|
||||
// DoChan + select allows per-caller context cancellation: if a caller's context
|
||||
// expires it can bail out without waiting for the shared computation to finish.
|
||||
ch := c.embeddingGroup.DoChan(query, func() (any, error) {
|
||||
// Double-check cache inside singleflight (another goroutine may have just cached it)
|
||||
c.queryCacheMu.RLock()
|
||||
if entry, ok := c.queryCache[query]; ok {
|
||||
@@ -921,8 +927,10 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
|
||||
// Record cache miss
|
||||
c.stats.embeddingMisses.Add(1)
|
||||
|
||||
// Compute embedding with context-aware lock acquisition
|
||||
emb, err := c.embedSvc.EmbedWithContext(ctx, query)
|
||||
// Compute embedding — use non-context Embed here because the singleflight
|
||||
// result is shared across callers with different contexts. Per-caller
|
||||
// cancellation is handled by the select below.
|
||||
emb, err := c.embedSvc.Embed(query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -969,10 +977,15 @@ func (c *Client) getOrComputeEmbedding(ctx context.Context, query string) ([]flo
|
||||
return emb, nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
select {
|
||||
case res := <-ch:
|
||||
if res.Err != nil {
|
||||
return nil, res.Err
|
||||
}
|
||||
return res.Val.([]float32), nil
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
return result.([]float32), nil
|
||||
}
|
||||
|
||||
// ClearCache clears the embedding cache.
|
||||
|
||||
Reference in New Issue
Block a user