fix: address 15 additional hang vectors found during deep audit (#45)

MCP server (5 fixes):
- Move semaphore acquisition inside goroutine so main loop stays
  responsive when all slots are taken
- Add 10s write timeout to sendResponse to prevent pipe deadlock
  when Claude Code pauses reading stdout
- Send fallback JSON-RPC error when json.Marshal fails instead of
  silently swallowing the error and leaving caller waiting forever
- Silence unknown notification methods (req.ID == nil) instead of
  sending unsolicited error responses that may desync the host
- Return MCP isError content for tool failures instead of top-level
  JSON-RPC error, matching the MCP specification

Vector/embedding (3 fixes):
- Move EmbedBatchWithContext call before writeMu.Lock in AddDocuments
  so ONNX inference runs outside the write lock
- Replace singleflight.Do with DoChan + ctx select in both
  getOrComputeEmbedding and UnifiedSearch so callers can bail out
  independently when their context expires
- Add activeQueries atomic counter; skip cache warming when user
  queries are in-flight; reduce warming timeout from 5s to 2s

Hooks (4 fixes):
- Cap EnsureWorkerRunning to 15s hard deadline with context; reduce
  StartupTimeout from 30s to 10s; reduce port-in-use retries
- Fix nil dereference panic in user-prompt hook when initResult is
  nil (non-JSON worker response); use comma-ok assertions
- Use package-level hookClient/healthClient with DisableKeepAlives
  to prevent FD leaks in short-lived hook processes
- Set SysProcAttr{Setpgid: true} to detach worker from hook process
  group, preventing kill-cascade from Claude Code

Worker/DB (3 fixes):
- Replace os.Exit(0) in MCP config watcher with context cancellation
  for clean protocol shutdown
- Add 60s context.WithTimeout around ProcessObservation calls in
  processAllSessions to prevent hung CLI subprocesses from blocking
  the queue processor forever
- Set explicit PRAGMA wal_autocheckpoint=1000 and add PASSIVE WAL
  checkpoint to Optimize() to prevent checkpoint stalls

Adds 20+ regression tests across all fix areas.
This commit is contained in:
2026-05-26 13:52:09 +01:00
parent de5796bbe6
commit a81482d06a
15 changed files with 952 additions and 92 deletions
+87 -27
View File
@@ -29,7 +29,11 @@ const (
HealthCheckTimeout = 2 * time.Second
// StartupTimeout is the timeout for worker startup.
StartupTimeout = 30 * time.Second
StartupTimeout = 10 * time.Second
// EnsureWorkerDeadline is the hard overall deadline for EnsureWorkerRunning.
// Must fit within Claude Code's hook timeout budget.
EnsureWorkerDeadline = 15 * time.Second
// workerCacheMaxAge is how long the worker cache is considered fresh.
workerCacheMaxAge = 10 * time.Second
@@ -48,6 +52,26 @@ var (
// circuitBreakerMu protects lastStartupFailure.
circuitBreakerMu sync.Mutex
lastStartupFailure time.Time
// hookClient is a shared HTTP client for hook->worker requests.
// DisableKeepAlives prevents TIME_WAIT connection leaks since each hook
// is a separate OS process that exits quickly.
hookClient = &http.Client{
Timeout: 10 * time.Second,
Transport: &http.Transport{
DisableKeepAlives: true,
MaxIdleConns: 1,
},
}
// healthClient is a shared HTTP client for health/version checks.
healthClient = &http.Client{
Timeout: HealthCheckTimeout,
Transport: &http.Transport{
DisableKeepAlives: true,
MaxIdleConns: 1,
},
}
)
// IsWorkerAvailable performs a fast check without network calls.
@@ -86,8 +110,7 @@ func GetWorkerPort() int {
// Parses the JSON health response to check the "ready" field when available.
// Falls back to HTTP status code check for backwards compatibility.
func IsWorkerRunning(port int) bool {
client := &http.Client{Timeout: HealthCheckTimeout}
resp, err := client.Get(fmt.Sprintf("http://127.0.0.1:%d/api/health", port))
resp, err := healthClient.Get(fmt.Sprintf("http://127.0.0.1:%d/api/health", port))
if err != nil {
return false
}
@@ -200,7 +223,25 @@ func isWorkerRunningWithRetries(port int) bool {
// EnsureWorkerRunning ensures the worker is running, starting it if necessary.
// If a worker is already running and healthy with matching version, it reuses it.
// If version mismatch or unhealthy, it kills the old worker and starts fresh.
// A hard deadline of EnsureWorkerDeadline prevents exceeding Claude Code's hook timeout.
func EnsureWorkerRunning() (int, error) {
ctx, cancel := context.WithTimeout(context.Background(), EnsureWorkerDeadline)
defer cancel()
return ensureWorkerRunningCtx(ctx)
}
// sleepCtx sleeps for d or returns early if ctx is cancelled.
func sleepCtx(ctx context.Context, d time.Duration) error {
select {
case <-time.After(d):
return nil
case <-ctx.Done():
return ctx.Err()
}
}
func ensureWorkerRunningCtx(ctx context.Context) (int, error) {
port := GetWorkerPort()
// Fast path: check PID cache before making any HTTP calls.
@@ -210,6 +251,10 @@ func EnsureWorkerRunning() (int, error) {
}
}
if ctx.Err() != nil {
return 0, ctx.Err()
}
// Circuit breaker: if we failed to start recently, don't retry immediately.
circuitBreakerMu.Lock()
if !lastStartupFailure.IsZero() && time.Since(lastStartupFailure) < circuitBreakerCooldown {
@@ -232,7 +277,9 @@ func EnsureWorkerRunning() (int, error) {
if err := KillProcessOnPort(port); err != nil {
fmt.Fprintf(os.Stderr, "[claude-mnemonic] Warning: failed to kill old worker: %v\n", err)
}
time.Sleep(500 * time.Millisecond)
if err := sleepCtx(ctx, 500*time.Millisecond); err != nil {
return 0, err
}
} else {
// Version matches, reuse existing worker
updateCacheFromPort(port)
@@ -245,14 +292,20 @@ func EnsureWorkerRunning() (int, error) {
}
}
if ctx.Err() != nil {
return 0, ctx.Err()
}
// Port is in use but health check failed -- worker may be slow, not dead.
if IsPortInUse(port) {
// The port is responding to TCP but health check timed out.
// Don't kill it -- it's likely just under load. Give it more time.
fmt.Fprintf(os.Stderr, "[claude-mnemonic] Worker on port %d is slow to respond, waiting...\n", port)
// Try a few more times with longer delays before giving up
for i := 0; i < 3; i++ {
time.Sleep(500 * time.Millisecond)
// Try a couple more times with shorter delays before giving up
for i := 0; i < 2; i++ {
if err := sleepCtx(ctx, 300*time.Millisecond); err != nil {
return 0, err
}
if IsWorkerRunning(port) {
updateCacheFromPort(port)
return port, nil
@@ -263,7 +316,13 @@ func EnsureWorkerRunning() (int, error) {
if err := KillProcessOnPort(port); err != nil {
fmt.Fprintf(os.Stderr, "[claude-mnemonic] Warning: failed to kill unhealthy process on port %d: %v\n", port, err)
}
time.Sleep(500 * time.Millisecond)
if err := sleepCtx(ctx, 500*time.Millisecond); err != nil {
return 0, err
}
}
if ctx.Err() != nil {
return 0, ctx.Err()
}
// Find worker binary
@@ -272,8 +331,10 @@ func EnsureWorkerRunning() (int, error) {
return 0, fmt.Errorf("worker binary not found")
}
// Start worker
// Start worker -- detach from hook's process group so Claude Code
// killing the hook doesn't take the worker down with it.
cmd := exec.Command(workerPath) // #nosec G204 -- workerPath is from internal findWorkerBinary
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Stdout = os.Stderr
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
@@ -286,27 +347,32 @@ func EnsureWorkerRunning() (int, error) {
pid := cmd.Process.Pid
// Wait for worker to be ready with exponential backoff
deadline := time.Now().Add(StartupTimeout)
backoff := 50 * time.Millisecond
maxBackoff := 500 * time.Millisecond
for time.Now().Before(deadline) {
for {
if ctx.Err() != nil {
circuitBreakerMu.Lock()
lastStartupFailure = time.Now()
circuitBreakerMu.Unlock()
return 0, fmt.Errorf("worker failed to start within deadline: %w", ctx.Err())
}
if IsWorkerRunning(port) {
writeWorkerCache(port, pid)
return port, nil
}
time.Sleep(backoff)
if err := sleepCtx(ctx, backoff); err != nil {
circuitBreakerMu.Lock()
lastStartupFailure = time.Now()
circuitBreakerMu.Unlock()
return 0, fmt.Errorf("worker failed to start within deadline: %w", err)
}
// Exponential backoff with cap
backoff = backoff * 2
if backoff > maxBackoff {
backoff = maxBackoff
}
}
circuitBreakerMu.Lock()
lastStartupFailure = time.Now()
circuitBreakerMu.Unlock()
return 0, fmt.Errorf("worker failed to start within timeout")
}
// updateCacheFromPort finds the PID of the process on the port and updates the cache.
@@ -330,8 +396,7 @@ func updateCacheFromPort(port int) {
// GetWorkerVersion gets the version of the running worker.
func GetWorkerVersion(port int) string {
client := &http.Client{Timeout: HealthCheckTimeout}
resp, err := client.Get(fmt.Sprintf("http://127.0.0.1:%d/api/version", port))
resp, err := healthClient.Get(fmt.Sprintf("http://127.0.0.1:%d/api/version", port))
if err != nil {
return ""
}
@@ -447,14 +512,12 @@ func findWorkerBinary() string {
// POST sends a POST request to the worker.
func POST(port int, path string, body interface{}) (map[string]interface{}, error) {
client := &http.Client{Timeout: 10 * time.Second}
jsonBody, err := json.Marshal(body)
if err != nil {
return nil, err
}
resp, err := client.Post(
resp, err := hookClient.Post(
fmt.Sprintf("http://127.0.0.1:%d%s", port, path),
"application/json",
bytes.NewReader(jsonBody),
@@ -493,8 +556,7 @@ func POSTWithContext(ctx context.Context, port int, path string, body interface{
}
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
resp, err := hookClient.Do(req)
if err != nil {
return err
}
@@ -504,9 +566,7 @@ func POSTWithContext(ctx context.Context, port int, path string, body interface{
// GET sends a GET request to the worker.
func GET(port int, path string) (map[string]interface{}, error) {
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Get(fmt.Sprintf("http://127.0.0.1:%d%s", port, path))
resp, err := hookClient.Get(fmt.Sprintf("http://127.0.0.1:%d%s", port, path))
if err != nil {
return nil, err
}
+103 -2
View File
@@ -2,6 +2,7 @@
package hooks
import (
"context"
"encoding/json"
"fmt"
"net/http"
@@ -518,7 +519,8 @@ func TestProjectIDWithName_Uniqueness(t *testing.T) {
func TestHookConstants(t *testing.T) {
assert.Equal(t, 37777, DefaultWorkerPort)
assert.Equal(t, 2*time.Second, HealthCheckTimeout)
assert.Equal(t, 30*time.Second, StartupTimeout)
assert.Equal(t, 10*time.Second, StartupTimeout)
assert.Equal(t, 15*time.Second, EnsureWorkerDeadline)
}
// TestExitCodes tests exit code constants.
@@ -1200,5 +1202,104 @@ func TestHealthCheckTimeout(t *testing.T) {
// TestStartupTimeout tests the startup timeout is reasonable.
func TestStartupTimeout(t *testing.T) {
assert.Greater(t, StartupTimeout, 5*time.Second)
assert.LessOrEqual(t, StartupTimeout, time.Minute)
assert.LessOrEqual(t, StartupTimeout, 15*time.Second)
}
// TestEnsureWorkerDeadline tests the deadline is within hook budget.
func TestEnsureWorkerDeadline(t *testing.T) {
assert.Greater(t, EnsureWorkerDeadline, StartupTimeout, "deadline must exceed startup timeout")
assert.LessOrEqual(t, EnsureWorkerDeadline, 20*time.Second, "deadline must fit in hook timeout budget")
}
// --- Regression tests for Fixes 1, 3, 4 ---
// TestEnsureWorkerRunning_RespectsDeadline verifies that EnsureWorkerRunning returns
// within a bounded time even in worst case (no worker, startup fails).
func TestEnsureWorkerRunning_RespectsDeadline(t *testing.T) {
// Reset circuit breaker so the function actually tries to start.
circuitBreakerMu.Lock()
lastStartupFailure = time.Time{}
circuitBreakerMu.Unlock()
// Use a port that nothing listens on.
t.Setenv("CLAUDE_MNEMONIC_WORKER_PORT", "19999")
// Point HOME to a temp dir so findWorkerBinary finds nothing.
t.Setenv("HOME", t.TempDir())
// Clear plugin root to avoid that path too.
t.Setenv("CLAUDE_PLUGIN_ROOT", "")
start := time.Now()
_, err := EnsureWorkerRunning()
elapsed := time.Since(start)
// Must error (no binary found).
assert.Error(t, err)
assert.Contains(t, err.Error(), "worker binary not found")
// Must complete well within EnsureWorkerDeadline.
assert.Less(t, elapsed, EnsureWorkerDeadline,
"EnsureWorkerRunning took %v, exceeding deadline %v", elapsed, EnsureWorkerDeadline)
}
// TestEnsureWorkerRunningCtx_CancelledContext verifies immediate return on cancelled context.
func TestEnsureWorkerRunningCtx_CancelledContext(t *testing.T) {
// Reset circuit breaker.
circuitBreakerMu.Lock()
lastStartupFailure = time.Time{}
circuitBreakerMu.Unlock()
t.Setenv("CLAUDE_MNEMONIC_WORKER_PORT", "19998")
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel immediately
start := time.Now()
_, err := ensureWorkerRunningCtx(ctx)
elapsed := time.Since(start)
assert.Error(t, err)
assert.Less(t, elapsed, 1*time.Second, "cancelled context should return immediately")
}
// TestSleepCtx_Normal verifies sleepCtx completes normally.
func TestSleepCtx_Normal(t *testing.T) {
ctx := context.Background()
start := time.Now()
err := sleepCtx(ctx, 50*time.Millisecond)
elapsed := time.Since(start)
assert.NoError(t, err)
assert.GreaterOrEqual(t, elapsed, 50*time.Millisecond)
}
// TestSleepCtx_Cancelled verifies sleepCtx returns early on cancel.
func TestSleepCtx_Cancelled(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel()
start := time.Now()
err := sleepCtx(ctx, 5*time.Second)
elapsed := time.Since(start)
assert.Error(t, err)
assert.Less(t, elapsed, 500*time.Millisecond)
}
// TestHookClients_DisableKeepAlives asserts shared clients disable keep-alives
// to prevent TIME_WAIT connection leaks in short-lived hook processes.
func TestHookClients_DisableKeepAlives(t *testing.T) {
hTransport, ok := hookClient.Transport.(*http.Transport)
require.True(t, ok, "hookClient.Transport should be *http.Transport")
assert.True(t, hTransport.DisableKeepAlives, "hookClient must disable keep-alives")
assert.Equal(t, 1, hTransport.MaxIdleConns)
hcTransport, ok := healthClient.Transport.(*http.Transport)
require.True(t, ok, "healthClient.Transport should be *http.Transport")
assert.True(t, hcTransport.DisableKeepAlives, "healthClient must disable keep-alives")
assert.Equal(t, 1, hcTransport.MaxIdleConns)
}
// TestHookClient_Timeout verifies hookClient timeout is set.
func TestHookClient_Timeout(t *testing.T) {
assert.Equal(t, 10*time.Second, hookClient.Timeout)
assert.Equal(t, HealthCheckTimeout, healthClient.Timeout)
}