mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-22 03:41:52 +00:00
fix: address 15 additional hang vectors found during deep audit (#45)
MCP server (5 fixes):
- Move semaphore acquisition inside goroutine so main loop stays
responsive when all slots are taken
- Add 10s write timeout to sendResponse to prevent pipe deadlock
when Claude Code pauses reading stdout
- Send fallback JSON-RPC error when json.Marshal fails instead of
silently swallowing the error and leaving caller waiting forever
- Silence unknown notification methods (req.ID == nil) instead of
sending unsolicited error responses that may desync the host
- Return MCP isError content for tool failures instead of top-level
JSON-RPC error, matching the MCP specification
Vector/embedding (3 fixes):
- Move EmbedBatchWithContext call before writeMu.Lock in AddDocuments
so ONNX inference runs outside the write lock
- Replace singleflight.Do with DoChan + ctx select in both
getOrComputeEmbedding and UnifiedSearch so callers can bail out
independently when their context expires
- Add activeQueries atomic counter; skip cache warming when user
queries are in-flight; reduce warming timeout from 5s to 2s
Hooks (4 fixes):
- Cap EnsureWorkerRunning to 15s hard deadline with context; reduce
StartupTimeout from 30s to 10s; reduce port-in-use retries
- Fix nil dereference panic in user-prompt hook when initResult is
nil (non-JSON worker response); use comma-ok assertions
- Use package-level hookClient/healthClient with DisableKeepAlives
to prevent FD leaks in short-lived hook processes
- Set SysProcAttr{Setpgid: true} to detach worker from hook process
group, preventing kill-cascade from Claude Code
Worker/DB (3 fixes):
- Replace os.Exit(0) in MCP config watcher with context cancellation
for clean protocol shutdown
- Add 60s context.WithTimeout around ProcessObservation calls in
processAllSessions to prevent hung CLI subprocesses from blocking
the queue processor forever
- Set explicit PRAGMA wal_autocheckpoint=1000 and add PASSIVE WAL
checkpoint to Optimize() to prevent checkpoint stalls
Adds 20+ regression tests across all fix areas.
This commit is contained in:
+87
-27
@@ -29,7 +29,11 @@ const (
|
||||
HealthCheckTimeout = 2 * time.Second
|
||||
|
||||
// StartupTimeout is the timeout for worker startup.
|
||||
StartupTimeout = 30 * time.Second
|
||||
StartupTimeout = 10 * time.Second
|
||||
|
||||
// EnsureWorkerDeadline is the hard overall deadline for EnsureWorkerRunning.
|
||||
// Must fit within Claude Code's hook timeout budget.
|
||||
EnsureWorkerDeadline = 15 * time.Second
|
||||
|
||||
// workerCacheMaxAge is how long the worker cache is considered fresh.
|
||||
workerCacheMaxAge = 10 * time.Second
|
||||
@@ -48,6 +52,26 @@ var (
|
||||
// circuitBreakerMu protects lastStartupFailure.
|
||||
circuitBreakerMu sync.Mutex
|
||||
lastStartupFailure time.Time
|
||||
|
||||
// hookClient is a shared HTTP client for hook->worker requests.
|
||||
// DisableKeepAlives prevents TIME_WAIT connection leaks since each hook
|
||||
// is a separate OS process that exits quickly.
|
||||
hookClient = &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
Transport: &http.Transport{
|
||||
DisableKeepAlives: true,
|
||||
MaxIdleConns: 1,
|
||||
},
|
||||
}
|
||||
|
||||
// healthClient is a shared HTTP client for health/version checks.
|
||||
healthClient = &http.Client{
|
||||
Timeout: HealthCheckTimeout,
|
||||
Transport: &http.Transport{
|
||||
DisableKeepAlives: true,
|
||||
MaxIdleConns: 1,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
// IsWorkerAvailable performs a fast check without network calls.
|
||||
@@ -86,8 +110,7 @@ func GetWorkerPort() int {
|
||||
// Parses the JSON health response to check the "ready" field when available.
|
||||
// Falls back to HTTP status code check for backwards compatibility.
|
||||
func IsWorkerRunning(port int) bool {
|
||||
client := &http.Client{Timeout: HealthCheckTimeout}
|
||||
resp, err := client.Get(fmt.Sprintf("http://127.0.0.1:%d/api/health", port))
|
||||
resp, err := healthClient.Get(fmt.Sprintf("http://127.0.0.1:%d/api/health", port))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
@@ -200,7 +223,25 @@ func isWorkerRunningWithRetries(port int) bool {
|
||||
// EnsureWorkerRunning ensures the worker is running, starting it if necessary.
|
||||
// If a worker is already running and healthy with matching version, it reuses it.
|
||||
// If version mismatch or unhealthy, it kills the old worker and starts fresh.
|
||||
// A hard deadline of EnsureWorkerDeadline prevents exceeding Claude Code's hook timeout.
|
||||
func EnsureWorkerRunning() (int, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), EnsureWorkerDeadline)
|
||||
defer cancel()
|
||||
|
||||
return ensureWorkerRunningCtx(ctx)
|
||||
}
|
||||
|
||||
// sleepCtx sleeps for d or returns early if ctx is cancelled.
|
||||
func sleepCtx(ctx context.Context, d time.Duration) error {
|
||||
select {
|
||||
case <-time.After(d):
|
||||
return nil
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
func ensureWorkerRunningCtx(ctx context.Context) (int, error) {
|
||||
port := GetWorkerPort()
|
||||
|
||||
// Fast path: check PID cache before making any HTTP calls.
|
||||
@@ -210,6 +251,10 @@ func EnsureWorkerRunning() (int, error) {
|
||||
}
|
||||
}
|
||||
|
||||
if ctx.Err() != nil {
|
||||
return 0, ctx.Err()
|
||||
}
|
||||
|
||||
// Circuit breaker: if we failed to start recently, don't retry immediately.
|
||||
circuitBreakerMu.Lock()
|
||||
if !lastStartupFailure.IsZero() && time.Since(lastStartupFailure) < circuitBreakerCooldown {
|
||||
@@ -232,7 +277,9 @@ func EnsureWorkerRunning() (int, error) {
|
||||
if err := KillProcessOnPort(port); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "[claude-mnemonic] Warning: failed to kill old worker: %v\n", err)
|
||||
}
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
if err := sleepCtx(ctx, 500*time.Millisecond); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
} else {
|
||||
// Version matches, reuse existing worker
|
||||
updateCacheFromPort(port)
|
||||
@@ -245,14 +292,20 @@ func EnsureWorkerRunning() (int, error) {
|
||||
}
|
||||
}
|
||||
|
||||
if ctx.Err() != nil {
|
||||
return 0, ctx.Err()
|
||||
}
|
||||
|
||||
// Port is in use but health check failed -- worker may be slow, not dead.
|
||||
if IsPortInUse(port) {
|
||||
// The port is responding to TCP but health check timed out.
|
||||
// Don't kill it -- it's likely just under load. Give it more time.
|
||||
fmt.Fprintf(os.Stderr, "[claude-mnemonic] Worker on port %d is slow to respond, waiting...\n", port)
|
||||
// Try a few more times with longer delays before giving up
|
||||
for i := 0; i < 3; i++ {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
// Try a couple more times with shorter delays before giving up
|
||||
for i := 0; i < 2; i++ {
|
||||
if err := sleepCtx(ctx, 300*time.Millisecond); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if IsWorkerRunning(port) {
|
||||
updateCacheFromPort(port)
|
||||
return port, nil
|
||||
@@ -263,7 +316,13 @@ func EnsureWorkerRunning() (int, error) {
|
||||
if err := KillProcessOnPort(port); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "[claude-mnemonic] Warning: failed to kill unhealthy process on port %d: %v\n", port, err)
|
||||
}
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
if err := sleepCtx(ctx, 500*time.Millisecond); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
if ctx.Err() != nil {
|
||||
return 0, ctx.Err()
|
||||
}
|
||||
|
||||
// Find worker binary
|
||||
@@ -272,8 +331,10 @@ func EnsureWorkerRunning() (int, error) {
|
||||
return 0, fmt.Errorf("worker binary not found")
|
||||
}
|
||||
|
||||
// Start worker
|
||||
// Start worker -- detach from hook's process group so Claude Code
|
||||
// killing the hook doesn't take the worker down with it.
|
||||
cmd := exec.Command(workerPath) // #nosec G204 -- workerPath is from internal findWorkerBinary
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
if err := cmd.Start(); err != nil {
|
||||
@@ -286,27 +347,32 @@ func EnsureWorkerRunning() (int, error) {
|
||||
pid := cmd.Process.Pid
|
||||
|
||||
// Wait for worker to be ready with exponential backoff
|
||||
deadline := time.Now().Add(StartupTimeout)
|
||||
backoff := 50 * time.Millisecond
|
||||
maxBackoff := 500 * time.Millisecond
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
circuitBreakerMu.Lock()
|
||||
lastStartupFailure = time.Now()
|
||||
circuitBreakerMu.Unlock()
|
||||
return 0, fmt.Errorf("worker failed to start within deadline: %w", ctx.Err())
|
||||
}
|
||||
if IsWorkerRunning(port) {
|
||||
writeWorkerCache(port, pid)
|
||||
return port, nil
|
||||
}
|
||||
time.Sleep(backoff)
|
||||
if err := sleepCtx(ctx, backoff); err != nil {
|
||||
circuitBreakerMu.Lock()
|
||||
lastStartupFailure = time.Now()
|
||||
circuitBreakerMu.Unlock()
|
||||
return 0, fmt.Errorf("worker failed to start within deadline: %w", err)
|
||||
}
|
||||
// Exponential backoff with cap
|
||||
backoff = backoff * 2
|
||||
if backoff > maxBackoff {
|
||||
backoff = maxBackoff
|
||||
}
|
||||
}
|
||||
|
||||
circuitBreakerMu.Lock()
|
||||
lastStartupFailure = time.Now()
|
||||
circuitBreakerMu.Unlock()
|
||||
return 0, fmt.Errorf("worker failed to start within timeout")
|
||||
}
|
||||
|
||||
// updateCacheFromPort finds the PID of the process on the port and updates the cache.
|
||||
@@ -330,8 +396,7 @@ func updateCacheFromPort(port int) {
|
||||
|
||||
// GetWorkerVersion gets the version of the running worker.
|
||||
func GetWorkerVersion(port int) string {
|
||||
client := &http.Client{Timeout: HealthCheckTimeout}
|
||||
resp, err := client.Get(fmt.Sprintf("http://127.0.0.1:%d/api/version", port))
|
||||
resp, err := healthClient.Get(fmt.Sprintf("http://127.0.0.1:%d/api/version", port))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
@@ -447,14 +512,12 @@ func findWorkerBinary() string {
|
||||
|
||||
// POST sends a POST request to the worker.
|
||||
func POST(port int, path string, body interface{}) (map[string]interface{}, error) {
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
|
||||
jsonBody, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := client.Post(
|
||||
resp, err := hookClient.Post(
|
||||
fmt.Sprintf("http://127.0.0.1:%d%s", port, path),
|
||||
"application/json",
|
||||
bytes.NewReader(jsonBody),
|
||||
@@ -493,8 +556,7 @@ func POSTWithContext(ctx context.Context, port int, path string, body interface{
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
resp, err := hookClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -504,9 +566,7 @@ func POSTWithContext(ctx context.Context, port int, path string, body interface{
|
||||
|
||||
// GET sends a GET request to the worker.
|
||||
func GET(port int, path string) (map[string]interface{}, error) {
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
|
||||
resp, err := client.Get(fmt.Sprintf("http://127.0.0.1:%d%s", port, path))
|
||||
resp, err := hookClient.Get(fmt.Sprintf("http://127.0.0.1:%d%s", port, path))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
+103
-2
@@ -2,6 +2,7 @@
|
||||
package hooks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@@ -518,7 +519,8 @@ func TestProjectIDWithName_Uniqueness(t *testing.T) {
|
||||
func TestHookConstants(t *testing.T) {
|
||||
assert.Equal(t, 37777, DefaultWorkerPort)
|
||||
assert.Equal(t, 2*time.Second, HealthCheckTimeout)
|
||||
assert.Equal(t, 30*time.Second, StartupTimeout)
|
||||
assert.Equal(t, 10*time.Second, StartupTimeout)
|
||||
assert.Equal(t, 15*time.Second, EnsureWorkerDeadline)
|
||||
}
|
||||
|
||||
// TestExitCodes tests exit code constants.
|
||||
@@ -1200,5 +1202,104 @@ func TestHealthCheckTimeout(t *testing.T) {
|
||||
// TestStartupTimeout tests the startup timeout is reasonable.
|
||||
func TestStartupTimeout(t *testing.T) {
|
||||
assert.Greater(t, StartupTimeout, 5*time.Second)
|
||||
assert.LessOrEqual(t, StartupTimeout, time.Minute)
|
||||
assert.LessOrEqual(t, StartupTimeout, 15*time.Second)
|
||||
}
|
||||
|
||||
// TestEnsureWorkerDeadline tests the deadline is within hook budget.
|
||||
func TestEnsureWorkerDeadline(t *testing.T) {
|
||||
assert.Greater(t, EnsureWorkerDeadline, StartupTimeout, "deadline must exceed startup timeout")
|
||||
assert.LessOrEqual(t, EnsureWorkerDeadline, 20*time.Second, "deadline must fit in hook timeout budget")
|
||||
}
|
||||
|
||||
// --- Regression tests for Fixes 1, 3, 4 ---
|
||||
|
||||
// TestEnsureWorkerRunning_RespectsDeadline verifies that EnsureWorkerRunning returns
|
||||
// within a bounded time even in worst case (no worker, startup fails).
|
||||
func TestEnsureWorkerRunning_RespectsDeadline(t *testing.T) {
|
||||
// Reset circuit breaker so the function actually tries to start.
|
||||
circuitBreakerMu.Lock()
|
||||
lastStartupFailure = time.Time{}
|
||||
circuitBreakerMu.Unlock()
|
||||
|
||||
// Use a port that nothing listens on.
|
||||
t.Setenv("CLAUDE_MNEMONIC_WORKER_PORT", "19999")
|
||||
// Point HOME to a temp dir so findWorkerBinary finds nothing.
|
||||
t.Setenv("HOME", t.TempDir())
|
||||
// Clear plugin root to avoid that path too.
|
||||
t.Setenv("CLAUDE_PLUGIN_ROOT", "")
|
||||
|
||||
start := time.Now()
|
||||
_, err := EnsureWorkerRunning()
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// Must error (no binary found).
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "worker binary not found")
|
||||
// Must complete well within EnsureWorkerDeadline.
|
||||
assert.Less(t, elapsed, EnsureWorkerDeadline,
|
||||
"EnsureWorkerRunning took %v, exceeding deadline %v", elapsed, EnsureWorkerDeadline)
|
||||
}
|
||||
|
||||
// TestEnsureWorkerRunningCtx_CancelledContext verifies immediate return on cancelled context.
|
||||
func TestEnsureWorkerRunningCtx_CancelledContext(t *testing.T) {
|
||||
// Reset circuit breaker.
|
||||
circuitBreakerMu.Lock()
|
||||
lastStartupFailure = time.Time{}
|
||||
circuitBreakerMu.Unlock()
|
||||
|
||||
t.Setenv("CLAUDE_MNEMONIC_WORKER_PORT", "19998")
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // cancel immediately
|
||||
|
||||
start := time.Now()
|
||||
_, err := ensureWorkerRunningCtx(ctx)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
assert.Error(t, err)
|
||||
assert.Less(t, elapsed, 1*time.Second, "cancelled context should return immediately")
|
||||
}
|
||||
|
||||
// TestSleepCtx_Normal verifies sleepCtx completes normally.
|
||||
func TestSleepCtx_Normal(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
start := time.Now()
|
||||
err := sleepCtx(ctx, 50*time.Millisecond)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
assert.NoError(t, err)
|
||||
assert.GreaterOrEqual(t, elapsed, 50*time.Millisecond)
|
||||
}
|
||||
|
||||
// TestSleepCtx_Cancelled verifies sleepCtx returns early on cancel.
|
||||
func TestSleepCtx_Cancelled(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
start := time.Now()
|
||||
err := sleepCtx(ctx, 5*time.Second)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
assert.Error(t, err)
|
||||
assert.Less(t, elapsed, 500*time.Millisecond)
|
||||
}
|
||||
|
||||
// TestHookClients_DisableKeepAlives asserts shared clients disable keep-alives
|
||||
// to prevent TIME_WAIT connection leaks in short-lived hook processes.
|
||||
func TestHookClients_DisableKeepAlives(t *testing.T) {
|
||||
hTransport, ok := hookClient.Transport.(*http.Transport)
|
||||
require.True(t, ok, "hookClient.Transport should be *http.Transport")
|
||||
assert.True(t, hTransport.DisableKeepAlives, "hookClient must disable keep-alives")
|
||||
assert.Equal(t, 1, hTransport.MaxIdleConns)
|
||||
|
||||
hcTransport, ok := healthClient.Transport.(*http.Transport)
|
||||
require.True(t, ok, "healthClient.Transport should be *http.Transport")
|
||||
assert.True(t, hcTransport.DisableKeepAlives, "healthClient must disable keep-alives")
|
||||
assert.Equal(t, 1, hcTransport.MaxIdleConns)
|
||||
}
|
||||
|
||||
// TestHookClient_Timeout verifies hookClient timeout is set.
|
||||
func TestHookClient_Timeout(t *testing.T) {
|
||||
assert.Equal(t, 10*time.Second, hookClient.Timeout)
|
||||
assert.Equal(t, HealthCheckTimeout, healthClient.Timeout)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user