fix: address 15 additional hang vectors found during deep audit (#45)

MCP server (5 fixes):
- Move semaphore acquisition inside goroutine so main loop stays
  responsive when all slots are taken
- Add 10s write timeout to sendResponse to prevent pipe deadlock
  when Claude Code pauses reading stdout
- Send fallback JSON-RPC error when json.Marshal fails instead of
  silently swallowing the error and leaving caller waiting forever
- Silence unknown notification methods (req.ID == nil) instead of
  sending unsolicited error responses that may desync the host
- Return MCP isError content for tool failures instead of top-level
  JSON-RPC error, matching the MCP specification

Vector/embedding (3 fixes):
- Move EmbedBatchWithContext call before writeMu.Lock in AddDocuments
  so ONNX inference runs outside the write lock
- Replace singleflight.Do with DoChan + ctx select in both
  getOrComputeEmbedding and UnifiedSearch so callers can bail out
  independently when their context expires
- Add activeQueries atomic counter; skip cache warming when user
  queries are in-flight; reduce warming timeout from 5s to 2s

Hooks (4 fixes):
- Cap EnsureWorkerRunning to 15s hard deadline with context; reduce
  StartupTimeout from 30s to 10s; reduce port-in-use retries
- Fix nil dereference panic in user-prompt hook when initResult is
  nil (non-JSON worker response); use comma-ok assertions
- Use package-level hookClient/healthClient with DisableKeepAlives
  to prevent FD leaks in short-lived hook processes
- Set SysProcAttr{Setpgid: true} to detach worker from hook process
  group, preventing kill-cascade from Claude Code

Worker/DB (3 fixes):
- Replace os.Exit(0) in MCP config watcher with context cancellation
  for clean protocol shutdown
- Add 60s context.WithTimeout around ProcessObservation calls in
  processAllSessions to prevent hung CLI subprocesses from blocking
  the queue processor forever
- Set explicit PRAGMA wal_autocheckpoint=1000 and add PASSIVE WAL
  checkpoint to Optimize() to prevent checkpoint stalls

Adds 20+ regression tests across all fix areas.
This commit is contained in:
2026-05-26 13:52:09 +01:00
parent de5796bbe6
commit a81482d06a
15 changed files with 952 additions and 92 deletions
+8 -2
View File
@@ -99,8 +99,9 @@ func NewStore(cfg Config) (*Store, error) {
"PRAGMA synchronous=NORMAL",
"PRAGMA cache_size=-64000", // 64MB cache (negative = KB)
"PRAGMA temp_store=MEMORY", // Store temp tables in memory
"PRAGMA mmap_size=268435456", // 256MB memory-mapped I/O
"PRAGMA page_size=4096", // 4KB pages (optimal for most systems)
"PRAGMA mmap_size=268435456", // 256MB memory-mapped I/O
"PRAGMA page_size=4096", // 4KB pages (optimal for most systems)
"PRAGMA wal_autocheckpoint=1000", // Explicit default; checkpoint every 1000 WAL frames
}
for _, pragma := range pragmas {
if _, err := sqlDB.Exec(pragma); err != nil {
@@ -192,6 +193,11 @@ func (s *Store) Optimize(ctx context.Context) error {
log.Warn().Err(err).Msg("PRAGMA optimize failed (non-fatal)")
}
// Passive WAL checkpoint — doesn't block readers/writers
if _, err := s.sqlDB.ExecContext(ctx, "PRAGMA wal_checkpoint(PASSIVE)"); err != nil {
log.Warn().Err(err).Msg("WAL checkpoint failed (non-fatal)")
}
log.Info().Dur("duration", time.Since(start)).Msg("Database optimization complete")
return nil
}
+89
View File
@@ -4,6 +4,7 @@
package gorm
import (
"context"
"os"
"path/filepath"
"testing"
@@ -150,3 +151,91 @@ func TestMigrationIdempotency(t *testing.T) {
t.Logf("✅ Migrations are idempotent")
}
func TestWALAutocheckpoint(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "gorm_wal_checkpoint_*")
if err != nil {
t.Fatalf("create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
dbPath := filepath.Join(tmpDir, "test.db")
store, err := NewStore(Config{
Path: dbPath,
MaxConns: 2,
LogLevel: logger.Silent,
})
if err != nil {
t.Fatalf("NewStore failed: %v", err)
}
defer store.Close()
// Verify wal_autocheckpoint is set to 1000
var checkpoint int
err = store.GetRawDB().QueryRow("PRAGMA wal_autocheckpoint").Scan(&checkpoint)
if err != nil {
t.Fatalf("query wal_autocheckpoint: %v", err)
}
if checkpoint != 1000 {
t.Errorf("expected wal_autocheckpoint=1000, got %d", checkpoint)
}
}
func TestOptimize_RunsWALCheckpoint(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "gorm_optimize_*")
if err != nil {
t.Fatalf("create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
dbPath := filepath.Join(tmpDir, "test.db")
store, err := NewStore(Config{
Path: dbPath,
MaxConns: 2,
LogLevel: logger.Silent,
})
if err != nil {
t.Fatalf("NewStore failed: %v", err)
}
defer store.Close()
// Insert some data to generate WAL frames
_, err = store.GetRawDB().Exec("INSERT INTO observations (sdk_session_id, title, scope, project, type, created_at, created_at_epoch) VALUES ('test-sess', 'test data', 'project', '/tmp/test', 'decision', '2026-01-01T00:00:00Z', 1735689600)")
if err != nil {
t.Fatalf("insert test data: %v", err)
}
// Optimize should succeed (includes PASSIVE WAL checkpoint)
err = store.Optimize(context.Background())
if err != nil {
t.Fatalf("Optimize failed: %v", err)
}
}
func TestOptimize_RespectsContextCancellation(t *testing.T) {
tmpDir, err := os.MkdirTemp("", "gorm_optimize_cancel_*")
if err != nil {
t.Fatalf("create temp dir: %v", err)
}
defer os.RemoveAll(tmpDir)
dbPath := filepath.Join(tmpDir, "test.db")
store, err := NewStore(Config{
Path: dbPath,
MaxConns: 2,
LogLevel: logger.Silent,
})
if err != nil {
t.Fatalf("NewStore failed: %v", err)
}
defer store.Close()
// Already-cancelled context should cause Optimize to fail
ctx, cancel := context.WithCancel(context.Background())
cancel()
err = store.Optimize(ctx)
if err == nil {
t.Error("expected error with cancelled context, got nil")
}
}