fix: bound SQLite WAL growth and prevent worker hangs (#49)

The worker's SQLite WAL could grow unbounded (observed 19MB) and wedge the
DB, hanging Claude Code on every prompt. No checkpoint ever truncated the
WAL (only PASSIVE auto-checkpoint, which cannot reclaim the file), the
connection-scoped pragmas were set via a single Exec so only one pooled
connection received them (e.g. busy_timeout=0 on the rest), and the
maintenance service that would optimize/checkpoint was never wired up.

- Register a sqlite3 ConnectHook driver so all pragmas (busy_timeout,
  journal_mode, synchronous, cache_size, foreign_keys, journal_size_limit)
  apply to every pooled connection; enable safe connection recycling.
- Add Store.Checkpoint (TRUNCATE), checkpoint-on-Close, and a periodic
  size-gated checkpoint loop with configurable interval/threshold.
- Wire up the previously-dead maintenance service; make trigger_maintenance
  actually run DB maintenance instead of only recalculating scores.
- Harden the user-prompt hook to honor its deadline and fail open so a
  slow worker can never stall a prompt.
- Add regression tests for WAL truncation, checkpoint-on-close, and
  per-connection pragmas.
This commit is contained in:
2026-06-01 16:38:40 +01:00
parent f78370a531
commit b7b82ce22f
10 changed files with 957 additions and 93 deletions
+20 -12
View File
@@ -91,18 +91,21 @@ func handleUserPrompt(ctx *hooks.HookContext, input *Input) (string, error) {
observationCount int
)
// Start search in background
// Start search in background. Pass the deadline context so a wedged worker
// aborts the request at the deadline instead of blocking for the full
// hookClient timeout (10s). Errors are ignored -- fail open with no memory.
wg.Add(1)
go func() {
defer wg.Done()
searchResult, _ = hooks.GET(ctx.Port, searchURL)
searchResult, _ = hooks.GETWithContext(deadline, ctx.Port, searchURL)
}()
// Start session init in parallel (with observationCount=0; approximate is fine)
// Start session init in parallel (with observationCount=0; approximate is fine).
// Deadline context guards this call too.
wg.Add(1)
go func() {
defer wg.Done()
initResult, initErr = hooks.POST(ctx.Port, "/api/sessions/init", map[string]interface{}{
initResult, initErr = hooks.POSTWithContextResult(deadline, ctx.Port, "/api/sessions/init", map[string]interface{}{
"claudeSessionId": ctx.SessionID,
"project": ctx.Project,
"prompt": input.Prompt,
@@ -113,7 +116,8 @@ func handleUserPrompt(ctx *hooks.HookContext, input *Input) (string, error) {
// Wait for both to complete
wg.Wait()
// Check deadline after network calls
// Check deadline after network calls -- if exceeded, fail open (proceed with
// no injected memory) rather than blocking or erroring the user's prompt.
if deadline.Err() != nil {
return "", nil
}
@@ -173,9 +177,11 @@ func handleUserPrompt(ctx *hooks.HookContext, input *Input) (string, error) {
contextToInject = contextBuilder
}
// Check session init result
// Check session init result. A session-init failure must never block the
// prompt: degrade gracefully and still inject any memory we found.
if initErr != nil {
return "", initErr
fmt.Fprintf(os.Stderr, "[user-prompt] Session init failed: %v\n", initErr)
return contextToInject, nil
}
if initResult == nil {
return contextToInject, nil // Non-JSON response from worker, skip session init
@@ -201,13 +207,15 @@ func handleUserPrompt(ctx *hooks.HookContext, input *Input) (string, error) {
fmt.Fprintf(os.Stderr, "[user-prompt] Session %d, prompt #%d\n", sessionID, promptNumber)
// Start SDK agent (depends on session init result, so kept sequential)
_, err := hooks.POST(ctx.Port, fmt.Sprintf("/sessions/%d/init", sessionID), map[string]interface{}{
// Start SDK agent (depends on session init result, so kept sequential).
// Deadline-guarded so a wedged worker cannot stall past the hook budget.
// Failure here must not block the prompt: degrade gracefully, still inject memory.
if _, err := hooks.POSTWithContextResult(deadline, ctx.Port, fmt.Sprintf("/sessions/%d/init", sessionID), map[string]interface{}{
"userPrompt": input.Prompt,
"promptNumber": promptNumber,
})
if err != nil {
return "", err
}); err != nil {
fmt.Fprintf(os.Stderr, "[user-prompt] SDK agent init failed: %v\n", err)
return contextToInject, nil
}
// Return context if we found relevant observations