fix: bound SQLite WAL growth and prevent worker hangs (#49)

The worker's SQLite WAL could grow unbounded (observed 19MB) and wedge the
DB, hanging Claude Code on every prompt. No checkpoint ever truncated the
WAL (only PASSIVE auto-checkpoint, which cannot reclaim the file), the
connection-scoped pragmas were set via a single Exec so only one pooled
connection received them (e.g. busy_timeout=0 on the rest), and the
maintenance service that would optimize/checkpoint was never wired up.

- Register a sqlite3 ConnectHook driver so all pragmas (busy_timeout,
  journal_mode, synchronous, cache_size, foreign_keys, journal_size_limit)
  apply to every pooled connection; enable safe connection recycling.
- Add Store.Checkpoint (TRUNCATE), checkpoint-on-Close, and a periodic
  size-gated checkpoint loop with configurable interval/threshold.
- Wire up the previously-dead maintenance service; make trigger_maintenance
  actually run DB maintenance instead of only recalculating scores.
- Harden the user-prompt hook to honor its deadline and fail open so a
  slow worker can never stall a prompt.
- Add regression tests for WAL truncation, checkpoint-on-close, and
  per-connection pragmas.
This commit is contained in:
2026-06-01 16:38:40 +01:00
parent f78370a531
commit b7b82ce22f
10 changed files with 957 additions and 93 deletions
+67
View File
@@ -564,6 +564,44 @@ func POSTWithContext(ctx context.Context, port int, path string, body interface{
return nil
}
// POSTWithContextResult sends a POST request using the provided context and
// decodes the JSON response body, mirroring POST but honoring ctx for
// cancellation/deadline. Used on the prompt critical path so a wedged worker
// aborts at the hook deadline instead of blocking for the full client timeout.
// A non-JSON body is returned as (nil, nil), matching POST's behavior.
func POSTWithContextResult(ctx context.Context, port int, path string, body interface{}) (map[string]interface{}, error) {
jsonBody, err := json.Marshal(body)
if err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost,
fmt.Sprintf("http://127.0.0.1:%d%s", port, path),
bytes.NewReader(jsonBody))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := hookClient.Do(req)
if err != nil {
return nil, err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("request failed: %s", resp.Status)
}
var result map[string]interface{}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
// Not all endpoints return JSON
return nil, nil
}
return result, nil
}
// GET sends a GET request to the worker.
func GET(port int, path string) (map[string]interface{}, error) {
resp, err := hookClient.Get(fmt.Sprintf("http://127.0.0.1:%d%s", port, path))
@@ -584,6 +622,35 @@ func GET(port int, path string) (map[string]interface{}, error) {
return result, nil
}
// GETWithContext sends a GET request using the provided context and decodes the
// JSON response body, mirroring GET but honoring ctx for cancellation/deadline.
// Used on the prompt critical path so a wedged worker aborts at the hook
// deadline instead of blocking for the full client timeout.
func GETWithContext(ctx context.Context, port int, path string) (map[string]interface{}, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet,
fmt.Sprintf("http://127.0.0.1:%d%s", port, path), nil)
if err != nil {
return nil, err
}
resp, err := hookClient.Do(req)
if err != nil {
return nil, err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("request failed: %s", resp.Status)
}
var result map[string]interface{}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, err
}
return result, nil
}
// versionsCompatible checks if two versions are compatible for dev builds.
// Returns true if both versions share the same base version (ignoring -dirty, -dev, commit suffixes).
// This prevents unnecessary restarts during development.
+197
View File
@@ -952,6 +952,203 @@ func TestGET_Timeout(t *testing.T) {
require.Error(t, err)
}
// TestGETWithContext tests GETWithContext with a mock server.
func TestGETWithContext(t *testing.T) {
tests := []struct {
serverHandler func(w http.ResponseWriter, r *http.Request)
expectedResult map[string]interface{}
name string
expectError bool
}{
{
name: "successful GET with JSON response",
serverHandler: func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, http.MethodGet, r.Method)
w.WriteHeader(http.StatusOK)
_ = json.NewEncoder(w).Encode(map[string]interface{}{"data": "test"})
},
expectError: false,
expectedResult: map[string]interface{}{"data": "test"},
},
{
name: "GET with 404 error",
serverHandler: func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
},
expectError: true,
},
{
name: "GET with invalid JSON",
serverHandler: func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("not valid json"))
},
expectError: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(tt.serverHandler))
defer server.Close()
var port int
_, err := fmt.Sscanf(server.URL, "http://127.0.0.1:%d", &port)
require.NoError(t, err)
result, err := GETWithContext(context.Background(), port, "/test")
if tt.expectError {
assert.Error(t, err)
} else {
assert.NoError(t, err)
if tt.expectedResult != nil {
assert.Equal(t, tt.expectedResult["data"], result["data"])
}
}
})
}
}
// TestGETWithContext_Timeout verifies the context deadline aborts a slow server
// well before the hookClient timeout, so a wedged worker cannot stall the prompt.
func TestGETWithContext_Timeout(t *testing.T) {
// Server that blocks longer than the context deadline.
blockUntil := make(chan struct{})
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
<-blockUntil // never closed during the test -> server hangs
}))
defer server.Close()
defer close(blockUntil)
var port int
_, err := fmt.Sscanf(server.URL, "http://127.0.0.1:%d", &port)
require.NoError(t, err)
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
start := time.Now()
_, err = GETWithContext(ctx, port, "/test")
elapsed := time.Since(start)
require.Error(t, err)
// Should abort near the 100ms deadline, far below hookClient's 10s timeout.
assert.Less(t, elapsed, 2*time.Second, "context deadline must abort the request quickly")
}
// TestGETWithContext_CancelledContext verifies an already-cancelled context
// returns immediately without making a real request.
func TestGETWithContext_CancelledContext(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel immediately
start := time.Now()
_, err := GETWithContext(ctx, 99994, "/test")
elapsed := time.Since(start)
require.Error(t, err)
assert.Less(t, elapsed, 1*time.Second, "cancelled context should return immediately")
}
// TestPOSTWithContextResult tests POSTWithContextResult with a mock server.
func TestPOSTWithContextResult(t *testing.T) {
tests := []struct {
body interface{}
serverHandler func(w http.ResponseWriter, r *http.Request)
expectedResult map[string]interface{}
name string
expectError bool
}{
{
name: "successful POST with JSON response",
serverHandler: func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, http.MethodPost, r.Method)
assert.Equal(t, "application/json", r.Header.Get("Content-Type"))
w.WriteHeader(http.StatusOK)
_ = json.NewEncoder(w).Encode(map[string]interface{}{"status": "ok"})
},
body: map[string]string{"key": "value"},
expectError: false,
expectedResult: map[string]interface{}{"status": "ok"},
},
{
name: "POST with 400 error",
serverHandler: func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusBadRequest)
},
body: map[string]string{"key": "value"},
expectError: true,
},
{
name: "POST with non-JSON response returns nil",
serverHandler: func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("not json"))
},
body: map[string]string{"key": "value"},
expectError: false,
expectedResult: nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(tt.serverHandler))
defer server.Close()
var port int
_, err := fmt.Sscanf(server.URL, "http://127.0.0.1:%d", &port)
require.NoError(t, err)
result, err := POSTWithContextResult(context.Background(), port, "/test", tt.body)
if tt.expectError {
assert.Error(t, err)
} else {
assert.NoError(t, err)
if tt.expectedResult != nil {
assert.Equal(t, tt.expectedResult["status"], result["status"])
} else {
assert.Nil(t, result)
}
}
})
}
}
// TestPOSTWithContextResult_MarshalError tests POSTWithContextResult with an unmarshalable body.
func TestPOSTWithContextResult_MarshalError(t *testing.T) {
badValue := make(chan int)
_, err := POSTWithContextResult(context.Background(), 99999, "/test", badValue)
require.Error(t, err)
}
// TestPOSTWithContextResult_Timeout verifies the context deadline aborts a slow
// server before the hookClient timeout.
func TestPOSTWithContextResult_Timeout(t *testing.T) {
blockUntil := make(chan struct{})
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
<-blockUntil
}))
defer server.Close()
defer close(blockUntil)
var port int
_, err := fmt.Sscanf(server.URL, "http://127.0.0.1:%d", &port)
require.NoError(t, err)
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
start := time.Now()
_, err = POSTWithContextResult(ctx, port, "/test", map[string]string{"k": "v"})
elapsed := time.Since(start)
require.Error(t, err)
assert.Less(t, elapsed, 2*time.Second, "context deadline must abort the request quickly")
}
// TestIsWorkerRunning_Timeout tests IsWorkerRunning with timeout.
func TestIsWorkerRunning_Timeout(t *testing.T) {
// Non-existent port should quickly return false