fix: bound SQLite WAL growth and prevent worker hangs (#49)

The worker's SQLite WAL could grow unbounded (observed 19MB) and wedge the
DB, hanging Claude Code on every prompt. No checkpoint ever truncated the
WAL (only PASSIVE auto-checkpoint, which cannot reclaim the file), the
connection-scoped pragmas were set via a single Exec so only one pooled
connection received them (e.g. busy_timeout=0 on the rest), and the
maintenance service that would optimize/checkpoint was never wired up.

- Register a sqlite3 ConnectHook driver so all pragmas (busy_timeout,
  journal_mode, synchronous, cache_size, foreign_keys, journal_size_limit)
  apply to every pooled connection; enable safe connection recycling.
- Add Store.Checkpoint (TRUNCATE), checkpoint-on-Close, and a periodic
  size-gated checkpoint loop with configurable interval/threshold.
- Wire up the previously-dead maintenance service; make trigger_maintenance
  actually run DB maintenance instead of only recalculating scores.
- Harden the user-prompt hook to honor its deadline and fail open so a
  slow worker can never stall a prompt.
- Add regression tests for WAL truncation, checkpoint-on-close, and
  per-connection pragmas.
This commit is contained in:
2026-06-01 16:38:40 +01:00
parent f78370a531
commit b7b82ce22f
10 changed files with 957 additions and 93 deletions
+43
View File
@@ -314,6 +314,49 @@ func (s *Service) handleTriggerRecalculation(w http.ResponseWriter, r *http.Requ
writeJSON(w, map[string]string{"status": "recalculation triggered"})
}
// handleRunMaintenance triggers an immediate, synchronous database maintenance run
// (Optimize/TRUNCATE checkpoint + prompt cleanup + any enabled retention/stale cleanup)
// and also kicks off an importance-score recalculation in the background so the behavior
// of the previous trigger_maintenance tool is preserved (issue #49).
func (s *Service) handleRunMaintenance(w http.ResponseWriter, r *http.Request) {
// initMu.RLock held by requireReady middleware
maintSvc := s.maintenanceSvc
recalculator := s.recalculator
if maintSvc == nil {
http.Error(w, "maintenance service not available", http.StatusServiceUnavailable)
return
}
// Run maintenance synchronously with an independent, bounded context so the caller
// receives a real completion status. Use context.Background so an HTTP client timeout
// does not abort an in-progress DB maintenance pass.
mctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
maintSvc.RunNowSync(mctx)
// Preserve prior trigger_maintenance behavior: also recalculate importance scores.
recalcTriggered := false
if recalculator != nil {
recalcTriggered = true
s.wg.Add(1)
go func() {
defer s.wg.Done()
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
if err := recalculator.RecalculateNow(ctx); err != nil {
log.Error().Err(err).Msg("Background recalculation during maintenance failed")
}
}()
}
writeJSON(w, map[string]any{
"status": "maintenance completed",
"recalc_triggered": recalcTriggered,
"maintenance_stats": maintSvc.Stats(),
})
}
// parseIntParam parses an integer query parameter with a default value.
func parseIntParam(r *http.Request, name string, defaultVal int) int {
if val := r.URL.Query().Get(name); val != "" {
+90
View File
@@ -16,6 +16,7 @@ import (
"github.com/lukaszraczylo/claude-mnemonic/internal/config"
"github.com/lukaszraczylo/claude-mnemonic/internal/db/gorm"
"github.com/lukaszraczylo/claude-mnemonic/internal/embedding"
"github.com/lukaszraczylo/claude-mnemonic/internal/maintenance"
"github.com/lukaszraczylo/claude-mnemonic/internal/pattern"
"github.com/lukaszraczylo/claude-mnemonic/internal/reranking"
"github.com/lukaszraczylo/claude-mnemonic/internal/scoring"
@@ -44,6 +45,14 @@ const (
// QueueProcessInterval is how often the background queue processor runs.
QueueProcessInterval = 2 * time.Second
// WALCheckpointInterval is how often the worker checks whether the SQLite WAL needs a
// TRUNCATE checkpoint to reclaim disk and prevent unbounded growth (issue #49).
WALCheckpointInterval = 60 * time.Second
// WALCheckpointThreshold is the WAL file size at or above which the periodic check
// performs a TRUNCATE checkpoint. Keeps the steady-state WAL bounded to a few MB.
WALCheckpointThreshold = 4 << 20 // 4 MiB
// reinitializationDrainDelay is the delay after marking the service as not ready
// to allow in-flight requests to complete before reinitializing.
reinitializationDrainDelay = 200 * time.Millisecond
@@ -121,6 +130,7 @@ type Service struct {
patternStore *gorm.PatternStore
relationStore *gorm.RelationStore
patternDetector *pattern.Detector
maintenanceSvc *maintenance.Service
sessionManager *session.Manager
sseBroadcaster *sse.Broadcaster
processor *sdk.Processor
@@ -570,6 +580,34 @@ func (s *Service) initializeAsync() {
go s.processQueue()
}
// Start periodic WAL checkpoint loop to bound SQLite WAL file growth (issue #49).
s.wg.Add(1)
go s.walCheckpointLoop()
// Start the scheduled maintenance service (issue #49: was dead code, never instantiated).
// vectorCleanupFn mirrors the observation store's cleanup hook so age/stale deletions done
// directly via GORM still remove their vectors from sqlite-vec.
var vectorCleanupFn func(ctx context.Context, deletedIDs []int64)
if vectorSync != nil {
vectorCleanupFn = func(ctx context.Context, deletedIDs []int64) {
if err := retryWithBackoff(ctx, VectorSyncMaxRetries, VectorSyncInitialBackoff, func() error {
return vectorSync.DeleteObservations(ctx, deletedIDs)
}); err != nil {
log.Warn().Err(err).Ints64("ids", deletedIDs).Msg("Failed to delete observations from sqlite-vec during maintenance")
}
}
}
maintSvc := maintenance.NewService(store, observationStore, summaryStore, promptStore, vectorCleanupFn, s.config, log.Logger)
s.initMu.Lock()
s.maintenanceSvc = maintSvc
s.initMu.Unlock()
s.wg.Add(1)
go func() {
defer s.wg.Done()
maintSvc.Start(s.ctx)
}()
log.Info().Msg("Maintenance scheduler started")
// Start file watchers for auto-recreation on deletion
s.startWatchers()
@@ -1290,6 +1328,9 @@ func (s *Service) setupRoutes() {
r.Put("/api/scoring/concepts/{concept}", s.handleUpdateConceptWeight)
r.Post("/api/scoring/recalculate", s.handleTriggerRecalculation)
// Maintenance: run an immediate synchronous DB maintenance pass (issue #49)
r.Post("/api/maintenance/run", s.handleRunMaintenance)
// Context injection
r.Get("/api/context/count", s.handleContextCount)
r.Get("/api/context/inject", s.handleContextInject)
@@ -1621,6 +1662,52 @@ func (s *Service) processQueue() {
}
}
// walCheckpointLoop periodically checkpoints the SQLite WAL so it cannot grow unbounded
// during long-lived sessions. SQLite's internal auto-checkpoint is PASSIVE and never
// shrinks the -wal file; under sustained writes with overlapping readers it can leave the
// WAL growing without limit (issue #49). This loop performs a TRUNCATE checkpoint whenever
// the WAL has grown to WALCheckpointThreshold, and does nothing while it is small.
func (s *Service) walCheckpointLoop() {
defer s.wg.Done()
// Tunable via config; fall back to the package constants when unset/<=0 (issue #49).
interval := WALCheckpointInterval
if s.config != nil && s.config.WALCheckpointIntervalSeconds > 0 {
interval = time.Duration(s.config.WALCheckpointIntervalSeconds) * time.Second
}
threshold := int64(WALCheckpointThreshold)
if s.config != nil && s.config.WALCheckpointThresholdBytes > 0 {
threshold = s.config.WALCheckpointThresholdBytes
}
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-s.ctx.Done():
return
case <-ticker.C:
s.initMu.RLock()
store := s.store
s.initMu.RUnlock()
if store == nil {
continue
}
ctx, cancel := context.WithTimeout(s.ctx, 15*time.Second)
done, err := store.CheckpointIfLarge(ctx, threshold)
cancel()
switch {
case err != nil:
log.Warn().Err(err).Msg("Periodic WAL checkpoint failed (non-fatal)")
case done:
log.Debug().Msg("Periodic WAL checkpoint (TRUNCATE) completed")
}
}
}
}
// processAllSessions processes pending messages for all active sessions.
// Messages are processed in parallel using goroutines, with concurrency
// limited by a channel-based semaphore.
@@ -1748,6 +1835,9 @@ func (s *Service) Shutdown(ctx context.Context) error {
if s.patternDetector != nil {
s.patternDetector.Stop()
}
if s.maintenanceSvc != nil {
s.maintenanceSvc.Stop()
}
// Phase 4: Shutdown sessions (flush pending work)
log.Debug().Msg("Phase 4: Shutting down sessions...")