mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-05 23:03:55 +00:00
fix: bound SQLite WAL growth and prevent worker hangs (#49)
The worker's SQLite WAL could grow unbounded (observed 19MB) and wedge the DB, hanging Claude Code on every prompt. No checkpoint ever truncated the WAL (only PASSIVE auto-checkpoint, which cannot reclaim the file), the connection-scoped pragmas were set via a single Exec so only one pooled connection received them (e.g. busy_timeout=0 on the rest), and the maintenance service that would optimize/checkpoint was never wired up. - Register a sqlite3 ConnectHook driver so all pragmas (busy_timeout, journal_mode, synchronous, cache_size, foreign_keys, journal_size_limit) apply to every pooled connection; enable safe connection recycling. - Add Store.Checkpoint (TRUNCATE), checkpoint-on-Close, and a periodic size-gated checkpoint loop with configurable interval/threshold. - Wire up the previously-dead maintenance service; make trigger_maintenance actually run DB maintenance instead of only recalculating scores. - Harden the user-prompt hook to honor its deadline and fail open so a slow worker can never stall a prompt. - Add regression tests for WAL truncation, checkpoint-on-close, and per-connection pragmas.
This commit is contained in:
@@ -314,6 +314,49 @@ func (s *Service) handleTriggerRecalculation(w http.ResponseWriter, r *http.Requ
|
||||
writeJSON(w, map[string]string{"status": "recalculation triggered"})
|
||||
}
|
||||
|
||||
// handleRunMaintenance triggers an immediate, synchronous database maintenance run
|
||||
// (Optimize/TRUNCATE checkpoint + prompt cleanup + any enabled retention/stale cleanup)
|
||||
// and also kicks off an importance-score recalculation in the background so the behavior
|
||||
// of the previous trigger_maintenance tool is preserved (issue #49).
|
||||
func (s *Service) handleRunMaintenance(w http.ResponseWriter, r *http.Request) {
|
||||
// initMu.RLock held by requireReady middleware
|
||||
maintSvc := s.maintenanceSvc
|
||||
recalculator := s.recalculator
|
||||
|
||||
if maintSvc == nil {
|
||||
http.Error(w, "maintenance service not available", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
// Run maintenance synchronously with an independent, bounded context so the caller
|
||||
// receives a real completion status. Use context.Background so an HTTP client timeout
|
||||
// does not abort an in-progress DB maintenance pass.
|
||||
mctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
maintSvc.RunNowSync(mctx)
|
||||
|
||||
// Preserve prior trigger_maintenance behavior: also recalculate importance scores.
|
||||
recalcTriggered := false
|
||||
if recalculator != nil {
|
||||
recalcTriggered = true
|
||||
s.wg.Add(1)
|
||||
go func() {
|
||||
defer s.wg.Done()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
if err := recalculator.RecalculateNow(ctx); err != nil {
|
||||
log.Error().Err(err).Msg("Background recalculation during maintenance failed")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
writeJSON(w, map[string]any{
|
||||
"status": "maintenance completed",
|
||||
"recalc_triggered": recalcTriggered,
|
||||
"maintenance_stats": maintSvc.Stats(),
|
||||
})
|
||||
}
|
||||
|
||||
// parseIntParam parses an integer query parameter with a default value.
|
||||
func parseIntParam(r *http.Request, name string, defaultVal int) int {
|
||||
if val := r.URL.Query().Get(name); val != "" {
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/config"
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/db/gorm"
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/embedding"
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/maintenance"
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/pattern"
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/reranking"
|
||||
"github.com/lukaszraczylo/claude-mnemonic/internal/scoring"
|
||||
@@ -44,6 +45,14 @@ const (
|
||||
// QueueProcessInterval is how often the background queue processor runs.
|
||||
QueueProcessInterval = 2 * time.Second
|
||||
|
||||
// WALCheckpointInterval is how often the worker checks whether the SQLite WAL needs a
|
||||
// TRUNCATE checkpoint to reclaim disk and prevent unbounded growth (issue #49).
|
||||
WALCheckpointInterval = 60 * time.Second
|
||||
|
||||
// WALCheckpointThreshold is the WAL file size at or above which the periodic check
|
||||
// performs a TRUNCATE checkpoint. Keeps the steady-state WAL bounded to a few MB.
|
||||
WALCheckpointThreshold = 4 << 20 // 4 MiB
|
||||
|
||||
// reinitializationDrainDelay is the delay after marking the service as not ready
|
||||
// to allow in-flight requests to complete before reinitializing.
|
||||
reinitializationDrainDelay = 200 * time.Millisecond
|
||||
@@ -121,6 +130,7 @@ type Service struct {
|
||||
patternStore *gorm.PatternStore
|
||||
relationStore *gorm.RelationStore
|
||||
patternDetector *pattern.Detector
|
||||
maintenanceSvc *maintenance.Service
|
||||
sessionManager *session.Manager
|
||||
sseBroadcaster *sse.Broadcaster
|
||||
processor *sdk.Processor
|
||||
@@ -570,6 +580,34 @@ func (s *Service) initializeAsync() {
|
||||
go s.processQueue()
|
||||
}
|
||||
|
||||
// Start periodic WAL checkpoint loop to bound SQLite WAL file growth (issue #49).
|
||||
s.wg.Add(1)
|
||||
go s.walCheckpointLoop()
|
||||
|
||||
// Start the scheduled maintenance service (issue #49: was dead code, never instantiated).
|
||||
// vectorCleanupFn mirrors the observation store's cleanup hook so age/stale deletions done
|
||||
// directly via GORM still remove their vectors from sqlite-vec.
|
||||
var vectorCleanupFn func(ctx context.Context, deletedIDs []int64)
|
||||
if vectorSync != nil {
|
||||
vectorCleanupFn = func(ctx context.Context, deletedIDs []int64) {
|
||||
if err := retryWithBackoff(ctx, VectorSyncMaxRetries, VectorSyncInitialBackoff, func() error {
|
||||
return vectorSync.DeleteObservations(ctx, deletedIDs)
|
||||
}); err != nil {
|
||||
log.Warn().Err(err).Ints64("ids", deletedIDs).Msg("Failed to delete observations from sqlite-vec during maintenance")
|
||||
}
|
||||
}
|
||||
}
|
||||
maintSvc := maintenance.NewService(store, observationStore, summaryStore, promptStore, vectorCleanupFn, s.config, log.Logger)
|
||||
s.initMu.Lock()
|
||||
s.maintenanceSvc = maintSvc
|
||||
s.initMu.Unlock()
|
||||
s.wg.Add(1)
|
||||
go func() {
|
||||
defer s.wg.Done()
|
||||
maintSvc.Start(s.ctx)
|
||||
}()
|
||||
log.Info().Msg("Maintenance scheduler started")
|
||||
|
||||
// Start file watchers for auto-recreation on deletion
|
||||
s.startWatchers()
|
||||
|
||||
@@ -1290,6 +1328,9 @@ func (s *Service) setupRoutes() {
|
||||
r.Put("/api/scoring/concepts/{concept}", s.handleUpdateConceptWeight)
|
||||
r.Post("/api/scoring/recalculate", s.handleTriggerRecalculation)
|
||||
|
||||
// Maintenance: run an immediate synchronous DB maintenance pass (issue #49)
|
||||
r.Post("/api/maintenance/run", s.handleRunMaintenance)
|
||||
|
||||
// Context injection
|
||||
r.Get("/api/context/count", s.handleContextCount)
|
||||
r.Get("/api/context/inject", s.handleContextInject)
|
||||
@@ -1621,6 +1662,52 @@ func (s *Service) processQueue() {
|
||||
}
|
||||
}
|
||||
|
||||
// walCheckpointLoop periodically checkpoints the SQLite WAL so it cannot grow unbounded
|
||||
// during long-lived sessions. SQLite's internal auto-checkpoint is PASSIVE and never
|
||||
// shrinks the -wal file; under sustained writes with overlapping readers it can leave the
|
||||
// WAL growing without limit (issue #49). This loop performs a TRUNCATE checkpoint whenever
|
||||
// the WAL has grown to WALCheckpointThreshold, and does nothing while it is small.
|
||||
func (s *Service) walCheckpointLoop() {
|
||||
defer s.wg.Done()
|
||||
|
||||
// Tunable via config; fall back to the package constants when unset/<=0 (issue #49).
|
||||
interval := WALCheckpointInterval
|
||||
if s.config != nil && s.config.WALCheckpointIntervalSeconds > 0 {
|
||||
interval = time.Duration(s.config.WALCheckpointIntervalSeconds) * time.Second
|
||||
}
|
||||
threshold := int64(WALCheckpointThreshold)
|
||||
if s.config != nil && s.config.WALCheckpointThresholdBytes > 0 {
|
||||
threshold = s.config.WALCheckpointThresholdBytes
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-s.ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
s.initMu.RLock()
|
||||
store := s.store
|
||||
s.initMu.RUnlock()
|
||||
if store == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(s.ctx, 15*time.Second)
|
||||
done, err := store.CheckpointIfLarge(ctx, threshold)
|
||||
cancel()
|
||||
switch {
|
||||
case err != nil:
|
||||
log.Warn().Err(err).Msg("Periodic WAL checkpoint failed (non-fatal)")
|
||||
case done:
|
||||
log.Debug().Msg("Periodic WAL checkpoint (TRUNCATE) completed")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// processAllSessions processes pending messages for all active sessions.
|
||||
// Messages are processed in parallel using goroutines, with concurrency
|
||||
// limited by a channel-based semaphore.
|
||||
@@ -1748,6 +1835,9 @@ func (s *Service) Shutdown(ctx context.Context) error {
|
||||
if s.patternDetector != nil {
|
||||
s.patternDetector.Stop()
|
||||
}
|
||||
if s.maintenanceSvc != nil {
|
||||
s.maintenanceSvc.Stop()
|
||||
}
|
||||
|
||||
// Phase 4: Shutdown sessions (flush pending work)
|
||||
log.Debug().Msg("Phase 4: Shutting down sessions...")
|
||||
|
||||
Reference in New Issue
Block a user