fix(refresh-coordinator): replace global RWMutex with sync.Map

Under Yaegi, the RefreshCoordinator.refreshMutex was held for tens of milliseconds per request because every operation inside the critical section (map access, isInCooldown, recordRefreshAttempt, isUnderMemoryPressure, atomic ops, struct allocation) is dispatched through reflect.Value.Call with full arg boxing/unboxing. Concurrent refreshes on the same coordinator serialized into a queue that grew without bound. Live capture in production (3 Grafana dashboards left open) showed: * 63 goroutines stuck on rc.refreshMutex.Lock() for 1-11 minutes * pod pinned at 1000m CPU (GOMAXPROCS=1) * 5.15M allocs/sec, 0.45 RPS effective throughput * yaegi.call.func9 accounting for 92.66% of cumulative allocs * mutex profile dominated by sync.(*Mutex).Unlock via the request chain Change inFlightRefreshes from map[string]*refreshOperation+RWMutex to sync.Map and rewrite getOrCreateOperation to: 1. Speculatively allocate the candidate operation. 2. Atomically LoadOrStore by tokenHash. Joiners take the existing operation; leader takes the new one. No global lock acquired. 3. Leader runs rate-limit / cooldown / memory-pressure gates AFTER the atomic store. Joiners share the leader's outcome via op.done. 4. Reserve the concurrent-refresh slot via CompareAndSwap so the count cannot overshoot in absence of the old serializing lock. 5. On any gate failure the leader calls failCandidate, which deletes the entry from sync.Map, records the error on op.result and closes op.done so any joiner that snuck in returns the same error. performCleanup becomes a single sync.Map.LoadAndDelete, eliminating the lock entirely on the cleanup path. Net effect: critical section is no longer Yaegi-interpreted; it collapses to atomic instructions on a sharded sync.Map. Refresh contention disappears even under Yaegi. All tests pass with -race; golangci-lint clean.
2026-06-05 22:44:17 +00:00 · 2026-05-23 02:34:49 +01:00
parent 984fd1c08f
commit ae4ccaa89d
1 changed files with 77 additions and 43 deletions
@@ -15,7 +15,13 @@ import (
 // It implements request coalescing, rate limiting, and circuit breaking
 // specifically for token refresh operations.
 type RefreshCoordinator struct {
-	inFlightRefreshes      map[string]*refreshOperation
+	// inFlightRefreshes maps tokenHash -> *refreshOperation. sync.Map is used
+	// instead of a plain map + RWMutex so concurrent refreshes do not
+	// serialize on a single global lock. Under Yaegi the previous
+	// refreshMutex.Lock() was held for tens of milliseconds per request due
+	// to interpreter overhead on the work inside the critical section,
+	// causing dozens of goroutines to stack up on it and pin one CPU core.
+	inFlightRefreshes      sync.Map
 	cleanupTimers          map[string]*time.Timer
 	sessionRefreshAttempts map[string]*refreshAttemptTracker
 	circuitBreaker         *RefreshCircuitBreaker
@@ -25,7 +31,6 @@ type RefreshCoordinator struct {
 	config                 RefreshCoordinatorConfig
 	wg                     sync.WaitGroup
 	attemptsMutex          sync.RWMutex
-	refreshMutex           sync.RWMutex
 	cleanupTimerMu         sync.Mutex
 }

@@ -130,7 +135,7 @@ func NewRefreshCoordinator(config RefreshCoordinatorConfig, logger *Logger) *Ref
 	}

 	rc := &RefreshCoordinator{
-		inFlightRefreshes:      make(map[string]*refreshOperation),
+		// inFlightRefreshes is a sync.Map; zero value is ready to use.
 		sessionRefreshAttempts: make(map[string]*refreshAttemptTracker),
 		config:                 config,
 		metrics:                &RefreshMetrics{},
@@ -227,13 +232,28 @@ func (rc *RefreshCoordinator) getOrCreateOperation(
 	tokenHash string,
 	refreshToken string,
 ) (*refreshOperation, bool, error) {
-	rc.refreshMutex.Lock()
-	defer rc.refreshMutex.Unlock()
+	// Speculatively construct the operation we WOULD register if we win the
+	// race. Allocating here keeps the LoadOrStore call below atomic and
+	// avoids any global lock — under Yaegi the previous map+RWMutex design
+	// held the write lock long enough (tens of ms per call) that concurrent
+	// refreshes on the same coordinator serialized into a queue that grew
+	// without bound. See struct comment on inFlightRefreshes.
+	candidate := &refreshOperation{
+		refreshToken: refreshToken,
+		done:         make(chan struct{}),
+		startTime:    time.Now(),
+		waiterCount:  1,
+	}

-	// Check for existing operation while holding the lock
-	if existingOp, exists := rc.inFlightRefreshes[tokenHash]; exists {
+	if existing, loaded := rc.inFlightRefreshes.LoadOrStore(tokenHash, candidate); loaded {
+		existingOp, ok := existing.(*refreshOperation)
+		if !ok {
+			// Defensive: anything stored here is always *refreshOperation, but
+			// keep the typed assert so a programming error elsewhere doesn't
+			// surface as a confusing panic in an interpreter frame.
+			return nil, false, fmt.Errorf("inFlightRefreshes corrupt: unexpected type %T", existing)
+		}
 		if existingOp.refreshToken == refreshToken {
-			// Join existing operation
 			atomic.AddInt32(&existingOp.waiterCount, 1)
 			return existingOp, false, nil
 		}
@@ -241,41 +261,60 @@ func (rc *RefreshCoordinator) getOrCreateOperation(
 		return nil, false, fmt.Errorf("refresh token mismatch")
 	}

-	// No existing operation - check if we can create a new one
-	// All checks happen while holding the lock to prevent races
+	// We won the race and registered `candidate`. Apply gates now. If any
+	// gate fails we must remove our entry from the map and signal failure
+	// to any joiners that snuck in between LoadOrStore and now.
+	if err := rc.applyLeaderGates(sessionID); err != nil {
+		rc.failCandidate(tokenHash, candidate, err)
+		return nil, false, err
+	}

-	// Check and record refresh attempt for rate limiting
+	// Reserve concurrent slot via CAS — without the old global lock we can
+	// no longer rely on mutex-mediated check-then-increment. If we lose the
+	// CAS race we retry; if the limit has since been reached we back out.
+	for {
+		current := atomic.LoadInt32(&rc.metrics.currentInFlightRefreshes)
+		if int(current) >= rc.config.MaxConcurrentRefreshes {
+			err := fmt.Errorf("maximum concurrent refresh operations reached")
+			rc.failCandidate(tokenHash, candidate, err)
+			return nil, false, err
+		}
+		if atomic.CompareAndSwapInt32(&rc.metrics.currentInFlightRefreshes, current, current+1) {
+			break
+		}
+	}
+
+	return candidate, true, nil
+}
+
+// applyLeaderGates runs the rate-limit, cooldown, and memory-pressure checks
+// that previously ran under the global refreshMutex. Only the leader (the
+// goroutine that just registered the operation) runs them; joiners share the
+// leader's outcome via operation.done.
+func (rc *RefreshCoordinator) applyLeaderGates(sessionID string) error {
 	rc.recordRefreshAttempt(sessionID)
 	if rc.isInCooldown(sessionID) {
 		atomic.AddInt64(&rc.metrics.cooldownsTriggered, 1)
-		return nil, false, fmt.Errorf("refresh attempts exceeded for session, in cooldown period")
+		return fmt.Errorf("refresh attempts exceeded for session, in cooldown period")
 	}
-
-	// Check memory pressure
 	if rc.config.EnableMemoryPressureDetection && rc.isUnderMemoryPressure() {
 		atomic.AddInt64(&rc.metrics.memoryPressureEvents, 1)
-		return nil, false, fmt.Errorf("system under memory pressure, refresh denied")
+		return fmt.Errorf("system under memory pressure, refresh denied")
 	}
+	return nil
+}

-	// Check and reserve concurrent refresh slot atomically
-	current := atomic.LoadInt32(&rc.metrics.currentInFlightRefreshes)
-	if int(current) >= rc.config.MaxConcurrentRefreshes {
-		return nil, false, fmt.Errorf("maximum concurrent refresh operations reached")
-	}
-
-	// Reserve the slot - we're still holding the lock so this is safe
-	atomic.AddInt32(&rc.metrics.currentInFlightRefreshes, 1)
-
-	// Create and register new operation
-	operation := &refreshOperation{
-		refreshToken: refreshToken,
-		done:         make(chan struct{}),
-		startTime:    time.Now(),
-		waiterCount:  1,
-	}
-	rc.inFlightRefreshes[tokenHash] = operation
-
-	return operation, true, nil
+// failCandidate removes the leader's just-registered operation from the
+// in-flight map and signals the error to any joiners by recording the result
+// and closing the done channel. This keeps the (nil, false, err) return path
+// equivalent to the pre-sync.Map version: callers see the error directly,
+// joiners see it via operation.done.
+func (rc *RefreshCoordinator) failCandidate(tokenHash string, op *refreshOperation, err error) {
+	rc.inFlightRefreshes.Delete(tokenHash)
+	op.mutex.Lock()
+	op.result = &refreshResult{err: err}
+	op.mutex.Unlock()
+	close(op.done)
 }

 // executeRefreshAsync performs the actual refresh operation asynchronously
@@ -367,16 +406,11 @@ func (rc *RefreshCoordinator) scheduleDelayedCleanup(tokenHash string) {

 // performCleanup removes the operation from the in-flight map.
 // Idempotent: only decrements the in-flight counter if an entry was actually
-// removed. This guards against any future path accidentally calling cleanup
-// twice for the same tokenHash (which would corrupt the refresh budget).
+// removed. LoadAndDelete is atomic so any concurrent failCandidate or repeat
+// cleanup call will see exactly one removal — the budget cannot be corrupted
+// by double-decrement.
 func (rc *RefreshCoordinator) performCleanup(tokenHash string) {
-	rc.refreshMutex.Lock()
-	_, existed := rc.inFlightRefreshes[tokenHash]
-	if existed {
-		delete(rc.inFlightRefreshes, tokenHash)
-	}
-	rc.refreshMutex.Unlock()
-	if existed {
+	if _, existed := rc.inFlightRefreshes.LoadAndDelete(tokenHash); existed {
 		atomic.AddInt32(&rc.metrics.currentInFlightRefreshes, -1)
 	}
 }