fix(refresh-coordinator): replace global RWMutex with sync.Map

Under Yaegi, the RefreshCoordinator.refreshMutex was held for tens of
milliseconds per request because every operation inside the critical
section (map access, isInCooldown, recordRefreshAttempt,
isUnderMemoryPressure, atomic ops, struct allocation) is dispatched
through reflect.Value.Call with full arg boxing/unboxing.

Concurrent refreshes on the same coordinator serialized into a queue
that grew without bound. Live capture in production (3 Grafana
dashboards left open) showed:
  * 63 goroutines stuck on rc.refreshMutex.Lock() for 1-11 minutes
  * pod pinned at 1000m CPU (GOMAXPROCS=1)
  * 5.15M allocs/sec, 0.45 RPS effective throughput
  * yaegi.call.func9 accounting for 92.66% of cumulative allocs
  * mutex profile dominated by sync.(*Mutex).Unlock via the request chain

Change inFlightRefreshes from map[string]*refreshOperation+RWMutex to
sync.Map and rewrite getOrCreateOperation to:
  1. Speculatively allocate the candidate operation.
  2. Atomically LoadOrStore by tokenHash. Joiners take the existing
     operation; leader takes the new one. No global lock acquired.
  3. Leader runs rate-limit / cooldown / memory-pressure gates AFTER
     the atomic store. Joiners share the leader's outcome via op.done.
  4. Reserve the concurrent-refresh slot via CompareAndSwap so the
     count cannot overshoot in absence of the old serializing lock.
  5. On any gate failure the leader calls failCandidate, which deletes
     the entry from sync.Map, records the error on op.result and closes
     op.done so any joiner that snuck in returns the same error.

performCleanup becomes a single sync.Map.LoadAndDelete, eliminating
the lock entirely on the cleanup path.

Net effect: critical section is no longer Yaegi-interpreted; it
collapses to atomic instructions on a sharded sync.Map. Refresh
contention disappears even under Yaegi.

All tests pass with -race; golangci-lint clean.
This commit is contained in:
2026-05-23 02:34:49 +01:00
parent 984fd1c08f
commit ae4ccaa89d
+77 -43
View File
@@ -15,7 +15,13 @@ import (
// It implements request coalescing, rate limiting, and circuit breaking
// specifically for token refresh operations.
type RefreshCoordinator struct {
inFlightRefreshes map[string]*refreshOperation
// inFlightRefreshes maps tokenHash -> *refreshOperation. sync.Map is used
// instead of a plain map + RWMutex so concurrent refreshes do not
// serialize on a single global lock. Under Yaegi the previous
// refreshMutex.Lock() was held for tens of milliseconds per request due
// to interpreter overhead on the work inside the critical section,
// causing dozens of goroutines to stack up on it and pin one CPU core.
inFlightRefreshes sync.Map
cleanupTimers map[string]*time.Timer
sessionRefreshAttempts map[string]*refreshAttemptTracker
circuitBreaker *RefreshCircuitBreaker
@@ -25,7 +31,6 @@ type RefreshCoordinator struct {
config RefreshCoordinatorConfig
wg sync.WaitGroup
attemptsMutex sync.RWMutex
refreshMutex sync.RWMutex
cleanupTimerMu sync.Mutex
}
@@ -130,7 +135,7 @@ func NewRefreshCoordinator(config RefreshCoordinatorConfig, logger *Logger) *Ref
}
rc := &RefreshCoordinator{
inFlightRefreshes: make(map[string]*refreshOperation),
// inFlightRefreshes is a sync.Map; zero value is ready to use.
sessionRefreshAttempts: make(map[string]*refreshAttemptTracker),
config: config,
metrics: &RefreshMetrics{},
@@ -227,13 +232,28 @@ func (rc *RefreshCoordinator) getOrCreateOperation(
tokenHash string,
refreshToken string,
) (*refreshOperation, bool, error) {
rc.refreshMutex.Lock()
defer rc.refreshMutex.Unlock()
// Speculatively construct the operation we WOULD register if we win the
// race. Allocating here keeps the LoadOrStore call below atomic and
// avoids any global lock — under Yaegi the previous map+RWMutex design
// held the write lock long enough (tens of ms per call) that concurrent
// refreshes on the same coordinator serialized into a queue that grew
// without bound. See struct comment on inFlightRefreshes.
candidate := &refreshOperation{
refreshToken: refreshToken,
done: make(chan struct{}),
startTime: time.Now(),
waiterCount: 1,
}
// Check for existing operation while holding the lock
if existingOp, exists := rc.inFlightRefreshes[tokenHash]; exists {
if existing, loaded := rc.inFlightRefreshes.LoadOrStore(tokenHash, candidate); loaded {
existingOp, ok := existing.(*refreshOperation)
if !ok {
// Defensive: anything stored here is always *refreshOperation, but
// keep the typed assert so a programming error elsewhere doesn't
// surface as a confusing panic in an interpreter frame.
return nil, false, fmt.Errorf("inFlightRefreshes corrupt: unexpected type %T", existing)
}
if existingOp.refreshToken == refreshToken {
// Join existing operation
atomic.AddInt32(&existingOp.waiterCount, 1)
return existingOp, false, nil
}
@@ -241,41 +261,60 @@ func (rc *RefreshCoordinator) getOrCreateOperation(
return nil, false, fmt.Errorf("refresh token mismatch")
}
// No existing operation - check if we can create a new one
// All checks happen while holding the lock to prevent races
// We won the race and registered `candidate`. Apply gates now. If any
// gate fails we must remove our entry from the map and signal failure
// to any joiners that snuck in between LoadOrStore and now.
if err := rc.applyLeaderGates(sessionID); err != nil {
rc.failCandidate(tokenHash, candidate, err)
return nil, false, err
}
// Check and record refresh attempt for rate limiting
// Reserve concurrent slot via CAS — without the old global lock we can
// no longer rely on mutex-mediated check-then-increment. If we lose the
// CAS race we retry; if the limit has since been reached we back out.
for {
current := atomic.LoadInt32(&rc.metrics.currentInFlightRefreshes)
if int(current) >= rc.config.MaxConcurrentRefreshes {
err := fmt.Errorf("maximum concurrent refresh operations reached")
rc.failCandidate(tokenHash, candidate, err)
return nil, false, err
}
if atomic.CompareAndSwapInt32(&rc.metrics.currentInFlightRefreshes, current, current+1) {
break
}
}
return candidate, true, nil
}
// applyLeaderGates runs the rate-limit, cooldown, and memory-pressure checks
// that previously ran under the global refreshMutex. Only the leader (the
// goroutine that just registered the operation) runs them; joiners share the
// leader's outcome via operation.done.
func (rc *RefreshCoordinator) applyLeaderGates(sessionID string) error {
rc.recordRefreshAttempt(sessionID)
if rc.isInCooldown(sessionID) {
atomic.AddInt64(&rc.metrics.cooldownsTriggered, 1)
return nil, false, fmt.Errorf("refresh attempts exceeded for session, in cooldown period")
return fmt.Errorf("refresh attempts exceeded for session, in cooldown period")
}
// Check memory pressure
if rc.config.EnableMemoryPressureDetection && rc.isUnderMemoryPressure() {
atomic.AddInt64(&rc.metrics.memoryPressureEvents, 1)
return nil, false, fmt.Errorf("system under memory pressure, refresh denied")
return fmt.Errorf("system under memory pressure, refresh denied")
}
return nil
}
// Check and reserve concurrent refresh slot atomically
current := atomic.LoadInt32(&rc.metrics.currentInFlightRefreshes)
if int(current) >= rc.config.MaxConcurrentRefreshes {
return nil, false, fmt.Errorf("maximum concurrent refresh operations reached")
}
// Reserve the slot - we're still holding the lock so this is safe
atomic.AddInt32(&rc.metrics.currentInFlightRefreshes, 1)
// Create and register new operation
operation := &refreshOperation{
refreshToken: refreshToken,
done: make(chan struct{}),
startTime: time.Now(),
waiterCount: 1,
}
rc.inFlightRefreshes[tokenHash] = operation
return operation, true, nil
// failCandidate removes the leader's just-registered operation from the
// in-flight map and signals the error to any joiners by recording the result
// and closing the done channel. This keeps the (nil, false, err) return path
// equivalent to the pre-sync.Map version: callers see the error directly,
// joiners see it via operation.done.
func (rc *RefreshCoordinator) failCandidate(tokenHash string, op *refreshOperation, err error) {
rc.inFlightRefreshes.Delete(tokenHash)
op.mutex.Lock()
op.result = &refreshResult{err: err}
op.mutex.Unlock()
close(op.done)
}
// executeRefreshAsync performs the actual refresh operation asynchronously
@@ -367,16 +406,11 @@ func (rc *RefreshCoordinator) scheduleDelayedCleanup(tokenHash string) {
// performCleanup removes the operation from the in-flight map.
// Idempotent: only decrements the in-flight counter if an entry was actually
// removed. This guards against any future path accidentally calling cleanup
// twice for the same tokenHash (which would corrupt the refresh budget).
// removed. LoadAndDelete is atomic so any concurrent failCandidate or repeat
// cleanup call will see exactly one removal — the budget cannot be corrupted
// by double-decrement.
func (rc *RefreshCoordinator) performCleanup(tokenHash string) {
rc.refreshMutex.Lock()
_, existed := rc.inFlightRefreshes[tokenHash]
if existed {
delete(rc.inFlightRefreshes, tokenHash)
}
rc.refreshMutex.Unlock()
if existed {
if _, existed := rc.inFlightRefreshes.LoadAndDelete(tokenHash); existed {
atomic.AddInt32(&rc.metrics.currentInFlightRefreshes, -1)
}
}