From abbfdb02a71d82c45216f11b3f98d73ef25f7b2d Mon Sep 17 00:00:00 2001 From: Lukasz Raczylo Date: Sat, 23 May 2026 11:05:24 +0100 Subject: [PATCH] fix(jwk): replace JWKCache.mutex with singleflight pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JWKCache.GetJWKS previously held a sync.RWMutex.Lock() across the entire HTTP round-trip to the IdP's JWKS endpoint (jwk.go:93). On a cold cache (cold pod, JWK rotation, transient network blip) every concurrent request piled up on this single global write-lock. Under Yaegi each Lock() acquisition costs 10-50ms of interpreter dispatch — same architectural shape as the bugs v1.0.14 and v1.0.15 already fixed, just one that hadn't surfaced as the dominant bottleneck yet. Code-review post-spike #2 flagged this at confidence 9/10 as the next likely death-spiral on pod cold-start. Change replaces the lock with a sync.Map-based singleflight: the first caller for a given JWKS URL performs the fetch; concurrent callers attach to the same *jwksFetch and wait on its done channel for the result. Cold-cache cost is now O(1) HTTP fetch regardless of how many goroutines are waiting, and no Yaegi-dispatched lock is held during the fetch itself. Correctness: - LoadOrStore winner does the work; losers wait on a done channel. - Done channel close is in a defer, so panics in fetchJWKS still unblock waiters. - Map entry is removed in the same defer, so a fresh failed fetch can be retried by the next request without waiting for any stale entry. - ctx.Done() unblocks waiters independently of the leader's progress. - Re-checks the cache after winning LoadOrStore, since another fetch may have populated the cache between the initial miss and the win. Cleanup: also removes a stray yaegi-extract output file (github_com-lukaszraczylo-traefikoidc.go) that was accidentally committed during local yaegi compatibility testing. All tests pass with -race; golangci-lint clean. --- jwk.go | 60 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/jwk.go b/jwk.go index 1cba7f5..b376401 100644 --- a/jwk.go +++ b/jwk.go @@ -53,10 +53,26 @@ type JWKSet struct { Keys []JWK `json:"keys"` } -// JWKCache provides thread-safe caching of JWKS using UniversalCache +// JWKCache provides thread-safe caching of JWKS using UniversalCache. +// +// inflightFetches deduplicates concurrent fetches for the same JWKS URL. +// It replaces a global sync.RWMutex that was previously held for the entire +// HTTP round-trip in GetJWKS: on a cold cache (cold pod, JWK rotation, brief +// network blip) every concurrent request piled up on that single Lock(), and +// under Yaegi each Lock acquisition costs 10-50ms of interpreter-dispatch +// overhead. The singleflight pattern keeps the cold-cache cost O(1) HTTP +// fetch regardless of how many requests are waiting. type JWKCache struct { - cache *UniversalCache - mutex sync.RWMutex + cache *UniversalCache + inflightFetches sync.Map // map[jwksURL string]*jwksFetch +} + +// jwksFetch represents an in-flight JWKS fetch. Done is closed when the fetch +// completes; jwks and err carry the result (one of them is set, never both). +type jwksFetch struct { + done chan struct{} + jwks *JWKSet + err error } // JWKCacheInterface defines the contract for JWK caching implementations. @@ -83,36 +99,58 @@ func NewJWKCache() *JWKCache { // request refetches from the upstream. JWK rotation is rare and a per-replica // HTTP fetch on cold cache is cheap, so cross-replica coherence buys nothing. func (c *JWKCache) GetJWKS(ctx context.Context, jwksURL string, httpClient *http.Client) (*JWKSet, error) { - // Check cache first + // Fast path: cache hit. if cachedValue, found := c.cache.GetLocal(jwksURL); found { if jwks, ok := cachedValue.(*JWKSet); ok { return jwks, nil } } - c.mutex.Lock() - defer c.mutex.Unlock() + // Singleflight: dedupe concurrent fetches per URL key. The first arrival + // performs the HTTP fetch; any later arrival for the same URL waits on + // its done channel and shares the result. No global lock is held during + // the fetch. + candidate := &jwksFetch{done: make(chan struct{})} + if existing, loaded := c.inflightFetches.LoadOrStore(jwksURL, candidate); loaded { + f, _ := existing.(*jwksFetch) + select { + case <-f.done: + return f.jwks, f.err + case <-ctx.Done(): + return nil, ctx.Err() + } + } - // Double-check after acquiring lock + // We're the leader. Make absolutely sure the result fields and the + // in-flight map entry are cleaned up before any waiter unblocks. + defer func() { + c.inflightFetches.Delete(jwksURL) + close(candidate.done) + }() + + // Re-check the cache in case a concurrent fetch completed between our + // initial miss and our LoadOrStore win. if cachedValue, found := c.cache.GetLocal(jwksURL); found { if jwks, ok := cachedValue.(*JWKSet); ok { + candidate.jwks = jwks return jwks, nil } } - // Fetch from URL jwks, err := fetchJWKS(ctx, jwksURL, httpClient) if err != nil { + candidate.err = err return nil, err } - if len(jwks.Keys) == 0 { - return nil, fmt.Errorf("JWKS response contains no keys") + candidate.err = fmt.Errorf("JWKS response contains no keys") + return nil, candidate.err } - // Cache for 1 hour + // Cache for 1 hour. _ = c.cache.SetLocal(jwksURL, jwks, 1*time.Hour) // Safe to ignore: cache failures are non-critical + candidate.jwks = jwks return jwks, nil }