Files
traefikoidc/jwk.go
T
lukaszraczylo abbfdb02a7 fix(jwk): replace JWKCache.mutex with singleflight pattern
JWKCache.GetJWKS previously held a sync.RWMutex.Lock() across the entire
HTTP round-trip to the IdP's JWKS endpoint (jwk.go:93). On a cold cache
(cold pod, JWK rotation, transient network blip) every concurrent
request piled up on this single global write-lock. Under Yaegi each
Lock() acquisition costs 10-50ms of interpreter dispatch — same
architectural shape as the bugs v1.0.14 and v1.0.15 already fixed,
just one that hadn't surfaced as the dominant bottleneck yet.

Code-review post-spike #2 flagged this at confidence 9/10 as the next
likely death-spiral on pod cold-start.

Change replaces the lock with a sync.Map-based singleflight: the first
caller for a given JWKS URL performs the fetch; concurrent callers
attach to the same *jwksFetch and wait on its done channel for the
result. Cold-cache cost is now O(1) HTTP fetch regardless of how many
goroutines are waiting, and no Yaegi-dispatched lock is held during the
fetch itself.

Correctness:
- LoadOrStore winner does the work; losers wait on a done channel.
- Done channel close is in a defer, so panics in fetchJWKS still
  unblock waiters.
- Map entry is removed in the same defer, so a fresh failed fetch can
  be retried by the next request without waiting for any stale entry.
- ctx.Done() unblocks waiters independently of the leader's progress.
- Re-checks the cache after winning LoadOrStore, since another fetch
  may have populated the cache between the initial miss and the win.

Cleanup: also removes a stray yaegi-extract output file
(github_com-lukaszraczylo-traefikoidc.go) that was accidentally
committed during local yaegi compatibility testing.

All tests pass with -race; golangci-lint clean.
2026-05-23 11:05:24 +01:00

380 lines
11 KiB
Go

package traefikoidc
import (
"context"
"crypto"
"crypto/ecdsa"
"crypto/elliptic"
"crypto/rsa"
"crypto/x509"
"encoding/base64"
"encoding/binary"
"encoding/json"
"encoding/pem"
"fmt"
"io"
"math/big"
"net/http"
"sync"
"time"
)
// parsedKeysSuffix marks the parallel UniversalCache entry that stores
// pre-parsed public keys for a given JWKS URL.
const parsedKeysSuffix = ":parsed"
// parsedJWKS holds keys decoded from a JWKSet, indexed by kid. Storing the
// already-parsed crypto.PublicKey avoids re-running the DER/PEM round trip
// on every JWT verification — a costly operation under the yaegi interpreter
// that hosts Traefik plugins.
type parsedJWKS struct {
keys map[string]crypto.PublicKey
}
// JWK represents a JSON Web Key as defined in RFC 7517.
// It can represent different key types including RSA, EC, and symmetric keys.
type JWK struct {
Kty string `json:"kty"`
Use string `json:"use,omitempty"`
Alg string `json:"alg,omitempty"`
Kid string `json:"kid,omitempty"`
N string `json:"n,omitempty"`
E string `json:"e,omitempty"`
Crv string `json:"crv,omitempty"`
X string `json:"x,omitempty"`
Y string `json:"y,omitempty"`
KeyOps []string `json:"key_ops,omitempty"`
}
// JWKSet represents a set of JSON Web Keys.
// Typically fetched from an OIDC provider's JWKS endpoint.
type JWKSet struct {
// Keys contains the array of JWK objects
Keys []JWK `json:"keys"`
}
// JWKCache provides thread-safe caching of JWKS using UniversalCache.
//
// inflightFetches deduplicates concurrent fetches for the same JWKS URL.
// It replaces a global sync.RWMutex that was previously held for the entire
// HTTP round-trip in GetJWKS: on a cold cache (cold pod, JWK rotation, brief
// network blip) every concurrent request piled up on that single Lock(), and
// under Yaegi each Lock acquisition costs 10-50ms of interpreter-dispatch
// overhead. The singleflight pattern keeps the cold-cache cost O(1) HTTP
// fetch regardless of how many requests are waiting.
type JWKCache struct {
cache *UniversalCache
inflightFetches sync.Map // map[jwksURL string]*jwksFetch
}
// jwksFetch represents an in-flight JWKS fetch. Done is closed when the fetch
// completes; jwks and err carry the result (one of them is set, never both).
type jwksFetch struct {
done chan struct{}
jwks *JWKSet
err error
}
// JWKCacheInterface defines the contract for JWK caching implementations.
type JWKCacheInterface interface {
GetJWKS(ctx context.Context, jwksURL string, httpClient *http.Client) (*JWKSet, error)
GetPublicKey(ctx context.Context, jwksURL, kid string, httpClient *http.Client) (crypto.PublicKey, error)
Cleanup()
Close()
}
// NewJWKCache creates a new JWK cache using the global cache manager
func NewJWKCache() *JWKCache {
manager := GetUniversalCacheManager(nil)
return &JWKCache{
cache: manager.GetJWKCache(),
}
}
// GetJWKS retrieves JWKS from cache or fetches from the remote URL if not cached.
//
// The entry is stored locally only via SetLocal/GetLocal. Going through a
// distributed backend defeats the cache: JSON round-tripping turns *JWKSet
// into map[string]interface{}, the type assertion below fails, and every
// request refetches from the upstream. JWK rotation is rare and a per-replica
// HTTP fetch on cold cache is cheap, so cross-replica coherence buys nothing.
func (c *JWKCache) GetJWKS(ctx context.Context, jwksURL string, httpClient *http.Client) (*JWKSet, error) {
// Fast path: cache hit.
if cachedValue, found := c.cache.GetLocal(jwksURL); found {
if jwks, ok := cachedValue.(*JWKSet); ok {
return jwks, nil
}
}
// Singleflight: dedupe concurrent fetches per URL key. The first arrival
// performs the HTTP fetch; any later arrival for the same URL waits on
// its done channel and shares the result. No global lock is held during
// the fetch.
candidate := &jwksFetch{done: make(chan struct{})}
if existing, loaded := c.inflightFetches.LoadOrStore(jwksURL, candidate); loaded {
f, _ := existing.(*jwksFetch)
select {
case <-f.done:
return f.jwks, f.err
case <-ctx.Done():
return nil, ctx.Err()
}
}
// We're the leader. Make absolutely sure the result fields and the
// in-flight map entry are cleaned up before any waiter unblocks.
defer func() {
c.inflightFetches.Delete(jwksURL)
close(candidate.done)
}()
// Re-check the cache in case a concurrent fetch completed between our
// initial miss and our LoadOrStore win.
if cachedValue, found := c.cache.GetLocal(jwksURL); found {
if jwks, ok := cachedValue.(*JWKSet); ok {
candidate.jwks = jwks
return jwks, nil
}
}
jwks, err := fetchJWKS(ctx, jwksURL, httpClient)
if err != nil {
candidate.err = err
return nil, err
}
if len(jwks.Keys) == 0 {
candidate.err = fmt.Errorf("JWKS response contains no keys")
return nil, candidate.err
}
// Cache for 1 hour.
_ = c.cache.SetLocal(jwksURL, jwks, 1*time.Hour) // Safe to ignore: cache failures are non-critical
candidate.jwks = jwks
return jwks, nil
}
// GetPublicKey returns the parsed public key for a given kid, fetching and
// caching the JWKS plus its derived parsedJWKS on miss. The parsed entry is
// stored alongside the raw JWKSet under a sibling cache key with the same
// 1-hour TTL, so both invalidate together when the upstream JWKS rotates.
//
// parsedJWKS is stored locally only (SetLocal/GetLocal). Its values are
// crypto.PublicKey interfaces wrapping *rsa.PublicKey/*ecdsa.PublicKey,
// which contain *big.Int that marshals to a hundreds-digit JSON number.
// On a distributed backend round-trip, json.Unmarshal into interface{} would
// try to fit that into float64 and fail with UnmarshalTypeError. Under yaegi
// the unexported parsedJWKS.keys field is exposed via an X-prefixed name on
// Marshal, leaking the modulus into the cached payload (issue #134).
func (c *JWKCache) GetPublicKey(ctx context.Context, jwksURL, kid string, httpClient *http.Client) (crypto.PublicKey, error) {
parsedKey := jwksURL + parsedKeysSuffix
if v, found := c.cache.GetLocal(parsedKey); found {
if pj, ok := v.(*parsedJWKS); ok {
if k, ok := pj.keys[kid]; ok {
return k, nil
}
}
}
jwks, err := c.GetJWKS(ctx, jwksURL, httpClient)
if err != nil {
return nil, err
}
pj := buildParsedJWKS(jwks)
_ = c.cache.SetLocal(parsedKey, pj, 1*time.Hour) // Safe to ignore: cache failures are non-critical
if k, ok := pj.keys[kid]; ok {
return k, nil
}
return nil, fmt.Errorf("no matching public key found for kid: %s", kid)
}
// buildParsedJWKS pre-parses every JWK in the set into the matching
// crypto.PublicKey, indexed by kid. Errors on individual keys are skipped so
// a single bad key does not block the rest of the keyset.
func buildParsedJWKS(jwks *JWKSet) *parsedJWKS {
out := make(map[string]crypto.PublicKey, len(jwks.Keys))
for i := range jwks.Keys {
k := &jwks.Keys[i]
if k.Kid == "" {
continue
}
var pub crypto.PublicKey
var err error
switch k.Kty {
case "RSA":
pub, err = k.ToRSAPublicKey()
case "EC":
pub, err = k.ToECDSAPublicKey()
default:
continue
}
if err != nil {
continue
}
out[k.Kid] = pub
}
return &parsedJWKS{keys: out}
}
// Cleanup is a no-op as cleanup is handled by UniversalCache
func (c *JWKCache) Cleanup() {
// Handled internally by UniversalCache
}
// Close is a no-op as the cache is managed globally
func (c *JWKCache) Close() {
// Managed by global cache manager
}
// fetchJWKS fetches JWKS from a remote URL
func fetchJWKS(ctx context.Context, jwksURL string, httpClient *http.Client) (*JWKSet, error) {
req, err := http.NewRequestWithContext(ctx, "GET", jwksURL, nil)
if err != nil {
return nil, fmt.Errorf("error creating JWKS request: %w", err)
}
resp, err := httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("error fetching JWKS: %w", err)
}
defer func() { _ = resp.Body.Close() }() // Safe to ignore: closing body on defer
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body) // Safe to ignore: reading error body for diagnostics
return nil, fmt.Errorf("JWKS fetch failed with status %d: %s", resp.StatusCode, body)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("error reading JWKS response: %w", err)
}
var jwks JWKSet
if err := json.Unmarshal(body, &jwks); err != nil {
return nil, fmt.Errorf("error parsing JWKS: %w", err)
}
return &jwks, nil
}
// ToRSAPublicKey converts a JWK to an RSA public key.
// Returns an error if the JWK is not an RSA key or if the key data is invalid.
func (jwk *JWK) ToRSAPublicKey() (*rsa.PublicKey, error) {
if jwk.Kty != "RSA" {
return nil, fmt.Errorf("not an RSA key")
}
nBytes, err := base64.RawURLEncoding.DecodeString(jwk.N)
if err != nil {
return nil, fmt.Errorf("error decoding modulus: %w", err)
}
eBytes, err := base64.RawURLEncoding.DecodeString(jwk.E)
if err != nil {
return nil, fmt.Errorf("error decoding exponent: %w", err)
}
// Convert exponent bytes to int
var e int
if len(eBytes) <= 8 {
// Pad to 8 bytes for uint64
paddedE := make([]byte, 8)
copy(paddedE[8-len(eBytes):], eBytes)
eUint64 := binary.BigEndian.Uint64(paddedE)
// RSA exponents are typically small (65537 is common), so overflow is not a concern
// #nosec G115 -- RSA public exponents are small values that fit in int
e = int(eUint64)
} else {
return nil, fmt.Errorf("exponent too large")
}
return &rsa.PublicKey{
N: new(big.Int).SetBytes(nBytes),
E: e,
}, nil
}
// ToECDSAPublicKey converts a JWK to an ECDSA public key.
// Returns an error if the JWK is not an EC key or if the key data is invalid.
func (jwk *JWK) ToECDSAPublicKey() (*ecdsa.PublicKey, error) {
if jwk.Kty != "EC" {
return nil, fmt.Errorf("not an EC key")
}
var curve elliptic.Curve
switch jwk.Crv {
case "P-256":
curve = elliptic.P256()
case "P-384":
curve = elliptic.P384()
case "P-521":
curve = elliptic.P521()
default:
return nil, fmt.Errorf("unsupported curve: %s", jwk.Crv)
}
xBytes, err := base64.RawURLEncoding.DecodeString(jwk.X)
if err != nil {
return nil, fmt.Errorf("error decoding X coordinate: %w", err)
}
yBytes, err := base64.RawURLEncoding.DecodeString(jwk.Y)
if err != nil {
return nil, fmt.Errorf("error decoding Y coordinate: %w", err)
}
return &ecdsa.PublicKey{
Curve: curve,
X: new(big.Int).SetBytes(xBytes),
Y: new(big.Int).SetBytes(yBytes),
}, nil
}
// GetKey finds a key by its ID (kid) in the JWKSet.
// Returns nil if no key with the given ID is found.
func (jwks *JWKSet) GetKey(kid string) *JWK {
for i := range jwks.Keys {
if jwks.Keys[i].Kid == kid {
return &jwks.Keys[i]
}
}
return nil
}
// jwkToPEM converts a JWK to PEM format for signature verification
func jwkToPEM(jwk *JWK) ([]byte, error) {
var publicKey interface{}
var err error
switch jwk.Kty {
case "RSA":
publicKey, err = jwk.ToRSAPublicKey()
if err != nil {
return nil, fmt.Errorf("failed to convert RSA JWK: %w", err)
}
case "EC":
publicKey, err = jwk.ToECDSAPublicKey()
if err != nil {
return nil, fmt.Errorf("failed to convert EC JWK: %w", err)
}
default:
return nil, fmt.Errorf("unsupported key type: %s", jwk.Kty)
}
// Marshal the public key to DER format
pubKeyBytes, err := x509.MarshalPKIXPublicKey(publicKey)
if err != nil {
return nil, fmt.Errorf("failed to marshal public key: %w", err)
}
// Encode to PEM format
pemBlock := &pem.Block{
Type: "PUBLIC KEY",
Bytes: pubKeyBytes,
}
return pem.EncodeToMemory(pemBlock), nil
}