mirror of
https://github.com/lukaszraczylo/traefikoidc.git
synced 2026-06-05 22:44:17 +00:00
72e2b682bb
The v1.0.14 fix replaced one contended sync.RWMutex (RefreshCoordinator.
refreshMutex) with sync.Map. Production showed the same death-spiral
signature recurring ~2 hours later — same shape, different mutex:
65 goroutines stuck on a sync.(*RWMutex).Lock at one address, pod
pinned at 1000m CPU, identical Yaegi runCfg/reflect.Value.Call stack
pattern. The mutex was RefreshCoordinator.attemptsMutex.
Generalising: under Yaegi (interpreted Go for traefik plugins), any
per-request global mutex acquisition is a latent serialization point.
reflect.Value.Call dispatch on a held lock turns a microsecond
critical section into a multi-millisecond one, and on a GOMAXPROCS=1
pod the queue is unbounded.
This commit removes every per-request global mutex on the hot path:
1. RefreshCoordinator.attemptsMutex (sync.RWMutex)
sessionRefreshAttempts: map -> sync.Map.
refreshAttemptTracker: all fields atomic (int32, int64 UnixNano,
cooldownEndNano == 0 as the not-in-cooldown sentinel, replacing
the inCooldown bool).
isInCooldown / recordRefreshAttempt / recordRefreshSuccess /
recordRefreshFailure all become lock-free. Cooldown entry uses
CompareAndSwapInt64 so only one goroutine logs the transition.
2. RefreshCircuitBreaker.mutex (sync.RWMutex)
lastFailureTime / lastSuccessTime -> atomic.Int64 UnixNano.
state and failures already atomic.
AllowRequest / RecordSuccess / RecordFailure now pure atomic ops.
3. TraefikOidc.firstRequestMutex (sync.Mutex)
firstRequestReceived bool -> firstRequestStarted int32.
metadataRefreshStarted bool -> metadataRefreshStartedAtomic int32.
ServeHTTP bootstrap path uses CompareAndSwapInt32 — fires once,
zero steady-state cost. Previously the mutex was acquired on
every non-health request forever.
4. TraefikOidc.metadataRetryMutex (sync.Mutex)
lastMetadataRetryTime time.Time -> lastMetadataRetryNano int64.
The 30-second retry throttle is now a CAS on lastMetadataRetryNano.
cleanupStaleEntries iterates via sync.Map.Range; eviction is a
CompareAndDelete by pointer identity so a tracker freshly re-used by
a concurrent caller is not lost.
Empirical evidence (3 specialist-agent analysis of the v1.0.14 spike,
profiles in /tmp/traefik-spike-1779511683/):
* mutex profile: 97% delay in sync.(*Mutex).Unlock via
HTTPHandlerSwitcher -> accesslog -> metrics -> backoff.RetryNotify
* 65 stuck goroutines at one RWMutex address (0x40022eb648),
identical Yaegi CFG pointer, all on rc.attemptsMutex via
recordRefreshAttempt + isInCooldown
* traffic driver: long-lived in-cluster Go-http-client doing
~5.4 req/s POST embeddings via OIDC cookie session → same
sessionID → contention all funnels to one tracker entry
Yaegi support for sync/atomic confirmed at
github.com/traefik/yaegi@v0.16.1/stdlib/go1_22_sync_atomic.go:
AddInt32/Int64, LoadInt32/Int64, StoreInt32/Int64,
CompareAndSwapInt32/Int64 all exposed via reflect.ValueOf. Yaegi
dispatches each call through reflect.Value.Call to the COMPILED
atomic.* function, which executes a single hardware CAS/LOCK-XADD
instruction. Each atomic op still pays Yaegi dispatch cost but
cannot block — no queueing, no death spiral.
Trade-off acknowledged: v1.0.15 issues ~6-8 atomic/sync.Map ops per
leader-path request vs the 4 mutex ops of v1.0.14. Under low
contention this is a modest CPU bump. Under high contention it's
an unbounded → bounded transformation. Net win.
All tests pass with -race; golangci-lint clean.
174 lines
7.1 KiB
Go
174 lines
7.1 KiB
Go
// Package traefikoidc provides OIDC authentication middleware for Traefik.
|
|
package traefikoidc
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
"sync"
|
|
"text/template"
|
|
"time"
|
|
|
|
"golang.org/x/time/rate"
|
|
)
|
|
|
|
// CacheInterface defines the common cache operations
|
|
type CacheInterface interface {
|
|
Set(key string, value any, ttl time.Duration)
|
|
Get(key string) (any, bool)
|
|
Delete(key string)
|
|
SetMaxSize(size int)
|
|
Size() int
|
|
Clear()
|
|
Cleanup()
|
|
Close()
|
|
GetStats() map[string]any // For testing and monitoring
|
|
}
|
|
|
|
// TokenVerifier interface defines token verification capabilities.
|
|
// Implementations should validate token format, signature, and claims.
|
|
type TokenVerifier interface {
|
|
VerifyToken(token string) error
|
|
}
|
|
|
|
// JWTVerifier interface defines JWT-specific verification capabilities.
|
|
// Implementations should validate JWT structure, signature using JWKs, and standard claims.
|
|
type JWTVerifier interface {
|
|
VerifyJWTSignatureAndClaims(jwt *JWT, token string) error
|
|
}
|
|
|
|
// TokenExchanger interface defines OAuth 2.0 and OpenID Connect token exchange capabilities.
|
|
// Implementations should handle authorization code exchange, refresh tokens, and revocation
|
|
// according to the OAuth 2.0 and OpenID Connect specifications.
|
|
type TokenExchanger interface {
|
|
ExchangeCodeForToken(ctx context.Context, grantType string, codeOrToken string, redirectURL string, codeVerifier string) (*TokenResponse, error)
|
|
GetNewTokenWithRefreshToken(refreshToken string) (*TokenResponse, error)
|
|
RevokeTokenWithProvider(token, tokenType string) error
|
|
}
|
|
|
|
// ProviderMetadata represents OIDC provider configuration data.
|
|
// This data is typically retrieved from the provider's .well-known/openid-configuration endpoint
|
|
// and contains essential URLs for authentication, token exchange, and key retrieval.
|
|
type ProviderMetadata struct {
|
|
Issuer string `json:"issuer"`
|
|
AuthURL string `json:"authorization_endpoint"`
|
|
TokenURL string `json:"token_endpoint"`
|
|
JWKSURL string `json:"jwks_uri"`
|
|
RevokeURL string `json:"revocation_endpoint"`
|
|
EndSessionURL string `json:"end_session_endpoint"`
|
|
IntrospectionURL string `json:"introspection_endpoint,omitempty"`
|
|
RegistrationURL string `json:"registration_endpoint,omitempty"`
|
|
ScopesSupported []string `json:"scopes_supported,omitempty"`
|
|
}
|
|
|
|
// TraefikOidc is the main middleware struct that implements OIDC authentication for Traefik.
|
|
// It integrates with various OIDC providers, manages sessions, caches tokens, and handles
|
|
// the complete authentication flow. It's designed to work seamlessly with Traefik's
|
|
// plugin system and provides flexible configuration options.
|
|
type TraefikOidc struct {
|
|
// lastMetadataRetryNano is the UnixNano timestamp of the last metadata
|
|
// recovery attempt. Stored atomically so the hot ServeHTTP path can
|
|
// throttle retries without acquiring metadataRetryMutex on every request.
|
|
lastMetadataRetryNano int64
|
|
// firstRequestStarted is 0 until the very first non-health request fires
|
|
// the background-task bootstrap; then it flips to 1 via CAS. Replaces the
|
|
// firstRequestMutex + firstRequestReceived combo which previously took
|
|
// a write lock on every non-health request forever.
|
|
firstRequestStarted int32
|
|
// metadataRefreshStartedAtomic is the CAS-only variant of the old
|
|
// metadataRefreshStarted bool. Both flags live under the same atomic so
|
|
// concurrent first-request goroutines race exactly once.
|
|
metadataRefreshStartedAtomic int32
|
|
jwkCache JWKCacheInterface
|
|
jwtVerifier JWTVerifier
|
|
ctx context.Context
|
|
tokenVerifier TokenVerifier
|
|
next http.Handler
|
|
tokenExchanger TokenExchanger
|
|
tokenBlacklist CacheInterface
|
|
tokenTypeCache CacheInterface
|
|
introspectionCache CacheInterface
|
|
initComplete chan struct{}
|
|
limiter *rate.Limiter
|
|
headerTemplates map[string]*template.Template
|
|
sessionManager *SessionManager
|
|
tokenCleanupStopChan chan struct{}
|
|
excludedURLs map[string]struct{}
|
|
extractClaimsFunc func(tokenString string) (map[string]any, error)
|
|
initiateAuthenticationFunc func(rw http.ResponseWriter, req *http.Request, session *SessionData, redirectURL string)
|
|
metadataCache *MetadataCache
|
|
allowedRolesAndGroups map[string]struct{}
|
|
allowedUsers map[string]struct{}
|
|
allowedUserDomains map[string]struct{}
|
|
tokenCache *TokenCache
|
|
httpClient *http.Client
|
|
tokenHTTPClient *http.Client
|
|
logger *Logger
|
|
metadataRefreshStopChan chan struct{}
|
|
cancelFunc context.CancelFunc
|
|
errorRecoveryManager *ErrorRecoveryManager
|
|
tokenResilienceManager *TokenResilienceManager
|
|
refreshCoordinator *RefreshCoordinator
|
|
goroutineWG *sync.WaitGroup
|
|
dcrConfig *DynamicClientRegistrationConfig
|
|
dynamicClientRegistrar *DynamicClientRegistrar
|
|
scopeFilter *ScopeFilter
|
|
securityHeadersApplier func(http.ResponseWriter, *http.Request)
|
|
userIdentifierClaim string
|
|
revocationURL string
|
|
name string
|
|
redirURLPath string
|
|
logoutURLPath string
|
|
tokenURL string
|
|
authURL string
|
|
endSessionURL string
|
|
postLogoutRedirectURI string
|
|
jwksURL string
|
|
issuerURL string
|
|
groupClaimName string
|
|
introspectionURL string
|
|
providerURL string
|
|
roleClaimName string
|
|
audience string
|
|
clientID string
|
|
clientSecret string
|
|
clientAuthMethod string
|
|
clientAssertion *ClientAssertionSigner
|
|
registrationURL string
|
|
backchannelLogoutPath string
|
|
frontchannelLogoutPath string
|
|
scopesSupported []string
|
|
scopes []string
|
|
refreshGracePeriod time.Duration
|
|
maxRefreshTokenAge time.Duration
|
|
metadataMu sync.RWMutex
|
|
shutdownOnce sync.Once
|
|
sessionInvalidationCache CacheInterface
|
|
refreshResultCache CacheInterface
|
|
minimalHeaders bool
|
|
stripAuthCookies bool
|
|
enableBackchannelLogout bool
|
|
enableFrontchannelLogout bool
|
|
requireTokenIntrospection bool
|
|
allowPrivateIPAddresses bool
|
|
disableReplayDetection bool
|
|
allowOpaqueTokens bool
|
|
strictAudienceValidation bool
|
|
overrideScopes bool
|
|
enablePKCE bool
|
|
forceHTTPS bool
|
|
suppressDiagnosticLogs bool
|
|
|
|
// Bearer-auth runtime state (populated only when EnableBearerAuth=true).
|
|
bearerIdentifierClaim string
|
|
bearerFailureTracker *bearerFailureTracker
|
|
maxTokenAge time.Duration
|
|
maxIdentifierLength int
|
|
bearerFailureThreshold int
|
|
bearerFailureWindow time.Duration
|
|
bearerFailurePenalty time.Duration
|
|
enableBearerAuth bool
|
|
stripAuthorizationHeader bool
|
|
bearerEmitWWWAuthenticate bool
|
|
bearerOverridesCookie bool
|
|
}
|