Files
traefikoidc/metadata_cache.go
T
lukaszraczylo 9126c74723 December 2025 Improvements - Azure AD, Internal Networks, Startup Race Condition (#100)
* Allow internal IPs for OIDC configuration via extra flag.

Addresses issue #97

* Allow for internal IPs in OIDC configuration.

Addresses issue #97.

* feat: Add allowPrivateIPAddresses config option for internal networks

Adds a new configuration option `allowPrivateIPAddresses` that allows
OIDC provider URLs to use private IP addresses (10.x.x.x, 172.16-31.x.x,
192.168.x.x). This is useful for internal deployments where Keycloak or
other OIDC providers run on private networks without DNS resolution.

Security considerations:
- Loopback addresses (127.0.0.1, localhost, ::1) remain blocked
- Link-local addresses (169.254.x.x) remain blocked
- Default is false (secure by default)

Fixes #97

* feat: Support non-email user identifiers for Azure AD

Add userIdentifierClaim configuration option to support Azure AD users
without email addresses. This allows using alternative JWT claims like
"sub", "oid", "upn", or "preferred_username" for user identification.

- Default behavior uses "email" claim (backward compatible)
- Falls back to "sub" claim if configured claim is missing
- allowedUsers matches against the configured claim value
- allowedUserDomains only applies when using email-based identification

Fixes #95

* Race condition on traefik pod startup

When the plugin initializes and calls GetMetadataWithRecovery():

1. Checks cache first (if metadata is cached, returns immediately)
2. Creates a retry executor with startup-optimized settings (10 attempts, 1s delays)
3. Attempts to fetch metadata from the OIDC provider
4. If the fetch fails with a retryable error (connection refused, EOF, TLS/certificate errors, Traefik default cert), it waits and retries
5. After 10 attempts or on a non-retryable error, returns the error

This allows the plugin to handle the race condition where:
- Traefik initializes the plugin before routes are established
- Traefik serves its default certificate before loading real ones
- The OIDC provider pod isn't fully ready yet

Fixes issue #90

* Race condition on traefik pod startup

When the plugin initializes and calls GetMetadataWithRecovery():

1. Checks cache first (if metadata is cached, returns immediately)
2. Creates a retry executor with startup-optimized settings (10 attempts, 1s delays)
3. Attempts to fetch metadata from the OIDC provider
4. If the fetch fails with a retryable error (connection refused, EOF, TLS/certificate errors, Traefik default cert), it waits and retries
5. After 10 attempts or on a non-retryable error, returns the error

This allows the plugin to handle the race condition where:
- Traefik initializes the plugin before routes are established
- Traefik serves its default certificate before loading real ones
- The OIDC provider pod isn't fully ready yet

Fixes issue #90

* Headers too big and 431 responses

Added new option `minimalHeaders` to reduce the size of forwarded headers from the auth middleware to backend services.

  - When minimalHeaders: false (default): All headers are forwarded as before
    - X-Forwarded-User (always set)
    - X-Auth-Request-Redirect
    - X-Auth-Request-User
    - X-Auth-Request-Token (the large ID token)
    - X-User-Groups, X-User-Roles (if configured)
  - When minimalHeaders: true: Reduces header overhead
    - X-Forwarded-User (always set)
    - X-User-Groups, X-User-Roles (still forwarded if configured)
    - Custom templated headers (still processed)
    - Skipped: X-Auth-Request-Token, X-Auth-Request-User, X-Auth-Request-Redirect

Fixes issues #64 and #86
2025-12-08 14:21:17 +00:00

263 lines
7.9 KiB
Go

package traefikoidc
import (
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"sync"
"time"
)
const (
// metadataCacheVersion is incremented when cache format changes
// This ensures old cached data is automatically ignored
metadataCacheVersion = "v2"
)
// MetadataCache wraps UniversalCache for metadata operations
type MetadataCache struct {
cache *UniversalCache
logger *Logger
wg *sync.WaitGroup
}
// versionedKey adds version prefix to cache keys
func (mc *MetadataCache) versionedKey(key string) string {
return metadataCacheVersion + ":" + key
}
// MetadataCacheEntry for compatibility
type MetadataCacheEntry struct {
}
// NewMetadataCache creates a new metadata cache
func NewMetadataCache(wg *sync.WaitGroup) *MetadataCache {
manager := GetUniversalCacheManager(nil)
return &MetadataCache{
cache: manager.GetMetadataCache(),
logger: manager.logger,
wg: wg,
}
}
// NewMetadataCacheWithLogger creates a metadata cache with specific logger
func NewMetadataCacheWithLogger(wg *sync.WaitGroup, logger *Logger) *MetadataCache {
manager := GetUniversalCacheManager(logger)
return &MetadataCache{
cache: manager.GetMetadataCache(),
logger: logger,
wg: wg,
}
}
// Set stores provider metadata with a TTL
func (mc *MetadataCache) Set(providerURL string, metadata *ProviderMetadata, ttl time.Duration) error {
if metadata == nil {
return fmt.Errorf("metadata cannot be nil")
}
mc.logger.Debugf("MetadataCache: Setting metadata for %s with TTL %v", providerURL, ttl)
// Store as JSON for consistency
data, err := json.Marshal(metadata)
if err != nil {
return fmt.Errorf("failed to marshal metadata: %w", err)
}
// Use versioned key to prevent stale data issues
return mc.cache.Set(mc.versionedKey(providerURL), data, ttl)
}
// Get retrieves provider metadata from cache
func (mc *MetadataCache) Get(providerURL string) (*ProviderMetadata, bool) {
// Use versioned key to prevent stale data issues
value, exists := mc.cache.Get(mc.versionedKey(providerURL))
if !exists {
mc.logger.Debugf("MetadataCache: MISS for %s", providerURL)
return nil, false
}
// Handle different value types
var data []byte
switch v := value.(type) {
case []byte:
data = v
case string:
data = []byte(v)
default:
mc.logger.Errorf("MetadataCache: Invalid data type for %s: %T", providerURL, value)
return nil, false
}
// Debug: log first 100 chars of cached data to diagnose unmarshal issues
dataPreview := string(data)
if len(dataPreview) > 100 {
dataPreview = dataPreview[:100]
}
mc.logger.Debugf("MetadataCache: Attempting to unmarshal for %s, data preview: %s", providerURL, dataPreview)
var metadata ProviderMetadata
if err := json.Unmarshal(data, &metadata); err != nil {
// Graceful degradation: corrupt data is treated as cache miss
mc.logger.Errorf("MetadataCache: Corrupt data detected for %s: %v (preview: %s) - deleting and treating as miss", providerURL, err, dataPreview)
// Delete corrupt entry to prevent repeated errors (use versioned key)
mc.cache.Delete(mc.versionedKey(providerURL))
return nil, false
}
mc.logger.Debugf("MetadataCache: HIT for %s", providerURL)
return &metadata, true
}
// GetProviderMetadata fetches metadata with automatic caching
func (mc *MetadataCache) GetProviderMetadata(ctx context.Context, providerURL string, httpClient *http.Client) (*ProviderMetadata, error) {
// Check cache first
if metadata, exists := mc.Get(providerURL); exists {
return metadata, nil
}
// Fetch from provider
// Ensure no double slashes by trimming trailing slash from provider URL
metadataURL := strings.TrimRight(providerURL, "/") + "/.well-known/openid-configuration"
mc.logger.Infof("Fetching provider metadata from: %s", metadataURL)
req, err := http.NewRequestWithContext(ctx, "GET", metadataURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
resp, err := httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to fetch metadata: %w", err)
}
defer func() { _ = resp.Body.Close() }() // Safe to ignore: closing body on defer
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("metadata fetch returned status %d", resp.StatusCode)
}
var metadata ProviderMetadata
if err := json.NewDecoder(resp.Body).Decode(&metadata); err != nil {
return nil, fmt.Errorf("failed to decode metadata: %w", err)
}
// Cache for 1 hour by default
if err := mc.Set(providerURL, &metadata, 1*time.Hour); err != nil {
mc.logger.Errorf("Failed to cache metadata: %v", err)
}
return &metadata, nil
}
// Clear removes all cached metadata
func (mc *MetadataCache) Clear() {
mc.cache.Clear()
mc.logger.Info("MetadataCache: Cleared all entries")
}
// Close shuts down the cache
func (mc *MetadataCache) Close() {
// Cache is managed globally, so we don't close it here
mc.logger.Debug("MetadataCache: Close called (managed by global cache manager)")
}
// GetMetrics returns cache metrics
func (mc *MetadataCache) GetMetrics() map[string]interface{} {
return mc.cache.GetMetrics()
}
// Size returns the number of cached entries
func (mc *MetadataCache) Size() int {
return mc.cache.Size()
}
// GetMetadata fetches metadata with HTTP client and logger
func (mc *MetadataCache) GetMetadata(providerURL string, httpClient *http.Client, logger *Logger) (*ProviderMetadata, error) {
// Check cache first
if metadata, exists := mc.Get(providerURL); exists {
return metadata, nil
}
// Use context with timeout
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
return mc.GetProviderMetadata(ctx, providerURL, httpClient)
}
// GetMetadataWithRecovery fetches metadata with retry support for startup scenarios.
// This handles the race condition where Traefik initializes the plugin before the
// OIDC provider routes are fully established, or before TLS certificates are loaded.
// Uses aggressive retry settings (10 attempts, 1s intervals) to give the infrastructure
// time to stabilize during cold starts.
// See: https://github.com/lukaszraczylo/traefikoidc/issues/90
func (mc *MetadataCache) GetMetadataWithRecovery(providerURL string, httpClient *http.Client, logger *Logger, errorRecoveryManager *ErrorRecoveryManager) (*ProviderMetadata, error) {
// Check cache first - if we have valid cached metadata, use it
if metadata, exists := mc.Get(providerURL); exists {
return metadata, nil
}
// Create a retry executor with metadata-fetch-specific configuration
retryConfig := MetadataFetchRetryConfig()
retryExecutor := NewRetryExecutor(retryConfig, logger)
var metadata *ProviderMetadata
var lastErr error
// Use context with overall timeout for the entire retry sequence
// 10 attempts * ~10s max delay = ~100s worst case, so use 2 minute timeout
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
err := retryExecutor.ExecuteWithContext(ctx, func() error {
// Create per-attempt context with shorter timeout
attemptCtx, attemptCancel := context.WithTimeout(ctx, 15*time.Second)
defer attemptCancel()
var fetchErr error
metadata, fetchErr = mc.GetProviderMetadata(attemptCtx, providerURL, httpClient)
if fetchErr != nil {
lastErr = fetchErr
if logger != nil {
logger.Debugf("Metadata fetch attempt failed: %v", fetchErr)
}
return fetchErr
}
return nil
})
if err != nil {
// Return the last actual error, not the retry wrapper error
if lastErr != nil {
return nil, lastErr
}
return nil, err
}
return metadata, nil
}
// GetStats returns cache statistics for testing
func (mc *MetadataCache) GetStats() map[string]interface{} {
return mc.cache.GetMetrics()
}
// CleanupExpired triggers cleanup of expired entries
func (mc *MetadataCache) CleanupExpired() {
mc.cache.Cleanup()
}
// Delete removes an entry from the cache
func (mc *MetadataCache) Delete(key string) {
mc.cache.Delete(mc.versionedKey(key))
}
// Mutex returns the cache mutex for testing
func (mc *MetadataCache) Mutex() *sync.RWMutex {
return &mc.cache.mu
}