package traefikoidc import ( "context" "crypto/x509" "errors" "fmt" "io" "math" "math/rand/v2" "net" "strings" "sync" "sync/atomic" "time" ) // ErrorRecoveryMechanism defines the interface for error recovery strategies. // It provides a common contract for implementing various resilience patterns // (circuit breaker, retry, graceful degradation) to handle transient failures // and protect downstream services from cascading failures. type ErrorRecoveryMechanism interface { // ExecuteWithContext executes a function with error recovery mechanisms ExecuteWithContext(ctx context.Context, fn func() error) error // GetMetrics returns metrics about the recovery mechanism's performance GetMetrics() map[string]interface{} // Reset resets the mechanism to its initial state Reset() // IsAvailable returns whether the mechanism is available for requests IsAvailable() bool } // BaseRecoveryMechanism provides common functionality and metrics tracking // for all error recovery mechanisms. It handles request/failure/success counting, // timing information, and logging capabilities for derived recovery mechanisms. type BaseRecoveryMechanism struct { // startTime tracks when the mechanism was created startTime time.Time // lastFailureTime records the most recent failure timestamp lastFailureTime time.Time // lastSuccessTime records the most recent success timestamp lastSuccessTime time.Time // logger for debugging and monitoring logger *Logger // name identifies this recovery mechanism instance name string // totalRequests counts all requests processed totalRequests int64 // totalFailures counts failed requests totalFailures int64 // totalSuccesses counts successful requests totalSuccesses int64 // mutex protects shared state access mutex sync.RWMutex } // NewBaseRecoveryMechanism creates a new base recovery mechanism with the given name and logger. // This serves as the foundation for specific recovery mechanism implementations. // Parameters: // - name: Identifier for this recovery mechanism instance // - logger: Logger for debugging and monitoring (nil creates no-op logger) // // Returns: // - A configured BaseRecoveryMechanism instance func NewBaseRecoveryMechanism(name string, logger *Logger) *BaseRecoveryMechanism { if logger == nil { logger = GetSingletonNoOpLogger() } return &BaseRecoveryMechanism{ name: name, logger: logger, startTime: time.Now(), } } // RecordRequest increments the total request counter. // This method is thread-safe using atomic operations. func (b *BaseRecoveryMechanism) RecordRequest() { atomic.AddInt64(&b.totalRequests, 1) } // RecordSuccess increments the success counter and updates the last success timestamp. // This method is thread-safe using atomic operations for counters // and mutex protection for timestamp updates. func (b *BaseRecoveryMechanism) RecordSuccess() { atomic.AddInt64(&b.totalSuccesses, 1) b.mutex.Lock() defer b.mutex.Unlock() b.lastSuccessTime = time.Now() } // RecordFailure increments the failure counter and updates the last failure timestamp. // This method is thread-safe using atomic operations for counters // and mutex protection for timestamp updates. func (b *BaseRecoveryMechanism) RecordFailure() { atomic.AddInt64(&b.totalFailures, 1) b.mutex.Lock() defer b.mutex.Unlock() b.lastFailureTime = time.Now() } // GetBaseMetrics returns comprehensive metrics about the recovery mechanism. // Includes request counts, success/failure rates, timing information, // and uptime statistics that are common to all recovery mechanisms. func (b *BaseRecoveryMechanism) GetBaseMetrics() map[string]interface{} { b.mutex.RLock() defer b.mutex.RUnlock() metrics := map[string]interface{}{ "total_requests": atomic.LoadInt64(&b.totalRequests), "total_failures": atomic.LoadInt64(&b.totalFailures), "total_successes": atomic.LoadInt64(&b.totalSuccesses), "uptime_seconds": time.Since(b.startTime).Seconds(), "name": b.name, } if !b.lastFailureTime.IsZero() { metrics["last_failure_time"] = b.lastFailureTime.Format(time.RFC3339) metrics["seconds_since_last_failure"] = time.Since(b.lastFailureTime).Seconds() } if !b.lastSuccessTime.IsZero() { metrics["last_success_time"] = b.lastSuccessTime.Format(time.RFC3339) metrics["seconds_since_last_success"] = time.Since(b.lastSuccessTime).Seconds() } totalReq, _ := metrics["total_requests"].(int64) // Safe to ignore: type assertion with fallback totalSucc, _ := metrics["total_successes"].(int64) // Safe to ignore: type assertion with fallback if totalReq > 0 { successRate := float64(totalSucc) / float64(totalReq) metrics["success_rate"] = successRate } else { metrics["success_rate"] = 1.0 } return metrics } // LogInfo logs an informational message with the mechanism name as prefix. // Provides consistent logging format across all recovery mechanisms. func (b *BaseRecoveryMechanism) LogInfo(format string, args ...interface{}) { if b.logger != nil { b.logger.Infof("%s: "+format, append([]interface{}{b.name}, args...)...) } } // LogError logs an error message with the mechanism name as prefix. // Used for reporting failures and error conditions in recovery mechanisms. func (b *BaseRecoveryMechanism) LogError(format string, args ...interface{}) { if b.logger != nil { b.logger.Errorf("%s: "+format, append([]interface{}{b.name}, args...)...) } } // LogDebug logs a debug message with the mechanism name as prefix. // Used for detailed debugging information about recovery mechanism operations. func (b *BaseRecoveryMechanism) LogDebug(format string, args ...interface{}) { if b.logger != nil { b.logger.Debugf("%s: "+format, append([]interface{}{b.name}, args...)...) } } // CircuitBreakerState represents the current state of a circuit breaker. // The circuit breaker pattern prevents cascading failures by monitoring // error rates and temporarily blocking requests to failing services. type CircuitBreakerState int // Circuit breaker states following the standard pattern: // Closed: Normal operation, requests flow through // Open: Circuit is tripped, requests are blocked // HalfOpen: Testing state, limited requests allowed to test recovery const ( // CircuitBreakerClosed allows all requests through (normal operation) CircuitBreakerClosed CircuitBreakerState = iota // CircuitBreakerOpen blocks all requests (service is failing) CircuitBreakerOpen // CircuitBreakerHalfOpen allows limited requests to test service recovery CircuitBreakerHalfOpen ) // CircuitBreaker implements the circuit breaker pattern for external service calls. // It monitors failure rates and automatically opens the circuit when failures // exceed the threshold, preventing further requests until the service recovers. type CircuitBreaker struct { // BaseRecoveryMechanism provides common functionality *BaseRecoveryMechanism // maxFailures is the threshold for opening the circuit maxFailures int // timeout is how long to wait before allowing requests in half-open state timeout time.Duration // resetTimeout is how long to wait before transitioning from open to half-open resetTimeout time.Duration // state tracks the current circuit breaker state state CircuitBreakerState // failures counts consecutive failures failures int64 } // CircuitBreakerConfig holds configuration parameters for circuit breakers. // These settings control when the circuit opens and how it recovers. type CircuitBreakerConfig struct { // MaxFailures is the number of failures before opening the circuit MaxFailures int `json:"max_failures"` // Timeout is how long to wait before trying to recover (open -> half-open) Timeout time.Duration `json:"timeout"` // ResetTimeout is how long to wait before fully closing the circuit ResetTimeout time.Duration `json:"reset_timeout"` } // DefaultCircuitBreakerConfig returns sensible default configuration for circuit breakers. // Configured for typical web service scenarios with moderate tolerance for failures. func DefaultCircuitBreakerConfig() CircuitBreakerConfig { return CircuitBreakerConfig{ MaxFailures: 2, Timeout: 60 * time.Second, ResetTimeout: 30 * time.Second, } } // NewCircuitBreaker creates a new circuit breaker with the specified configuration. // The circuit breaker starts in the closed state, allowing all requests through. func NewCircuitBreaker(config CircuitBreakerConfig, logger *Logger) *CircuitBreaker { return &CircuitBreaker{ BaseRecoveryMechanism: NewBaseRecoveryMechanism("circuit-breaker", logger), maxFailures: config.MaxFailures, timeout: config.Timeout, resetTimeout: config.ResetTimeout, state: CircuitBreakerClosed, } } // ExecuteWithContext executes a function through the circuit breaker with context. // It checks if requests are allowed, executes the function, and updates the circuit state // based on the result. Implements the ErrorRecoveryMechanism interface. func (cb *CircuitBreaker) ExecuteWithContext(ctx context.Context, fn func() error) error { cb.RecordRequest() if !cb.allowRequest() { return fmt.Errorf("circuit breaker is open") } err := fn() if err != nil { cb.recordFailure() cb.RecordFailure() return err } cb.recordSuccess() cb.RecordSuccess() return nil } // Execute executes a function through the circuit breaker without context. // This is provided for backward compatibility with existing code. func (cb *CircuitBreaker) Execute(fn func() error) error { return cb.ExecuteWithContext(context.Background(), fn) } // allowRequest determines whether to allow a request based on the circuit state. // Handles state transitions from open to half-open based on timeout. func (cb *CircuitBreaker) allowRequest() bool { cb.mutex.Lock() defer cb.mutex.Unlock() now := time.Now() switch cb.state { case CircuitBreakerClosed: return true case CircuitBreakerOpen: if now.Sub(cb.lastFailureTime) > cb.timeout { cb.state = CircuitBreakerHalfOpen cb.logger.Infof("Circuit breaker transitioning to half-open state") return true } return false case CircuitBreakerHalfOpen: return true default: return false } } // recordFailure records a failure and potentially opens the circuit. // Updates failure count and triggers state transitions when thresholds are exceeded. func (cb *CircuitBreaker) recordFailure() { cb.mutex.Lock() defer cb.mutex.Unlock() cb.failures++ switch cb.state { case CircuitBreakerClosed: if cb.failures >= int64(cb.maxFailures) { cb.state = CircuitBreakerOpen cb.LogError("Circuit breaker opened after %d failures", cb.failures) } case CircuitBreakerHalfOpen: cb.state = CircuitBreakerOpen cb.LogError("Circuit breaker returned to open state after failure in half-open") } } // recordSuccess records a successful request and potentially closes the circuit. // Resets failure count and transitions from half-open to closed state on success. func (cb *CircuitBreaker) recordSuccess() { cb.mutex.Lock() defer cb.mutex.Unlock() switch cb.state { case CircuitBreakerHalfOpen: cb.failures = 0 cb.state = CircuitBreakerClosed cb.LogInfo("Circuit breaker closed after successful request in half-open state") case CircuitBreakerClosed: cb.failures = 0 } } // GetState returns the current state of the circuit breaker. // Thread-safe method for monitoring circuit breaker status. func (cb *CircuitBreaker) GetState() CircuitBreakerState { cb.mutex.RLock() defer cb.mutex.RUnlock() return cb.state } // Reset resets the circuit breaker to its initial closed state. // Clears failure count and state, effectively recovering from any open state. func (cb *CircuitBreaker) Reset() { cb.mutex.Lock() defer cb.mutex.Unlock() cb.state = CircuitBreakerClosed atomic.StoreInt64(&cb.failures, 0) cb.LogInfo("Circuit breaker has been reset") } // IsAvailable returns whether the circuit breaker is currently allowing requests. // This provides a quick way to check if the service is available. func (cb *CircuitBreaker) IsAvailable() bool { return cb.allowRequest() } // GetMetrics returns comprehensive metrics about the circuit breaker. // Includes state information, failure counts, configuration, and base metrics. func (cb *CircuitBreaker) GetMetrics() map[string]interface{} { cb.mutex.RLock() state := cb.state failures := cb.failures cb.mutex.RUnlock() metrics := cb.GetBaseMetrics() stateStr := "unknown" switch state { case CircuitBreakerClosed: stateStr = "closed" case CircuitBreakerOpen: stateStr = "open" case CircuitBreakerHalfOpen: stateStr = "half-open" } metrics["state"] = stateStr metrics["max_failures"] = cb.maxFailures metrics["current_failures"] = failures metrics["timeout_ms"] = cb.timeout.Milliseconds() metrics["reset_timeout_ms"] = cb.resetTimeout.Milliseconds() return metrics } // RetryConfig holds configuration parameters for retry mechanisms. // Controls retry behavior including which errors to retry, timing, and backoff strategy. type RetryConfig struct { // RetryableErrors defines error patterns that should trigger retries RetryableErrors []string `json:"retryable_errors"` // MaxAttempts is the maximum number of retry attempts MaxAttempts int `json:"max_attempts"` // InitialDelay is the delay before the first retry InitialDelay time.Duration `json:"initial_delay"` // MaxDelay caps the maximum delay between retries MaxDelay time.Duration `json:"max_delay"` // BackoffFactor multiplies delay between attempts (exponential backoff) BackoffFactor float64 `json:"backoff_factor"` // EnableJitter adds randomness to delays to prevent thundering herd EnableJitter bool `json:"enable_jitter"` } // DefaultRetryConfig returns sensible default configuration for retry mechanisms. // Configured with exponential backoff, jitter, and common retryable error patterns. func DefaultRetryConfig() RetryConfig { return RetryConfig{ MaxAttempts: 3, InitialDelay: 100 * time.Millisecond, MaxDelay: 5 * time.Second, BackoffFactor: 2.0, EnableJitter: true, RetryableErrors: []string{ "connection refused", "timeout", "temporary failure", "network unreachable", }, } } // MetadataFetchRetryConfig returns retry configuration optimized for OIDC metadata // fetching during startup. Uses more aggressive retry settings to handle the race // condition where Traefik initializes the plugin before routes are fully established, // or before TLS certificates are properly loaded. // See: https://github.com/lukaszraczylo/traefikoidc/issues/90 func MetadataFetchRetryConfig() RetryConfig { return RetryConfig{ MaxAttempts: 10, // More attempts for startup scenarios InitialDelay: 1 * time.Second, // 1 second between attempts as suggested MaxDelay: 10 * time.Second, // Cap at 10 seconds BackoffFactor: 1.5, // Gentler backoff for startup EnableJitter: true, // Prevent thundering herd RetryableErrors: []string{ "connection refused", "timeout", "temporary failure", "network unreachable", "EOF", "certificate", "x509", "tls", }, } } // RetryExecutor implements retry logic with exponential backoff and jitter. // It automatically retries failed operations based on configurable error patterns // and uses exponential backoff to avoid overwhelming failing services. type RetryExecutor struct { // BaseRecoveryMechanism provides common functionality *BaseRecoveryMechanism // config contains retry behavior configuration config RetryConfig } // NewRetryExecutor creates a new retry executor with the specified configuration. // The executor will retry operations according to the provided configuration. func NewRetryExecutor(config RetryConfig, logger *Logger) *RetryExecutor { return &RetryExecutor{ BaseRecoveryMechanism: NewBaseRecoveryMechanism("retry-executor", logger), config: config, } } // ExecuteWithContext executes a function with retry logic and exponential backoff. // Retries failed operations based on error patterns and respects context cancellation. // Implements the ErrorRecoveryMechanism interface. func (re *RetryExecutor) ExecuteWithContext(ctx context.Context, fn func() error) error { re.RecordRequest() var lastErr error for attempt := 1; attempt <= re.config.MaxAttempts; attempt++ { err := fn() if err == nil { if attempt > 1 { re.LogInfo("Operation succeeded after %d attempts", attempt) } re.RecordSuccess() return nil } lastErr = err if !re.isRetryableError(err) { re.RecordFailure() return err } if attempt == re.config.MaxAttempts { re.RecordFailure() break } delay := re.calculateDelay(attempt) if attempt == 1 || attempt%3 == 0 { re.LogDebug("Retrying operation after %v (attempt %d/%d): %v", delay, attempt, re.config.MaxAttempts, err) } select { case <-ctx.Done(): re.RecordFailure() return ctx.Err() case <-time.After(delay): } } finalErr := fmt.Errorf("operation failed after %d attempts: %w", re.config.MaxAttempts, lastErr) return finalErr } // Execute runs the given function with retry logic (for backward compatibility) // Execute executes a function with retry logic (backward compatibility). // This method provides the same functionality as ExecuteWithContext. func (re *RetryExecutor) Execute(ctx context.Context, fn func() error) error { return re.ExecuteWithContext(ctx, fn) } // isRetryableError checks if an error should trigger a retry // isRetryableError determines if an error should trigger a retry attempt. // Checks error message against configured retryable error patterns. // Also handles startup-specific errors like Traefik default certificate errors // and EOF errors that occur during service initialization. func (re *RetryExecutor) isRetryableError(err error) bool { if err == nil { return false } // Check for Traefik default certificate error (startup race condition) // See: https://github.com/lukaszraczylo/traefikoidc/issues/90 if isTraefikDefaultCertError(err) { return true } // Check for EOF errors (common during startup when services aren't ready) if isEOFError(err) { return true } // Check for certificate errors (transient during startup) if isCertificateError(err) { return true } errStr := strings.ToLower(err.Error()) for _, retryableErr := range re.config.RetryableErrors { if contains(errStr, strings.ToLower(retryableErr)) { return true } } if netErr, ok := err.(net.Error); ok { if netErr.Timeout() { return true } errStr := strings.ToLower(netErr.Error()) temporaryPatterns := []string{ "connection refused", "connection reset", "network is unreachable", "no route to host", "temporary failure", "try again", "resource temporarily unavailable", } for _, pattern := range temporaryPatterns { if contains(errStr, pattern) { return true } } } if httpErr, ok := err.(*HTTPError); ok { return httpErr.StatusCode >= 500 || httpErr.StatusCode == 429 } return false } // calculateDelay calculates the delay for the next retry attempt // calculateDelay computes the delay before the next retry attempt. // Uses exponential backoff with optional jitter to prevent thundering herd. func (re *RetryExecutor) calculateDelay(attempt int) time.Duration { delay := float64(re.config.InitialDelay) * math.Pow(re.config.BackoffFactor, float64(attempt-1)) if delay > float64(re.config.MaxDelay) { delay = float64(re.config.MaxDelay) } // #nosec G404 -- math/rand is acceptable for jitter timing, not security-sensitive if re.config.EnableJitter { jitter := delay * 0.1 * (2.0*rand.Float64() - 1.0) delay += jitter } return time.Duration(delay) } // Reset resets the retry executor state // Reset clears any internal state of the retry executor. // For RetryExecutor, this is primarily a logging operation. func (re *RetryExecutor) Reset() { re.LogDebug("Retry executor reset") } // IsAvailable always returns true for RetryExecutor // IsAvailable returns whether the retry executor is available. // Always returns true as retry executors don't have availability state. func (re *RetryExecutor) IsAvailable() bool { return true } // GetMetrics returns metrics about the retry executor // GetMetrics returns comprehensive metrics about the retry executor. // Includes base metrics plus retry-specific configuration information. func (re *RetryExecutor) GetMetrics() map[string]interface{} { metrics := re.GetBaseMetrics() metrics["max_attempts"] = re.config.MaxAttempts metrics["initial_delay_ms"] = re.config.InitialDelay.Milliseconds() metrics["max_delay_ms"] = re.config.MaxDelay.Milliseconds() metrics["backoff_factor"] = re.config.BackoffFactor metrics["enable_jitter"] = re.config.EnableJitter metrics["retryable_errors"] = re.config.RetryableErrors return metrics } // HTTPError represents an HTTP error with status code and message. // Used for categorizing HTTP-related errors in error recovery mechanisms. type HTTPError struct { // Message is the error description Message string // StatusCode is the HTTP status code StatusCode int } // Error returns the string representation of the HTTP error. // Implements the error interface. func (e *HTTPError) Error() string { return fmt.Sprintf("HTTP %d: %s", e.StatusCode, e.Message) } // OIDCError represents OIDC-specific errors with context information. // It provides structured error reporting for authentication and authorization failures. type OIDCError struct { Cause error Context map[string]interface{} Code string Message string } // Error returns the string representation of the OIDC error. // Implements the error interface. func (e *OIDCError) Error() string { if e.Cause != nil { return fmt.Sprintf("OIDC error [%s]: %s - caused by: %v", e.Code, e.Message, e.Cause) } return fmt.Sprintf("OIDC error [%s]: %s", e.Code, e.Message) } // Unwrap returns the underlying error for error chain unwrapping. func (e *OIDCError) Unwrap() error { return e.Cause } // SessionError represents session-related errors with context. // Used for session management, validation, and storage errors. type SessionError struct { Cause error Operation string Message string SessionID string } // Error returns the string representation of the session error. // Implements the error interface. func (e *SessionError) Error() string { if e.Cause != nil { return fmt.Sprintf("Session error in %s: %s - caused by: %v", e.Operation, e.Message, e.Cause) } return fmt.Sprintf("Session error in %s: %s", e.Operation, e.Message) } // Unwrap returns the underlying error for error chain unwrapping. func (e *SessionError) Unwrap() error { return e.Cause } // TokenError represents token-related errors with validation context. // Used for JWT validation, token refresh, and token format errors. type TokenError struct { Cause error TokenType string Reason string Message string } // Error returns the string representation of the token error. // Implements the error interface. func (e *TokenError) Error() string { if e.Cause != nil { return fmt.Sprintf("Token error (%s) - %s: %s - caused by: %v", e.TokenType, e.Reason, e.Message, e.Cause) } return fmt.Sprintf("Token error (%s) - %s: %s", e.TokenType, e.Reason, e.Message) } // Unwrap returns the underlying error for error chain unwrapping. func (e *TokenError) Unwrap() error { return e.Cause } // NewOIDCError creates a new OIDC error with context. func NewOIDCError(code, message string, cause error) *OIDCError { return &OIDCError{ Code: code, Message: message, Context: make(map[string]interface{}), Cause: cause, } } // WithContext adds context information to the OIDC error. func (e *OIDCError) WithContext(key string, value interface{}) *OIDCError { e.Context[key] = value return e } // NewSessionError creates a new session error with operation context. func NewSessionError(operation, message string, cause error) *SessionError { return &SessionError{ Operation: operation, Message: message, Cause: cause, } } // WithSessionID adds session ID to the session error. func (e *SessionError) WithSessionID(sessionID string) *SessionError { e.SessionID = sessionID return e } // NewTokenError creates a new token error with type and reason. func NewTokenError(tokenType, reason, message string, cause error) *TokenError { return &TokenError{ TokenType: tokenType, Reason: reason, Message: message, Cause: cause, } } // GracefulDegradation implements graceful degradation patterns for service resilience. // It provides fallback mechanisms when primary services are unavailable and monitors // service health to automatically recover when services become available again. type GracefulDegradation struct { *BaseRecoveryMechanism fallbacks map[string]func() (interface{}, error) healthChecks map[string]func() bool degradedServices map[string]time.Time healthCheckTask *BackgroundTask stopChan chan struct{} config GracefulDegradationConfig mutex sync.RWMutex shutdownOnce sync.Once } // GracefulDegradationConfig holds configuration for graceful degradation behavior. // Controls health checking frequency, recovery timing, and fallback enablement. type GracefulDegradationConfig struct { // HealthCheckInterval defines how often to check service health HealthCheckInterval time.Duration `json:"health_check_interval"` // RecoveryTimeout is how long to wait before attempting service recovery RecoveryTimeout time.Duration `json:"recovery_timeout"` // EnableFallbacks controls whether fallback mechanisms are active EnableFallbacks bool `json:"enable_fallbacks"` } // DefaultGracefulDegradationConfig returns sensible defaults for graceful degradation. // Configured with moderate health check frequency and recovery timeouts. func DefaultGracefulDegradationConfig() GracefulDegradationConfig { return GracefulDegradationConfig{ HealthCheckInterval: 30 * time.Second, RecoveryTimeout: 5 * time.Minute, EnableFallbacks: true, } } // NewGracefulDegradation creates a new graceful degradation manager // NewGracefulDegradation creates a new graceful degradation mechanism. // Initializes fallback and health check maps and starts background health monitoring. func NewGracefulDegradation(config GracefulDegradationConfig, logger *Logger) *GracefulDegradation { gd := &GracefulDegradation{ BaseRecoveryMechanism: NewBaseRecoveryMechanism("graceful-degradation", logger), fallbacks: make(map[string]func() (interface{}, error)), healthChecks: make(map[string]func() bool), degradedServices: make(map[string]time.Time), config: config, } gd.stopChan = make(chan struct{}) go gd.startHealthCheckRoutine() return gd } // RegisterFallback registers a fallback function for a service func (gd *GracefulDegradation) RegisterFallback(serviceName string, fallback func() (interface{}, error)) { gd.mutex.Lock() defer gd.mutex.Unlock() gd.fallbacks[serviceName] = fallback } // RegisterHealthCheck registers a health check function for a service func (gd *GracefulDegradation) RegisterHealthCheck(serviceName string, healthCheck func() bool) { gd.mutex.Lock() defer gd.mutex.Unlock() gd.healthChecks[serviceName] = healthCheck } // ExecuteWithContext implements the ErrorRecoveryMechanism interface func (gd *GracefulDegradation) ExecuteWithContext(ctx context.Context, fn func() error) error { gd.RecordRequest() _, err := gd.ExecuteWithFallback("default", func() (interface{}, error) { return nil, fn() }) if err != nil { gd.RecordFailure() } else { gd.RecordSuccess() } return err } // ExecuteWithFallback executes a function with fallback support func (gd *GracefulDegradation) ExecuteWithFallback(serviceName string, primary func() (interface{}, error)) (interface{}, error) { if gd.isServiceDegraded(serviceName) { gd.LogInfo("Service %s is degraded, using fallback", serviceName) return gd.executeFallback(serviceName) } result, err := primary() if err != nil { gd.markServiceDegraded(serviceName) gd.LogError("Service %s failed: %v", serviceName, err) if gd.config.EnableFallbacks { gd.LogInfo("Using fallback for service %s", serviceName) return gd.executeFallback(serviceName) } return nil, err } return result, nil } // isServiceDegraded checks if a service is currently degraded func (gd *GracefulDegradation) isServiceDegraded(serviceName string) bool { // Uses a write lock because the recovery-timeout branch deletes from the map. gd.mutex.Lock() defer gd.mutex.Unlock() degradedTime, exists := gd.degradedServices[serviceName] if !exists { return false } if time.Since(degradedTime) > gd.config.RecoveryTimeout { delete(gd.degradedServices, serviceName) return false } return true } // markServiceDegraded marks a service as degraded func (gd *GracefulDegradation) markServiceDegraded(serviceName string) { gd.mutex.Lock() defer gd.mutex.Unlock() if _, exists := gd.degradedServices[serviceName]; !exists { gd.LogError("Service %s marked as degraded", serviceName) } gd.degradedServices[serviceName] = time.Now() } // executeFallback executes the fallback function for a service func (gd *GracefulDegradation) executeFallback(serviceName string) (interface{}, error) { gd.mutex.RLock() fallback, exists := gd.fallbacks[serviceName] gd.mutex.RUnlock() if !exists { return nil, fmt.Errorf("no fallback available for service %s", serviceName) } gd.LogInfo("Executing fallback for degraded service %s", serviceName) return fallback() } // startHealthCheckRoutine starts the background health check routine func (gd *GracefulDegradation) startHealthCheckRoutine() { // Use singleton task registry to prevent multiple instances registry := GetGlobalTaskRegistry() task, err := registry.CreateSingletonTask( "graceful-degradation-health-check", gd.config.HealthCheckInterval, gd.performHealthChecks, gd.BaseRecoveryMechanism.logger, nil, // No specific wait group ) if err != nil { gd.BaseRecoveryMechanism.logger.Errorf("Failed to create health check task: %v", err) return } gd.mutex.Lock() gd.healthCheckTask = task gd.mutex.Unlock() task.Start() } // performHealthChecks runs health checks for all registered services func (gd *GracefulDegradation) performHealthChecks() { gd.mutex.RLock() healthChecks := make(map[string]func() bool) for k, v := range gd.healthChecks { healthChecks[k] = v } gd.mutex.RUnlock() for serviceName, healthCheck := range healthChecks { if healthCheck() { gd.mutex.Lock() if _, wasDegraded := gd.degradedServices[serviceName]; wasDegraded { delete(gd.degradedServices, serviceName) gd.logger.Infof("Service %s recovered from degraded state", serviceName) } gd.mutex.Unlock() } else { gd.markServiceDegraded(serviceName) } } } // GetDegradedServices returns a list of currently degraded services func (gd *GracefulDegradation) GetDegradedServices() []string { gd.mutex.RLock() defer gd.mutex.RUnlock() degraded := make([]string, 0, len(gd.degradedServices)) for serviceName := range gd.degradedServices { degraded = append(degraded, serviceName) } return degraded } // Reset resets the state of all degraded services func (gd *GracefulDegradation) Reset() { gd.mutex.Lock() defer gd.mutex.Unlock() gd.degradedServices = make(map[string]time.Time) gd.LogInfo("Graceful degradation state has been reset") } // Close shuts down the graceful degradation system and cleans up resources func (gd *GracefulDegradation) Close() { gd.shutdownOnce.Do(func() { // Signal shutdown select { case <-gd.stopChan: // Already closed default: close(gd.stopChan) } // Stop health check task gd.mutex.Lock() task := gd.healthCheckTask gd.mutex.Unlock() if task != nil { task.Stop() // Don't set to nil to avoid race conditions } gd.logger.Debug("GracefulDegradation shut down successfully") }) } // IsAvailable returns whether the mechanism is available for use func (gd *GracefulDegradation) IsAvailable() bool { return true } // GetMetrics returns metrics about the graceful degradation mechanism func (gd *GracefulDegradation) GetMetrics() map[string]interface{} { gd.mutex.RLock() degradedCount := len(gd.degradedServices) degradedServices := make([]string, 0, degradedCount) for service := range gd.degradedServices { degradedServices = append(degradedServices, service) } fallbackCount := len(gd.fallbacks) healthCheckCount := len(gd.healthChecks) gd.mutex.RUnlock() metrics := gd.GetBaseMetrics() metrics["degraded_services_count"] = degradedCount metrics["degraded_services"] = degradedServices metrics["registered_fallbacks_count"] = fallbackCount metrics["registered_health_checks_count"] = healthCheckCount metrics["health_check_interval_seconds"] = gd.config.HealthCheckInterval.Seconds() metrics["recovery_timeout_seconds"] = gd.config.RecoveryTimeout.Seconds() metrics["fallbacks_enabled"] = gd.config.EnableFallbacks return metrics } // ErrorRecoveryManager coordinates all error recovery mechanisms type ErrorRecoveryManager struct { circuitBreakers map[string]*CircuitBreaker retryExecutor *RetryExecutor gracefulDegradation *GracefulDegradation logger *Logger mutex sync.RWMutex } // NewErrorRecoveryManager creates a new error recovery manager // NewErrorRecoveryManager creates a comprehensive error recovery manager. // Combines circuit breakers, retry logic, and graceful degradation into a unified system. func NewErrorRecoveryManager(logger *Logger) *ErrorRecoveryManager { return &ErrorRecoveryManager{ circuitBreakers: make(map[string]*CircuitBreaker), retryExecutor: NewRetryExecutor(DefaultRetryConfig(), logger), gracefulDegradation: NewGracefulDegradation(DefaultGracefulDegradationConfig(), logger), logger: logger, } } // GetCircuitBreaker gets or creates a circuit breaker for a service // GetCircuitBreaker returns the circuit breaker for a specific service. // Creates a new circuit breaker if one doesn't exist for the service. func (erm *ErrorRecoveryManager) GetCircuitBreaker(serviceName string) *CircuitBreaker { erm.mutex.Lock() defer erm.mutex.Unlock() if cb, exists := erm.circuitBreakers[serviceName]; exists { return cb } cb := NewCircuitBreaker(DefaultCircuitBreakerConfig(), erm.logger) erm.circuitBreakers[serviceName] = cb return cb } // ExecuteWithRecovery executes a function with full error recovery support // ExecuteWithRecovery executes a function with comprehensive error recovery. // Applies circuit breaker protection and retry logic for the specified service. func (erm *ErrorRecoveryManager) ExecuteWithRecovery(ctx context.Context, serviceName string, fn func() error) error { cb := erm.GetCircuitBreaker(serviceName) return erm.retryExecutor.Execute(ctx, func() error { return cb.Execute(fn) }) } // GetRecoveryMetrics returns metrics for all recovery mechanisms // GetRecoveryMetrics returns comprehensive metrics for all recovery mechanisms. // Includes circuit breaker states, retry statistics, and graceful degradation status. func (erm *ErrorRecoveryManager) GetRecoveryMetrics() map[string]interface{} { erm.mutex.RLock() defer erm.mutex.RUnlock() metrics := make(map[string]interface{}) cbMetrics := make(map[string]interface{}) for name, cb := range erm.circuitBreakers { cbMetrics[name] = cb.GetMetrics() } metrics["circuit_breakers"] = cbMetrics metrics["degraded_services"] = erm.gracefulDegradation.GetDegradedServices() return metrics } // Helper function to check if a string contains a substring (case-insensitive) func contains(s, substr string) bool { return len(s) >= len(substr) && (s == substr || (len(s) > len(substr) && (s[:len(substr)] == substr || s[len(s)-len(substr):] == substr || containsSubstring(s, substr)))) } func containsSubstring(s, substr string) bool { for i := 0; i <= len(s)-len(substr); i++ { if s[i:i+len(substr)] == substr { return true } } return false } // isTraefikDefaultCertError detects when Traefik is serving its default self-signed // certificate during cold-start, before the real certificates are loaded. // This manifests as an x509.HostnameError where one of the certificate's DNS names // ends with "traefik.default" (the default Traefik certificate pattern). // See: https://github.com/lukaszraczylo/traefikoidc/issues/90 func isTraefikDefaultCertError(err error) bool { if err == nil { return false } var hostnameErr x509.HostnameError if errors.As(err, &hostnameErr) { if hostnameErr.Certificate != nil { for _, name := range hostnameErr.Certificate.DNSNames { if strings.HasSuffix(name, "traefik.default") { return true } } } } return false } // isEOFError checks if an error is an EOF error, which can occur during // connection establishment when the remote end closes unexpectedly. // This is common during service startup when endpoints aren't fully ready. func isEOFError(err error) bool { if err == nil { return false } // Check for direct EOF if errors.Is(err, io.EOF) { return true } // Check for unexpected EOF if errors.Is(err, io.ErrUnexpectedEOF) { return true } // Check error message for EOF patterns (wrapped errors) errStr := err.Error() return strings.Contains(errStr, "EOF") || strings.Contains(errStr, "unexpected EOF") } // isCertificateError checks if an error is related to TLS certificate validation. // These errors are often transient during startup when services are still initializing. func isCertificateError(err error) bool { if err == nil { return false } // Check for x509 certificate errors var certInvalidErr x509.CertificateInvalidError var hostnameErr x509.HostnameError var unknownAuthErr x509.UnknownAuthorityError if errors.As(err, &certInvalidErr) || errors.As(err, &hostnameErr) || errors.As(err, &unknownAuthErr) { return true } // Check error message for certificate patterns errStr := strings.ToLower(err.Error()) certPatterns := []string{ "certificate", "x509", "tls", "ssl", } for _, pattern := range certPatterns { if strings.Contains(errStr, pattern) { return true } } return false }