mirror of
https://github.com/lukaszraczylo/traefikoidc.git
synced 2026-06-05 22:44:17 +00:00
e64fc7f730
* Add redis support for distributed caching * Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi. * fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi. * fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi. * fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi. * fixup! fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi. * fixup! fixup! fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi. * ... and another all nighter. * fixup! ... and another all nighter. * fixup! fixup! ... and another all nighter. * fixup! fixup! fixup! ... and another all nighter. * Resolve issue #85 by adding ability to set custom claims in JWT tokens * Remove redundant validation in auth middleware ( issue #89 ) * Add ability to set cookie prefix for session cookies ( #87 ) * fixup! Add ability to set cookie prefix for session cookies ( #87 ) * Add ability to set cookie max age - issue #91 * Potential fix for code scanning alert no. 10: Size computation for allocation may overflow Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> * fixup! Merge main into 0.8.0-redis: resolve conflicts --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
392 lines
10 KiB
Go
392 lines
10 KiB
Go
// Package recovery provides error recovery and resilience mechanisms for OIDC authentication.
|
|
package recovery
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math"
|
|
"math/rand"
|
|
"net/http"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
// RetryConfig defines configuration for the retry executor
|
|
type RetryConfig struct {
|
|
// MaxAttempts is the maximum number of retry attempts
|
|
MaxAttempts int
|
|
// InitialDelay is the initial delay between retries
|
|
InitialDelay time.Duration
|
|
// MaxDelay is the maximum delay between retries
|
|
MaxDelay time.Duration
|
|
// Multiplier is the backoff multiplier
|
|
Multiplier float64
|
|
// RandomizationFactor adds jitter to delays (0.0 to 1.0)
|
|
RandomizationFactor float64
|
|
// RetryableErrors defines which errors should trigger a retry
|
|
RetryableErrors []string
|
|
// RetryableStatusCodes defines which HTTP status codes should trigger a retry
|
|
RetryableStatusCodes []int
|
|
}
|
|
|
|
// DefaultRetryConfig returns sensible default retry configuration
|
|
func DefaultRetryConfig() RetryConfig {
|
|
return RetryConfig{
|
|
MaxAttempts: 3,
|
|
InitialDelay: 100 * time.Millisecond,
|
|
MaxDelay: 30 * time.Second,
|
|
Multiplier: 2.0,
|
|
RandomizationFactor: 0.1,
|
|
RetryableErrors: []string{"connection refused", "timeout", "EOF"},
|
|
RetryableStatusCodes: []int{408, 429, 500, 502, 503, 504},
|
|
}
|
|
}
|
|
|
|
// RetryExecutor implements retry logic with exponential backoff
|
|
type RetryExecutor struct {
|
|
*BaseRecoveryMechanism
|
|
config RetryConfig
|
|
|
|
// Metrics
|
|
totalRetries int64
|
|
maxRetriesHit int64
|
|
lastRetryTime time.Time
|
|
retryTimeMutex sync.RWMutex
|
|
}
|
|
|
|
// NewRetryExecutor creates a new retry executor with the given configuration
|
|
func NewRetryExecutor(config RetryConfig, logger Logger) *RetryExecutor {
|
|
if config.MaxAttempts < 1 {
|
|
config.MaxAttempts = 1
|
|
}
|
|
if config.Multiplier < 1.0 {
|
|
config.Multiplier = 1.0
|
|
}
|
|
return &RetryExecutor{
|
|
BaseRecoveryMechanism: NewBaseRecoveryMechanism("RetryExecutor", logger),
|
|
config: config,
|
|
totalRetries: 0,
|
|
maxRetriesHit: 0,
|
|
}
|
|
}
|
|
|
|
// ExecuteWithContext executes a function with retry logic
|
|
func (re *RetryExecutor) ExecuteWithContext(ctx context.Context, fn func() error) error {
|
|
re.RecordRequest()
|
|
|
|
var lastErr error
|
|
for attempt := 1; attempt <= re.config.MaxAttempts; attempt++ {
|
|
// Check context before attempting
|
|
select {
|
|
case <-ctx.Done():
|
|
re.RecordFailure()
|
|
return ctx.Err()
|
|
default:
|
|
}
|
|
|
|
// Execute the function
|
|
lastErr = fn()
|
|
|
|
if lastErr == nil {
|
|
re.RecordSuccess()
|
|
if attempt > 1 {
|
|
re.LogInfo("Succeeded after %d attempts", attempt)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Check if error is retryable
|
|
if !re.isRetryableError(lastErr) {
|
|
re.LogDebug("Error is not retryable: %v", lastErr)
|
|
re.RecordFailure()
|
|
return lastErr
|
|
}
|
|
|
|
// Don't retry if this was the last attempt
|
|
if attempt >= re.config.MaxAttempts {
|
|
atomic.AddInt64(&re.maxRetriesHit, 1)
|
|
re.LogError("Max retries (%d) exhausted", re.config.MaxAttempts)
|
|
break
|
|
}
|
|
|
|
// Calculate and apply delay
|
|
delay := re.calculateDelay(attempt)
|
|
re.LogInfo("Attempt %d failed: %v, retrying in %v", attempt, lastErr, delay)
|
|
|
|
atomic.AddInt64(&re.totalRetries, 1)
|
|
re.retryTimeMutex.Lock()
|
|
re.lastRetryTime = time.Now()
|
|
re.retryTimeMutex.Unlock()
|
|
|
|
select {
|
|
case <-time.After(delay):
|
|
// Continue to next attempt
|
|
case <-ctx.Done():
|
|
re.RecordFailure()
|
|
return fmt.Errorf("retry cancelled: %w", ctx.Err())
|
|
}
|
|
}
|
|
|
|
re.RecordFailure()
|
|
return fmt.Errorf("all retry attempts failed: %w", lastErr)
|
|
}
|
|
|
|
// Execute executes a function with retry logic (legacy method)
|
|
func (re *RetryExecutor) Execute(ctx context.Context, fn func() error) error {
|
|
return re.ExecuteWithContext(ctx, fn)
|
|
}
|
|
|
|
// isRetryableError determines if an error should trigger a retry
|
|
func (re *RetryExecutor) isRetryableError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
|
|
errStr := strings.ToLower(err.Error())
|
|
|
|
// Check for retryable error patterns
|
|
for _, pattern := range re.config.RetryableErrors {
|
|
if strings.Contains(errStr, strings.ToLower(pattern)) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Check for HTTP errors
|
|
if httpErr, ok := err.(*HTTPError); ok {
|
|
for _, code := range re.config.RetryableStatusCodes {
|
|
if httpErr.StatusCode == code {
|
|
return true
|
|
}
|
|
}
|
|
// Also retry on any 5xx error
|
|
if httpErr.StatusCode >= 500 && httpErr.StatusCode < 600 {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Check for OIDC errors
|
|
if oidcErr, ok := err.(*OIDCError); ok {
|
|
return oidcErr.IsRetryable()
|
|
}
|
|
|
|
// Check for context errors (don't retry these)
|
|
if err == context.Canceled || err == context.DeadlineExceeded {
|
|
return false
|
|
}
|
|
|
|
// Default: don't retry unknown errors
|
|
return false
|
|
}
|
|
|
|
// calculateDelay calculates the delay before the next retry attempt
|
|
func (re *RetryExecutor) calculateDelay(attempt int) time.Duration {
|
|
// Exponential backoff
|
|
delay := float64(re.config.InitialDelay) * math.Pow(re.config.Multiplier, float64(attempt-1))
|
|
|
|
// Cap at max delay
|
|
if delay > float64(re.config.MaxDelay) {
|
|
delay = float64(re.config.MaxDelay)
|
|
}
|
|
|
|
// Add jitter
|
|
if re.config.RandomizationFactor > 0 {
|
|
jitter := delay * re.config.RandomizationFactor
|
|
minDelay := delay - jitter
|
|
maxDelay := delay + jitter
|
|
delay = minDelay + rand.Float64()*(maxDelay-minDelay)
|
|
}
|
|
|
|
return time.Duration(delay)
|
|
}
|
|
|
|
// Reset resets the retry executor state
|
|
func (re *RetryExecutor) Reset() {
|
|
atomic.StoreInt64(&re.totalRetries, 0)
|
|
atomic.StoreInt64(&re.maxRetriesHit, 0)
|
|
atomic.StoreInt64(&re.totalRequests, 0)
|
|
atomic.StoreInt64(&re.successCount, 0)
|
|
atomic.StoreInt64(&re.failureCount, 0)
|
|
|
|
re.retryTimeMutex.Lock()
|
|
re.lastRetryTime = time.Time{}
|
|
re.retryTimeMutex.Unlock()
|
|
}
|
|
|
|
// IsAvailable always returns true for retry executor
|
|
func (re *RetryExecutor) IsAvailable() bool {
|
|
return true
|
|
}
|
|
|
|
// GetMetrics returns comprehensive metrics about the retry executor
|
|
func (re *RetryExecutor) GetMetrics() map[string]interface{} {
|
|
metrics := re.GetBaseMetrics()
|
|
|
|
metrics["totalRetries"] = atomic.LoadInt64(&re.totalRetries)
|
|
metrics["maxRetriesHit"] = atomic.LoadInt64(&re.maxRetriesHit)
|
|
|
|
re.retryTimeMutex.RLock()
|
|
if !re.lastRetryTime.IsZero() {
|
|
metrics["lastRetryTime"] = re.lastRetryTime.Format(time.RFC3339)
|
|
metrics["timeSinceLastRetry"] = time.Since(re.lastRetryTime).String()
|
|
} else {
|
|
metrics["lastRetryTime"] = "never"
|
|
}
|
|
re.retryTimeMutex.RUnlock()
|
|
|
|
// Configuration
|
|
metrics["config"] = map[string]interface{}{
|
|
"maxAttempts": re.config.MaxAttempts,
|
|
"initialDelay": re.config.InitialDelay.String(),
|
|
"maxDelay": re.config.MaxDelay.String(),
|
|
"multiplier": re.config.Multiplier,
|
|
"randomizationFactor": re.config.RandomizationFactor,
|
|
}
|
|
|
|
// Calculate average retries per request
|
|
totalRequests := atomic.LoadInt64(&re.totalRequests)
|
|
if totalRequests > 0 {
|
|
avgRetries := float64(atomic.LoadInt64(&re.totalRetries)) / float64(totalRequests)
|
|
metrics["averageRetriesPerRequest"] = fmt.Sprintf("%.2f", avgRetries)
|
|
}
|
|
|
|
return metrics
|
|
}
|
|
|
|
// RecoveryMetrics aggregates metrics from multiple recovery mechanisms
|
|
type RecoveryMetrics struct {
|
|
mechanisms map[string]ErrorRecoveryMechanism
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
// NewRecoveryMetrics creates a new recovery metrics aggregator
|
|
func NewRecoveryMetrics() *RecoveryMetrics {
|
|
return &RecoveryMetrics{
|
|
mechanisms: make(map[string]ErrorRecoveryMechanism),
|
|
}
|
|
}
|
|
|
|
// RegisterMechanism registers a recovery mechanism for metrics collection
|
|
func (rm *RecoveryMetrics) RegisterMechanism(name string, mechanism ErrorRecoveryMechanism) {
|
|
rm.mu.Lock()
|
|
defer rm.mu.Unlock()
|
|
rm.mechanisms[name] = mechanism
|
|
}
|
|
|
|
// UnregisterMechanism removes a recovery mechanism from metrics collection
|
|
func (rm *RecoveryMetrics) UnregisterMechanism(name string) {
|
|
rm.mu.Lock()
|
|
defer rm.mu.Unlock()
|
|
delete(rm.mechanisms, name)
|
|
}
|
|
|
|
// GetAllMetrics returns aggregated metrics from all registered mechanisms
|
|
func (rm *RecoveryMetrics) GetAllMetrics() map[string]interface{} {
|
|
rm.mu.RLock()
|
|
defer rm.mu.RUnlock()
|
|
|
|
allMetrics := make(map[string]interface{})
|
|
for name, mechanism := range rm.mechanisms {
|
|
allMetrics[name] = mechanism.GetMetrics()
|
|
}
|
|
|
|
// Add summary statistics
|
|
totalRequests := int64(0)
|
|
totalSuccesses := int64(0)
|
|
totalFailures := int64(0)
|
|
|
|
for _, mechanism := range rm.mechanisms {
|
|
metrics := mechanism.GetMetrics()
|
|
if requests, ok := metrics["totalRequests"].(int64); ok {
|
|
totalRequests += requests
|
|
}
|
|
if successes, ok := metrics["successCount"].(int64); ok {
|
|
totalSuccesses += successes
|
|
}
|
|
if failures, ok := metrics["failureCount"].(int64); ok {
|
|
totalFailures += failures
|
|
}
|
|
}
|
|
|
|
allMetrics["summary"] = map[string]interface{}{
|
|
"totalMechanisms": len(rm.mechanisms),
|
|
"totalRequests": totalRequests,
|
|
"totalSuccesses": totalSuccesses,
|
|
"totalFailures": totalFailures,
|
|
}
|
|
|
|
if totalRequests > 0 {
|
|
successRate := float64(totalSuccesses) / float64(totalRequests) * 100
|
|
allMetrics["summary"].(map[string]interface{})["overallSuccessRate"] = fmt.Sprintf("%.2f%%", successRate)
|
|
}
|
|
|
|
return allMetrics
|
|
}
|
|
|
|
// GetMechanismMetrics returns metrics for a specific mechanism
|
|
func (rm *RecoveryMetrics) GetMechanismMetrics(name string) (map[string]interface{}, bool) {
|
|
rm.mu.RLock()
|
|
defer rm.mu.RUnlock()
|
|
|
|
if mechanism, exists := rm.mechanisms[name]; exists {
|
|
return mechanism.GetMetrics(), true
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
// HealthCheck performs a health check on all registered mechanisms
|
|
func (rm *RecoveryMetrics) HealthCheck() map[string]interface{} {
|
|
rm.mu.RLock()
|
|
defer rm.mu.RUnlock()
|
|
|
|
health := make(map[string]interface{})
|
|
healthyCount := 0
|
|
unhealthyCount := 0
|
|
|
|
for name, mechanism := range rm.mechanisms {
|
|
if mechanism.IsAvailable() {
|
|
health[name] = "healthy"
|
|
healthyCount++
|
|
} else {
|
|
health[name] = "unhealthy"
|
|
unhealthyCount++
|
|
}
|
|
}
|
|
|
|
overallHealth := "healthy"
|
|
if unhealthyCount > 0 {
|
|
if healthyCount > 0 {
|
|
overallHealth = "degraded"
|
|
} else {
|
|
overallHealth = "unhealthy"
|
|
}
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"status": overallHealth,
|
|
"mechanisms": health,
|
|
"healthy": healthyCount,
|
|
"unhealthy": unhealthyCount,
|
|
"timestamp": time.Now().Format(time.RFC3339),
|
|
}
|
|
}
|
|
|
|
// HTTPMetricsHandler creates an HTTP handler for serving recovery metrics
|
|
func (rm *RecoveryMetrics) HTTPMetricsHandler() http.HandlerFunc {
|
|
return func(w http.ResponseWriter, r *http.Request) {
|
|
metrics := rm.GetAllMetrics()
|
|
health := rm.HealthCheck()
|
|
|
|
response := map[string]interface{}{
|
|
"metrics": metrics,
|
|
"health": health,
|
|
}
|
|
|
|
// Would normally use json.Marshal here, but keeping it simple for the module
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusOK)
|
|
fmt.Fprintf(w, "%v", response)
|
|
}
|
|
}
|