Files
traefikoidc/internal/recovery/metrics.go
T
lukaszraczylo e64fc7f730 Add redis support for distributed caching (#83)
* Add redis support for distributed caching

* Move towards the self-provided Redis connection pool and RESP protocol implementation.
Official redis client library won't work with yaegi.

* fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* fixup! fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* fixup! fixup! fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* ... and another all nighter.

* fixup! ... and another all nighter.

* fixup! fixup! ... and another all nighter.

* fixup! fixup! fixup! ... and another all nighter.

* Resolve issue #85 by adding ability to set custom claims in JWT tokens

* Remove redundant validation in auth middleware ( issue #89 )

* Add ability to set cookie prefix for session cookies ( #87 )

* fixup! Add ability to set cookie prefix for session cookies ( #87 )

* Add ability to set cookie max age - issue #91

* Potential fix for code scanning alert no. 10: Size computation for allocation may overflow

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>

* fixup! Merge main into 0.8.0-redis: resolve conflicts

---------

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-11-30 02:18:46 +00:00

392 lines
10 KiB
Go

// Package recovery provides error recovery and resilience mechanisms for OIDC authentication.
package recovery
import (
"context"
"fmt"
"math"
"math/rand"
"net/http"
"strings"
"sync"
"sync/atomic"
"time"
)
// RetryConfig defines configuration for the retry executor
type RetryConfig struct {
// MaxAttempts is the maximum number of retry attempts
MaxAttempts int
// InitialDelay is the initial delay between retries
InitialDelay time.Duration
// MaxDelay is the maximum delay between retries
MaxDelay time.Duration
// Multiplier is the backoff multiplier
Multiplier float64
// RandomizationFactor adds jitter to delays (0.0 to 1.0)
RandomizationFactor float64
// RetryableErrors defines which errors should trigger a retry
RetryableErrors []string
// RetryableStatusCodes defines which HTTP status codes should trigger a retry
RetryableStatusCodes []int
}
// DefaultRetryConfig returns sensible default retry configuration
func DefaultRetryConfig() RetryConfig {
return RetryConfig{
MaxAttempts: 3,
InitialDelay: 100 * time.Millisecond,
MaxDelay: 30 * time.Second,
Multiplier: 2.0,
RandomizationFactor: 0.1,
RetryableErrors: []string{"connection refused", "timeout", "EOF"},
RetryableStatusCodes: []int{408, 429, 500, 502, 503, 504},
}
}
// RetryExecutor implements retry logic with exponential backoff
type RetryExecutor struct {
*BaseRecoveryMechanism
config RetryConfig
// Metrics
totalRetries int64
maxRetriesHit int64
lastRetryTime time.Time
retryTimeMutex sync.RWMutex
}
// NewRetryExecutor creates a new retry executor with the given configuration
func NewRetryExecutor(config RetryConfig, logger Logger) *RetryExecutor {
if config.MaxAttempts < 1 {
config.MaxAttempts = 1
}
if config.Multiplier < 1.0 {
config.Multiplier = 1.0
}
return &RetryExecutor{
BaseRecoveryMechanism: NewBaseRecoveryMechanism("RetryExecutor", logger),
config: config,
totalRetries: 0,
maxRetriesHit: 0,
}
}
// ExecuteWithContext executes a function with retry logic
func (re *RetryExecutor) ExecuteWithContext(ctx context.Context, fn func() error) error {
re.RecordRequest()
var lastErr error
for attempt := 1; attempt <= re.config.MaxAttempts; attempt++ {
// Check context before attempting
select {
case <-ctx.Done():
re.RecordFailure()
return ctx.Err()
default:
}
// Execute the function
lastErr = fn()
if lastErr == nil {
re.RecordSuccess()
if attempt > 1 {
re.LogInfo("Succeeded after %d attempts", attempt)
}
return nil
}
// Check if error is retryable
if !re.isRetryableError(lastErr) {
re.LogDebug("Error is not retryable: %v", lastErr)
re.RecordFailure()
return lastErr
}
// Don't retry if this was the last attempt
if attempt >= re.config.MaxAttempts {
atomic.AddInt64(&re.maxRetriesHit, 1)
re.LogError("Max retries (%d) exhausted", re.config.MaxAttempts)
break
}
// Calculate and apply delay
delay := re.calculateDelay(attempt)
re.LogInfo("Attempt %d failed: %v, retrying in %v", attempt, lastErr, delay)
atomic.AddInt64(&re.totalRetries, 1)
re.retryTimeMutex.Lock()
re.lastRetryTime = time.Now()
re.retryTimeMutex.Unlock()
select {
case <-time.After(delay):
// Continue to next attempt
case <-ctx.Done():
re.RecordFailure()
return fmt.Errorf("retry cancelled: %w", ctx.Err())
}
}
re.RecordFailure()
return fmt.Errorf("all retry attempts failed: %w", lastErr)
}
// Execute executes a function with retry logic (legacy method)
func (re *RetryExecutor) Execute(ctx context.Context, fn func() error) error {
return re.ExecuteWithContext(ctx, fn)
}
// isRetryableError determines if an error should trigger a retry
func (re *RetryExecutor) isRetryableError(err error) bool {
if err == nil {
return false
}
errStr := strings.ToLower(err.Error())
// Check for retryable error patterns
for _, pattern := range re.config.RetryableErrors {
if strings.Contains(errStr, strings.ToLower(pattern)) {
return true
}
}
// Check for HTTP errors
if httpErr, ok := err.(*HTTPError); ok {
for _, code := range re.config.RetryableStatusCodes {
if httpErr.StatusCode == code {
return true
}
}
// Also retry on any 5xx error
if httpErr.StatusCode >= 500 && httpErr.StatusCode < 600 {
return true
}
}
// Check for OIDC errors
if oidcErr, ok := err.(*OIDCError); ok {
return oidcErr.IsRetryable()
}
// Check for context errors (don't retry these)
if err == context.Canceled || err == context.DeadlineExceeded {
return false
}
// Default: don't retry unknown errors
return false
}
// calculateDelay calculates the delay before the next retry attempt
func (re *RetryExecutor) calculateDelay(attempt int) time.Duration {
// Exponential backoff
delay := float64(re.config.InitialDelay) * math.Pow(re.config.Multiplier, float64(attempt-1))
// Cap at max delay
if delay > float64(re.config.MaxDelay) {
delay = float64(re.config.MaxDelay)
}
// Add jitter
if re.config.RandomizationFactor > 0 {
jitter := delay * re.config.RandomizationFactor
minDelay := delay - jitter
maxDelay := delay + jitter
delay = minDelay + rand.Float64()*(maxDelay-minDelay)
}
return time.Duration(delay)
}
// Reset resets the retry executor state
func (re *RetryExecutor) Reset() {
atomic.StoreInt64(&re.totalRetries, 0)
atomic.StoreInt64(&re.maxRetriesHit, 0)
atomic.StoreInt64(&re.totalRequests, 0)
atomic.StoreInt64(&re.successCount, 0)
atomic.StoreInt64(&re.failureCount, 0)
re.retryTimeMutex.Lock()
re.lastRetryTime = time.Time{}
re.retryTimeMutex.Unlock()
}
// IsAvailable always returns true for retry executor
func (re *RetryExecutor) IsAvailable() bool {
return true
}
// GetMetrics returns comprehensive metrics about the retry executor
func (re *RetryExecutor) GetMetrics() map[string]interface{} {
metrics := re.GetBaseMetrics()
metrics["totalRetries"] = atomic.LoadInt64(&re.totalRetries)
metrics["maxRetriesHit"] = atomic.LoadInt64(&re.maxRetriesHit)
re.retryTimeMutex.RLock()
if !re.lastRetryTime.IsZero() {
metrics["lastRetryTime"] = re.lastRetryTime.Format(time.RFC3339)
metrics["timeSinceLastRetry"] = time.Since(re.lastRetryTime).String()
} else {
metrics["lastRetryTime"] = "never"
}
re.retryTimeMutex.RUnlock()
// Configuration
metrics["config"] = map[string]interface{}{
"maxAttempts": re.config.MaxAttempts,
"initialDelay": re.config.InitialDelay.String(),
"maxDelay": re.config.MaxDelay.String(),
"multiplier": re.config.Multiplier,
"randomizationFactor": re.config.RandomizationFactor,
}
// Calculate average retries per request
totalRequests := atomic.LoadInt64(&re.totalRequests)
if totalRequests > 0 {
avgRetries := float64(atomic.LoadInt64(&re.totalRetries)) / float64(totalRequests)
metrics["averageRetriesPerRequest"] = fmt.Sprintf("%.2f", avgRetries)
}
return metrics
}
// RecoveryMetrics aggregates metrics from multiple recovery mechanisms
type RecoveryMetrics struct {
mechanisms map[string]ErrorRecoveryMechanism
mu sync.RWMutex
}
// NewRecoveryMetrics creates a new recovery metrics aggregator
func NewRecoveryMetrics() *RecoveryMetrics {
return &RecoveryMetrics{
mechanisms: make(map[string]ErrorRecoveryMechanism),
}
}
// RegisterMechanism registers a recovery mechanism for metrics collection
func (rm *RecoveryMetrics) RegisterMechanism(name string, mechanism ErrorRecoveryMechanism) {
rm.mu.Lock()
defer rm.mu.Unlock()
rm.mechanisms[name] = mechanism
}
// UnregisterMechanism removes a recovery mechanism from metrics collection
func (rm *RecoveryMetrics) UnregisterMechanism(name string) {
rm.mu.Lock()
defer rm.mu.Unlock()
delete(rm.mechanisms, name)
}
// GetAllMetrics returns aggregated metrics from all registered mechanisms
func (rm *RecoveryMetrics) GetAllMetrics() map[string]interface{} {
rm.mu.RLock()
defer rm.mu.RUnlock()
allMetrics := make(map[string]interface{})
for name, mechanism := range rm.mechanisms {
allMetrics[name] = mechanism.GetMetrics()
}
// Add summary statistics
totalRequests := int64(0)
totalSuccesses := int64(0)
totalFailures := int64(0)
for _, mechanism := range rm.mechanisms {
metrics := mechanism.GetMetrics()
if requests, ok := metrics["totalRequests"].(int64); ok {
totalRequests += requests
}
if successes, ok := metrics["successCount"].(int64); ok {
totalSuccesses += successes
}
if failures, ok := metrics["failureCount"].(int64); ok {
totalFailures += failures
}
}
allMetrics["summary"] = map[string]interface{}{
"totalMechanisms": len(rm.mechanisms),
"totalRequests": totalRequests,
"totalSuccesses": totalSuccesses,
"totalFailures": totalFailures,
}
if totalRequests > 0 {
successRate := float64(totalSuccesses) / float64(totalRequests) * 100
allMetrics["summary"].(map[string]interface{})["overallSuccessRate"] = fmt.Sprintf("%.2f%%", successRate)
}
return allMetrics
}
// GetMechanismMetrics returns metrics for a specific mechanism
func (rm *RecoveryMetrics) GetMechanismMetrics(name string) (map[string]interface{}, bool) {
rm.mu.RLock()
defer rm.mu.RUnlock()
if mechanism, exists := rm.mechanisms[name]; exists {
return mechanism.GetMetrics(), true
}
return nil, false
}
// HealthCheck performs a health check on all registered mechanisms
func (rm *RecoveryMetrics) HealthCheck() map[string]interface{} {
rm.mu.RLock()
defer rm.mu.RUnlock()
health := make(map[string]interface{})
healthyCount := 0
unhealthyCount := 0
for name, mechanism := range rm.mechanisms {
if mechanism.IsAvailable() {
health[name] = "healthy"
healthyCount++
} else {
health[name] = "unhealthy"
unhealthyCount++
}
}
overallHealth := "healthy"
if unhealthyCount > 0 {
if healthyCount > 0 {
overallHealth = "degraded"
} else {
overallHealth = "unhealthy"
}
}
return map[string]interface{}{
"status": overallHealth,
"mechanisms": health,
"healthy": healthyCount,
"unhealthy": unhealthyCount,
"timestamp": time.Now().Format(time.RFC3339),
}
}
// HTTPMetricsHandler creates an HTTP handler for serving recovery metrics
func (rm *RecoveryMetrics) HTTPMetricsHandler() http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
metrics := rm.GetAllMetrics()
health := rm.HealthCheck()
response := map[string]interface{}{
"metrics": metrics,
"health": health,
}
// Would normally use json.Marshal here, but keeping it simple for the module
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
fmt.Fprintf(w, "%v", response)
}
}