Files
traefikoidc/internal/cache/resilience/health_check.go
T
lukaszraczylo e64fc7f730 Add redis support for distributed caching (#83)
* Add redis support for distributed caching

* Move towards the self-provided Redis connection pool and RESP protocol implementation.
Official redis client library won't work with yaegi.

* fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* fixup! fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* fixup! fixup! fixup! fixup! fixup! Move towards the self-provided Redis connection pool and RESP protocol implementation. Official redis client library won't work with yaegi.

* ... and another all nighter.

* fixup! ... and another all nighter.

* fixup! fixup! ... and another all nighter.

* fixup! fixup! fixup! ... and another all nighter.

* Resolve issue #85 by adding ability to set custom claims in JWT tokens

* Remove redundant validation in auth middleware ( issue #89 )

* Add ability to set cookie prefix for session cookies ( #87 )

* fixup! Add ability to set cookie prefix for session cookies ( #87 )

* Add ability to set cookie max age - issue #91

* Potential fix for code scanning alert no. 10: Size computation for allocation may overflow

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>

* fixup! Merge main into 0.8.0-redis: resolve conflicts

---------

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-11-30 02:18:46 +00:00

376 lines
9.0 KiB
Go

// Package resilience provides resilience patterns for cache backends.
package resilience
import (
"context"
"sync"
"sync/atomic"
"time"
)
// HealthStatus represents the health status of a backend
type HealthStatus int32
const (
// HealthUnknown indicates unknown health status
HealthUnknown HealthStatus = iota
// HealthHealthy indicates the backend is healthy
HealthHealthy
// HealthDegraded indicates the backend is degraded but operational
HealthDegraded
// HealthUnhealthy indicates the backend is unhealthy
HealthUnhealthy
)
// String returns the string representation of the health status
func (h HealthStatus) String() string {
switch h {
case HealthHealthy:
return "healthy"
case HealthDegraded:
return "degraded"
case HealthUnhealthy:
return "unhealthy"
default:
return "unknown"
}
}
// HealthCheckConfig holds configuration for the health checker
type HealthCheckConfig struct {
// CheckInterval is how often to check health
CheckInterval time.Duration
// Timeout is the timeout for each health check
Timeout time.Duration
// HealthyThreshold is the number of consecutive successes to become healthy
HealthyThreshold int
// UnhealthyThreshold is the number of consecutive failures to become unhealthy
UnhealthyThreshold int
// DegradedThreshold is the latency threshold in ms to mark as degraded
DegradedThreshold time.Duration
// OnStatusChange is called when health status changes
OnStatusChange func(from, to HealthStatus)
// CheckFunc is the function to check health
CheckFunc func(ctx context.Context) error
}
// DefaultHealthCheckConfig returns default configuration
func DefaultHealthCheckConfig() *HealthCheckConfig {
return &HealthCheckConfig{
CheckInterval: 30 * time.Second,
Timeout: 5 * time.Second,
HealthyThreshold: 3,
UnhealthyThreshold: 3,
DegradedThreshold: 100 * time.Millisecond,
}
}
// HealthChecker monitors the health of a backend
type HealthChecker struct {
config *HealthCheckConfig
// Status tracking
status atomic.Int32
consecutiveSuccesses atomic.Int32
consecutiveFailures atomic.Int32
// Timing
lastCheckTime time.Time
lastSuccessTime time.Time
lastFailureTime time.Time
averageLatency atomic.Int64
timeMu sync.RWMutex
// Metrics
totalChecks atomic.Int64
totalSuccesses atomic.Int64
totalFailures atomic.Int64
statusChanges atomic.Int64
// Lifecycle
ticker *time.Ticker
stopChan chan struct{}
stopped atomic.Bool
wg sync.WaitGroup
}
// NewHealthChecker creates a new health checker
func NewHealthChecker(config *HealthCheckConfig) *HealthChecker {
if config == nil {
config = DefaultHealthCheckConfig()
}
hc := &HealthChecker{
config: config,
stopChan: make(chan struct{}),
}
hc.status.Store(int32(HealthUnknown))
return hc
}
// Start begins health checking
func (hc *HealthChecker) Start() {
if hc.stopped.Load() {
return
}
hc.ticker = time.NewTicker(hc.config.CheckInterval)
hc.wg.Add(1)
go hc.checkLoop()
}
// Stop stops health checking
func (hc *HealthChecker) Stop() {
if hc.stopped.Swap(true) {
return // Already stopped
}
close(hc.stopChan)
if hc.ticker != nil {
hc.ticker.Stop()
}
hc.wg.Wait()
}
// checkLoop runs periodic health checks
func (hc *HealthChecker) checkLoop() {
defer hc.wg.Done()
// Initial check - log error but continue
if err := hc.Check(context.Background()); err != nil {
// Error is already tracked in Check() method, no need to log again
_ = err
}
for {
select {
case <-hc.stopChan:
return
case <-hc.ticker.C:
ctx, cancel := context.WithTimeout(context.Background(), hc.config.Timeout)
if err := hc.Check(ctx); err != nil {
// Error is already tracked in Check() method, no need to log again
_ = err
}
cancel()
}
}
}
// Check performs a health check
func (hc *HealthChecker) Check(ctx context.Context) error {
if hc.config.CheckFunc == nil {
return nil
}
hc.totalChecks.Add(1)
start := time.Now()
// Create timeout context if not already set
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, hc.config.Timeout)
defer cancel()
}
// Perform health check
err := hc.config.CheckFunc(ctx)
latency := time.Since(start)
hc.timeMu.Lock()
hc.lastCheckTime = time.Now()
hc.timeMu.Unlock()
// Update average latency
hc.updateAverageLatency(latency)
if err != nil {
hc.recordFailure()
} else {
hc.recordSuccess(latency)
}
return err
}
// recordSuccess records a successful health check
func (hc *HealthChecker) recordSuccess(latency time.Duration) {
hc.totalSuccesses.Add(1)
successes := hc.consecutiveSuccesses.Add(1)
hc.consecutiveFailures.Store(0)
hc.timeMu.Lock()
hc.lastSuccessTime = time.Now()
hc.timeMu.Unlock()
currentStatus := hc.GetStatus()
newStatus := currentStatus
// Check if we should become healthy
if successes >= int32(hc.config.HealthyThreshold) {
if latency > hc.config.DegradedThreshold {
newStatus = HealthDegraded
} else {
newStatus = HealthHealthy
}
}
if newStatus != currentStatus {
hc.setStatus(newStatus)
}
}
// recordFailure records a failed health check
func (hc *HealthChecker) recordFailure() {
hc.totalFailures.Add(1)
failures := hc.consecutiveFailures.Add(1)
hc.consecutiveSuccesses.Store(0)
hc.timeMu.Lock()
hc.lastFailureTime = time.Now()
hc.timeMu.Unlock()
// Check if we should become unhealthy
if failures >= int32(hc.config.UnhealthyThreshold) {
hc.setStatus(HealthUnhealthy)
}
}
// updateAverageLatency updates the rolling average latency
func (hc *HealthChecker) updateAverageLatency(latency time.Duration) {
// Simple exponential moving average
currentAvg := time.Duration(hc.averageLatency.Load())
if currentAvg == 0 {
hc.averageLatency.Store(int64(latency))
} else {
// Weight: 0.2 for new value, 0.8 for old average
newAvg := (currentAvg*4 + latency) / 5
hc.averageLatency.Store(int64(newAvg))
}
}
// GetStatus returns the current health status
func (hc *HealthChecker) GetStatus() HealthStatus {
return HealthStatus(hc.status.Load())
}
// setStatus changes the health status
func (hc *HealthChecker) setStatus(newStatus HealthStatus) {
oldStatus := HealthStatus(hc.status.Swap(int32(newStatus)))
if oldStatus != newStatus {
hc.statusChanges.Add(1)
if hc.config.OnStatusChange != nil {
hc.config.OnStatusChange(oldStatus, newStatus)
}
}
}
// IsHealthy returns true if the backend is healthy or degraded
func (hc *HealthChecker) IsHealthy() bool {
status := hc.GetStatus()
return status == HealthHealthy || status == HealthDegraded
}
// LastCheckTime returns the time of the last health check
func (hc *HealthChecker) LastCheckTime() time.Time {
hc.timeMu.RLock()
defer hc.timeMu.RUnlock()
return hc.lastCheckTime
}
// HealthScore returns a health score between 0.0 (unhealthy) and 1.0 (healthy)
func (hc *HealthChecker) HealthScore() float64 {
status := hc.GetStatus()
switch status {
case HealthHealthy:
return 1.0
case HealthDegraded:
return 0.7
case HealthUnhealthy:
return 0.0
default:
return 0.5
}
}
// Stats returns health checker statistics
func (hc *HealthChecker) Stats() HealthCheckerStats {
hc.timeMu.RLock()
lastCheck := hc.lastCheckTime
lastSuccess := hc.lastSuccessTime
lastFailure := hc.lastFailureTime
hc.timeMu.RUnlock()
totalChecks := hc.totalChecks.Load()
totalSuccesses := hc.totalSuccesses.Load()
totalFailures := hc.totalFailures.Load()
successRate := float64(0)
if totalChecks > 0 {
successRate = float64(totalSuccesses) / float64(totalChecks)
}
return HealthCheckerStats{
Status: hc.GetStatus(),
ConsecutiveSuccesses: hc.consecutiveSuccesses.Load(),
ConsecutiveFailures: hc.consecutiveFailures.Load(),
TotalChecks: totalChecks,
TotalSuccesses: totalSuccesses,
TotalFailures: totalFailures,
SuccessRate: successRate,
AverageLatency: time.Duration(hc.averageLatency.Load()),
StatusChanges: hc.statusChanges.Load(),
LastCheckTime: lastCheck,
LastSuccessTime: lastSuccess,
LastFailureTime: lastFailure,
HealthScore: hc.HealthScore(),
}
}
// HealthCheckerStats holds statistics for the health checker
type HealthCheckerStats struct {
Status HealthStatus
ConsecutiveSuccesses int32
ConsecutiveFailures int32
TotalChecks int64
TotalSuccesses int64
TotalFailures int64
SuccessRate float64
AverageLatency time.Duration
StatusChanges int64
LastCheckTime time.Time
LastSuccessTime time.Time
LastFailureTime time.Time
HealthScore float64
}
// Reset resets the health checker statistics
func (hc *HealthChecker) Reset() {
hc.status.Store(int32(HealthUnknown))
hc.consecutiveSuccesses.Store(0)
hc.consecutiveFailures.Store(0)
hc.totalChecks.Store(0)
hc.totalSuccesses.Store(0)
hc.totalFailures.Store(0)
hc.statusChanges.Store(0)
hc.averageLatency.Store(0)
now := time.Now()
hc.timeMu.Lock()
hc.lastCheckTime = now
hc.lastSuccessTime = now
hc.lastFailureTime = now
hc.timeMu.Unlock()
}