mirror of
https://github.com/lukaszraczylo/kportal.git
synced 2026-06-14 02:31:39 +00:00
healtcheck improvements (#4)
* Advanced healtchecks. * Add watchdog for stale connections handling.
This commit is contained in:
+109
-11
@@ -12,11 +12,6 @@ import (
|
||||
"github.com/nvm/kportal/internal/logger"
|
||||
)
|
||||
|
||||
const (
|
||||
healthCheckInterval = 5 * time.Second
|
||||
healthCheckTimeout = 2 * time.Second
|
||||
)
|
||||
|
||||
// StatusUpdater is an interface for updating forward status
|
||||
type StatusUpdater interface {
|
||||
UpdateStatus(id string, status string)
|
||||
@@ -34,12 +29,15 @@ type Manager struct {
|
||||
portForwarder *k8s.PortForwarder
|
||||
portChecker *PortChecker
|
||||
healthChecker *healthcheck.Checker
|
||||
watchdog *Watchdog
|
||||
verbose bool
|
||||
currentConfig *config.Config
|
||||
statusUI StatusUpdater
|
||||
}
|
||||
|
||||
// NewManager creates a new forward Manager.
|
||||
// The health checker will be created with default settings and can be
|
||||
// reconfigured via SetConfig().
|
||||
func NewManager(verbose bool) (*Manager, error) {
|
||||
clientPool, err := k8s.NewClientPool()
|
||||
if err != nil {
|
||||
@@ -49,8 +47,13 @@ func NewManager(verbose bool) (*Manager, error) {
|
||||
resolver := k8s.NewResourceResolver(clientPool)
|
||||
portForwarder := k8s.NewPortForwarder(clientPool, resolver)
|
||||
|
||||
// Create health checker: check every 5 seconds with 2 second timeout
|
||||
healthChecker := healthcheck.NewChecker(healthCheckInterval, healthCheckTimeout)
|
||||
// Create health checker with defaults: check every 3 seconds with 2 second timeout
|
||||
// Will be reconfigured when config is loaded
|
||||
healthChecker := healthcheck.NewChecker(3*time.Second, 2*time.Second)
|
||||
|
||||
// Create watchdog with default settings: check every 30 seconds, 60 second hang threshold
|
||||
// Will be reconfigured when config is loaded
|
||||
watchdog := NewWatchdog(30*time.Second, 60*time.Second)
|
||||
|
||||
return &Manager{
|
||||
workers: make(map[string]*ForwardWorker),
|
||||
@@ -59,10 +62,56 @@ func NewManager(verbose bool) (*Manager, error) {
|
||||
portForwarder: portForwarder,
|
||||
portChecker: NewPortChecker(),
|
||||
healthChecker: healthChecker,
|
||||
watchdog: watchdog,
|
||||
verbose: verbose,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// configureHealthChecker creates a new health checker with settings from config
|
||||
func (m *Manager) configureHealthChecker(cfg *config.Config) {
|
||||
// Stop existing health checker
|
||||
if m.healthChecker != nil {
|
||||
m.healthChecker.Stop()
|
||||
}
|
||||
|
||||
// Parse check method
|
||||
methodStr := cfg.GetHealthCheckMethod()
|
||||
var method healthcheck.CheckMethod
|
||||
switch methodStr {
|
||||
case "tcp-dial":
|
||||
method = healthcheck.CheckMethodTCPDial
|
||||
case "data-transfer":
|
||||
method = healthcheck.CheckMethodDataTransfer
|
||||
default:
|
||||
method = healthcheck.CheckMethodDataTransfer
|
||||
}
|
||||
|
||||
// Create new health checker with config settings
|
||||
m.healthChecker = healthcheck.NewCheckerWithOptions(healthcheck.CheckerOptions{
|
||||
Interval: cfg.GetHealthCheckIntervalOrDefault(),
|
||||
Timeout: cfg.GetHealthCheckTimeoutOrDefault(),
|
||||
Method: method,
|
||||
MaxConnectionAge: cfg.GetMaxConnectionAge(),
|
||||
MaxIdleTime: cfg.GetMaxIdleTime(),
|
||||
})
|
||||
|
||||
// Configure TCP settings on port forwarder
|
||||
tcpKeepalive := cfg.GetTCPKeepalive()
|
||||
dialTimeout := cfg.GetDialTimeout()
|
||||
m.portForwarder.SetTCPKeepalive(tcpKeepalive)
|
||||
m.portForwarder.SetDialTimeout(dialTimeout)
|
||||
|
||||
logger.Info("Health checker and reliability configured", map[string]interface{}{
|
||||
"interval": cfg.GetHealthCheckIntervalOrDefault().String(),
|
||||
"timeout": cfg.GetHealthCheckTimeoutOrDefault().String(),
|
||||
"method": methodStr,
|
||||
"max_connection_age": cfg.GetMaxConnectionAge().String(),
|
||||
"max_idle_time": cfg.GetMaxIdleTime().String(),
|
||||
"tcp_keepalive": tcpKeepalive.String(),
|
||||
"dial_timeout": dialTimeout.String(),
|
||||
})
|
||||
}
|
||||
|
||||
// SetStatusUI sets the status updater for the manager
|
||||
func (m *Manager) SetStatusUI(ui StatusUpdater) {
|
||||
m.statusUI = ui
|
||||
@@ -76,6 +125,20 @@ func (m *Manager) Start(cfg *config.Config) error {
|
||||
|
||||
m.currentConfig = cfg
|
||||
|
||||
// Configure health checker with settings from config
|
||||
m.configureHealthChecker(cfg)
|
||||
|
||||
// Start watchdog
|
||||
watchdogPeriod := cfg.GetWatchdogPeriod()
|
||||
m.watchdog.checkInterval = watchdogPeriod
|
||||
m.watchdog.hangThreshold = watchdogPeriod * 2 // Hang threshold is 2x check interval
|
||||
m.watchdog.Start()
|
||||
|
||||
logger.Info("Watchdog started", map[string]interface{}{
|
||||
"check_interval": watchdogPeriod.String(),
|
||||
"hang_threshold": (watchdogPeriod * 2).String(),
|
||||
})
|
||||
|
||||
// Get all forwards from config
|
||||
forwards := cfg.GetAllForwards()
|
||||
|
||||
@@ -119,8 +182,9 @@ func (m *Manager) Start(cfg *config.Config) error {
|
||||
func (m *Manager) Stop() {
|
||||
log.Printf("Stopping all port-forwards...")
|
||||
|
||||
// Stop health checker first
|
||||
// Stop health checker and watchdog first
|
||||
m.healthChecker.Stop()
|
||||
m.watchdog.Stop()
|
||||
|
||||
m.workersMu.Lock()
|
||||
workers := make([]*ForwardWorker, 0, len(m.workers))
|
||||
@@ -273,21 +337,54 @@ func (m *Manager) startWorker(fwd config.Forward) error {
|
||||
m.statusUI.AddForward(fwd.ID(), &fwd)
|
||||
}
|
||||
|
||||
// Register with watchdog
|
||||
m.watchdog.RegisterWorker(fwd.ID(), func(forwardID string) {
|
||||
logger.Warn("Watchdog triggered reconnection for hung worker", map[string]interface{}{
|
||||
"forward_id": forwardID,
|
||||
})
|
||||
|
||||
// Find and trigger reconnect on hung worker
|
||||
m.workersMu.RLock()
|
||||
worker, exists := m.workers[forwardID]
|
||||
m.workersMu.RUnlock()
|
||||
|
||||
if exists {
|
||||
worker.TriggerReconnect("watchdog detected hung worker")
|
||||
}
|
||||
})
|
||||
|
||||
// Register with health checker
|
||||
m.healthChecker.Register(fwd.ID(), fwd.LocalPort, func(forwardID string, status healthcheck.Status, errorMsg string) {
|
||||
if m.statusUI != nil {
|
||||
m.statusUI.UpdateStatus(forwardID, string(status))
|
||||
// Send error separately if there is one
|
||||
if status == healthcheck.StatusUnhealthy && errorMsg != "" {
|
||||
if (status == healthcheck.StatusUnhealthy || status == healthcheck.StatusStale) && errorMsg != "" {
|
||||
if ui, ok := m.statusUI.(interface{ SetError(id, msg string) }); ok {
|
||||
ui.SetError(forwardID, errorMsg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle stale connections: trigger reconnection if retryOnStale is enabled
|
||||
if status == healthcheck.StatusStale && m.currentConfig.GetRetryOnStale() {
|
||||
logger.Info("Stale connection detected, triggering reconnection", map[string]interface{}{
|
||||
"forward_id": forwardID,
|
||||
"reason": errorMsg,
|
||||
})
|
||||
|
||||
// Find and notify the worker to reconnect
|
||||
m.workersMu.RLock()
|
||||
worker, exists := m.workers[forwardID]
|
||||
m.workersMu.RUnlock()
|
||||
|
||||
if exists {
|
||||
worker.TriggerReconnect("stale connection")
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Create and start worker
|
||||
worker := NewForwardWorker(fwd, m.portForwarder, m.verbose, m.statusUI, m.healthChecker)
|
||||
worker := NewForwardWorker(fwd, m.portForwarder, m.verbose, m.statusUI, m.healthChecker, m.watchdog)
|
||||
worker.Start()
|
||||
|
||||
// Store worker
|
||||
@@ -312,8 +409,9 @@ func (m *Manager) stopWorkerInternal(id string, removeFromUI bool) error {
|
||||
delete(m.workers, id)
|
||||
m.workersMu.Unlock()
|
||||
|
||||
// Unregister from health checker
|
||||
// Unregister from health checker and watchdog
|
||||
m.healthChecker.Unregister(id)
|
||||
m.watchdog.UnregisterWorker(id)
|
||||
|
||||
// Notify UI - either remove or update to disabled status
|
||||
if m.statusUI != nil {
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
package forward
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/nvm/kportal/internal/logger"
|
||||
)
|
||||
|
||||
// Watchdog monitors worker goroutines to detect hung workers
|
||||
type Watchdog struct {
|
||||
mu sync.RWMutex
|
||||
workers map[string]*workerState // key: forward ID
|
||||
checkInterval time.Duration
|
||||
hangThreshold time.Duration // How long without heartbeat before considered hung
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// workerState tracks the health of a single worker
|
||||
type workerState struct {
|
||||
forwardID string
|
||||
lastHeartbeat time.Time
|
||||
heartbeatCount uint64
|
||||
isHung bool
|
||||
onHungCallback func(forwardID string)
|
||||
}
|
||||
|
||||
// NewWatchdog creates a new goroutine watchdog
|
||||
func NewWatchdog(checkInterval, hangThreshold time.Duration) *Watchdog {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
return &Watchdog{
|
||||
workers: make(map[string]*workerState),
|
||||
checkInterval: checkInterval,
|
||||
hangThreshold: hangThreshold,
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins the watchdog monitoring loop
|
||||
func (w *Watchdog) Start() {
|
||||
w.wg.Add(1)
|
||||
go w.monitorLoop()
|
||||
}
|
||||
|
||||
// Stop stops the watchdog
|
||||
func (w *Watchdog) Stop() {
|
||||
w.cancel()
|
||||
w.wg.Wait()
|
||||
}
|
||||
|
||||
// RegisterWorker adds a worker to monitor
|
||||
func (w *Watchdog) RegisterWorker(forwardID string, onHungCallback func(string)) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
w.workers[forwardID] = &workerState{
|
||||
forwardID: forwardID,
|
||||
lastHeartbeat: time.Now(),
|
||||
heartbeatCount: 0,
|
||||
isHung: false,
|
||||
onHungCallback: onHungCallback,
|
||||
}
|
||||
|
||||
logger.Debug("Watchdog registered worker", map[string]interface{}{
|
||||
"forward_id": forwardID,
|
||||
})
|
||||
}
|
||||
|
||||
// UnregisterWorker removes a worker from monitoring
|
||||
func (w *Watchdog) UnregisterWorker(forwardID string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
delete(w.workers, forwardID)
|
||||
|
||||
logger.Debug("Watchdog unregistered worker", map[string]interface{}{
|
||||
"forward_id": forwardID,
|
||||
})
|
||||
}
|
||||
|
||||
// Heartbeat records that a worker is alive and processing
|
||||
// Workers should call this periodically (e.g., in their main loop)
|
||||
func (w *Watchdog) Heartbeat(forwardID string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
if state, exists := w.workers[forwardID]; exists {
|
||||
state.lastHeartbeat = time.Now()
|
||||
state.heartbeatCount++
|
||||
state.isHung = false
|
||||
}
|
||||
}
|
||||
|
||||
// GetWorkerState returns the current state of a worker (for testing)
|
||||
func (w *Watchdog) GetWorkerState(forwardID string) (lastHeartbeat time.Time, count uint64, exists bool) {
|
||||
w.mu.RLock()
|
||||
defer w.mu.RUnlock()
|
||||
|
||||
if state, ok := w.workers[forwardID]; ok {
|
||||
return state.lastHeartbeat, state.heartbeatCount, true
|
||||
}
|
||||
return time.Time{}, 0, false
|
||||
}
|
||||
|
||||
// monitorLoop periodically checks all workers
|
||||
func (w *Watchdog) monitorLoop() {
|
||||
defer w.wg.Done()
|
||||
|
||||
ticker := time.NewTicker(w.checkInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-w.ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
w.checkWorkers()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// checkWorkers checks all registered workers for hung state
|
||||
func (w *Watchdog) checkWorkers() {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
for forwardID, state := range w.workers {
|
||||
timeSinceHeartbeat := now.Sub(state.lastHeartbeat)
|
||||
|
||||
// Check if worker is hung
|
||||
if timeSinceHeartbeat > w.hangThreshold {
|
||||
if !state.isHung {
|
||||
// First time detecting hung state
|
||||
state.isHung = true
|
||||
|
||||
logger.Warn("Watchdog detected hung worker", map[string]interface{}{
|
||||
"forward_id": forwardID,
|
||||
"time_since_heartbeat": timeSinceHeartbeat.String(),
|
||||
"hang_threshold": w.hangThreshold.String(),
|
||||
"heartbeat_count": state.heartbeatCount,
|
||||
})
|
||||
|
||||
// Trigger callback to handle hung worker (without holding lock)
|
||||
if state.onHungCallback != nil {
|
||||
callback := state.onHungCallback
|
||||
w.mu.Unlock()
|
||||
callback(forwardID)
|
||||
w.mu.Lock()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,310 @@
|
||||
package forward
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"github.com/stretchr/testify/suite"
|
||||
)
|
||||
|
||||
// WatchdogTestSuite contains tests for the watchdog
|
||||
type WatchdogTestSuite struct {
|
||||
suite.Suite
|
||||
watchdog *Watchdog
|
||||
}
|
||||
|
||||
func TestWatchdogSuite(t *testing.T) {
|
||||
suite.Run(t, new(WatchdogTestSuite))
|
||||
}
|
||||
|
||||
func (s *WatchdogTestSuite) SetupTest() {
|
||||
// Create watchdog with fast intervals for testing
|
||||
s.watchdog = NewWatchdog(100*time.Millisecond, 300*time.Millisecond)
|
||||
s.watchdog.Start()
|
||||
}
|
||||
|
||||
func (s *WatchdogTestSuite) TearDownTest() {
|
||||
if s.watchdog != nil {
|
||||
s.watchdog.Stop()
|
||||
}
|
||||
}
|
||||
|
||||
// TestRegisterUnregister tests basic registration and unregistration
|
||||
func (s *WatchdogTestSuite) TestRegisterUnregister() {
|
||||
callbackCalled := false
|
||||
callback := func(forwardID string) {
|
||||
callbackCalled = true
|
||||
}
|
||||
|
||||
// Register worker
|
||||
s.watchdog.RegisterWorker("test-forward", callback)
|
||||
|
||||
// Verify worker is registered
|
||||
_, _, exists := s.watchdog.GetWorkerState("test-forward")
|
||||
assert.True(s.T(), exists, "Worker should be registered")
|
||||
|
||||
// Unregister worker
|
||||
s.watchdog.UnregisterWorker("test-forward")
|
||||
|
||||
// Verify worker is unregistered
|
||||
_, _, exists = s.watchdog.GetWorkerState("test-forward")
|
||||
assert.False(s.T(), exists, "Worker should be unregistered")
|
||||
assert.False(s.T(), callbackCalled, "Callback should not have been called")
|
||||
}
|
||||
|
||||
// TestHeartbeat tests that heartbeats update worker state
|
||||
func (s *WatchdogTestSuite) TestHeartbeat() {
|
||||
s.watchdog.RegisterWorker("test-forward", nil)
|
||||
|
||||
// Send initial heartbeat
|
||||
s.watchdog.Heartbeat("test-forward")
|
||||
|
||||
lastHeartbeat1, count1, exists := s.watchdog.GetWorkerState("test-forward")
|
||||
require.True(s.T(), exists)
|
||||
assert.Equal(s.T(), uint64(1), count1)
|
||||
|
||||
// Wait a bit
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Send another heartbeat
|
||||
s.watchdog.Heartbeat("test-forward")
|
||||
|
||||
lastHeartbeat2, count2, exists := s.watchdog.GetWorkerState("test-forward")
|
||||
require.True(s.T(), exists)
|
||||
assert.Equal(s.T(), uint64(2), count2)
|
||||
assert.True(s.T(), lastHeartbeat2.After(lastHeartbeat1), "Second heartbeat should be after first")
|
||||
}
|
||||
|
||||
// TestHungWorkerDetection tests that hung workers are detected
|
||||
func (s *WatchdogTestSuite) TestHungWorkerDetection() {
|
||||
callbackCalled := make(chan string, 1)
|
||||
callback := func(forwardID string) {
|
||||
callbackCalled <- forwardID
|
||||
}
|
||||
|
||||
s.watchdog.RegisterWorker("test-forward", callback)
|
||||
|
||||
// Send initial heartbeat
|
||||
s.watchdog.Heartbeat("test-forward")
|
||||
|
||||
// Wait for worker to be considered hung (300ms threshold + 100ms check interval)
|
||||
timeout := time.After(1 * time.Second)
|
||||
|
||||
select {
|
||||
case forwardID := <-callbackCalled:
|
||||
assert.Equal(s.T(), "test-forward", forwardID)
|
||||
case <-timeout:
|
||||
s.T().Fatal("Timeout waiting for hung worker callback")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHealthyWorkerNotDetectedAsHung tests that workers sending heartbeats are not considered hung
|
||||
func (s *WatchdogTestSuite) TestHealthyWorkerNotDetectedAsHung() {
|
||||
callbackCalled := false
|
||||
var mu sync.Mutex
|
||||
callback := func(forwardID string) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callbackCalled = true
|
||||
}
|
||||
|
||||
s.watchdog.RegisterWorker("test-forward", callback)
|
||||
|
||||
// Send periodic heartbeats (faster than hang threshold)
|
||||
ticker := time.NewTicker(50 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
|
||||
done := make(chan bool)
|
||||
go func() {
|
||||
for i := 0; i < 10; i++ {
|
||||
<-ticker.C
|
||||
s.watchdog.Heartbeat("test-forward")
|
||||
}
|
||||
done <- true
|
||||
}()
|
||||
|
||||
// Wait for all heartbeats to complete
|
||||
<-done
|
||||
|
||||
// Check that callback was not called
|
||||
mu.Lock()
|
||||
assert.False(s.T(), callbackCalled, "Callback should not be called for healthy worker")
|
||||
mu.Unlock()
|
||||
}
|
||||
|
||||
// TestMultipleWorkers tests monitoring multiple workers simultaneously
|
||||
func (s *WatchdogTestSuite) TestMultipleWorkers() {
|
||||
callbacks := make(map[string]int)
|
||||
var mu sync.Mutex
|
||||
|
||||
makeCallback := func(id string) func(string) {
|
||||
return func(forwardID string) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callbacks[id]++
|
||||
}
|
||||
}
|
||||
|
||||
// Register multiple workers
|
||||
s.watchdog.RegisterWorker("worker-1", makeCallback("worker-1"))
|
||||
s.watchdog.RegisterWorker("worker-2", makeCallback("worker-2"))
|
||||
s.watchdog.RegisterWorker("worker-3", makeCallback("worker-3"))
|
||||
|
||||
// worker-1: Keep sending heartbeats (healthy)
|
||||
ticker1 := time.NewTicker(50 * time.Millisecond)
|
||||
defer ticker1.Stop()
|
||||
go func() {
|
||||
for i := 0; i < 10; i++ {
|
||||
<-ticker1.C
|
||||
s.watchdog.Heartbeat("worker-1")
|
||||
}
|
||||
}()
|
||||
|
||||
// worker-2: Send initial heartbeat then stop (will become hung)
|
||||
s.watchdog.Heartbeat("worker-2")
|
||||
|
||||
// worker-3: Send initial heartbeat then stop (will become hung)
|
||||
s.watchdog.Heartbeat("worker-3")
|
||||
|
||||
// Wait for hung workers to be detected
|
||||
time.Sleep(600 * time.Millisecond)
|
||||
|
||||
// Check results
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
assert.Equal(s.T(), 0, callbacks["worker-1"], "worker-1 should not trigger callback (healthy)")
|
||||
assert.Greater(s.T(), callbacks["worker-2"], 0, "worker-2 should trigger callback (hung)")
|
||||
assert.Greater(s.T(), callbacks["worker-3"], 0, "worker-3 should trigger callback (hung)")
|
||||
}
|
||||
|
||||
// TestCallbackOnlyOnFirstDetection tests that callback is only called once when hung is first detected
|
||||
func (s *WatchdogTestSuite) TestCallbackOnlyOnFirstDetection() {
|
||||
callbackCount := 0
|
||||
var mu sync.Mutex
|
||||
callback := func(forwardID string) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callbackCount++
|
||||
}
|
||||
|
||||
s.watchdog.RegisterWorker("test-forward", callback)
|
||||
|
||||
// Send initial heartbeat
|
||||
s.watchdog.Heartbeat("test-forward")
|
||||
|
||||
// Wait for multiple check cycles
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// Check that callback was only called once
|
||||
mu.Lock()
|
||||
assert.Equal(s.T(), 1, callbackCount, "Callback should only be called once")
|
||||
mu.Unlock()
|
||||
}
|
||||
|
||||
// TestHeartbeatResetsHungState tests that sending heartbeat after hung detection resets state
|
||||
func (s *WatchdogTestSuite) TestHeartbeatResetsHungState() {
|
||||
callbackCount := 0
|
||||
var mu sync.Mutex
|
||||
callback := func(forwardID string) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callbackCount++
|
||||
}
|
||||
|
||||
s.watchdog.RegisterWorker("test-forward", callback)
|
||||
|
||||
// Send initial heartbeat
|
||||
s.watchdog.Heartbeat("test-forward")
|
||||
|
||||
// Wait for hung detection
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
|
||||
mu.Lock()
|
||||
firstCount := callbackCount
|
||||
mu.Unlock()
|
||||
|
||||
assert.Equal(s.T(), 1, firstCount, "First hung detection should trigger callback")
|
||||
|
||||
// Send heartbeat to reset hung state
|
||||
s.watchdog.Heartbeat("test-forward")
|
||||
|
||||
// Wait for worker to become hung again
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
|
||||
mu.Lock()
|
||||
secondCount := callbackCount
|
||||
mu.Unlock()
|
||||
|
||||
assert.Equal(s.T(), 2, secondCount, "Second hung detection should trigger callback again")
|
||||
}
|
||||
|
||||
// TestConcurrentOperations tests thread safety
|
||||
func (s *WatchdogTestSuite) TestConcurrentOperations() {
|
||||
var wg sync.WaitGroup
|
||||
numWorkers := 10
|
||||
|
||||
for i := 0; i < numWorkers; i++ {
|
||||
wg.Add(1)
|
||||
go func(id int) {
|
||||
defer wg.Done()
|
||||
forwardID := string(rune('a' + id))
|
||||
s.watchdog.RegisterWorker(forwardID, nil)
|
||||
for j := 0; j < 10; j++ {
|
||||
s.watchdog.Heartbeat(forwardID)
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
s.watchdog.UnregisterWorker(forwardID)
|
||||
}(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
// If we get here without deadlocks or panics, test passes
|
||||
}
|
||||
|
||||
// TestStopWatchdog tests that stopping watchdog cleans up properly
|
||||
func TestStopWatchdog(t *testing.T) {
|
||||
watchdog := NewWatchdog(100*time.Millisecond, 300*time.Millisecond)
|
||||
watchdog.Start()
|
||||
|
||||
callbackCalled := false
|
||||
callback := func(forwardID string) {
|
||||
callbackCalled = true
|
||||
}
|
||||
|
||||
watchdog.RegisterWorker("test-forward", callback)
|
||||
watchdog.Heartbeat("test-forward")
|
||||
|
||||
// Stop watchdog before hang detection
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
watchdog.Stop()
|
||||
|
||||
// Wait to ensure no more callbacks after stop
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
|
||||
assert.False(t, callbackCalled, "Callback should not be called after watchdog is stopped")
|
||||
}
|
||||
|
||||
// TestWatchdogWithZeroHeartbeats tests detecting hung worker that never sends heartbeats
|
||||
func (s *WatchdogTestSuite) TestWatchdogWithZeroHeartbeats() {
|
||||
callbackCalled := make(chan string, 1)
|
||||
callback := func(forwardID string) {
|
||||
callbackCalled <- forwardID
|
||||
}
|
||||
|
||||
// Register worker but never send heartbeat
|
||||
s.watchdog.RegisterWorker("test-forward", callback)
|
||||
|
||||
// Wait for hung detection
|
||||
timeout := time.After(1 * time.Second)
|
||||
|
||||
select {
|
||||
case forwardID := <-callbackCalled:
|
||||
assert.Equal(s.T(), "test-forward", forwardID)
|
||||
case <-timeout:
|
||||
s.T().Fatal("Timeout waiting for hung worker callback")
|
||||
}
|
||||
}
|
||||
+59
-13
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/nvm/kportal/internal/config"
|
||||
@@ -20,21 +21,25 @@ const (
|
||||
|
||||
// ForwardWorker manages a single port-forward connection with automatic retry.
|
||||
type ForwardWorker struct {
|
||||
forward config.Forward
|
||||
portForwarder *k8s.PortForwarder
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
stopChan chan struct{}
|
||||
doneChan chan struct{}
|
||||
verbose bool
|
||||
lastPod string // Track the last pod we connected to
|
||||
statusUI StatusUpdater
|
||||
healthChecker *healthcheck.Checker
|
||||
startTime time.Time // Track when the worker started
|
||||
forward config.Forward
|
||||
portForwarder *k8s.PortForwarder
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
stopChan chan struct{}
|
||||
doneChan chan struct{}
|
||||
reconnectChan chan string // Channel to trigger reconnection
|
||||
verbose bool
|
||||
lastPod string // Track the last pod we connected to
|
||||
statusUI StatusUpdater
|
||||
healthChecker *healthcheck.Checker
|
||||
watchdog *Watchdog
|
||||
startTime time.Time // Track when the worker started
|
||||
forwardCancel context.CancelFunc // Cancel function for current forward attempt
|
||||
forwardCancelMu sync.Mutex // Protects forwardCancel
|
||||
}
|
||||
|
||||
// NewForwardWorker creates a new ForwardWorker for a single forward configuration.
|
||||
func NewForwardWorker(fwd config.Forward, portForwarder *k8s.PortForwarder, verbose bool, statusUI StatusUpdater, healthChecker *healthcheck.Checker) *ForwardWorker {
|
||||
func NewForwardWorker(fwd config.Forward, portForwarder *k8s.PortForwarder, verbose bool, statusUI StatusUpdater, healthChecker *healthcheck.Checker, watchdog *Watchdog) *ForwardWorker {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
return &ForwardWorker{
|
||||
@@ -44,13 +49,32 @@ func NewForwardWorker(fwd config.Forward, portForwarder *k8s.PortForwarder, verb
|
||||
cancel: cancel,
|
||||
stopChan: make(chan struct{}),
|
||||
doneChan: make(chan struct{}),
|
||||
reconnectChan: make(chan string, 1), // Buffered to avoid blocking
|
||||
verbose: verbose,
|
||||
statusUI: statusUI,
|
||||
healthChecker: healthChecker,
|
||||
watchdog: watchdog,
|
||||
startTime: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// TriggerReconnect triggers a reconnection (e.g., due to stale connection)
|
||||
func (w *ForwardWorker) TriggerReconnect(reason string) {
|
||||
// Cancel current forward if running
|
||||
w.forwardCancelMu.Lock()
|
||||
if w.forwardCancel != nil {
|
||||
w.forwardCancel()
|
||||
}
|
||||
w.forwardCancelMu.Unlock()
|
||||
|
||||
// Send reconnect signal (non-blocking)
|
||||
select {
|
||||
case w.reconnectChan <- reason:
|
||||
default:
|
||||
// Channel already has pending reconnect
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins the port-forward worker in a goroutine.
|
||||
// The worker will continuously retry on failures with exponential backoff.
|
||||
func (w *ForwardWorker) Start() {
|
||||
@@ -71,6 +95,11 @@ func (w *ForwardWorker) run() {
|
||||
backoff := retry.NewBackoff()
|
||||
|
||||
for {
|
||||
// Send heartbeat to watchdog to indicate we're alive
|
||||
if w.watchdog != nil {
|
||||
w.watchdog.Heartbeat(w.forward.ID())
|
||||
}
|
||||
|
||||
// Check if we should stop
|
||||
select {
|
||||
case <-w.ctx.Done():
|
||||
@@ -184,11 +213,24 @@ func (w *ForwardWorker) establishForward(podName string) error {
|
||||
forwardCtx, forwardCancel := context.WithCancel(w.ctx)
|
||||
defer forwardCancel()
|
||||
|
||||
// Start a goroutine to monitor for stop signal
|
||||
// Store cancel function so TriggerReconnect can use it
|
||||
w.forwardCancelMu.Lock()
|
||||
w.forwardCancel = forwardCancel
|
||||
w.forwardCancelMu.Unlock()
|
||||
|
||||
defer func() {
|
||||
w.forwardCancelMu.Lock()
|
||||
w.forwardCancel = nil
|
||||
w.forwardCancelMu.Unlock()
|
||||
}()
|
||||
|
||||
// Start a goroutine to monitor for stop signal and reconnect triggers
|
||||
go func() {
|
||||
select {
|
||||
case <-w.stopChan:
|
||||
close(stopChan)
|
||||
case <-w.reconnectChan:
|
||||
close(stopChan)
|
||||
case <-forwardCtx.Done():
|
||||
close(stopChan)
|
||||
}
|
||||
@@ -230,6 +272,10 @@ func (w *ForwardWorker) establishForward(podName string) error {
|
||||
if w.verbose {
|
||||
log.Printf("[%s] Port-forward connection established", w.forward.ID())
|
||||
}
|
||||
// Mark connection as established in health checker
|
||||
if w.healthChecker != nil {
|
||||
w.healthChecker.MarkConnected(w.forward.ID())
|
||||
}
|
||||
case err := <-errChan:
|
||||
return fmt.Errorf("failed to establish forward: %w", err)
|
||||
case <-w.ctx.Done():
|
||||
|
||||
Reference in New Issue
Block a user