healtcheck improvements (#4)

* Advanced healtchecks. * Add watchdog for stale connections handling.
2026-06-14 02:31:39 +00:00 · 2025-11-24 13:00:19 +00:00
parent 7df161aee0
commit 2fdc5912e7
10 changed files with 1577 additions and 74 deletions
@@ -12,11 +12,6 @@ import (
 	"github.com/nvm/kportal/internal/logger"
 )

-const (
-	healthCheckInterval = 5 * time.Second
-	healthCheckTimeout  = 2 * time.Second
-)
-
 // StatusUpdater is an interface for updating forward status
 type StatusUpdater interface {
 	UpdateStatus(id string, status string)
@@ -34,12 +29,15 @@ type Manager struct {
 	portForwarder *k8s.PortForwarder
 	portChecker   *PortChecker
 	healthChecker *healthcheck.Checker
+	watchdog      *Watchdog
 	verbose       bool
 	currentConfig *config.Config
 	statusUI      StatusUpdater
 }

 // NewManager creates a new forward Manager.
+// The health checker will be created with default settings and can be
+// reconfigured via SetConfig().
 func NewManager(verbose bool) (*Manager, error) {
 	clientPool, err := k8s.NewClientPool()
 	if err != nil {
@@ -49,8 +47,13 @@ func NewManager(verbose bool) (*Manager, error) {
 	resolver := k8s.NewResourceResolver(clientPool)
 	portForwarder := k8s.NewPortForwarder(clientPool, resolver)

-	// Create health checker: check every 5 seconds with 2 second timeout
-	healthChecker := healthcheck.NewChecker(healthCheckInterval, healthCheckTimeout)
+	// Create health checker with defaults: check every 3 seconds with 2 second timeout
+	// Will be reconfigured when config is loaded
+	healthChecker := healthcheck.NewChecker(3*time.Second, 2*time.Second)
+
+	// Create watchdog with default settings: check every 30 seconds, 60 second hang threshold
+	// Will be reconfigured when config is loaded
+	watchdog := NewWatchdog(30*time.Second, 60*time.Second)

 	return &Manager{
 		workers:       make(map[string]*ForwardWorker),
@@ -59,10 +62,56 @@ func NewManager(verbose bool) (*Manager, error) {
 		portForwarder: portForwarder,
 		portChecker:   NewPortChecker(),
 		healthChecker: healthChecker,
+		watchdog:      watchdog,
 		verbose:       verbose,
 	}, nil
 }

+// configureHealthChecker creates a new health checker with settings from config
+func (m *Manager) configureHealthChecker(cfg *config.Config) {
+	// Stop existing health checker
+	if m.healthChecker != nil {
+		m.healthChecker.Stop()
+	}
+
+	// Parse check method
+	methodStr := cfg.GetHealthCheckMethod()
+	var method healthcheck.CheckMethod
+	switch methodStr {
+	case "tcp-dial":
+		method = healthcheck.CheckMethodTCPDial
+	case "data-transfer":
+		method = healthcheck.CheckMethodDataTransfer
+	default:
+		method = healthcheck.CheckMethodDataTransfer
+	}
+
+	// Create new health checker with config settings
+	m.healthChecker = healthcheck.NewCheckerWithOptions(healthcheck.CheckerOptions{
+		Interval:         cfg.GetHealthCheckIntervalOrDefault(),
+		Timeout:          cfg.GetHealthCheckTimeoutOrDefault(),
+		Method:           method,
+		MaxConnectionAge: cfg.GetMaxConnectionAge(),
+		MaxIdleTime:      cfg.GetMaxIdleTime(),
+	})
+
+	// Configure TCP settings on port forwarder
+	tcpKeepalive := cfg.GetTCPKeepalive()
+	dialTimeout := cfg.GetDialTimeout()
+	m.portForwarder.SetTCPKeepalive(tcpKeepalive)
+	m.portForwarder.SetDialTimeout(dialTimeout)
+
+	logger.Info("Health checker and reliability configured", map[string]interface{}{
+		"interval":           cfg.GetHealthCheckIntervalOrDefault().String(),
+		"timeout":            cfg.GetHealthCheckTimeoutOrDefault().String(),
+		"method":             methodStr,
+		"max_connection_age": cfg.GetMaxConnectionAge().String(),
+		"max_idle_time":      cfg.GetMaxIdleTime().String(),
+		"tcp_keepalive":      tcpKeepalive.String(),
+		"dial_timeout":       dialTimeout.String(),
+	})
+}
+
 // SetStatusUI sets the status updater for the manager
 func (m *Manager) SetStatusUI(ui StatusUpdater) {
 	m.statusUI = ui
@@ -76,6 +125,20 @@ func (m *Manager) Start(cfg *config.Config) error {

 	m.currentConfig = cfg

+	// Configure health checker with settings from config
+	m.configureHealthChecker(cfg)
+
+	// Start watchdog
+	watchdogPeriod := cfg.GetWatchdogPeriod()
+	m.watchdog.checkInterval = watchdogPeriod
+	m.watchdog.hangThreshold = watchdogPeriod * 2 // Hang threshold is 2x check interval
+	m.watchdog.Start()
+
+	logger.Info("Watchdog started", map[string]interface{}{
+		"check_interval": watchdogPeriod.String(),
+		"hang_threshold": (watchdogPeriod * 2).String(),
+	})
+
 	// Get all forwards from config
 	forwards := cfg.GetAllForwards()

@@ -119,8 +182,9 @@ func (m *Manager) Start(cfg *config.Config) error {
 func (m *Manager) Stop() {
 	log.Printf("Stopping all port-forwards...")

-	// Stop health checker first
+	// Stop health checker and watchdog first
 	m.healthChecker.Stop()
+	m.watchdog.Stop()

 	m.workersMu.Lock()
 	workers := make([]*ForwardWorker, 0, len(m.workers))
@@ -273,21 +337,54 @@ func (m *Manager) startWorker(fwd config.Forward) error {
 		m.statusUI.AddForward(fwd.ID(), &fwd)
 	}

+	// Register with watchdog
+	m.watchdog.RegisterWorker(fwd.ID(), func(forwardID string) {
+		logger.Warn("Watchdog triggered reconnection for hung worker", map[string]interface{}{
+			"forward_id": forwardID,
+		})
+
+		// Find and trigger reconnect on hung worker
+		m.workersMu.RLock()
+		worker, exists := m.workers[forwardID]
+		m.workersMu.RUnlock()
+
+		if exists {
+			worker.TriggerReconnect("watchdog detected hung worker")
+		}
+	})
+
 	// Register with health checker
 	m.healthChecker.Register(fwd.ID(), fwd.LocalPort, func(forwardID string, status healthcheck.Status, errorMsg string) {
 		if m.statusUI != nil {
 			m.statusUI.UpdateStatus(forwardID, string(status))
 			// Send error separately if there is one
-			if status == healthcheck.StatusUnhealthy && errorMsg != "" {
+			if (status == healthcheck.StatusUnhealthy || status == healthcheck.StatusStale) && errorMsg != "" {
 				if ui, ok := m.statusUI.(interface{ SetError(id, msg string) }); ok {
 					ui.SetError(forwardID, errorMsg)
 				}
 			}
 		}
+
+		// Handle stale connections: trigger reconnection if retryOnStale is enabled
+		if status == healthcheck.StatusStale && m.currentConfig.GetRetryOnStale() {
+			logger.Info("Stale connection detected, triggering reconnection", map[string]interface{}{
+				"forward_id": forwardID,
+				"reason":     errorMsg,
+			})
+
+			// Find and notify the worker to reconnect
+			m.workersMu.RLock()
+			worker, exists := m.workers[forwardID]
+			m.workersMu.RUnlock()
+
+			if exists {
+				worker.TriggerReconnect("stale connection")
+			}
+		}
 	})

 	// Create and start worker
-	worker := NewForwardWorker(fwd, m.portForwarder, m.verbose, m.statusUI, m.healthChecker)
+	worker := NewForwardWorker(fwd, m.portForwarder, m.verbose, m.statusUI, m.healthChecker, m.watchdog)
 	worker.Start()

 	// Store worker
@@ -312,8 +409,9 @@ func (m *Manager) stopWorkerInternal(id string, removeFromUI bool) error {
 	delete(m.workers, id)
 	m.workersMu.Unlock()

-	// Unregister from health checker
+	// Unregister from health checker and watchdog
 	m.healthChecker.Unregister(id)
+	m.watchdog.UnregisterWorker(id)

 	// Notify UI - either remove or update to disabled status
 	if m.statusUI != nil {
@@ -0,0 +1,158 @@
+package forward
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/nvm/kportal/internal/logger"
+)
+
+// Watchdog monitors worker goroutines to detect hung workers
+type Watchdog struct {
+	mu            sync.RWMutex
+	workers       map[string]*workerState // key: forward ID
+	checkInterval time.Duration
+	hangThreshold time.Duration // How long without heartbeat before considered hung
+	ctx           context.Context
+	cancel        context.CancelFunc
+	wg            sync.WaitGroup
+}
+
+// workerState tracks the health of a single worker
+type workerState struct {
+	forwardID      string
+	lastHeartbeat  time.Time
+	heartbeatCount uint64
+	isHung         bool
+	onHungCallback func(forwardID string)
+}
+
+// NewWatchdog creates a new goroutine watchdog
+func NewWatchdog(checkInterval, hangThreshold time.Duration) *Watchdog {
+	ctx, cancel := context.WithCancel(context.Background())
+	return &Watchdog{
+		workers:       make(map[string]*workerState),
+		checkInterval: checkInterval,
+		hangThreshold: hangThreshold,
+		ctx:           ctx,
+		cancel:        cancel,
+	}
+}
+
+// Start begins the watchdog monitoring loop
+func (w *Watchdog) Start() {
+	w.wg.Add(1)
+	go w.monitorLoop()
+}
+
+// Stop stops the watchdog
+func (w *Watchdog) Stop() {
+	w.cancel()
+	w.wg.Wait()
+}
+
+// RegisterWorker adds a worker to monitor
+func (w *Watchdog) RegisterWorker(forwardID string, onHungCallback func(string)) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	w.workers[forwardID] = &workerState{
+		forwardID:      forwardID,
+		lastHeartbeat:  time.Now(),
+		heartbeatCount: 0,
+		isHung:         false,
+		onHungCallback: onHungCallback,
+	}
+
+	logger.Debug("Watchdog registered worker", map[string]interface{}{
+		"forward_id": forwardID,
+	})
+}
+
+// UnregisterWorker removes a worker from monitoring
+func (w *Watchdog) UnregisterWorker(forwardID string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	delete(w.workers, forwardID)
+
+	logger.Debug("Watchdog unregistered worker", map[string]interface{}{
+		"forward_id": forwardID,
+	})
+}
+
+// Heartbeat records that a worker is alive and processing
+// Workers should call this periodically (e.g., in their main loop)
+func (w *Watchdog) Heartbeat(forwardID string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	if state, exists := w.workers[forwardID]; exists {
+		state.lastHeartbeat = time.Now()
+		state.heartbeatCount++
+		state.isHung = false
+	}
+}
+
+// GetWorkerState returns the current state of a worker (for testing)
+func (w *Watchdog) GetWorkerState(forwardID string) (lastHeartbeat time.Time, count uint64, exists bool) {
+	w.mu.RLock()
+	defer w.mu.RUnlock()
+
+	if state, ok := w.workers[forwardID]; ok {
+		return state.lastHeartbeat, state.heartbeatCount, true
+	}
+	return time.Time{}, 0, false
+}
+
+// monitorLoop periodically checks all workers
+func (w *Watchdog) monitorLoop() {
+	defer w.wg.Done()
+
+	ticker := time.NewTicker(w.checkInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-w.ctx.Done():
+			return
+		case <-ticker.C:
+			w.checkWorkers()
+		}
+	}
+}
+
+// checkWorkers checks all registered workers for hung state
+func (w *Watchdog) checkWorkers() {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	now := time.Now()
+	for forwardID, state := range w.workers {
+		timeSinceHeartbeat := now.Sub(state.lastHeartbeat)
+
+		// Check if worker is hung
+		if timeSinceHeartbeat > w.hangThreshold {
+			if !state.isHung {
+				// First time detecting hung state
+				state.isHung = true
+
+				logger.Warn("Watchdog detected hung worker", map[string]interface{}{
+					"forward_id":           forwardID,
+					"time_since_heartbeat": timeSinceHeartbeat.String(),
+					"hang_threshold":       w.hangThreshold.String(),
+					"heartbeat_count":      state.heartbeatCount,
+				})
+
+				// Trigger callback to handle hung worker (without holding lock)
+				if state.onHungCallback != nil {
+					callback := state.onHungCallback
+					w.mu.Unlock()
+					callback(forwardID)
+					w.mu.Lock()
+				}
+			}
+		}
+	}
+}
@@ -0,0 +1,310 @@
+package forward
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/stretchr/testify/suite"
+)
+
+// WatchdogTestSuite contains tests for the watchdog
+type WatchdogTestSuite struct {
+	suite.Suite
+	watchdog *Watchdog
+}
+
+func TestWatchdogSuite(t *testing.T) {
+	suite.Run(t, new(WatchdogTestSuite))
+}
+
+func (s *WatchdogTestSuite) SetupTest() {
+	// Create watchdog with fast intervals for testing
+	s.watchdog = NewWatchdog(100*time.Millisecond, 300*time.Millisecond)
+	s.watchdog.Start()
+}
+
+func (s *WatchdogTestSuite) TearDownTest() {
+	if s.watchdog != nil {
+		s.watchdog.Stop()
+	}
+}
+
+// TestRegisterUnregister tests basic registration and unregistration
+func (s *WatchdogTestSuite) TestRegisterUnregister() {
+	callbackCalled := false
+	callback := func(forwardID string) {
+		callbackCalled = true
+	}
+
+	// Register worker
+	s.watchdog.RegisterWorker("test-forward", callback)
+
+	// Verify worker is registered
+	_, _, exists := s.watchdog.GetWorkerState("test-forward")
+	assert.True(s.T(), exists, "Worker should be registered")
+
+	// Unregister worker
+	s.watchdog.UnregisterWorker("test-forward")
+
+	// Verify worker is unregistered
+	_, _, exists = s.watchdog.GetWorkerState("test-forward")
+	assert.False(s.T(), exists, "Worker should be unregistered")
+	assert.False(s.T(), callbackCalled, "Callback should not have been called")
+}
+
+// TestHeartbeat tests that heartbeats update worker state
+func (s *WatchdogTestSuite) TestHeartbeat() {
+	s.watchdog.RegisterWorker("test-forward", nil)
+
+	// Send initial heartbeat
+	s.watchdog.Heartbeat("test-forward")
+
+	lastHeartbeat1, count1, exists := s.watchdog.GetWorkerState("test-forward")
+	require.True(s.T(), exists)
+	assert.Equal(s.T(), uint64(1), count1)
+
+	// Wait a bit
+	time.Sleep(50 * time.Millisecond)
+
+	// Send another heartbeat
+	s.watchdog.Heartbeat("test-forward")
+
+	lastHeartbeat2, count2, exists := s.watchdog.GetWorkerState("test-forward")
+	require.True(s.T(), exists)
+	assert.Equal(s.T(), uint64(2), count2)
+	assert.True(s.T(), lastHeartbeat2.After(lastHeartbeat1), "Second heartbeat should be after first")
+}
+
+// TestHungWorkerDetection tests that hung workers are detected
+func (s *WatchdogTestSuite) TestHungWorkerDetection() {
+	callbackCalled := make(chan string, 1)
+	callback := func(forwardID string) {
+		callbackCalled <- forwardID
+	}
+
+	s.watchdog.RegisterWorker("test-forward", callback)
+
+	// Send initial heartbeat
+	s.watchdog.Heartbeat("test-forward")
+
+	// Wait for worker to be considered hung (300ms threshold + 100ms check interval)
+	timeout := time.After(1 * time.Second)
+
+	select {
+	case forwardID := <-callbackCalled:
+		assert.Equal(s.T(), "test-forward", forwardID)
+	case <-timeout:
+		s.T().Fatal("Timeout waiting for hung worker callback")
+	}
+}
+
+// TestHealthyWorkerNotDetectedAsHung tests that workers sending heartbeats are not considered hung
+func (s *WatchdogTestSuite) TestHealthyWorkerNotDetectedAsHung() {
+	callbackCalled := false
+	var mu sync.Mutex
+	callback := func(forwardID string) {
+		mu.Lock()
+		defer mu.Unlock()
+		callbackCalled = true
+	}
+
+	s.watchdog.RegisterWorker("test-forward", callback)
+
+	// Send periodic heartbeats (faster than hang threshold)
+	ticker := time.NewTicker(50 * time.Millisecond)
+	defer ticker.Stop()
+
+	done := make(chan bool)
+	go func() {
+		for i := 0; i < 10; i++ {
+			<-ticker.C
+			s.watchdog.Heartbeat("test-forward")
+		}
+		done <- true
+	}()
+
+	// Wait for all heartbeats to complete
+	<-done
+
+	// Check that callback was not called
+	mu.Lock()
+	assert.False(s.T(), callbackCalled, "Callback should not be called for healthy worker")
+	mu.Unlock()
+}
+
+// TestMultipleWorkers tests monitoring multiple workers simultaneously
+func (s *WatchdogTestSuite) TestMultipleWorkers() {
+	callbacks := make(map[string]int)
+	var mu sync.Mutex
+
+	makeCallback := func(id string) func(string) {
+		return func(forwardID string) {
+			mu.Lock()
+			defer mu.Unlock()
+			callbacks[id]++
+		}
+	}
+
+	// Register multiple workers
+	s.watchdog.RegisterWorker("worker-1", makeCallback("worker-1"))
+	s.watchdog.RegisterWorker("worker-2", makeCallback("worker-2"))
+	s.watchdog.RegisterWorker("worker-3", makeCallback("worker-3"))
+
+	// worker-1: Keep sending heartbeats (healthy)
+	ticker1 := time.NewTicker(50 * time.Millisecond)
+	defer ticker1.Stop()
+	go func() {
+		for i := 0; i < 10; i++ {
+			<-ticker1.C
+			s.watchdog.Heartbeat("worker-1")
+		}
+	}()
+
+	// worker-2: Send initial heartbeat then stop (will become hung)
+	s.watchdog.Heartbeat("worker-2")
+
+	// worker-3: Send initial heartbeat then stop (will become hung)
+	s.watchdog.Heartbeat("worker-3")
+
+	// Wait for hung workers to be detected
+	time.Sleep(600 * time.Millisecond)
+
+	// Check results
+	mu.Lock()
+	defer mu.Unlock()
+
+	assert.Equal(s.T(), 0, callbacks["worker-1"], "worker-1 should not trigger callback (healthy)")
+	assert.Greater(s.T(), callbacks["worker-2"], 0, "worker-2 should trigger callback (hung)")
+	assert.Greater(s.T(), callbacks["worker-3"], 0, "worker-3 should trigger callback (hung)")
+}
+
+// TestCallbackOnlyOnFirstDetection tests that callback is only called once when hung is first detected
+func (s *WatchdogTestSuite) TestCallbackOnlyOnFirstDetection() {
+	callbackCount := 0
+	var mu sync.Mutex
+	callback := func(forwardID string) {
+		mu.Lock()
+		defer mu.Unlock()
+		callbackCount++
+	}
+
+	s.watchdog.RegisterWorker("test-forward", callback)
+
+	// Send initial heartbeat
+	s.watchdog.Heartbeat("test-forward")
+
+	// Wait for multiple check cycles
+	time.Sleep(1 * time.Second)
+
+	// Check that callback was only called once
+	mu.Lock()
+	assert.Equal(s.T(), 1, callbackCount, "Callback should only be called once")
+	mu.Unlock()
+}
+
+// TestHeartbeatResetsHungState tests that sending heartbeat after hung detection resets state
+func (s *WatchdogTestSuite) TestHeartbeatResetsHungState() {
+	callbackCount := 0
+	var mu sync.Mutex
+	callback := func(forwardID string) {
+		mu.Lock()
+		defer mu.Unlock()
+		callbackCount++
+	}
+
+	s.watchdog.RegisterWorker("test-forward", callback)
+
+	// Send initial heartbeat
+	s.watchdog.Heartbeat("test-forward")
+
+	// Wait for hung detection
+	time.Sleep(500 * time.Millisecond)
+
+	mu.Lock()
+	firstCount := callbackCount
+	mu.Unlock()
+
+	assert.Equal(s.T(), 1, firstCount, "First hung detection should trigger callback")
+
+	// Send heartbeat to reset hung state
+	s.watchdog.Heartbeat("test-forward")
+
+	// Wait for worker to become hung again
+	time.Sleep(500 * time.Millisecond)
+
+	mu.Lock()
+	secondCount := callbackCount
+	mu.Unlock()
+
+	assert.Equal(s.T(), 2, secondCount, "Second hung detection should trigger callback again")
+}
+
+// TestConcurrentOperations tests thread safety
+func (s *WatchdogTestSuite) TestConcurrentOperations() {
+	var wg sync.WaitGroup
+	numWorkers := 10
+
+	for i := 0; i < numWorkers; i++ {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+			forwardID := string(rune('a' + id))
+			s.watchdog.RegisterWorker(forwardID, nil)
+			for j := 0; j < 10; j++ {
+				s.watchdog.Heartbeat(forwardID)
+				time.Sleep(10 * time.Millisecond)
+			}
+			s.watchdog.UnregisterWorker(forwardID)
+		}(i)
+	}
+
+	wg.Wait()
+	// If we get here without deadlocks or panics, test passes
+}
+
+// TestStopWatchdog tests that stopping watchdog cleans up properly
+func TestStopWatchdog(t *testing.T) {
+	watchdog := NewWatchdog(100*time.Millisecond, 300*time.Millisecond)
+	watchdog.Start()
+
+	callbackCalled := false
+	callback := func(forwardID string) {
+		callbackCalled = true
+	}
+
+	watchdog.RegisterWorker("test-forward", callback)
+	watchdog.Heartbeat("test-forward")
+
+	// Stop watchdog before hang detection
+	time.Sleep(100 * time.Millisecond)
+	watchdog.Stop()
+
+	// Wait to ensure no more callbacks after stop
+	time.Sleep(500 * time.Millisecond)
+
+	assert.False(t, callbackCalled, "Callback should not be called after watchdog is stopped")
+}
+
+// TestWatchdogWithZeroHeartbeats tests detecting hung worker that never sends heartbeats
+func (s *WatchdogTestSuite) TestWatchdogWithZeroHeartbeats() {
+	callbackCalled := make(chan string, 1)
+	callback := func(forwardID string) {
+		callbackCalled <- forwardID
+	}
+
+	// Register worker but never send heartbeat
+	s.watchdog.RegisterWorker("test-forward", callback)
+
+	// Wait for hung detection
+	timeout := time.After(1 * time.Second)
+
+	select {
+	case forwardID := <-callbackCalled:
+		assert.Equal(s.T(), "test-forward", forwardID)
+	case <-timeout:
+		s.T().Fatal("Timeout waiting for hung worker callback")
+	}
+}
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"log"
+	"sync"
 	"time"

 	"github.com/nvm/kportal/internal/config"
@@ -20,21 +21,25 @@ const (

 // ForwardWorker manages a single port-forward connection with automatic retry.
 type ForwardWorker struct {
-	forward       config.Forward
-	portForwarder *k8s.PortForwarder
-	ctx           context.Context
-	cancel        context.CancelFunc
-	stopChan      chan struct{}
-	doneChan      chan struct{}
-	verbose       bool
-	lastPod       string // Track the last pod we connected to
-	statusUI      StatusUpdater
-	healthChecker *healthcheck.Checker
-	startTime     time.Time // Track when the worker started
+	forward         config.Forward
+	portForwarder   *k8s.PortForwarder
+	ctx             context.Context
+	cancel          context.CancelFunc
+	stopChan        chan struct{}
+	doneChan        chan struct{}
+	reconnectChan   chan string // Channel to trigger reconnection
+	verbose         bool
+	lastPod         string // Track the last pod we connected to
+	statusUI        StatusUpdater
+	healthChecker   *healthcheck.Checker
+	watchdog        *Watchdog
+	startTime       time.Time          // Track when the worker started
+	forwardCancel   context.CancelFunc // Cancel function for current forward attempt
+	forwardCancelMu sync.Mutex         // Protects forwardCancel
 }

 // NewForwardWorker creates a new ForwardWorker for a single forward configuration.
-func NewForwardWorker(fwd config.Forward, portForwarder *k8s.PortForwarder, verbose bool, statusUI StatusUpdater, healthChecker *healthcheck.Checker) *ForwardWorker {
+func NewForwardWorker(fwd config.Forward, portForwarder *k8s.PortForwarder, verbose bool, statusUI StatusUpdater, healthChecker *healthcheck.Checker, watchdog *Watchdog) *ForwardWorker {
 	ctx, cancel := context.WithCancel(context.Background())

 	return &ForwardWorker{
@@ -44,13 +49,32 @@ func NewForwardWorker(fwd config.Forward, portForwarder *k8s.PortForwarder, verb
 		cancel:        cancel,
 		stopChan:      make(chan struct{}),
 		doneChan:      make(chan struct{}),
+		reconnectChan: make(chan string, 1), // Buffered to avoid blocking
 		verbose:       verbose,
 		statusUI:      statusUI,
 		healthChecker: healthChecker,
+		watchdog:      watchdog,
 		startTime:     time.Now(),
 	}
 }

+// TriggerReconnect triggers a reconnection (e.g., due to stale connection)
+func (w *ForwardWorker) TriggerReconnect(reason string) {
+	// Cancel current forward if running
+	w.forwardCancelMu.Lock()
+	if w.forwardCancel != nil {
+		w.forwardCancel()
+	}
+	w.forwardCancelMu.Unlock()
+
+	// Send reconnect signal (non-blocking)
+	select {
+	case w.reconnectChan <- reason:
+	default:
+		// Channel already has pending reconnect
+	}
+}
+
 // Start begins the port-forward worker in a goroutine.
 // The worker will continuously retry on failures with exponential backoff.
 func (w *ForwardWorker) Start() {
@@ -71,6 +95,11 @@ func (w *ForwardWorker) run() {
 	backoff := retry.NewBackoff()

 	for {
+		// Send heartbeat to watchdog to indicate we're alive
+		if w.watchdog != nil {
+			w.watchdog.Heartbeat(w.forward.ID())
+		}
+
 		// Check if we should stop
 		select {
 		case <-w.ctx.Done():
@@ -184,11 +213,24 @@ func (w *ForwardWorker) establishForward(podName string) error {
 	forwardCtx, forwardCancel := context.WithCancel(w.ctx)
 	defer forwardCancel()

-	// Start a goroutine to monitor for stop signal
+	// Store cancel function so TriggerReconnect can use it
+	w.forwardCancelMu.Lock()
+	w.forwardCancel = forwardCancel
+	w.forwardCancelMu.Unlock()
+
+	defer func() {
+		w.forwardCancelMu.Lock()
+		w.forwardCancel = nil
+		w.forwardCancelMu.Unlock()
+	}()
+
+	// Start a goroutine to monitor for stop signal and reconnect triggers
 	go func() {
 		select {
 		case <-w.stopChan:
 			close(stopChan)
+		case <-w.reconnectChan:
+			close(stopChan)
 		case <-forwardCtx.Done():
 			close(stopChan)
 		}
@@ -230,6 +272,10 @@ func (w *ForwardWorker) establishForward(podName string) error {
 		if w.verbose {
 			log.Printf("[%s] Port-forward connection established", w.forward.ID())
 		}
+		// Mark connection as established in health checker
+		if w.healthChecker != nil {
+			w.healthChecker.MarkConnected(w.forward.ID())
+		}
 	case err := <-errChan:
 		return fmt.Errorf("failed to establish forward: %w", err)
 	case <-w.ctx.Done():