Files
kportal/internal/forward/manager.go
T
lukaszraczylo 7a33e01863 fix: 4 P0 concurrency races in forward + k8s
P0 #2 — currentConfig data race
  Manager.currentConfig was written without locking in Start/Reload but
  read from the health-checker callback goroutine. All accesses now go
  through workersMu (read or write as appropriate).

P0 #3 — Reload kills health checker permanently
  Reload's zero-forward branch called m.Stop() which tore down the
  health checker, watchdog, and event bus. After that, EnableForward
  silently registered callbacks against dead components. Now the branch
  stops only the running workers; the supervisory infrastructure stays
  alive across config changes.

P0 #4 — rest.Config write-write race
  executePortForward was mutating .Dial on the cached *rest.Config
  shared by all forwards in the same kube context. Cloning the config
  with rest.CopyConfig before mutation isolates per-forward dialers.

P0 #5 — ForwardWorker.Stop() double-close panic
  close(w.stopChan) is now wrapped in sync.Once, so concurrent Stop
  calls (Manager.Stop racing stopWorkerInternal) are safe.

New tests in internal/forward/concurrency_test.go exercise each fix
under -race: 16 concurrent worker Stops, repeated sequential Stops,
empty-Reload preserves infra pointers, and concurrent currentConfig
read/write.
2026-05-06 10:45:10 +01:00

616 lines
17 KiB
Go

// Package forward provides the core port-forwarding orchestration for kportal.
// It manages the lifecycle of port-forward workers, handles hot-reload of
// configuration changes, and coordinates with the health checker and watchdog.
//
// The Manager is the central orchestrator that:
// - Creates and manages ForwardWorker instances for each configured forward
// - Handles graceful startup, shutdown, and reconfiguration
// - Coordinates with the HealthChecker for connection monitoring
// - Integrates with mDNS for hostname publishing
//
// ForwardWorker handles individual port-forward connections with:
// - Automatic retry with exponential backoff (1s → 2s → 4s → 8s → 10s max)
// - Pod restart detection and re-resolution
// - Graceful shutdown support
package forward
import (
"fmt"
"log"
"sync"
"time"
"github.com/lukaszraczylo/kportal/internal/config"
"github.com/lukaszraczylo/kportal/internal/events"
"github.com/lukaszraczylo/kportal/internal/healthcheck"
"github.com/lukaszraczylo/kportal/internal/k8s"
"github.com/lukaszraczylo/kportal/internal/logger"
"github.com/lukaszraczylo/kportal/internal/mdns"
)
// StatusUpdater is an interface for updating forward status
type StatusUpdater interface {
UpdateStatus(id string, status string)
AddForward(id string, fwd *config.Forward)
Remove(id string)
}
// Manager orchestrates all port-forward workers.
// It handles starting, stopping, and hot-reloading forwards.
type Manager struct {
statusUI StatusUpdater
healthChecker *healthcheck.Checker
clientPool *k8s.ClientPool
resolver *k8s.ResourceResolver
portForwarder *k8s.PortForwarder
portChecker *PortChecker
workers map[string]*ForwardWorker
watchdog *Watchdog
mdnsPublisher *mdns.Publisher
eventBus *events.Bus
// currentConfig holds the active configuration. Access MUST be guarded by
// workersMu — it is read from the health-checker callback goroutine
// (registered in startWorker) and written by Start/Reload.
currentConfig *config.Config
workersMu sync.RWMutex
verbose bool
}
// NewManager creates a new forward Manager.
// The health checker will be created with default settings and can be
// reconfigured via SetConfig().
func NewManager(verbose bool) (*Manager, error) {
clientPool, err := k8s.NewClientPool()
if err != nil {
return nil, fmt.Errorf("failed to create client pool: %w", err)
}
resolver := k8s.NewResourceResolver(clientPool)
portForwarder := k8s.NewPortForwarder(clientPool, resolver)
// Create health checker with defaults: check every 3 seconds with 2 second timeout
// Will be reconfigured when config is loaded
healthChecker := healthcheck.NewChecker(3*time.Second, 2*time.Second)
// Create watchdog with default settings: check every 30 seconds, 60 second hang threshold
// Will be reconfigured when config is loaded
watchdog := NewWatchdog(30*time.Second, 60*time.Second)
// Create event bus for decoupled communication between components
eventBus := events.NewBus()
// Wire up event bus to components
healthChecker.SetEventBus(eventBus)
watchdog.SetEventBus(eventBus)
return &Manager{
workers: make(map[string]*ForwardWorker),
clientPool: clientPool,
resolver: resolver,
portForwarder: portForwarder,
portChecker: NewPortChecker(),
healthChecker: healthChecker,
watchdog: watchdog,
eventBus: eventBus,
verbose: verbose,
}, nil
}
// configureHealthChecker creates a new health checker with settings from config
func (m *Manager) configureHealthChecker(cfg *config.Config) {
// Stop existing health checker
if m.healthChecker != nil {
m.healthChecker.Stop()
}
// Parse check method
methodStr := cfg.GetHealthCheckMethod()
var method healthcheck.CheckMethod
switch methodStr {
case "tcp-dial":
method = healthcheck.CheckMethodTCPDial
case "data-transfer":
method = healthcheck.CheckMethodDataTransfer
default:
method = healthcheck.CheckMethodDataTransfer
}
// Create new health checker with config settings
m.healthChecker = healthcheck.NewCheckerWithOptions(healthcheck.CheckerOptions{
Interval: cfg.GetHealthCheckIntervalOrDefault(),
Timeout: cfg.GetHealthCheckTimeoutOrDefault(),
Method: method,
MaxConnectionAge: cfg.GetMaxConnectionAge(),
MaxIdleTime: cfg.GetMaxIdleTime(),
})
// Reconnect event bus to new health checker
if m.eventBus != nil {
m.healthChecker.SetEventBus(m.eventBus)
}
// Configure TCP settings on port forwarder
tcpKeepalive := cfg.GetTCPKeepalive()
dialTimeout := cfg.GetDialTimeout()
m.portForwarder.SetTCPKeepalive(tcpKeepalive)
m.portForwarder.SetDialTimeout(dialTimeout)
logger.Info("Health checker and reliability configured", map[string]interface{}{
"interval": cfg.GetHealthCheckIntervalOrDefault().String(),
"timeout": cfg.GetHealthCheckTimeoutOrDefault().String(),
"method": methodStr,
"max_connection_age": cfg.GetMaxConnectionAge().String(),
"max_idle_time": cfg.GetMaxIdleTime().String(),
"tcp_keepalive": tcpKeepalive.String(),
"dial_timeout": dialTimeout.String(),
})
}
// SetStatusUI sets the status updater for the manager
func (m *Manager) SetStatusUI(ui StatusUpdater) {
m.statusUI = ui
}
// SetMDNSPublisher sets the mDNS publisher for the manager
func (m *Manager) SetMDNSPublisher(publisher *mdns.Publisher) {
m.mdnsPublisher = publisher
}
// Start initializes and starts all port-forwards from the configuration.
func (m *Manager) Start(cfg *config.Config) error {
if cfg == nil {
return fmt.Errorf("configuration is nil")
}
m.workersMu.Lock()
m.currentConfig = cfg
m.workersMu.Unlock()
// Configure health checker with settings from config
m.configureHealthChecker(cfg)
// Start watchdog
watchdogPeriod := cfg.GetWatchdogPeriod()
m.watchdog.checkInterval = watchdogPeriod
m.watchdog.hangThreshold = watchdogPeriod * 2 // Hang threshold is 2x check interval
m.watchdog.Start()
logger.Info("Watchdog started", map[string]interface{}{
"check_interval": watchdogPeriod.String(),
"hang_threshold": (watchdogPeriod * 2).String(),
})
// Get all forwards from config
forwards := cfg.GetAllForwards()
// Empty config is valid - user can add forwards later via TUI
if len(forwards) == 0 {
log.Printf("No forwards configured - use 'n' to add forwards")
return nil
}
// Check port availability before starting
ports := m.extractPorts(forwards)
conflicts := m.portChecker.CheckAvailability(ports, nil)
if len(conflicts) > 0 {
// Add resource information to conflicts
for i := range conflicts {
conflicts[i].Resource = m.getResourceForPort(forwards, conflicts[i].Port)
}
return fmt.Errorf("port conflicts detected:\n%s", FormatConflicts(conflicts))
}
// Start all workers
log.Printf("Starting %d port-forward(s)...", len(forwards))
for _, fwd := range forwards {
if err := m.startWorker(fwd); err != nil {
logger.Error("Failed to start worker", map[string]interface{}{
"forward_id": fwd.ID(),
"context": fwd.GetContext(),
"namespace": fwd.GetNamespace(),
"resource": fwd.Resource,
"local_port": fwd.LocalPort,
"error": err.Error(),
})
// Continue with other workers
}
}
log.Printf("All port-forwards started")
return nil
}
// Stop gracefully stops all port-forward workers.
func (m *Manager) Stop() {
log.Printf("Stopping all port-forwards...")
// Stop health checker and watchdog first
m.healthChecker.Stop()
m.watchdog.Stop()
// Close event bus
if m.eventBus != nil {
m.eventBus.Close()
}
// Stop mDNS publisher
if m.mdnsPublisher != nil {
m.mdnsPublisher.Stop()
}
m.workersMu.Lock()
workers := make([]*ForwardWorker, 0, len(m.workers))
for _, worker := range m.workers {
workers = append(workers, worker)
}
m.workersMu.Unlock()
// Stop all workers with limited concurrency to avoid unbounded goroutine creation
var wg sync.WaitGroup
sem := make(chan struct{}, 10) // Limit to 10 concurrent stops
for _, worker := range workers {
wg.Add(1)
sem <- struct{}{} // Acquire semaphore
go func(w *ForwardWorker) {
defer wg.Done()
defer func() { <-sem }() // Release semaphore
w.Stop()
}(worker)
}
wg.Wait()
// Clear workers map
m.workersMu.Lock()
m.workers = make(map[string]*ForwardWorker)
m.workersMu.Unlock()
log.Printf("All port-forwards stopped")
}
// Reload applies a new configuration with hot-reload logic.
// It diffs the new config against the current one and:
// - Stops removed forwards
// - Keeps unchanged forwards running
// - Starts new forwards
func (m *Manager) Reload(newCfg *config.Config) error {
if newCfg == nil {
return fmt.Errorf("new configuration is nil")
}
logger.Info("Reloading configuration", map[string]interface{}{
"new_forwards_count": len(newCfg.GetAllForwards()),
})
// Get all forwards from new config
newForwards := newCfg.GetAllForwards()
if len(newForwards) == 0 {
log.Printf("New configuration has no forwards, stopping all workers")
// Do NOT call m.Stop() here: it tears down healthChecker, watchdog
// and eventBus, which must remain alive so subsequent
// EnableForward / Reload calls can register against them.
// Only stop currently-running workers and update currentConfig.
m.workersMu.RLock()
ids := make([]string, 0, len(m.workers))
for id := range m.workers {
ids = append(ids, id)
}
m.workersMu.RUnlock()
for _, id := range ids {
if err := m.stopWorkerInternal(id, true); err != nil {
log.Printf("Failed to stop worker %s: %v", id, err)
}
}
m.workersMu.Lock()
m.currentConfig = newCfg
m.workersMu.Unlock()
return nil
}
// Create maps for easier comparison
newForwardsMap := make(map[string]config.Forward)
for _, fwd := range newForwards {
newForwardsMap[fwd.ID()] = fwd
}
m.workersMu.RLock()
currentForwardsMap := make(map[string]config.Forward)
for id, worker := range m.workers {
currentForwardsMap[id] = worker.GetForward()
}
m.workersMu.RUnlock()
// Determine changes
var toAdd []config.Forward
var toRemove []string
var toKeep []string
// Find forwards to add and keep
for id, fwd := range newForwardsMap {
if _, exists := currentForwardsMap[id]; exists {
toKeep = append(toKeep, id)
} else {
toAdd = append(toAdd, fwd)
}
}
// Find forwards to remove
for id := range currentForwardsMap {
if _, exists := newForwardsMap[id]; !exists {
toRemove = append(toRemove, id)
}
}
// Check port availability for new forwards
if len(toAdd) > 0 {
// Get currently managed ports to skip in availability check
managedPorts := make(map[int]bool)
for _, id := range toKeep {
managedPorts[currentForwardsMap[id].LocalPort] = true
}
// Check new ports
newPorts := m.extractPorts(toAdd)
conflicts := m.portChecker.CheckAvailability(newPorts, managedPorts)
if len(conflicts) > 0 {
// Add resource information to conflicts
for i := range conflicts {
conflicts[i].Resource = m.getResourceForPort(toAdd, conflicts[i].Port)
}
log.Printf("Config change rejected due to port conflicts:\n%s", FormatConflicts(conflicts))
log.Printf("Keeping previous configuration active")
return fmt.Errorf("port conflicts detected")
}
}
// Apply changes
log.Printf("Configuration diff: %d to add, %d to remove, %d to keep",
len(toAdd), len(toRemove), len(toKeep))
// Stop removed forwards
for _, id := range toRemove {
if err := m.stopWorker(id); err != nil {
log.Printf("Failed to stop worker %s: %v", id, err)
} else {
log.Printf("Stopped: %s", id)
}
}
// Start new forwards
for _, fwd := range toAdd {
if err := m.startWorker(fwd); err != nil {
log.Printf("Failed to start worker for %s: %v", fwd.ID(), err)
} else {
log.Printf("Started: %s", fwd.ID())
}
}
// Update current config
m.workersMu.Lock()
m.currentConfig = newCfg
m.workersMu.Unlock()
log.Printf("Configuration reloaded successfully")
return nil
}
// startWorker creates and starts a new forward worker.
func (m *Manager) startWorker(fwd config.Forward) error {
m.workersMu.Lock()
defer m.workersMu.Unlock()
// Check if worker already exists
if _, exists := m.workers[fwd.ID()]; exists {
return fmt.Errorf("worker already exists for %s", fwd.ID())
}
// Notify UI about new forward
if m.statusUI != nil {
m.statusUI.AddForward(fwd.ID(), &fwd)
}
// Create worker first so we can pass it to watchdog
worker := NewForwardWorker(fwd, m.portForwarder, m.verbose, m.statusUI, m.healthChecker, m.watchdog)
// Register with watchdog using the new responder interface
// This allows the watchdog to poll the worker for heartbeats centrally
// instead of each worker spawning its own heartbeat goroutine
m.watchdog.RegisterWorkerWithResponder(fwd.ID(), worker, func(forwardID string) {
logger.Warn("Watchdog triggered reconnection for hung worker", map[string]interface{}{
"forward_id": forwardID,
})
// Find and trigger reconnect on hung worker
m.workersMu.RLock()
w, exists := m.workers[forwardID]
m.workersMu.RUnlock()
if exists {
w.TriggerReconnect("watchdog detected hung worker")
}
})
// Register with health checker
m.healthChecker.Register(fwd.ID(), fwd.LocalPort, func(forwardID string, status healthcheck.Status, errorMsg string) {
if m.statusUI != nil {
m.statusUI.UpdateStatus(forwardID, string(status))
// Send error separately if there is one
if (status == healthcheck.StatusUnhealthy || status == healthcheck.StatusStale) && errorMsg != "" {
if ui, ok := m.statusUI.(interface{ SetError(id, msg string) }); ok {
ui.SetError(forwardID, errorMsg)
}
}
}
// Handle stale connections: trigger reconnection if retryOnStale is enabled.
// Read currentConfig and worker map under a single lock acquisition
// to avoid racing with Reload/Start writes.
if status == healthcheck.StatusStale {
m.workersMu.RLock()
retryOnStale := m.currentConfig != nil && m.currentConfig.GetRetryOnStale()
staleWorker, exists := m.workers[forwardID]
m.workersMu.RUnlock()
if retryOnStale {
logger.Info("Stale connection detected, triggering reconnection", map[string]interface{}{
"forward_id": forwardID,
"reason": errorMsg,
})
if exists {
staleWorker.TriggerReconnect("stale connection")
}
}
}
})
// Start the worker (already created above)
worker.Start()
// Store worker
m.workers[fwd.ID()] = worker
// Register mDNS hostname if enabled
// Uses explicit alias if set, otherwise generates from resource name
if m.mdnsPublisher != nil {
mdnsAlias := fwd.GetMDNSAlias()
if mdnsAlias != "" {
if err := m.mdnsPublisher.Register(fwd.ID(), mdnsAlias, fwd.LocalPort); err != nil {
logger.Warn("Failed to register mDNS hostname", map[string]interface{}{
"forward_id": fwd.ID(),
"alias": mdnsAlias,
"error": err.Error(),
})
// Don't fail the forward start - mDNS is optional
}
}
}
return nil
}
// stopWorker stops and removes a forward worker.
func (m *Manager) stopWorker(id string) error {
return m.stopWorkerInternal(id, true)
}
// stopWorkerInternal stops a worker with option to remove from UI or just update status
func (m *Manager) stopWorkerInternal(id string, removeFromUI bool) error {
m.workersMu.Lock()
worker, exists := m.workers[id]
if !exists {
m.workersMu.Unlock()
return fmt.Errorf("worker not found: %s", id)
}
delete(m.workers, id)
m.workersMu.Unlock()
// Unregister from health checker and watchdog
m.healthChecker.Unregister(id)
m.watchdog.UnregisterWorker(id)
// Unregister mDNS hostname
if m.mdnsPublisher != nil {
m.mdnsPublisher.Unregister(id)
}
// Notify UI - either remove or update to disabled status
if m.statusUI != nil {
if removeFromUI {
m.statusUI.Remove(id)
} else {
m.statusUI.UpdateStatus(id, "Disabled")
}
}
// Stop the worker
worker.Stop()
return nil
}
// GetWorker returns a worker by ID, or nil if not found.
func (m *Manager) GetWorker(id string) *ForwardWorker {
m.workersMu.RLock()
defer m.workersMu.RUnlock()
return m.workers[id]
}
// extractPorts extracts all local ports from a list of forwards.
func (m *Manager) extractPorts(forwards []config.Forward) []int {
ports := make([]int, len(forwards))
for i, fwd := range forwards {
ports[i] = fwd.LocalPort
}
return ports
}
// getResourceForPort finds the resource (forward ID) that uses a given port.
func (m *Manager) getResourceForPort(forwards []config.Forward, port int) string {
for _, fwd := range forwards {
if fwd.LocalPort == port {
return fwd.ID()
}
}
return "unknown"
}
// DisableForward temporarily stops a forward by ID
func (m *Manager) DisableForward(id string) error {
if err := m.stopWorkerInternal(id, false); err != nil {
return err
}
log.Printf("Disabled: %s", id)
return nil
}
// EnableForward re-enables a previously disabled forward
func (m *Manager) EnableForward(id string) error {
// Find the forward configuration in current config (read under lock)
m.workersMu.RLock()
cfg := m.currentConfig
m.workersMu.RUnlock()
if cfg == nil {
return fmt.Errorf("no configuration available")
}
forwards := cfg.GetAllForwards()
var targetFwd *config.Forward
for _, fwd := range forwards {
if fwd.ID() == id {
targetFwd = &fwd
break
}
}
if targetFwd == nil {
return fmt.Errorf("forward not found in configuration: %s", id)
}
// Check if already running
m.workersMu.RLock()
_, exists := m.workers[id]
m.workersMu.RUnlock()
if exists {
return fmt.Errorf("forward already enabled: %s", id)
}
// Start the worker
if err := m.startWorker(*targetFwd); err != nil {
return fmt.Errorf("failed to enable forward: %w", err)
}
log.Printf("Enabled: %s", id)
return nil
}