Files
jobs-manager-operator/controllers/metrics.go
T
lukaszraczylo 2b36071647 Multiple fixes (#29)
* Multiple fixes

- add goreleaser to the build / release process
- add kubectl plugin for job graphs visualization
- add installation scripts
- update dependencies

* Update the release & CRD content.

* Next set of improvements.

  Code Quality

  - Label constants: Added LabelWorkflowName, LabelGroupName, LabelJobName, LabelJobID in controllers/definitions.go
  - Removed commented debug code: Cleaned up dead code from multiple files
  - Removed unused dependencyTree field: Cleaned connPackage struct
  - Fixed snake_case variables: Changed to camelCase (runGroup, groupDep, runJob, jobDep, k8sJob)

  Kubernetes Best Practices

  - Finalizers: Implemented handleDeletion() and deleteChildJobs() for proper cleanup
  - Status enum validation: Added +kubebuilder:validation:Enum=pending;running;succeeded;failed;aborted
  - ImagePullPolicy default: Created getImagePullPolicy() helper that defaults to IfNotPresent
  - Resource limits support: Added Resources *corev1.ResourceRequirements to ManagedJobParameters

  Observability

  - Prometheus metrics: Created controllers/metrics.go with counters (jobs created/succeeded/failed), histogram (reconciliation duration), and gauge (active jobs)
  - Structured logging: Added logger field to connPackage, used context-based logging throughout

  Configuration

  - Leader election ID: Made configurable via --leader-election-id flag
  - Development mode: Made configurable via --dev-mode flag and LOG_LEVEL env var

  Performance

  - Dependency lookup optimization: Changed from O(n*m) to O(1) using lookup maps (jobDepMap, groupDepMap)
  - Reconciliation backoff: Added RequeueAfter: 30*time.Second when workflow is running

  Documentation & Testing

  - Godoc documentation: Added comprehensive comments to API types and controller
  - Unit tests: Added helpers_test.go with tests for all helper functions
  - Integration tests: Added managedjob_controller_test.go with Ginkgo/Gomega tests

* Add the helm chart release.

* Add reasonable test coverage.
2025-12-17 22:33:23 +00:00

86 lines
2.6 KiB
Go

package controllers
import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)
var (
// JobsCreatedTotal tracks the total number of jobs created
JobsCreatedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "managedjob_jobs_created_total",
Help: "Total number of Kubernetes jobs created by the operator",
},
[]string{"namespace", "workflow", "group"},
)
// JobsSucceededTotal tracks the total number of jobs that succeeded
JobsSucceededTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "managedjob_jobs_succeeded_total",
Help: "Total number of jobs that completed successfully",
},
[]string{"namespace", "workflow", "group"},
)
// JobsFailedTotal tracks the total number of jobs that failed
JobsFailedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "managedjob_jobs_failed_total",
Help: "Total number of jobs that failed",
},
[]string{"namespace", "workflow", "group"},
)
// ReconciliationDuration tracks how long reconciliations take
ReconciliationDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "managedjob_reconciliation_duration_seconds",
Help: "Time spent reconciling ManagedJob resources",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), // 1ms to ~16s
},
[]string{"namespace", "workflow"},
)
// ActiveJobs tracks the number of currently running jobs per workflow
ActiveJobs = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "managedjob_active_jobs",
Help: "Number of currently active (running) jobs per workflow",
},
[]string{"namespace", "workflow"},
)
)
func init() {
// Register custom metrics with the controller-runtime metrics registry
metrics.Registry.MustRegister(
JobsCreatedTotal,
JobsSucceededTotal,
JobsFailedTotal,
ReconciliationDuration,
ActiveJobs,
)
}
// RecordJobCreated increments the job created counter
func RecordJobCreated(namespace, workflow, group string) {
JobsCreatedTotal.WithLabelValues(namespace, workflow, group).Inc()
}
// RecordJobSucceeded increments the job succeeded counter
func RecordJobSucceeded(namespace, workflow, group string) {
JobsSucceededTotal.WithLabelValues(namespace, workflow, group).Inc()
}
// RecordJobFailed increments the job failed counter
func RecordJobFailed(namespace, workflow, group string) {
JobsFailedTotal.WithLabelValues(namespace, workflow, group).Inc()
}
// SetActiveJobs sets the number of active jobs for a workflow
func SetActiveJobs(namespace, workflow string, count float64) {
ActiveJobs.WithLabelValues(namespace, workflow).Set(count)
}