mirror of
https://github.com/lukaszraczylo/jobs-manager-operator.git
synced 2026-06-06 22:39:22 +00:00
2b36071647
* Multiple fixes - add goreleaser to the build / release process - add kubectl plugin for job graphs visualization - add installation scripts - update dependencies * Update the release & CRD content. * Next set of improvements. Code Quality - Label constants: Added LabelWorkflowName, LabelGroupName, LabelJobName, LabelJobID in controllers/definitions.go - Removed commented debug code: Cleaned up dead code from multiple files - Removed unused dependencyTree field: Cleaned connPackage struct - Fixed snake_case variables: Changed to camelCase (runGroup, groupDep, runJob, jobDep, k8sJob) Kubernetes Best Practices - Finalizers: Implemented handleDeletion() and deleteChildJobs() for proper cleanup - Status enum validation: Added +kubebuilder:validation:Enum=pending;running;succeeded;failed;aborted - ImagePullPolicy default: Created getImagePullPolicy() helper that defaults to IfNotPresent - Resource limits support: Added Resources *corev1.ResourceRequirements to ManagedJobParameters Observability - Prometheus metrics: Created controllers/metrics.go with counters (jobs created/succeeded/failed), histogram (reconciliation duration), and gauge (active jobs) - Structured logging: Added logger field to connPackage, used context-based logging throughout Configuration - Leader election ID: Made configurable via --leader-election-id flag - Development mode: Made configurable via --dev-mode flag and LOG_LEVEL env var Performance - Dependency lookup optimization: Changed from O(n*m) to O(1) using lookup maps (jobDepMap, groupDepMap) - Reconciliation backoff: Added RequeueAfter: 30*time.Second when workflow is running Documentation & Testing - Godoc documentation: Added comprehensive comments to API types and controller - Unit tests: Added helpers_test.go with tests for all helper functions - Integration tests: Added managedjob_controller_test.go with Ginkgo/Gomega tests * Add the helm chart release. * Add reasonable test coverage.
86 lines
2.6 KiB
Go
86 lines
2.6 KiB
Go
package controllers
|
|
|
|
import (
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"sigs.k8s.io/controller-runtime/pkg/metrics"
|
|
)
|
|
|
|
var (
|
|
// JobsCreatedTotal tracks the total number of jobs created
|
|
JobsCreatedTotal = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "managedjob_jobs_created_total",
|
|
Help: "Total number of Kubernetes jobs created by the operator",
|
|
},
|
|
[]string{"namespace", "workflow", "group"},
|
|
)
|
|
|
|
// JobsSucceededTotal tracks the total number of jobs that succeeded
|
|
JobsSucceededTotal = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "managedjob_jobs_succeeded_total",
|
|
Help: "Total number of jobs that completed successfully",
|
|
},
|
|
[]string{"namespace", "workflow", "group"},
|
|
)
|
|
|
|
// JobsFailedTotal tracks the total number of jobs that failed
|
|
JobsFailedTotal = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "managedjob_jobs_failed_total",
|
|
Help: "Total number of jobs that failed",
|
|
},
|
|
[]string{"namespace", "workflow", "group"},
|
|
)
|
|
|
|
// ReconciliationDuration tracks how long reconciliations take
|
|
ReconciliationDuration = prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "managedjob_reconciliation_duration_seconds",
|
|
Help: "Time spent reconciling ManagedJob resources",
|
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15), // 1ms to ~16s
|
|
},
|
|
[]string{"namespace", "workflow"},
|
|
)
|
|
|
|
// ActiveJobs tracks the number of currently running jobs per workflow
|
|
ActiveJobs = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "managedjob_active_jobs",
|
|
Help: "Number of currently active (running) jobs per workflow",
|
|
},
|
|
[]string{"namespace", "workflow"},
|
|
)
|
|
)
|
|
|
|
func init() {
|
|
// Register custom metrics with the controller-runtime metrics registry
|
|
metrics.Registry.MustRegister(
|
|
JobsCreatedTotal,
|
|
JobsSucceededTotal,
|
|
JobsFailedTotal,
|
|
ReconciliationDuration,
|
|
ActiveJobs,
|
|
)
|
|
}
|
|
|
|
// RecordJobCreated increments the job created counter
|
|
func RecordJobCreated(namespace, workflow, group string) {
|
|
JobsCreatedTotal.WithLabelValues(namespace, workflow, group).Inc()
|
|
}
|
|
|
|
// RecordJobSucceeded increments the job succeeded counter
|
|
func RecordJobSucceeded(namespace, workflow, group string) {
|
|
JobsSucceededTotal.WithLabelValues(namespace, workflow, group).Inc()
|
|
}
|
|
|
|
// RecordJobFailed increments the job failed counter
|
|
func RecordJobFailed(namespace, workflow, group string) {
|
|
JobsFailedTotal.WithLabelValues(namespace, workflow, group).Inc()
|
|
}
|
|
|
|
// SetActiveJobs sets the number of active jobs for a workflow
|
|
func SetActiveJobs(namespace, workflow string, count float64) {
|
|
ActiveJobs.WithLabelValues(namespace, workflow).Set(count)
|
|
}
|