mirror of
https://github.com/lukaszraczylo/jobs-manager-operator.git
synced 2026-07-05 06:05:14 +00:00
Multiple fixes (#29)
* Multiple fixes - add goreleaser to the build / release process - add kubectl plugin for job graphs visualization - add installation scripts - update dependencies * Update the release & CRD content. * Next set of improvements. Code Quality - Label constants: Added LabelWorkflowName, LabelGroupName, LabelJobName, LabelJobID in controllers/definitions.go - Removed commented debug code: Cleaned up dead code from multiple files - Removed unused dependencyTree field: Cleaned connPackage struct - Fixed snake_case variables: Changed to camelCase (runGroup, groupDep, runJob, jobDep, k8sJob) Kubernetes Best Practices - Finalizers: Implemented handleDeletion() and deleteChildJobs() for proper cleanup - Status enum validation: Added +kubebuilder:validation:Enum=pending;running;succeeded;failed;aborted - ImagePullPolicy default: Created getImagePullPolicy() helper that defaults to IfNotPresent - Resource limits support: Added Resources *corev1.ResourceRequirements to ManagedJobParameters Observability - Prometheus metrics: Created controllers/metrics.go with counters (jobs created/succeeded/failed), histogram (reconciliation duration), and gauge (active jobs) - Structured logging: Added logger field to connPackage, used context-based logging throughout Configuration - Leader election ID: Made configurable via --leader-election-id flag - Development mode: Made configurable via --dev-mode flag and LOG_LEVEL env var Performance - Dependency lookup optimization: Changed from O(n*m) to O(1) using lookup maps (jobDepMap, groupDepMap) - Reconciliation backoff: Added RequeueAfter: 30*time.Second when workflow is running Documentation & Testing - Godoc documentation: Added comprehensive comments to API types and controller - Unit tests: Added helpers_test.go with tests for all helper functions - Integration tests: Added managedjob_controller_test.go with Ginkgo/Gomega tests * Add the helm chart release. * Add reasonable test coverage.
This commit is contained in:
@@ -18,18 +18,26 @@ package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/lukaszraczylo/pandati"
|
||||
kbatch "k8s.io/api/batch/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/client-go/tools/record"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log"
|
||||
|
||||
jobsmanagerv1beta1 "raczylo.com/jobs-manager-operator/api/v1beta1"
|
||||
)
|
||||
|
||||
const (
|
||||
// RequeueDelay is the time to wait before requeuing when jobs are running
|
||||
RequeueDelay = 30 * time.Second
|
||||
)
|
||||
|
||||
// ManagedJobReconciler reconciles a ManagedJob object
|
||||
type ManagedJobReconciler struct {
|
||||
client.Client
|
||||
@@ -43,46 +51,125 @@ type ManagedJobReconciler struct {
|
||||
//+kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete
|
||||
//+kubebuilder:rbac:groups="",resources=events,verbs=create;update;patch;delete;get;list;watch
|
||||
|
||||
// Reconcile ensures ManagedJob workflows progress toward completion.
|
||||
// It orchestrates job execution respecting dependencies, manages retries,
|
||||
// and tracks overall workflow status.
|
||||
func (r *ManagedJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
||||
_ = log.FromContext(ctx)
|
||||
|
||||
cp := &connPackage{
|
||||
r: r,
|
||||
ctx: ctx,
|
||||
req: req,
|
||||
dependencyTree: nil,
|
||||
}
|
||||
logger := log.FromContext(ctx).WithValues("managedJob", req.NamespacedName)
|
||||
|
||||
var managedJob jobsmanagerv1beta1.ManagedJob
|
||||
if err := r.Get(ctx, req.NamespacedName, &managedJob); err != nil {
|
||||
return ctrl.Result{}, client.IgnoreNotFound(err)
|
||||
}
|
||||
|
||||
cp.mj = &managedJob
|
||||
cp := &connPackage{
|
||||
r: r,
|
||||
ctx: ctx,
|
||||
req: req,
|
||||
logger: logger,
|
||||
mj: &managedJob,
|
||||
}
|
||||
|
||||
// Handle deletion with finalizer
|
||||
if !managedJob.DeletionTimestamp.IsZero() {
|
||||
return r.handleDeletion(ctx, cp)
|
||||
}
|
||||
|
||||
// Add finalizer if not present
|
||||
if !controllerutil.ContainsFinalizer(&managedJob, FinalizerName) {
|
||||
controllerutil.AddFinalizer(&managedJob, FinalizerName)
|
||||
if err := r.Update(ctx, &managedJob); err != nil {
|
||||
logger.Error(err, "Failed to add finalizer")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{RequeueAfter: time.Second}, nil
|
||||
}
|
||||
|
||||
originalMainJobDefinition := cp.mj.DeepCopy()
|
||||
cp.generateDependencyTree()
|
||||
cp.buildDependencyMaps() // Build lookup maps for O(1) dependency updates
|
||||
_, theSame, _ := pandati.CompareStructsReplaced(originalMainJobDefinition, cp.mj)
|
||||
if !theSame {
|
||||
cp.updateCRDStatusDirectly()
|
||||
if err := cp.updateCRDStatusDirectly(); err != nil {
|
||||
logger.Error(err, "Failed to update CRD status after dependency tree generation")
|
||||
}
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
originalMainJobDefinition = cp.mj.DeepCopy()
|
||||
|
||||
// TODO: Re-enable after testing
|
||||
cp.checkRunningJobsStatus()
|
||||
cp.runPendingJobs()
|
||||
|
||||
_, theSame, _ = pandati.CompareStructsReplaced(originalMainJobDefinition, cp.mj)
|
||||
if !theSame {
|
||||
cp.updateCRDStatusDirectly()
|
||||
if err := cp.updateCRDStatusDirectly(); err != nil {
|
||||
logger.Error(err, "Failed to update CRD status after job processing")
|
||||
}
|
||||
}
|
||||
|
||||
cp.checkOverallStatus()
|
||||
// fmt.Printf("Reconcile: %# v", pretty.Formatter(r.Updater))
|
||||
|
||||
// If workflow is still running, requeue after a delay to check status
|
||||
if cp.mj.Status == ExecutionStatusRunning {
|
||||
return ctrl.Result{RequeueAfter: RequeueDelay}, nil
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
// handleDeletion cleans up child jobs before removing the finalizer
|
||||
func (r *ManagedJobReconciler) handleDeletion(ctx context.Context, cp *connPackage) (ctrl.Result, error) {
|
||||
if !controllerutil.ContainsFinalizer(cp.mj, FinalizerName) {
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
cp.logger.Info("Cleaning up child jobs before deletion")
|
||||
|
||||
// Delete all child jobs
|
||||
if err := r.deleteChildJobs(ctx, cp); err != nil {
|
||||
cp.logger.Error(err, "Failed to delete child jobs")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
// Remove finalizer
|
||||
controllerutil.RemoveFinalizer(cp.mj, FinalizerName)
|
||||
if err := r.Update(ctx, cp.mj); err != nil {
|
||||
cp.logger.Error(err, "Failed to remove finalizer")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
cp.logger.Info("Successfully cleaned up ManagedJob")
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
// deleteChildJobs removes all jobs owned by this ManagedJob
|
||||
func (r *ManagedJobReconciler) deleteChildJobs(ctx context.Context, cp *connPackage) error {
|
||||
var childJobs kbatch.JobList
|
||||
labelSelector := labels.SelectorFromSet(labels.Set{
|
||||
LabelWorkflowName: cp.mj.Name,
|
||||
})
|
||||
listOptions := &client.ListOptions{
|
||||
LabelSelector: labelSelector,
|
||||
Namespace: cp.mj.Namespace,
|
||||
}
|
||||
|
||||
if err := r.Client.List(ctx, &childJobs, listOptions); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for i := range childJobs.Items {
|
||||
job := &childJobs.Items[i]
|
||||
if err := r.Client.Delete(ctx, job, client.PropagationPolicy("Background")); err != nil {
|
||||
cp.logger.Error(err, "Failed to delete child job", "job", job.Name)
|
||||
// Continue trying to delete other jobs
|
||||
} else {
|
||||
cp.logger.Info("Deleted child job", "job", job.Name)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetupWithManager sets up the controller with the Manager.
|
||||
func (r *ManagedJobReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
return ctrl.NewControllerManagedBy(mgr).
|
||||
|
||||
Reference in New Issue
Block a user