diff --git a/chart/Chart.yaml b/chart/Chart.yaml index 1e16bae..9677e0d 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -10,9 +10,9 @@ description: | type: application -version: 0.2.33 +version: 0.2.34 -appVersion: "0.2.33" +appVersion: "0.2.34" home: https://github.com/lukaszraczylo/kubernetes-images-sync-operator diff --git a/chart/values.yaml b/chart/values.yaml index a06e49f..45df93d 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -12,7 +12,7 @@ sa: - ALL image: repository: ghcr.io/lukaszraczylo/kubernetes-images-sync-operator - tag: 0.2.33 + tag: 0.2.34 resources: limits: cpu: 500m diff --git a/internal/controller/raczylo.com/clusterimage_controller.go b/internal/controller/raczylo.com/clusterimage_controller.go index 069ab6c..06e614f 100644 --- a/internal/controller/raczylo.com/clusterimage_controller.go +++ b/internal/controller/raczylo.com/clusterimage_controller.go @@ -157,6 +157,13 @@ func (r *ClusterImageReconciler) handlePendingClusterImage(ctx context.Context, } func (r *ClusterImageReconciler) handleRunningClusterImage(ctx context.Context, clusterImage *raczylocomv1.ClusterImage, l logr.Logger) (ctrl.Result, error) { + // Get the latest version before proceeding + latest := &raczylocomv1.ClusterImage{} + if err := r.Get(ctx, types.NamespacedName{Name: clusterImage.Name, Namespace: clusterImage.Namespace}, latest); err != nil { + return ctrl.Result{}, err + } + clusterImage = latest + // Check for existing job for this ClusterImage existingJob := &v1batch.Job{} jobName := fmt.Sprintf("img-export-%s", clusterImage.Name) @@ -171,16 +178,26 @@ func (r *ClusterImageReconciler) handleRunningClusterImage(ctx context.Context, return ctrl.Result{}, nil } - // If we have retries left, consider retrying - if clusterImage.Status.RetryCount < 3 { - clusterImage.Status.Progress = shared.STATUS_RETRYING - clusterImage.Status.RetryCount++ - } else { - // Exceeded retries; mark as FAILED - clusterImage.Status.Progress = shared.STATUS_FAILED + // Get latest version before updating status + latest := &raczylocomv1.ClusterImage{} + if err := r.Get(ctx, types.NamespacedName{Name: clusterImage.Name, Namespace: clusterImage.Namespace}, latest); err != nil { + return ctrl.Result{}, err } - if err := r.Status().Update(ctx, clusterImage); err != nil { + // If we have retries left, consider retrying + if latest.Status.RetryCount < 3 { + latest.Status.Progress = shared.STATUS_RETRYING + latest.Status.RetryCount++ + } else { + // Exceeded retries; mark as FAILED + latest.Status.Progress = shared.STATUS_FAILED + } + + if err := r.Status().Update(ctx, latest); err != nil { + if errors.IsConflict(err) { + // Resource was modified, requeue and try again + return ctrl.Result{Requeue: true}, nil + } l.Error(err, "unable to update ClusterImage status after job not found") return ctrl.Result{}, err } @@ -192,18 +209,17 @@ func (r *ClusterImageReconciler) handleRunningClusterImage(ctx context.Context, } // Check job status and update ClusterImage accordingly - // Get latest version before updating status - latest := &raczylocomv1.ClusterImage{} - if err := r.Get(ctx, types.NamespacedName{Name: clusterImage.Name, Namespace: clusterImage.Namespace}, latest); err != nil { - return ctrl.Result{}, err - } - clusterImage = latest - if existingJob.Status.Succeeded > 0 { - clusterImage.Status.Progress = shared.STATUS_SUCCESS + // Get latest version before updating status + latest := &raczylocomv1.ClusterImage{} + if err := r.Get(ctx, types.NamespacedName{Name: clusterImage.Name, Namespace: clusterImage.Namespace}, latest); err != nil { + return ctrl.Result{}, err + } + + latest.Status.Progress = shared.STATUS_SUCCESS r.ActiveJobs-- // Update the status before cleaning up the job - if err := r.Status().Update(ctx, clusterImage); err != nil { + if err := r.Status().Update(ctx, latest); err != nil { if errors.IsConflict(err) { // Resource was modified, requeue and try again return ctrl.Result{Requeue: true}, nil @@ -431,6 +447,14 @@ func (r *ClusterImageReconciler) updateClusterImageExportStatus(ctx context.Cont func (r *ClusterImageReconciler) handleJobRestarts(ctx context.Context, job *v1batch.Job, clusterImage *raczylocomv1.ClusterImage) error { l := log.FromContext(ctx) + + // Get the latest version before proceeding + latest := &raczylocomv1.ClusterImage{} + if err := r.Get(ctx, types.NamespacedName{Name: clusterImage.Name, Namespace: clusterImage.Namespace}, latest); err != nil { + return err + } + clusterImage = latest + podList := &v1.PodList{} if err := r.List(ctx, podList, client.InNamespace(job.Namespace), client.MatchingLabels(job.Spec.Selector.MatchLabels)); err != nil { return err @@ -449,22 +473,21 @@ func (r *ClusterImageReconciler) handleJobRestarts(ctx context.Context, job *v1b if newRestarts > 0 { l.Info("Container restarts detected", "job", job.Name, "newRestarts", newRestarts, "totalRestarts", totalRestarts) + // Get latest version before updating + latest := &raczylocomv1.ClusterImage{} + if err := r.Get(ctx, types.NamespacedName{Name: clusterImage.Name, Namespace: clusterImage.Namespace}, latest); err != nil { + return err + } + // Update retry count with new restarts - clusterImage.Status.RetryCount += newRestarts - - if clusterImage.Status.RetryCount >= 3 { - // Get latest version before updating status - latest := &raczylocomv1.ClusterImage{} - if err := r.Get(ctx, types.NamespacedName{Name: clusterImage.Name, Namespace: clusterImage.Namespace}, latest); err != nil { - return err - } + latest.Status.RetryCount = clusterImage.Status.RetryCount + newRestarts + if latest.Status.RetryCount >= 3 { // Max retries reached latest.Status.Progress = shared.STATUS_FAILED - latest.Status.RetryCount = clusterImage.Status.RetryCount if err := r.Status().Update(ctx, latest); err != nil { if errors.IsConflict(err) { - // Resource was modified, try again + // Resource was modified, requeue and try again return nil } return fmt.Errorf("failed to update status to FAILED: %w", err) @@ -475,18 +498,11 @@ func (r *ClusterImageReconciler) handleJobRestarts(ctx context.Context, job *v1b return fmt.Errorf("failed to cleanup resources: %w", err) } } else { - // Get latest version before updating status - latest := &raczylocomv1.ClusterImage{} - if err := r.Get(ctx, types.NamespacedName{Name: clusterImage.Name, Namespace: clusterImage.Namespace}, latest); err != nil { - return err - } - // Still have retries left latest.Status.Progress = shared.STATUS_RETRYING - latest.Status.RetryCount = clusterImage.Status.RetryCount if err := r.Status().Update(ctx, latest); err != nil { if errors.IsConflict(err) { - // Resource was modified, try again + // Resource was modified, requeue and try again return nil } return fmt.Errorf("failed to update status to RETRYING: %w", err)