Fix the issue with job continuously restarting.

This commit is contained in:
2024-09-11 23:51:06 +01:00
parent 9cf5975def
commit e8cda5952f
4 changed files with 22 additions and 25 deletions
+2 -2
View File
@@ -10,9 +10,9 @@ description: |
type: application
version: 0.1.30
version: 0.1.33
appVersion: "0.1.30"
appVersion: "0.1.33"
home: https://github.com/lukaszraczylo/kubernetes-images-sync-operator
+1 -1
View File
@@ -12,7 +12,7 @@ sa:
- ALL
image:
repository: ghcr.io/lukaszraczylo/kubernetes-images-sync-operator
tag: 0.1.30
tag: 0.1.33
resources:
limits:
cpu: 500m
@@ -155,6 +155,11 @@ func (r *ClusterImageReconciler) handleRunningClusterImage(ctx context.Context,
if existingJob.Status.Succeeded > 0 {
clusterImage.Status.Progress = shared.STATUS_SUCCESS
r.ActiveJobs--
// Update the status before cleaning up the job
if err := r.Status().Update(ctx, clusterImage); err != nil {
l.Error(err, "unable to update ClusterImage status to SUCCESS")
return ctrl.Result{}, err
}
if err := r.cleanupJobAndPods(ctx, existingJob); err != nil {
l.Error(err, "unable to cleanup job and pods")
return ctrl.Result{}, err
@@ -163,6 +168,11 @@ func (r *ClusterImageReconciler) handleRunningClusterImage(ctx context.Context,
if clusterImage.Status.RetryCount < 3 {
clusterImage.Status.Progress = shared.STATUS_RETRYING
clusterImage.Status.RetryCount++
// Update the status before cleaning up the job
if err := r.Status().Update(ctx, clusterImage); err != nil {
l.Error(err, "unable to update ClusterImage status for retry")
return ctrl.Result{}, err
}
if err := r.cleanupJobAndPods(ctx, existingJob); err != nil {
l.Error(err, "unable to cleanup failed job and pods for retry")
return ctrl.Result{}, err
@@ -172,6 +182,11 @@ func (r *ClusterImageReconciler) handleRunningClusterImage(ctx context.Context,
} else {
clusterImage.Status.Progress = shared.STATUS_FAILED
r.ActiveJobs--
// Update the status before cleaning up the job
if err := r.Status().Update(ctx, clusterImage); err != nil {
l.Error(err, "unable to update ClusterImage status to FAILED")
return ctrl.Result{}, err
}
if err := r.cleanupJobAndPods(ctx, existingJob); err != nil {
l.Error(err, "unable to cleanup failed job and pods")
return ctrl.Result{}, err
@@ -192,8 +207,10 @@ func (r *ClusterImageReconciler) handleRunningClusterImage(ctx context.Context,
return r.updateClusterImageExportStatus(ctx, clusterImage)
}
func (r *ClusterImageReconciler) cleanupJobAndPods(ctx context.Context, job *v1batch.Job) error {
// Add a short delay to allow status updates to propagate
time.Sleep(2 * time.Second)
// Delete the job
if err := r.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)); err != nil && !errors.IsNotFound(err) {
return fmt.Errorf("failed to delete job: %w", err)
@@ -410,23 +427,3 @@ func (r *ClusterImageReconciler) checkImageExists(ctx context.Context, clusterIm
return false, nil
}
func (r *ClusterImageReconciler) isJobStarted(ctx context.Context, job *v1batch.Job) (bool, error) {
podList := &v1.PodList{}
if err := r.List(ctx, podList, client.InNamespace(job.Namespace), client.MatchingLabels(job.Spec.Selector.MatchLabels)); err != nil {
return false, err
}
for _, pod := range podList.Items {
if pod.Status.Phase == v1.PodRunning {
return true, nil
}
}
return false, nil
}
func (r *ClusterImageReconciler) hasJobTimedOut(job *v1batch.Job) bool {
// Check if the job has been running for more than 5 minutes without starting
return time.Since(job.CreationTimestamp.Time) > 5*time.Minute
}
+1 -1
View File
@@ -1,5 +1,5 @@
version: 1
force:
minor: 1
existing: false
existing: true
strict: false