diff --git a/chart/Chart.yaml b/chart/Chart.yaml index 32755ed..962ac42 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -10,9 +10,9 @@ description: | type: application -version: 0.2.40 +version: 0.2.42 -appVersion: "0.2.40" +appVersion: "0.2.42" home: https://github.com/lukaszraczylo/kubernetes-images-sync-operator diff --git a/chart/values.yaml b/chart/values.yaml index 9a3fd47..7d38789 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -12,7 +12,7 @@ sa: - ALL image: repository: ghcr.io/lukaszraczylo/kubernetes-images-sync-operator - tag: 0.2.40 + tag: 0.2.42 resources: limits: cpu: 500m diff --git a/docker-image-worker/s3_utils.py b/docker-image-worker/s3_utils.py index 0da8f78..6891d86 100644 --- a/docker-image-worker/s3_utils.py +++ b/docker-image-worker/s3_utils.py @@ -69,16 +69,23 @@ def get_s3_client(use_role=False, role_name=None, use_current_role=False, aws_ac logger.info(f"Environment: {key}={value}") # Get the AWS region from environment or parameter - aws_region = region or os.environ.get('AWS_REGION') or os.environ.get('AWS_DEFAULT_REGION') - if not aws_region: + aws_region = os.environ.get('AWS_REGION') or os.environ.get('AWS_DEFAULT_REGION') + if not aws_region and not region: raise ValueError("AWS region must be specified either through region parameter or AWS_REGION environment variable") + # Use region from parameter only if not set in environment + if not aws_region: + aws_region = region + # Set it in environment for other AWS clients + os.environ['AWS_REGION'] = region + logger.info(f"Using AWS region: {aws_region}") # Create an STS client in the correct region - sts = boto3.client('sts', - region_name=aws_region, - endpoint_url=f'https://sts.{aws_region}.amazonaws.com') + sts_kwargs = {'endpoint_url': f'https://sts.{aws_region}.amazonaws.com'} + if not os.environ.get('AWS_REGION') and not os.environ.get('AWS_DEFAULT_REGION'): + sts_kwargs['region_name'] = aws_region + sts = boto3.client('sts', **sts_kwargs) # Read the web identity token token_file = os.environ.get('AWS_WEB_IDENTITY_TOKEN_FILE') @@ -105,14 +112,17 @@ def get_s3_client(use_role=False, role_name=None, use_current_role=False, aws_ac credentials = response['Credentials'] # Create the S3 client with the temporary credentials - client = boto3.client( - 's3', - region_name=aws_region, - aws_access_key_id=credentials['AccessKeyId'], - aws_secret_access_key=credentials['SecretAccessKey'], - aws_session_token=credentials['SessionToken'], - **client_kwargs - ) + s3_kwargs = { + 'aws_access_key_id': credentials['AccessKeyId'], + 'aws_secret_access_key': credentials['SecretAccessKey'], + 'aws_session_token': credentials['SessionToken'] + } + # Only set region_name if not already in environment + if not os.environ.get('AWS_REGION') and not os.environ.get('AWS_DEFAULT_REGION'): + s3_kwargs['region_name'] = aws_region + # Add any additional kwargs + s3_kwargs.update(client_kwargs) + client = boto3.client('s3', **s3_kwargs) logger.info(f"Successfully assumed role with web identity: {response['AssumedRoleUser']['Arn']}") diff --git a/internal/controller/raczylo.com/clusterimageexport_controller.go b/internal/controller/raczylo.com/clusterimageexport_controller.go index 457d51e..04c479a 100644 --- a/internal/controller/raczylo.com/clusterimageexport_controller.go +++ b/internal/controller/raczylo.com/clusterimageexport_controller.go @@ -308,7 +308,7 @@ func (r *ClusterImageExportReconciler) handleDeletion(ctx context.Context, clust // Continue with deletion even if cleanup fails } - // Delete existing cleanup job if it exists + // Create or recreate cleanup job jobName := "cleanup-" + shared.NormalizeImageName(clusterImageExport.Name) existingJob := &batchv1.Job{} err := r.Get(ctx, client.ObjectKey{ @@ -317,26 +317,35 @@ func (r *ClusterImageExportReconciler) handleDeletion(ctx context.Context, clust }, existingJob) if err == nil { - // Job exists, delete it - if err := r.Delete(ctx, existingJob); err != nil && !errors.IsNotFound(err) { - l.Error(err, "Failed to delete existing cleanup job") - // Continue with deletion even if cleanup job deletion fails - } else { - l.Info("Successfully deleted existing cleanup job", "job", jobName) + // Job exists, delete it first + deletePolicy := metav1.DeletePropagationForeground + deleteOptions := client.DeleteOptions{ + PropagationPolicy: &deletePolicy, } + if err := r.Delete(ctx, existingJob, &deleteOptions); err != nil && !errors.IsNotFound(err) { + l.Error(err, "Failed to delete existing cleanup job") + return ctrl.Result{Requeue: true}, nil + } + l.Info("Successfully deleted existing cleanup job", "job", jobName) + // Requeue to wait for job deletion to complete + return ctrl.Result{Requeue: true}, nil } else if !errors.IsNotFound(err) { - // Unexpected error, log but continue + // Unexpected error l.Error(err, "Error checking for existing cleanup job") + return ctrl.Result{Requeue: true}, nil } // Create new cleanup job - r.runCleanupJob(ctx, clusterImageExport) + if err := r.runCleanupJob(ctx, clusterImageExport); err != nil { + l.Error(err, "Failed to create cleanup job") + return ctrl.Result{Requeue: true}, nil + } - // Remove the finalizer regardless of cleanup job status + // Only remove finalizer after cleanup job is created successfully controllerutil.RemoveFinalizer(clusterImageExport, clusterImageExportFinalizer) if err := r.Update(ctx, clusterImageExport); err != nil { if errors.IsNotFound(err) { - // CRD is already gone, which is fine + // Object is already gone, which is fine return ctrl.Result{}, nil } return ctrl.Result{}, err @@ -367,7 +376,7 @@ func (r *ClusterImageExportReconciler) deleteAssociatedClusterImages(ctx context return nil } -func (r *ClusterImageExportReconciler) runCleanupJob(ctx context.Context, clusterImageExport *raczylocomv1.ClusterImageExport) { +func (r *ClusterImageExportReconciler) runCleanupJob(ctx context.Context, clusterImageExport *raczylocomv1.ClusterImageExport) error { l := log.FromContext(ctx) normalisedImageName := "cleanup-" + shared.NormalizeImageName(clusterImageExport.Name) @@ -413,11 +422,12 @@ func (r *ClusterImageExportReconciler) runCleanupJob(ctx context.Context, cluste cleanupJob := shared.CreateJob(jobParams, func(raczylocomv1.ClusterImageExport) []string { return nil }) - // Try to create the cleanup job but don't block on errors + // Try to create the cleanup job if err := r.Create(ctx, cleanupJob); err != nil { - l.Error(err, "Failed to create cleanup job, continuing with deletion anyway") - return + l.Error(err, "Failed to create cleanup job") + return err } l.Info("Created cleanup job with retry limit and TTL") + return nil }