From 5499734eef1e3279f8fb45851c6bb578454ec2fc Mon Sep 17 00:00:00 2001 From: Gianluca Mardente Date: Thu, 19 Mar 2026 12:11:19 +0100 Subject: [PATCH] Fix hash evaluation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hash evaluation fails because of missing referenced resource, continue by returning nil instead of failing. This will make sure the error is reported in the clusterSummary status. This PR also changes the logic in clusterSummary controller `scope.Close()` When a conflict is detected in scope.Close(), instead of returning nothing, set `result = ctrl.Result{RequeueAfter: time.Minute}`. This ensures re-reconciliation without bypassing the NextReconcileTime backoff (since on the success path no setNextReconcileTime is called, so skipReconciliation won't block the 1-minute requeue) Test failure in the tier-change scenario: 1. Tier changes → ClusterSummary spec updated → reconcile fires 2. Deployment succeeds (no setNextReconcileTime called on success path → no cooldown set) 3. scope.Close() tries to patch status → conflict (the ClusterSummary was modified between read and patch by the controller's own in-flight logic) 4. Old (pre PR): conflict swallowed → no requeue scheduled, no watch event comes → status never reaches Provisioned → test times out --- api/v1beta1/zz_generated.deepcopy.go | 5 +- cmd/main.go | 9 +-- controllers/clustersummary_controller.go | 20 +++++-- controllers/clustersummary_deployer.go | 3 +- controllers/controllers_suite_test.go | 10 ++-- controllers/handlers_kustomize.go | 1 + controllers/handlers_resources.go | 71 +++++++++++++--------- controllers/profile_utils.go | 2 +- controllers/utils.go | 2 +- test/fv/second_tier_test.go | 76 ++++++++++++++++++++++-- test/fv/utils_test.go | 9 +-- 11 files changed, 148 insertions(+), 60 deletions(-) diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index dba06be0..9241f874 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -21,11 +21,12 @@ limitations under the License. package v1beta1 import ( - apiv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" + + apiv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. diff --git a/cmd/main.go b/cmd/main.go index 6ff6df85..2e4c0348 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -781,16 +781,11 @@ func runInitContainerWork(ctx context.Context, config *rest.Config, func setupLogging() { klog.InitFlags(nil) - _ = flag.Set("logtostderr", "false") // set default, but still overridable via CLI + initFlags(pflag.CommandLine) pflag.CommandLine.SetNormalizeFunc(cliflag.WordSepNormalizeFunc) pflag.CommandLine.AddGoFlagSet(flag.CommandLine) pflag.Parse() - if flag.Lookup("logtostderr").Value.String() == "false" { - klog.SetOutputBySeverity("INFO", os.Stdout) - klog.SetOutputBySeverity("WARNING", os.Stdout) - klog.SetOutputBySeverity("ERROR", os.Stderr) - klog.SetOutputBySeverity("FATAL", os.Stderr) - } + ctrl.SetLogger(klog.Background()) } diff --git a/controllers/clustersummary_controller.go b/controllers/clustersummary_controller.go index 24058321..97849def 100644 --- a/controllers/clustersummary_controller.go +++ b/controllers/clustersummary_controller.go @@ -155,7 +155,7 @@ type ClusterSummaryReconciler struct { //+kubebuilder:rbac:groups="source.toolkit.fluxcd.io",resources=buckets,verbs=get;watch;list //+kubebuilder:rbac:groups="source.toolkit.fluxcd.io",resources=buckets/status,verbs=get;watch;list -func (r *ClusterSummaryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { +func (r *ClusterSummaryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, reterr error) { logger := ctrl.LoggerFrom(ctx) logger.V(logs.LogDebug).Info("Reconciling") @@ -210,14 +210,17 @@ func (r *ClusterSummaryReconciler) Reconcile(ctx context.Context, req ctrl.Reque } // Always close the scope when exiting this function so we can persist any ClusterSummary - // changes. Conflict errors are swallowed because the watch event from whatever caused the - // conflict will re-enqueue this resource, and the next reconciliation will recompute status. - // Propagating the conflict would cause controller-runtime to immediately requeue, bypassing - // the intended NextReconcileTime backoff. + // changes. Conflict errors are swallowed and a 1-minute requeue is scheduled instead of + // propagating the error (which would cause controller-runtime to immediately requeue, + // bypassing the intended NextReconcileTime backoff). We cannot rely solely on a watch event + // to re-enqueue because the conflict may be caused by the controller's own logic (e.g. a + // tier change), in which case no further watch event will arrive. defer func() { if err = clusterSummaryScope.Close(ctx); err != nil { if apierrors.IsConflict(err) { - logger.V(logs.LogDebug).Info("conflict patching ClusterSummary status, will reconcile on next event") + logger.V(logs.LogDebug).Info("conflict patching ClusterSummary status, will retry in 1 minute") + r.setNextReconcileTime(clusterSummaryScope, time.Minute) + result = ctrl.Result{RequeueAfter: time.Minute} return } reterr = err @@ -295,6 +298,11 @@ func (r *ClusterSummaryReconciler) updateDeletedInstancs(clusterSummaryScope *sc Namespace: clusterSummaryScope.Namespace(), Name: clusterSummaryScope.Name(), }] = time.Now() + + delete(r.NextReconcileTimes, types.NamespacedName{ + Namespace: clusterSummaryScope.Namespace(), + Name: clusterSummaryScope.Name(), + }) } func (r *ClusterSummaryReconciler) reconcileDelete( diff --git a/controllers/clustersummary_deployer.go b/controllers/clustersummary_deployer.go index ed5b1c5f..3f28b392 100644 --- a/controllers/clustersummary_deployer.go +++ b/controllers/clustersummary_deployer.go @@ -98,7 +98,8 @@ func (r *ClusterSummaryReconciler) deployFeature(ctx context.Context, clusterSum // Get hash of current configuration (at this very precise moment) currentHash, err := f.currentHash(ctx, r.Client, clusterSummary, logger) if err != nil { - if !apierrors.IsNotFound(err) { + var nrErr *configv1beta1.NonRetriableError + if !errors.As(err, &nrErr) && !apierrors.IsNotFound(err) { return err } } diff --git a/controllers/controllers_suite_test.go b/controllers/controllers_suite_test.go index 52491a96..dc5c101d 100644 --- a/controllers/controllers_suite_test.go +++ b/controllers/controllers_suite_test.go @@ -171,11 +171,11 @@ var _ = AfterSuite(func() { func getClusterSummaryReconciler(c client.Client, dep deployer.DeployerInterface) *controllers.ClusterSummaryReconciler { return &controllers.ClusterSummaryReconciler{ - Client: c, - Scheme: scheme, - Deployer: dep, - ClusterMap: make(map[corev1.ObjectReference]*libsveltosset.Set), - ReferenceMap: make(map[corev1.ObjectReference]*libsveltosset.Set), + Client: c, + Scheme: scheme, + Deployer: dep, + ClusterMap: make(map[corev1.ObjectReference]*libsveltosset.Set), + ReferenceMap: make(map[corev1.ObjectReference]*libsveltosset.Set), DeletedInstances: make(map[types.NamespacedName]time.Time), NextReconcileTimes: make(map[types.NamespacedName]time.Time), PolicyMux: sync.Mutex{}, diff --git a/controllers/handlers_kustomize.go b/controllers/handlers_kustomize.go index 1e72dd44..6272ffa2 100644 --- a/controllers/handlers_kustomize.go +++ b/controllers/handlers_kustomize.go @@ -367,6 +367,7 @@ func kustomizationHash(ctx context.Context, c client.Client, clusterSummary *con continue } config += string(result) + config += fmt.Sprintf("%d", kustomizationRef.Tier) valueFromHash, err := getKustomizeReferenceResourceHash(ctx, c, clusterSummary, kustomizationRef, logger) diff --git a/controllers/handlers_resources.go b/controllers/handlers_resources.go index 174cad86..bb685d90 100644 --- a/controllers/handlers_resources.go +++ b/controllers/handlers_resources.go @@ -533,34 +533,7 @@ func resourcesHash(ctx context.Context, c client.Client, clusterSummary *configv hash := sha256.Sum256(raw) config += hex.EncodeToString((hash[:])) - referencedObjects := make([]corev1.ObjectReference, 0, len(clusterSummary.Spec.ClusterProfileSpec.PolicyRefs)) - for i := range sortedPolicyRefs { - reference := &sortedPolicyRefs[i] - namespace, err := libsveltostemplate.GetReferenceResourceNamespace(ctx, c, - clusterSummary.Spec.ClusterNamespace, clusterSummary.Spec.ClusterName, reference.Namespace, - clusterSummary.Spec.ClusterType) - if err != nil { - logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to instantiate namespace for %s %s/%s: %v", - reference.Kind, reference.Namespace, reference.Name, err)) - // Ignore template instantiation error - continue - } - - name, err := libsveltostemplate.GetReferenceResourceName(ctx, c, clusterSummary.Spec.ClusterNamespace, - clusterSummary.Spec.ClusterName, reference.Name, clusterSummary.Spec.ClusterType) - if err != nil { - logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to instantiate name for %s %s/%s: %v", - reference.Kind, reference.Namespace, reference.Name, err)) - // Ignore template instantiation error - continue - } - - referencedObjects = append(referencedObjects, corev1.ObjectReference{ - Kind: sortedPolicyRefs[i].Kind, - Namespace: namespace, - Name: name, - }) - } + referencedObjects, referencedObjectTiers := getInstantiatedPolicyRefInfo(ctx, c, clusterSummary, sortedPolicyRefs, logger) sort.Sort(dependencymanager.SortedCorev1ObjectReference(referencedObjects)) @@ -573,12 +546,14 @@ func resourcesHash(ctx context.Context, c client.Client, clusterSummary *configv err = c.Get(ctx, types.NamespacedName{Namespace: reference.Namespace, Name: reference.Name}, configmap) if err == nil { config += getConfigMapHash(configmap) + config += fmt.Sprintf("%d", referencedObjectTiers[*reference]) } } else if reference.Kind == string(libsveltosv1beta1.SecretReferencedResourceKind) { secret := &corev1.Secret{} err = c.Get(ctx, types.NamespacedName{Namespace: reference.Namespace, Name: reference.Name}, secret) if err == nil { config += getSecretHash(secret) + config += fmt.Sprintf("%d", referencedObjectTiers[*reference]) } } else { var source client.Object @@ -591,6 +566,7 @@ func resourcesHash(ctx context.Context, c client.Client, clusterSummary *configv if source.GetAnnotations() != nil { config += getDataSectionHash(source.GetAnnotations()) } + config += fmt.Sprintf("%d", referencedObjectTiers[*reference]) } } if err != nil { @@ -623,6 +599,45 @@ func resourcesHash(ctx context.Context, c client.Client, clusterSummary *configv return h.Sum(nil), nil } +func getInstantiatedPolicyRefInfo(ctx context.Context, c client.Client, clusterSummary *configv1beta1.ClusterSummary, + sortedPolicyRefs []configv1beta1.PolicyRef, logger logr.Logger, +) (referencedObjects []corev1.ObjectReference, referencedObjectTiers map[corev1.ObjectReference]int32) { + + referencedObjects = make([]corev1.ObjectReference, 0, len(clusterSummary.Spec.ClusterProfileSpec.PolicyRefs)) + referencedObjectTiers = make(map[corev1.ObjectReference]int32, len(clusterSummary.Spec.ClusterProfileSpec.PolicyRefs)) + for i := range sortedPolicyRefs { + reference := &sortedPolicyRefs[i] + namespace, err := libsveltostemplate.GetReferenceResourceNamespace(ctx, c, + clusterSummary.Spec.ClusterNamespace, clusterSummary.Spec.ClusterName, reference.Namespace, + clusterSummary.Spec.ClusterType) + if err != nil { + logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to instantiate namespace for %s %s/%s: %v", + reference.Kind, reference.Namespace, reference.Name, err)) + // Ignore template instantiation error + continue + } + + name, err := libsveltostemplate.GetReferenceResourceName(ctx, c, clusterSummary.Spec.ClusterNamespace, + clusterSummary.Spec.ClusterName, reference.Name, clusterSummary.Spec.ClusterType) + if err != nil { + logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to instantiate name for %s %s/%s: %v", + reference.Kind, reference.Namespace, reference.Name, err)) + // Ignore template instantiation error + continue + } + + obj := corev1.ObjectReference{ + Kind: sortedPolicyRefs[i].Kind, + Namespace: namespace, + Name: name, + } + referencedObjects = append(referencedObjects, obj) + referencedObjectTiers[obj] = reference.Tier + } + + return referencedObjects, referencedObjectTiers +} + func getResourceRefs(clusterSummary *configv1beta1.ClusterSummary) []configv1beta1.PolicyRef { return clusterSummary.Spec.ClusterProfileSpec.PolicyRefs } diff --git a/controllers/profile_utils.go b/controllers/profile_utils.go index bd1fb0af..e23aae80 100644 --- a/controllers/profile_utils.go +++ b/controllers/profile_utils.go @@ -398,7 +398,7 @@ func cleanClusterConfigurations(ctx context.Context, c client.Client, profileSco } err = cleanClusterConfiguration(ctx, c, profileScope.Profile, cc) - if err != nil { + if err != nil && !apierrors.IsNotFound(err) { return err } } diff --git a/controllers/utils.go b/controllers/utils.go index 5cfb2b58..2d147182 100644 --- a/controllers/utils.go +++ b/controllers/utils.go @@ -236,7 +236,7 @@ func isNamespaced(ctx context.Context, r *unstructured.Unstructured, clusterName // 2. RETRY LOOP: Give it 3 attempts with increasing wait times // Total wait time: 1s + 2s + 3s = 6 seconds. for i := range 3 { - // Log that we are attempting a refresh (MGIANLUC style) + // Log that we are attempting a refresh logger.V(logs.LogInfo).Info(fmt.Sprintf("GVK %s not found, refreshing discovery (attempt %d)", gvk.String(), i+1)) // IMPORTANT: Invalidate the Discovery Client FIRST, then Reset the Mapper diff --git a/test/fv/second_tier_test.go b/test/fv/second_tier_test.go index 99d31980..ecf356f1 100644 --- a/test/fv/second_tier_test.go +++ b/test/fv/second_tier_test.go @@ -135,7 +135,7 @@ var _ = Describe("PolicyRef Tier", func() { currentServiceAccount) }, timeout, pollingInterval).Should(BeNil()) - Byf("Verifying ServicdAccount has proper labels") + Byf("Verifying ServiceAccount has proper labels") currentServiceAccount := &corev1.ServiceAccount{} Expect(workloadClient.Get(context.TODO(), types.NamespacedName{Namespace: saNamespace, Name: saName}, @@ -144,6 +144,8 @@ var _ = Describe("PolicyRef Tier", func() { v, ok := currentServiceAccount.Labels[firstConfigMapLabelKey] Expect(ok).To(BeTrue()) Expect(v).To(Equal(firstConfigMapLabelValue)) + v, ok = currentServiceAccount.Labels[secondConfigMapLabelKey] + Expect(ok).To(BeFalse()) Byf("Verifying ClusterSummary %s status reports conflict for Resources feature", clusterSummary.Name) Eventually(func() bool { @@ -165,7 +167,7 @@ var _ = Describe("PolicyRef Tier", func() { return false }, timeout, pollingInterval).Should(BeTrue()) - By("Updating second ConfigMap tier") + By(fmt.Sprintf("Updating ConfigMap %s/%s tier", secondConfigMap.Namespace, secondConfigMap.Name)) const lowerTier = 90 err = retry.RetryOnConflict(retry.DefaultRetry, func() error { Expect(k8sClient.Get(context.TODO(), @@ -177,18 +179,22 @@ var _ = Describe("PolicyRef Tier", func() { Name: firstConfigMap.Name, }, { - Tier: lowerTier, Kind: string(libsveltosv1beta1.ConfigMapReferencedResourceKind), Namespace: secondConfigMap.Namespace, Name: secondConfigMap.Name, + Tier: lowerTier, }, } return k8sClient.Update(context.TODO(), currentClusterProfile) }) Expect(err).To(BeNil()) - Byf("Verifying ClusterSummary %s status is set to Deployed for Resources feature", clusterSummary.Name) - verifyFeatureStatusIsProvisioned(kindWorkloadCluster.GetNamespace(), clusterSummary.Name, libsveltosv1beta1.FeatureResources) + Expect(k8sClient.Get(context.TODO(), + types.NamespacedName{Name: clusterProfile.Name}, currentClusterProfile)).To(Succeed()) + + clusterSummary = verifyClusterSummary(clusterops.ClusterProfileLabelName, + currentClusterProfile.Name, ¤tClusterProfile.Spec, + kindWorkloadCluster.GetNamespace(), kindWorkloadCluster.GetName(), getClusterType()) Byf("Verifying proper ServiceAccount is still present in the workload cluster with correct labels") Eventually(func() bool { @@ -203,12 +209,63 @@ var _ = Describe("PolicyRef Tier", func() { if currentServiceAccount.Labels == nil { return false } + _, ok = currentServiceAccount.Labels[firstConfigMapLabelKey] + if ok { + return false + } v, ok = currentServiceAccount.Labels[secondConfigMapLabelKey] return ok && v == secondConfigMapLabelValue }, timeout, pollingInterval).Should(BeTrue()) + By("Changing first ConfigMap so there is no conflict anymore") + newSaNamespace := randomString() + firstConfigMap = createConfigMapWithPolicy(configMapNs, namePrefix+randomString(), + fmt.Sprintf(resource, newSaNamespace, saName, firstConfigMapLabelKey, firstConfigMapLabelValue)) + Expect(k8sClient.Create(context.TODO(), firstConfigMap)).To(Succeed()) + + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + Expect(k8sClient.Get(context.TODO(), + types.NamespacedName{Name: clusterProfile.Name}, currentClusterProfile)).To(Succeed()) + currentClusterProfile.Spec.PolicyRefs = []configv1beta1.PolicyRef{ + { + Kind: string(libsveltosv1beta1.ConfigMapReferencedResourceKind), + Namespace: firstConfigMap.Namespace, + Name: firstConfigMap.Name, + }, + { + Kind: string(libsveltosv1beta1.ConfigMapReferencedResourceKind), + Namespace: secondConfigMap.Namespace, + Name: secondConfigMap.Name, + Tier: lowerTier, + }, + } + return k8sClient.Update(context.TODO(), currentClusterProfile) + }) + Expect(err).To(BeNil()) + + Byf("Verifying new ServiceAccount is present in the workload cluster with correct labels") + Eventually(func() bool { + currentServiceAccount := &corev1.ServiceAccount{} + err = workloadClient.Get(context.TODO(), + types.NamespacedName{Namespace: newSaNamespace, Name: saName}, + currentServiceAccount) + if err != nil { + return false + } + + if currentServiceAccount.Labels == nil { + return false + } + v, ok = currentServiceAccount.Labels[firstConfigMapLabelKey] + return ok && v == firstConfigMapLabelValue + }, timeout, pollingInterval).Should(BeTrue()) + + Byf("Verifying ClusterSummary %s status is set to Deployed for Resources feature", clusterSummary.Name) + verifyFeatureStatusIsProvisioned(clusterSummary.Namespace, clusterSummary.Name, libsveltosv1beta1.FeatureResources) + policies := []policy{ {kind: "ServiceAccount", name: saName, namespace: saNamespace, group: ""}, + {kind: "ServiceAccount", name: saName, namespace: newSaNamespace, group: ""}, } verifyClusterConfiguration(configv1beta1.ClusterProfileKind, clusterProfile.Name, clusterSummary.Spec.ClusterNamespace, clusterSummary.Spec.ClusterName, libsveltosv1beta1.FeatureResources, @@ -228,5 +285,14 @@ var _ = Describe("PolicyRef Tier", func() { currentServiceAccount) return err != nil && apierrors.IsNotFound(err) }, timeout, pollingInterval).Should(BeTrue()) + + Byf("Verifying second ServiceAccount is removed from the workload cluster") + Eventually(func() bool { + currentServiceAccount := &corev1.ServiceAccount{} + err = workloadClient.Get(context.TODO(), + types.NamespacedName{Namespace: newSaNamespace, Name: saName}, + currentServiceAccount) + return err != nil && apierrors.IsNotFound(err) + }, timeout, pollingInterval).Should(BeTrue()) }) }) diff --git a/test/fv/utils_test.go b/test/fv/utils_test.go index 77674b37..6e9cb4dd 100644 --- a/test/fv/utils_test.go +++ b/test/fv/utils_test.go @@ -165,11 +165,12 @@ func verifyFeatureStatusIsProvisioned(clusterSummaryNamespace, clusterSummaryNam return false } for i := range currentClusterSummary.Status.FeatureSummaries { - if currentClusterSummary.Status.FeatureSummaries[i].FeatureID == featureID && - currentClusterSummary.Status.FeatureSummaries[i].Status == libsveltosv1beta1.FeatureStatusProvisioned && - currentClusterSummary.Status.FeatureSummaries[i].FailureMessage == nil { + if currentClusterSummary.Status.FeatureSummaries[i].FeatureID == featureID { + if currentClusterSummary.Status.FeatureSummaries[i].Status == libsveltosv1beta1.FeatureStatusProvisioned && + currentClusterSummary.Status.FeatureSummaries[i].FailureMessage == nil { - return true + return true + } } } return false