diff --git a/api/external/nova/messages.go b/api/external/nova/messages.go index 5b4736ef7..452046b5e 100644 --- a/api/external/nova/messages.go +++ b/api/external/nova/messages.go @@ -9,6 +9,7 @@ import ( "log/slog" "strings" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" ) @@ -129,23 +130,21 @@ func (req ExternalSchedulerRequest) GetHypervisorType() (HypervisorType, error) return "", errors.New("hypervisor type not specified in flavor extra specs") } -type RequestIntent string - const ( // LiveMigrationIntent indicates that the request is intended for live migration. - LiveMigrationIntent RequestIntent = "live_migration" + LiveMigrationIntent v1alpha1.SchedulingIntent = "live_migration" // RebuildIntent indicates that the request is intended for rebuilding a VM. - RebuildIntent RequestIntent = "rebuild" + RebuildIntent v1alpha1.SchedulingIntent = "rebuild" // ResizeIntent indicates that the request is intended for resizing a VM. - ResizeIntent RequestIntent = "resize" + ResizeIntent v1alpha1.SchedulingIntent = "resize" // EvacuateIntent indicates that the request is intended for evacuating a VM. - EvacuateIntent RequestIntent = "evacuate" + EvacuateIntent v1alpha1.SchedulingIntent = "evacuate" // CreateIntent indicates that the request is intended for creating a new VM. - CreateIntent RequestIntent = "create" + CreateIntent v1alpha1.SchedulingIntent = "create" ) // GetIntent analyzes the request spec and determines the intent of the scheduling request. -func (req ExternalSchedulerRequest) GetIntent() (RequestIntent, error) { +func (req ExternalSchedulerRequest) GetIntent() (v1alpha1.SchedulingIntent, error) { str, err := req.Spec.Data.GetSchedulerHintStr("_nova_check_type") if err != nil { return "", err diff --git a/api/external/nova/messages_test.go b/api/external/nova/messages_test.go index 1802c5a13..13040da11 100644 --- a/api/external/nova/messages_test.go +++ b/api/external/nova/messages_test.go @@ -6,13 +6,15 @@ package api import ( "encoding/json" "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" ) func TestGetIntent(t *testing.T) { tests := []struct { name string schedulerHints map[string]any - expectedIntent RequestIntent + expectedIntent v1alpha1.SchedulingIntent expectError bool }{ { diff --git a/api/v1alpha1/history_types.go b/api/v1alpha1/history_types.go new file mode 100644 index 000000000..237d30aa4 --- /dev/null +++ b/api/v1alpha1/history_types.go @@ -0,0 +1,116 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// SchedulingIntent defines the intent of a scheduling decision. +type SchedulingIntent string + +// Other intents can be defined by the operators. +const ( + // Used as default intent if the operator does not specify one. + SchedulingIntentUnknown SchedulingIntent = "Unknown" +) + +type SchedulingHistoryEntry struct { + // The timestamp of when the decision was made. + Timestamp metav1.Time `json:"timestamp"` + // The pipeline that was used for the decision. + PipelineRef corev1.ObjectReference `json:"pipelineRef"` + // The intent of the decision (e.g., initial scheduling, rescheduling, etc.). + Intent SchedulingIntent `json:"intent"` + // The top hosts ordered by score for the decision (limited to 3). + // This is not a complete list of all candidates — only the highest-ranked + // hosts are retained to keep the history compact. + // +kubebuilder:validation:Optional + OrderedHosts []string `json:"orderedHosts"` + // Whether the scheduling decision was successful. + // +kubebuilder:validation:Optional + Successful bool `json:"successful"` +} + +type HistorySpec struct { + // The scheduling domain this object with the history belongs to. + SchedulingDomain SchedulingDomain `json:"schedulingDomain"` + // The resource ID this history belongs to (e.g., the UUID of a nova instance). + ResourceID string `json:"resourceID"` +} + +// CurrentDecision holds the full context of the most recent scheduling +// decision. When a new decision arrives the previous CurrentDecision is +// compacted into a SchedulingHistoryEntry and appended to History. +type CurrentDecision struct { + // The timestamp of when the decision was made. + Timestamp metav1.Time `json:"timestamp"` + // The pipeline that was used for the decision. + PipelineRef corev1.ObjectReference `json:"pipelineRef"` + // The intent of the decision (e.g., initial scheduling, rescheduling, etc.). + Intent SchedulingIntent `json:"intent"` + // Whether the scheduling decision was successful. + Successful bool `json:"successful"` + // The target host selected for the resource. nil when no host was found. + // +kubebuilder:validation:Optional + TargetHost *string `json:"targetHost,omitempty"` + // A human-readable explanation of the scheduling decision. + // +kubebuilder:validation:Optional + Explanation string `json:"explanation,omitempty"` + // The top hosts ordered by score (limited to 3). + // +kubebuilder:validation:Optional + OrderedHosts []string `json:"orderedHosts,omitempty"` +} + +type HistoryStatus struct { + // Current represents the latest scheduling decision with full context. + // +kubebuilder:validation:Optional + Current CurrentDecision `json:"current,omitempty"` + // History of past scheduling decisions (limited to last 10). + // +kubebuilder:validation:Optional + History []SchedulingHistoryEntry `json:"history,omitempty"` + + // Conditions represent the latest available observations of the history's state. + // +kubebuilder:validation:Optional + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:printcolumn:name="Domain",type="string",JSONPath=".spec.schedulingDomain" +// +kubebuilder:printcolumn:name="Resource ID",type="string",JSONPath=".spec.resourceID" +// +kubebuilder:printcolumn:name="Target Host",type="string",JSONPath=".status.current.targetHost" +// +kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].reason" +// +kubebuilder:printcolumn:name="Created",type="date",JSONPath=".metadata.creationTimestamp" + +// History is the Schema for the history API +type History struct { + metav1.TypeMeta `json:",inline"` + + // Standard object metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Spec defines the desired state of History. + // +required + Spec HistorySpec `json:"spec"` + // Status defines the observed state of History. + // +optional + Status HistoryStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// HistoryList contains a list of History +type HistoryList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []History `json:"items"` +} + +func init() { + SchemeBuilder.Register(&History{}, &HistoryList{}) +} diff --git a/api/v1alpha1/pipeline_types.go b/api/v1alpha1/pipeline_types.go index 47c069184..1b86501c8 100644 --- a/api/v1alpha1/pipeline_types.go +++ b/api/v1alpha1/pipeline_types.go @@ -80,6 +80,9 @@ type PipelineSpec struct { // If this pipeline should create decision objects. // When this is false, the pipeline will still process requests. + // NOTE: This flag is intentionally kept as "createDecisions" to avoid + // breaking changes. It will be renamed when the deprecated Decision CRD + // is fully replaced in a future refactoring. // +kubebuilder:default=false CreateDecisions bool `json:"createDecisions,omitempty"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 564f30cac..39efe3080 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -96,6 +96,33 @@ func (in *CommittedResourceReservationStatus) DeepCopy() *CommittedResourceReser return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CurrentDecision) DeepCopyInto(out *CurrentDecision) { + *out = *in + in.Timestamp.DeepCopyInto(&out.Timestamp) + out.PipelineRef = in.PipelineRef + if in.TargetHost != nil { + in, out := &in.TargetHost, &out.TargetHost + *out = new(string) + **out = **in + } + if in.OrderedHosts != nil { + in, out := &in.OrderedHosts, &out.OrderedHosts + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CurrentDecision. +func (in *CurrentDecision) DeepCopy() *CurrentDecision { + if in == nil { + return nil + } + out := new(CurrentDecision) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Datasource) DeepCopyInto(out *Datasource) { *out = *in @@ -573,6 +600,110 @@ func (in *FilterSpec) DeepCopy() *FilterSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *History) DeepCopyInto(out *History) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new History. +func (in *History) DeepCopy() *History { + if in == nil { + return nil + } + out := new(History) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *History) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HistoryList) DeepCopyInto(out *HistoryList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]History, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HistoryList. +func (in *HistoryList) DeepCopy() *HistoryList { + if in == nil { + return nil + } + out := new(HistoryList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *HistoryList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HistorySpec) DeepCopyInto(out *HistorySpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HistorySpec. +func (in *HistorySpec) DeepCopy() *HistorySpec { + if in == nil { + return nil + } + out := new(HistorySpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HistoryStatus) DeepCopyInto(out *HistoryStatus) { + *out = *in + in.Current.DeepCopyInto(&out.Current) + if in.History != nil { + in, out := &in.History, &out.History + *out = make([]SchedulingHistoryEntry, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HistoryStatus. +func (in *HistoryStatus) DeepCopy() *HistoryStatus { + if in == nil { + return nil + } + out := new(HistoryStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *IdentityDatasource) DeepCopyInto(out *IdentityDatasource) { *out = *in @@ -1285,6 +1416,28 @@ func (in *ReservationStatus) DeepCopy() *ReservationStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SchedulingHistoryEntry) DeepCopyInto(out *SchedulingHistoryEntry) { + *out = *in + in.Timestamp.DeepCopyInto(&out.Timestamp) + out.PipelineRef = in.PipelineRef + if in.OrderedHosts != nil { + in, out := &in.OrderedHosts, &out.OrderedHosts + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingHistoryEntry. +func (in *SchedulingHistoryEntry) DeepCopy() *SchedulingHistoryEntry { + if in == nil { + return nil + } + out := new(SchedulingHistoryEntry) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StepResult) DeepCopyInto(out *StepResult) { *out = *in diff --git a/cmd/main.go b/cmd/main.go index 43ae63f9c..b6db924e8 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -41,7 +41,6 @@ import ( "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor" "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis" "github.com/cobaltcore-dev/cortex/internal/scheduling/cinder" - "github.com/cobaltcore-dev/cortex/internal/scheduling/explanation" schedulinglib "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" "github.com/cobaltcore-dev/cortex/internal/scheduling/machines" "github.com/cobaltcore-dev/cortex/internal/scheduling/manila" @@ -443,19 +442,6 @@ func main() { os.Exit(1) } } - if slices.Contains(mainConfig.EnabledControllers, "explanation-controller") { - // Setup a controller which will reconcile the history and explanation for - // decision resources. - explanationControllerConfig := conf.GetConfigOrDie[explanation.ControllerConfig]() - explanationController := &explanation.Controller{ - Client: multiclusterClient, - Config: explanationControllerConfig, - } - if err := explanationController.SetupWithManager(mgr, multiclusterClient); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "ExplanationController") - os.Exit(1) - } - } if slices.Contains(mainConfig.EnabledControllers, "reservations-controller") { monitor := reservationscontroller.NewControllerMonitor(multiclusterClient) metrics.Registry.MustRegister(&monitor) diff --git a/helm/bundles/cortex-cinder/values.yaml b/helm/bundles/cortex-cinder/values.yaml index b01656205..59472f368 100644 --- a/helm/bundles/cortex-cinder/values.yaml +++ b/helm/bundles/cortex-cinder/values.yaml @@ -97,7 +97,6 @@ cortex-scheduling-controllers: component: cinder-scheduling enabledControllers: - cinder-decisions-pipeline-controller - - explanation-controller enabledTasks: - cinder-decisions-cleanup-task diff --git a/helm/bundles/cortex-ironcore/values.yaml b/helm/bundles/cortex-ironcore/values.yaml index 82e490585..6f53ac20c 100644 --- a/helm/bundles/cortex-ironcore/values.yaml +++ b/helm/bundles/cortex-ironcore/values.yaml @@ -32,7 +32,6 @@ cortex: schedulingDomain: machines enabledControllers: - ironcore-decisions-pipeline-controller - - explanation-controller monitoring: labels: github_org: cobaltcore-dev diff --git a/helm/bundles/cortex-manila/values.yaml b/helm/bundles/cortex-manila/values.yaml index 50d16352e..326a8e61b 100644 --- a/helm/bundles/cortex-manila/values.yaml +++ b/helm/bundles/cortex-manila/values.yaml @@ -97,7 +97,6 @@ cortex-scheduling-controllers: component: manila-scheduling enabledControllers: - manila-decisions-pipeline-controller - - explanation-controller enabledTasks: - manila-decisions-cleanup-task diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 200ba3ff3..a88d8d195 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -113,7 +113,6 @@ cortex-scheduling-controllers: enabledControllers: - nova-pipeline-controllers - nova-deschedulings-executor - - explanation-controller - reservations-controller enabledTasks: - nova-decisions-cleanup-task diff --git a/helm/bundles/cortex-pods/values.yaml b/helm/bundles/cortex-pods/values.yaml index 4c381f539..6ae8e8d6c 100644 --- a/helm/bundles/cortex-pods/values.yaml +++ b/helm/bundles/cortex-pods/values.yaml @@ -32,7 +32,6 @@ cortex: schedulingDomain: pods enabledControllers: - pods-decisions-pipeline-controller - - explanation-controller monitoring: labels: github_org: cobaltcore-dev diff --git a/helm/library/cortex/files/crds/cortex.cloud_histories.yaml b/helm/library/cortex/files/crds/cortex.cloud_histories.yaml new file mode 100644 index 000000000..19baa97e7 --- /dev/null +++ b/helm/library/cortex/files/crds/cortex.cloud_histories.yaml @@ -0,0 +1,287 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.0 + name: histories.cortex.cloud +spec: + group: cortex.cloud + names: + kind: History + listKind: HistoryList + plural: histories + singular: history + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.schedulingDomain + name: Domain + type: string + - jsonPath: .spec.resourceID + name: Resource ID + type: string + - jsonPath: .status.current.targetHost + name: Target Host + type: string + - jsonPath: .status.conditions[?(@.type=='Ready')].reason + name: Status + type: string + - jsonPath: .metadata.creationTimestamp + name: Created + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: History is the Schema for the history API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the desired state of History. + properties: + resourceID: + description: The resource ID this history belongs to (e.g., the UUID + of a nova instance). + type: string + schedulingDomain: + description: The scheduling domain this object with the history belongs + to. + type: string + required: + - resourceID + - schedulingDomain + type: object + status: + description: Status defines the observed state of History. + properties: + conditions: + description: Conditions represent the latest available observations + of the history's state. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + current: + description: Current represents the latest scheduling decision with + full context. + properties: + explanation: + description: A human-readable explanation of the scheduling decision. + type: string + intent: + description: The intent of the decision (e.g., initial scheduling, + rescheduling, etc.). + type: string + orderedHosts: + description: The top hosts ordered by score (limited to 3). + items: + type: string + type: array + pipelineRef: + description: The pipeline that was used for the decision. + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: |- + If referring to a piece of an object instead of an entire object, this string + should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within a pod, this would take on a value like: + "spec.containers{name}" (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" (container with + index 2 in this pod). This syntax is chosen only to have some well-defined way of + referencing a part of an object. + type: string + kind: + description: |- + Kind of the referent. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + name: + description: |- + Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + namespace: + description: |- + Namespace of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ + type: string + resourceVersion: + description: |- + Specific resourceVersion to which this reference is made, if any. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency + type: string + uid: + description: |- + UID of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids + type: string + type: object + x-kubernetes-map-type: atomic + successful: + description: Whether the scheduling decision was successful. + type: boolean + targetHost: + description: The target host selected for the resource. nil when + no host was found. + type: string + timestamp: + description: The timestamp of when the decision was made. + format: date-time + type: string + required: + - intent + - pipelineRef + - successful + - timestamp + type: object + history: + description: History of past scheduling decisions (limited to last + 10). + items: + properties: + intent: + description: The intent of the decision (e.g., initial scheduling, + rescheduling, etc.). + type: string + orderedHosts: + description: |- + The top hosts ordered by score for the decision (limited to 3). + This is not a complete list of all candidates — only the highest-ranked + hosts are retained to keep the history compact. + items: + type: string + type: array + pipelineRef: + description: The pipeline that was used for the decision. + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: |- + If referring to a piece of an object instead of an entire object, this string + should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within a pod, this would take on a value like: + "spec.containers{name}" (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" (container with + index 2 in this pod). This syntax is chosen only to have some well-defined way of + referencing a part of an object. + type: string + kind: + description: |- + Kind of the referent. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + name: + description: |- + Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + namespace: + description: |- + Namespace of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ + type: string + resourceVersion: + description: |- + Specific resourceVersion to which this reference is made, if any. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency + type: string + uid: + description: |- + UID of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids + type: string + type: object + x-kubernetes-map-type: atomic + successful: + description: Whether the scheduling decision was successful. + type: boolean + timestamp: + description: The timestamp of when the decision was made. + format: date-time + type: string + required: + - intent + - pipelineRef + - timestamp + type: object + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/helm/library/cortex/files/crds/cortex.cloud_pipelines.yaml b/helm/library/cortex/files/crds/cortex.cloud_pipelines.yaml index 60b84a75d..9fadae42a 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_pipelines.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_pipelines.yaml @@ -63,6 +63,9 @@ spec: description: |- If this pipeline should create decision objects. When this is false, the pipeline will still process requests. + NOTE: This flag is intentionally kept as "createDecisions" to avoid + breaking changes. It will be renamed when the deprecated Decision CRD + is fully replaced in a future refactoring. type: boolean description: description: An optional description of the pipeline, helping understand diff --git a/helm/library/cortex/templates/rbac/role.yaml b/helm/library/cortex/templates/rbac/role.yaml index c01ab9062..920604f2b 100644 --- a/helm/library/cortex/templates/rbac/role.yaml +++ b/helm/library/cortex/templates/rbac/role.yaml @@ -17,6 +17,7 @@ rules: - deschedulings - pipelines - kpis + - histories verbs: - create - delete @@ -35,6 +36,7 @@ rules: - deschedulings/finalizers - pipelines/finalizers - kpis/finalizers + - histories/finalizers verbs: - update - apiGroups: @@ -47,8 +49,23 @@ rules: - deschedulings/status - pipelines/status - kpis/status + - histories/status verbs: - get - patch - update +- apiGroups: + - events.k8s.io + resources: + - events + verbs: + - create + - patch +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch {{- end -}} diff --git a/internal/scheduling/cinder/decisions_cleanup.go b/internal/scheduling/cinder/decisions_cleanup.go index ffe6119dc..405f47048 100644 --- a/internal/scheduling/cinder/decisions_cleanup.go +++ b/internal/scheduling/cinder/decisions_cleanup.go @@ -107,22 +107,22 @@ func DecisionsCleanup(ctx context.Context, client client.Client, conf DecisionsC } // List all decisions and delete those whose volume no longer exists. - decisionList := &v1alpha1.DecisionList{} - if err := client.List(ctx, decisionList); err != nil { + historyList := &v1alpha1.HistoryList{} + if err := client.List(ctx, historyList); err != nil { return err } - for _, decision := range decisionList.Items { + for _, history := range historyList.Items { // Skip non-cinder decisions. - if decision.Spec.SchedulingDomain != v1alpha1.SchedulingDomainCinder { + if history.Spec.SchedulingDomain != v1alpha1.SchedulingDomainCinder { continue } // Skip decisions for which the volume still exists. - if _, ok := volumesByID[decision.Spec.ResourceID]; ok { + if _, ok := volumesByID[history.Spec.ResourceID]; ok { continue } // Delete the decision since the volume no longer exists. - slog.Info("deleting decision for deleted volume", "decision", decision.Name, "volumeID", decision.Spec.ResourceID) - if err := client.Delete(ctx, &decision); err != nil { + slog.Info("deleting decision for deleted volume", "decision", history.Name, "volumeID", history.Spec.ResourceID) + if err := client.Delete(ctx, &history); err != nil { return err } } diff --git a/internal/scheduling/cinder/decisions_cleanup_test.go b/internal/scheduling/cinder/decisions_cleanup_test.go index 570320ae5..edf8d1a81 100644 --- a/internal/scheduling/cinder/decisions_cleanup_test.go +++ b/internal/scheduling/cinder/decisions_cleanup_test.go @@ -32,7 +32,7 @@ func TestCleanupCinder(t *testing.T) { tests := []struct { name string - decisions []v1alpha1.Decision + histories []v1alpha1.History expectError bool authError bool endpointError bool @@ -43,36 +43,36 @@ func TestCleanupCinder(t *testing.T) { }{ { name: "handle authentication error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, authError: true, expectError: true, }, { name: "handle endpoint error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, endpointError: true, expectError: true, }, { name: "handle server error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, mockServerError: true, expectError: true, }, { name: "handle empty volumes case", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, emptyVolumesError: true, expectError: false, }, { name: "delete decisions for non-existent volumes", - decisions: []v1alpha1.Decision{ + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ Name: "decision-existing-volume", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainCinder, ResourceID: "volume-exists", }, @@ -81,7 +81,7 @@ func TestCleanupCinder(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "decision-deleted-volume", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainCinder, ResourceID: "volume-deleted", }, @@ -95,12 +95,12 @@ func TestCleanupCinder(t *testing.T) { }, { name: "keep decisions for existing volumes", - decisions: []v1alpha1.Decision{ + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ Name: "decision-volume-1", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainCinder, ResourceID: "volume-1", }, @@ -109,7 +109,7 @@ func TestCleanupCinder(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "decision-volume-2", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainCinder, ResourceID: "volume-2", }, @@ -126,9 +126,9 @@ func TestCleanupCinder(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - objects := make([]client.Object, len(tt.decisions)) - for i := range tt.decisions { - objects[i] = &tt.decisions[i] + objects := make([]client.Object, len(tt.histories)) + for i := range tt.histories { + objects[i] = &tt.histories[i] } // Create mock Cinder server first @@ -308,21 +308,21 @@ func TestCleanupCinder(t *testing.T) { } // Verify other decisions still exist - for _, originalDecision := range tt.decisions { + for _, originalHistory := range tt.histories { shouldBeDeleted := false for _, expectedDeleted := range tt.expectedDeleted { - if originalDecision.Name == expectedDeleted { + if originalHistory.Name == expectedDeleted { shouldBeDeleted = true break } } if !shouldBeDeleted { - var decision v1alpha1.Decision + var history v1alpha1.History err := client.Get(context.Background(), - types.NamespacedName{Name: originalDecision.Name}, &decision) + types.NamespacedName{Name: originalHistory.Name}, &history) if err != nil { - t.Errorf("Expected decision %s to still exist but got error: %v", - originalDecision.Name, err) + t.Errorf("Expected history %s to still exist but got error: %v", + originalHistory.Name, err) } } } diff --git a/internal/scheduling/cinder/filter_weigher_pipeline_controller.go b/internal/scheduling/cinder/filter_weigher_pipeline_controller.go index 0d8771081..da8297568 100644 --- a/internal/scheduling/cinder/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/cinder/filter_weigher_pipeline_controller.go @@ -79,12 +79,6 @@ func (c *FilterWeigherPipelineController) ProcessNewDecisionFromAPI(ctx context. if !ok { return fmt.Errorf("pipeline %s not configured", decision.Spec.PipelineRef.Name) } - if pipelineConf.Spec.CreateDecisions { - if err := c.Create(ctx, decision); err != nil { - return err - } - } - old := decision.DeepCopy() err := c.process(ctx, decision) if err != nil { meta.SetStatusCondition(&decision.Status.Conditions, metav1.Condition{ @@ -102,10 +96,11 @@ func (c *FilterWeigherPipelineController) ProcessNewDecisionFromAPI(ctx context. }) } if pipelineConf.Spec.CreateDecisions { - patch := client.MergeFrom(old) - if err := c.Status().Patch(ctx, decision, patch); err != nil { - return err - } + go func() { + if upsertErr := c.HistoryManager.Upsert(context.Background(), decision, v1alpha1.SchedulingIntentUnknown, err); upsertErr != nil { + ctrl.LoggerFrom(ctx).Error(upsertErr, "failed to create/update history") + } + }() } return err } @@ -156,6 +151,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainCinder + c.HistoryManager = lib.HistoryManager{Client: mgr.GetClient(), Recorder: mgr.GetEventRecorder("cortex-cinder-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/cinder/filter_weigher_pipeline_controller_test.go b/internal/scheduling/cinder/filter_weigher_pipeline_controller_test.go index dadd74e70..fa447c530 100644 --- a/internal/scheduling/cinder/filter_weigher_pipeline_controller_test.go +++ b/internal/scheduling/cinder/filter_weigher_pipeline_controller_test.go @@ -7,6 +7,7 @@ import ( "context" "encoding/json" "testing" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" @@ -247,13 +248,13 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) } tests := []struct { - name string - decision *v1alpha1.Decision - pipelineConfig *v1alpha1.Pipeline - createDecisions bool - expectError bool - expectDecisionCreated bool - expectResult bool + name string + decision *v1alpha1.Decision + pipelineConfig *v1alpha1.Pipeline + createDecisions bool + expectError bool + expectHistoryCreated bool + expectResult bool }{ { name: "successful decision processing with creation", @@ -264,6 +265,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) }, Spec: v1alpha1.DecisionSpec{ SchedulingDomain: v1alpha1.SchedulingDomainCinder, + ResourceID: "test-uuid-1", PipelineRef: corev1.ObjectReference{ Name: "test-pipeline", }, @@ -284,10 +286,10 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: true, - expectError: false, - expectDecisionCreated: true, - expectResult: true, + createDecisions: true, + expectError: false, + expectHistoryCreated: true, + expectResult: true, }, { name: "successful decision processing without creation", @@ -318,10 +320,10 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: false, - expectError: false, - expectDecisionCreated: false, - expectResult: true, + createDecisions: false, + expectError: false, + expectHistoryCreated: false, + expectResult: true, }, { name: "pipeline not configured", @@ -340,10 +342,10 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) }, }, }, - pipelineConfig: nil, - expectError: true, - expectDecisionCreated: false, - expectResult: false, + pipelineConfig: nil, + expectError: true, + expectHistoryCreated: false, + expectResult: false, }, { name: "decision without cinderRaw spec", @@ -372,10 +374,10 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: true, - expectError: true, - expectDecisionCreated: false, - expectResult: false, + createDecisions: true, + expectError: true, + expectHistoryCreated: false, + expectResult: false, }, } @@ -389,7 +391,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) client := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(objects...). - WithStatusSubresource(&v1alpha1.Decision{}). + WithStatusSubresource(&v1alpha1.Decision{}, &v1alpha1.History{}). Build() controller := &FilterWeigherPipelineController{ @@ -397,6 +399,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Client: client, Pipelines: make(map[string]lib.FilterWeigherPipeline[api.ExternalSchedulerRequest]), PipelineConfigs: make(map[string]v1alpha1.Pipeline), + HistoryManager: lib.HistoryManager{Client: client}, }, Monitor: lib.FilterWeigherPipelineMonitor{}, } @@ -419,41 +422,23 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) t.Errorf("Expected no error but got: %v", err) } - // Check if decision was created (if expected) - if tt.expectDecisionCreated { - var decisions v1alpha1.DecisionList - err := client.List(context.Background(), &decisions) - if err != nil { - t.Errorf("Failed to list decisions: %v", err) - return - } - - found := false - for _, decision := range decisions.Items { - if decision.Spec.SchedulingDomain == v1alpha1.SchedulingDomainCinder { - found = true - - // Verify decision properties - if decision.Spec.PipelineRef.Name != "test-pipeline" { - t.Errorf("expected pipeline ref %q, got %q", "test-pipeline", decision.Spec.PipelineRef.Name) - } - - // Check if result was set - if tt.expectResult { - if decision.Status.Result == nil { - t.Error("expected decision result to be set") - return - } - } + // Check if history CRD was created when expected + if tt.expectHistoryCreated { + var histories v1alpha1.HistoryList + deadline := time.Now().Add(2 * time.Second) + for { + if err := client.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) + } + if len(histories.Items) > 0 { break } - } - - if !found { - t.Error("expected decision to be created but was not found") + if time.Now().After(deadline) { + t.Fatal("timed out waiting for history CRD to be created") + } + time.Sleep(5 * time.Millisecond) } } else if !tt.expectError { - // For cases without creation, check that the decision has the right status if tt.expectResult && tt.decision.Status.Result == nil { t.Error("expected decision result to be set in original decision object") } diff --git a/internal/scheduling/explanation/controller.go b/internal/scheduling/explanation/controller.go deleted file mode 100644 index 7b71e0c4a..000000000 --- a/internal/scheduling/explanation/controller.go +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package explanation - -import ( - "context" - "sort" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/pkg/multicluster" - corev1 "k8s.io/api/core/v1" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/predicate" -) - -type ControllerConfig struct { - // The controller will scope to objects using this scheduling domain name. - // This allows multiple controllers to coexist in the same cluster without - // interfering with each other's decisions. - SchedulingDomain v1alpha1.SchedulingDomain `json:"schedulingDomain"` -} - -// The explanation controller populates two fields of the decision status. -// -// First, it reconstructs the history of each decision. It will look for -// previous decisions for the same resource (based on ResourceID) and provide -// them through the decision history field. -// -// Second, it will use the available context for a decision to generate a -// human-readable explanation of why the decision was made the way it was. -// This explanation is intended to help operators understand the reasoning -// behind scheduling decisions. -type Controller struct { - // The kubernetes client to use for processing decisions. - client.Client - // Config for the controller. - Config ControllerConfig - // If the field indexing should be skipped (useful for testing). - SkipIndexFields bool -} - -// Check if a decision should be processed by this controller. -func (c *Controller) shouldReconcileDecision(decision *v1alpha1.Decision) bool { - // Ignore decisions not created by this operator. - if decision.Spec.SchedulingDomain != c.Config.SchedulingDomain { - return false - } - // Ignore decisions that already have an explanation. - if decision.Status.Explanation != "" { - return false - } - // Ignore decisions that have no result yet. - if decision.Status.Result == nil { - return false - } - return true -} - -// This loop will be called by the controller-runtime for each decision -// resource that needs to be reconciled. -func (c *Controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - log := ctrl.LoggerFrom(ctx) - decision := &v1alpha1.Decision{} - if err := c.Get(ctx, req.NamespacedName, decision); err != nil { - log.Error(err, "failed to get decision", "name", req.NamespacedName) - return ctrl.Result{}, client.IgnoreNotFound(err) - } - // Reconcile the history. - if err := c.reconcileHistory(ctx, decision); err != nil { - return ctrl.Result{}, err - } - // Reconcile the explanation. - if err := c.reconcileExplanation(ctx, decision); err != nil { - return ctrl.Result{}, err - } - log.Info("successfully reconciled decision explanation", "name", req.NamespacedName) - return ctrl.Result{}, nil -} - -// Process the history for the given decision. -func (c *Controller) reconcileHistory(ctx context.Context, decision *v1alpha1.Decision) error { - log := ctrl.LoggerFrom(ctx) - // Get all previous decisions for the same ResourceID. - var previousDecisions v1alpha1.DecisionList - if c.SkipIndexFields { - // When field indexing is skipped, list all decisions and filter manually - if err := c.List(ctx, &previousDecisions); err != nil { - log.Error(err, "failed to list all decisions", "resourceID", decision.Spec.ResourceID) - return err - } - // Filter to only decisions with matching ResourceID - var filteredDecisions []v1alpha1.Decision - for _, prevDecision := range previousDecisions.Items { - if prevDecision.Spec.ResourceID == decision.Spec.ResourceID { - filteredDecisions = append(filteredDecisions, prevDecision) - } - } - previousDecisions.Items = filteredDecisions - } else { - // Use field indexing for efficient lookup - if err := c.List(ctx, &previousDecisions, client.MatchingFields{"spec.resourceID": decision.Spec.ResourceID}); err != nil { - log.Error(err, "failed to list previous decisions", "resourceID", decision.Spec.ResourceID) - return err - } - } - history := []corev1.ObjectReference{} // Not var-init so we see the empty slice. - // Make sure the resulting history will be in chronological order. - sort.Slice(previousDecisions.Items, func(i, j int) bool { - t1 := previousDecisions.Items[i].CreationTimestamp - t2 := previousDecisions.Items[j].CreationTimestamp - return t1.Before(&t2) - }) - for _, prevDecision := range previousDecisions.Items { - // Skip the current decision. - if prevDecision.Name == decision.Name && prevDecision.Namespace == decision.Namespace { - continue - } - // Skip decisions that were made after the current one. - if prevDecision.CreationTimestamp.After(decision.CreationTimestamp.Time) { - continue - } - history = append(history, corev1.ObjectReference{ - Kind: "Decision", - Namespace: prevDecision.Namespace, - Name: prevDecision.Name, - UID: prevDecision.UID, - }) - } - old := decision.DeepCopy() - decision.Status.History = &history - precedence := len(history) - decision.Status.Precedence = &precedence - patch := client.MergeFrom(old) - if err := c.Status().Patch(ctx, decision, patch); err != nil { - log.Error(err, "failed to patch decision status with history", "name", decision.Name) - return err - } - log.Info("successfully reconciled decision history", "name", decision.Name) - return nil -} - -// Process the explanation for the given decision. -func (c *Controller) reconcileExplanation(ctx context.Context, decision *v1alpha1.Decision) error { - log := ctrl.LoggerFrom(ctx) - explainer, err := NewExplainer(c.Client) - if err != nil { - log.Error(err, "failed to create explainer", "name", decision.Name) - return err - } - explanationText, err := explainer.Explain(ctx, decision) - if err != nil { - log.Error(err, "failed to explain decision", "name", decision.Name) - return err - } - old := decision.DeepCopy() - decision.Status.Explanation = explanationText - patch := client.MergeFrom(old) - if err := c.Status().Patch(ctx, decision, patch); err != nil { - log.Error(err, "failed to patch decision status with explanation", "name", decision.Name) - return err - } - log.Info("successfully reconciled decision explanation", "name", decision.Name) - return nil -} - -// This function will be called when the manager starts up. Must block. -func (c *Controller) StartupCallback(ctx context.Context) error { - // Reprocess all existing decisions that need an explanation. - var decisions v1alpha1.DecisionList - if err := c.List(ctx, &decisions); err != nil { - return err - } - for _, decision := range decisions.Items { - if !c.shouldReconcileDecision(&decision) { - continue - } - if _, err := c.Reconcile(ctx, ctrl.Request{ - NamespacedName: client.ObjectKey{ - Namespace: decision.Namespace, - Name: decision.Name, - }, - }); err != nil { - return err - } - } - return nil -} - -// This function sets up the controller with the provided manager. -func (c *Controller) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { - if !c.SkipIndexFields { - ctx := context.Background() - obj := &v1alpha1.Decision{} - lst := &v1alpha1.DecisionList{} - idx := "spec.resourceID" - fnc := func(obj client.Object) []string { - decision := obj.(*v1alpha1.Decision) - return []string{decision.Spec.ResourceID} - } - if err := mcl.IndexField(ctx, obj, lst, idx, fnc); err != nil { - return err - } - } - if err := mgr.Add(manager.RunnableFunc(c.StartupCallback)); err != nil { - return err - } - return multicluster.BuildController(mcl, mgr). - Named("explanation-controller"). - For( - &v1alpha1.Decision{}, - builder.WithPredicates(predicate.NewPredicateFuncs(func(obj client.Object) bool { - decision := obj.(*v1alpha1.Decision) - return c.shouldReconcileDecision(decision) - })), - ). - Complete(c) -} diff --git a/internal/scheduling/explanation/controller_test.go b/internal/scheduling/explanation/controller_test.go deleted file mode 100644 index f287b4995..000000000 --- a/internal/scheduling/explanation/controller_test.go +++ /dev/null @@ -1,589 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package explanation - -import ( - "context" - "testing" - "time" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestController_shouldReconcileDecision(t *testing.T) { - controller := &Controller{ - Config: ControllerConfig{SchedulingDomain: v1alpha1.SchedulingDomainNova}, - } - - tests := []struct { - name string - decision *v1alpha1.Decision - expected bool - }{ - { - name: "should reconcile nova decision without explanation", - decision: &v1alpha1.Decision{ - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - }, - Status: v1alpha1.DecisionStatus{ - Explanation: "", - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), - }, - }, - }, - expected: true, - }, - { - name: "should not reconcile decision from different operator", - decision: &v1alpha1.Decision{ - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: "different-operator", - }, - Status: v1alpha1.DecisionStatus{ - Explanation: "", - }, - }, - expected: false, - }, - { - name: "should not reconcile decision with existing explanation", - decision: &v1alpha1.Decision{ - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - }, - Status: v1alpha1.DecisionStatus{ - Explanation: "Already has explanation", - }, - }, - expected: false, - }, - { - name: "should not reconcile non-nova decision", - decision: &v1alpha1.Decision{ - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - }, - Status: v1alpha1.DecisionStatus{ - Explanation: "", - }, - }, - expected: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := controller.shouldReconcileDecision(tt.decision) - if result != tt.expected { - t.Errorf("shouldReconcileDecision() = %v, expected %v", result, tt.expected) - } - }) - } -} - -func TestController_Reconcile(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - tests := []struct { - name string - decision *v1alpha1.Decision - existingDecisions []v1alpha1.Decision - expectError bool - expectRequeue bool - expectedExplanation string - expectedHistoryLength int - }{ - { - name: "decision not found", - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "nonexistent-decision", - Namespace: "default", - }, - }, - expectError: false, // controller-runtime ignores not found errors - }, - { - name: "reconcile decision without history", - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision", - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource-1", - }, - Status: v1alpha1.DecisionStatus{}, - }, - expectedExplanation: "Initial placement of the nova server", - expectedHistoryLength: 0, - }, - { - name: "reconcile decision with history", - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-2", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now().Add(time.Hour)}, - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource-2", - }, - Status: v1alpha1.DecisionStatus{}, - }, - existingDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-1", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now()}, - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource-2", - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), - }, - }, - }, - }, - expectedHistoryLength: 1, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var objects []runtime.Object - if tt.name != "decision not found" { - objects = append(objects, tt.decision) - } - for i := range tt.existingDecisions { - objects = append(objects, &tt.existingDecisions[i]) - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(objects...). - WithStatusSubresource(&v1alpha1.Decision{}). - Build() - - controller := &Controller{ - Client: client, - Config: ControllerConfig{SchedulingDomain: v1alpha1.SchedulingDomainNova}, - SkipIndexFields: true, // Skip field indexing for testing - } - - req := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: tt.decision.Name, - Namespace: tt.decision.Namespace, - }, - } - - result, err := controller.Reconcile(context.Background(), req) - - if tt.expectError && err == nil { - t.Errorf("Expected error but got none") - return - } - if !tt.expectError && err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - if tt.expectRequeue && result.RequeueAfter == 0 { - t.Errorf("Expected requeue but got none") - } - if !tt.expectRequeue && result.RequeueAfter > 0 { - t.Errorf("Expected no requeue but got %v", result.RequeueAfter) - } - - // Only check results if we expect the decision to exist - if tt.name != "decision not found" { - // Verify the decision was updated - var updated v1alpha1.Decision - err = client.Get(context.Background(), req.NamespacedName, &updated) - if err != nil { - t.Errorf("Failed to get updated decision: %v", err) - return - } - - if tt.expectedExplanation != "" && !contains(updated.Status.Explanation, tt.expectedExplanation) { - t.Errorf("Expected explanation to contain '%s', but got: %s", tt.expectedExplanation, updated.Status.Explanation) - } - - if updated.Status.History != nil && len(*updated.Status.History) != tt.expectedHistoryLength { - t.Errorf("Expected history length %d, got %d", tt.expectedHistoryLength, len(*updated.Status.History)) - } - } - }) - } -} - -func TestController_reconcileHistory(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - tests := []struct { - name string - decision *v1alpha1.Decision - existingDecisions []v1alpha1.Decision - expectedHistory int - expectError bool - }{ - { - name: "no previous decisions", - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision", - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-1", - }, - }, - expectedHistory: 0, - }, - { - name: "one previous decision", - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-2", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now().Add(time.Hour)}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-2", - }, - }, - existingDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-1", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now()}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-2", - }, - }, - }, - expectedHistory: 1, - }, - { - name: "multiple previous decisions in correct order", - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-3", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now().Add(2 * time.Hour)}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-3", - }, - }, - existingDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-1", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now()}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-3", - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-2", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now().Add(time.Hour)}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-3", - }, - }, - }, - expectedHistory: 2, - }, - { - name: "exclude future decisions", - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-2", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now().Add(time.Hour)}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-4", - }, - }, - existingDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-1", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now()}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-4", - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-3", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now().Add(2 * time.Hour)}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "test-resource-4", - }, - }, - }, - expectedHistory: 1, // Only test-decision-1 should be included - }, - { - name: "exclude decisions with different ResourceID", - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-target", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now().Add(time.Hour)}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "target-resource", - }, - }, - existingDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-same", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now()}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "target-resource", - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-different", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: time.Now()}, - }, - Spec: v1alpha1.DecisionSpec{ - ResourceID: "different-resource", - }, - }, - }, - expectedHistory: 1, // Only same ResourceID should be included - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - objects := []runtime.Object{tt.decision} - for i := range tt.existingDecisions { - objects = append(objects, &tt.existingDecisions[i]) - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(objects...). - WithStatusSubresource(&v1alpha1.Decision{}). - Build() - - controller := &Controller{ - Client: client, - Config: ControllerConfig{SchedulingDomain: v1alpha1.SchedulingDomainNova}, - SkipIndexFields: true, // Skip field indexing for testing - } - - err := controller.reconcileHistory(context.Background(), tt.decision) - - if tt.expectError && err == nil { - t.Errorf("Expected error but got none") - return - } - if !tt.expectError && err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - if tt.decision.Status.History == nil { - if tt.expectedHistory != 0 { - t.Errorf("Expected history length %d, got nil", tt.expectedHistory) - } - } else if len(*tt.decision.Status.History) != tt.expectedHistory { - t.Errorf("Expected history length %d, got %d", tt.expectedHistory, len(*tt.decision.Status.History)) - } - }) - } -} - -func TestController_reconcileExplanation(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - decision := &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision", - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource", - }, - Status: v1alpha1.DecisionStatus{ - History: nil, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(decision). - WithStatusSubresource(&v1alpha1.Decision{}). - Build() - - controller := &Controller{ - Client: client, - Config: ControllerConfig{SchedulingDomain: v1alpha1.SchedulingDomainNova}, - } - - err := controller.reconcileExplanation(context.Background(), decision) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - } - - if decision.Status.Explanation == "" { - t.Error("Expected explanation to be set but it was empty") - } - - if !contains(decision.Status.Explanation, "Initial placement of the nova server") { - t.Errorf("Expected explanation to contain nova server text, got: %s", decision.Status.Explanation) - } -} - -func TestController_StartupCallback(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - // Create a decision that should be reconciled - decision1 := &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-1", - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource-1", - }, - Status: v1alpha1.DecisionStatus{ - Explanation: "", // Empty explanation means it should be reconciled - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), - }, - }, - } - - // Create a decision that should not be reconciled (already has explanation) - decision2 := &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-2", - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource-2", - }, - Status: v1alpha1.DecisionStatus{ - Explanation: "Already has explanation", - }, - } - - // Create a decision from different operator that should not be reconciled - decision3 := &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision-3", - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: "different-operator", - ResourceID: "test-resource-3", - }, - Status: v1alpha1.DecisionStatus{ - Explanation: "", - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(decision1, decision2, decision3). - WithStatusSubresource(&v1alpha1.Decision{}). - Build() - - controller := &Controller{ - Client: client, - Config: ControllerConfig{SchedulingDomain: v1alpha1.SchedulingDomainNova}, - SkipIndexFields: true, // Skip field indexing for testing - } - - err := controller.StartupCallback(context.Background()) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - } - - // Verify that decision1 now has an explanation - var updated1 v1alpha1.Decision - err = client.Get(context.Background(), types.NamespacedName{Name: "test-decision-1", Namespace: "default"}, &updated1) - if err != nil { - t.Errorf("Failed to get updated decision1: %v", err) - } - - if updated1.Status.Explanation == "" { - t.Error("Expected decision1 to have explanation after startup callback") - } - - // Verify that decision2 explanation remains unchanged - var updated2 v1alpha1.Decision - err = client.Get(context.Background(), types.NamespacedName{Name: "test-decision-2", Namespace: "default"}, &updated2) - if err != nil { - t.Errorf("Failed to get updated decision2: %v", err) - } - - if updated2.Status.Explanation != "Already has explanation" { - t.Errorf("Expected decision2 explanation to remain unchanged, got: %s", updated2.Status.Explanation) - } - - // Verify that decision3 explanation remains empty (different operator) - var updated3 v1alpha1.Decision - err = client.Get(context.Background(), types.NamespacedName{Name: "test-decision-3", Namespace: "default"}, &updated3) - if err != nil { - t.Errorf("Failed to get updated decision3: %v", err) - } - - if updated3.Status.Explanation != "" { - t.Errorf("Expected decision3 explanation to remain empty, got: %s", updated3.Status.Explanation) - } -} diff --git a/internal/scheduling/explanation/explainer.go b/internal/scheduling/explanation/explainer.go deleted file mode 100644 index a5f199fae..000000000 --- a/internal/scheduling/explanation/explainer.go +++ /dev/null @@ -1,729 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package explanation - -import ( - "context" - "fmt" - "sort" - "time" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "k8s.io/apimachinery/pkg/api/errors" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" -) - -// The explainer gets a scheduling decision and produces a human-readable -// explanation of why the decision was made the way it was. -type Explainer struct { - // The kubernetes client to use for fetching related data. - client.Client - // The template manager to use for rendering explanations. - templateManager *TemplateManager -} - -// NewExplainer creates a new explainer with template support. -func NewExplainer(client client.Client) (*Explainer, error) { - templateManager, err := NewTemplateManager() - if err != nil { - return nil, fmt.Errorf("failed to create template manager: %w", err) - } - - return &Explainer{ - Client: client, - templateManager: templateManager, - }, nil -} - -// Explain the given decision and return a human-readable explanation. -func (e *Explainer) Explain(ctx context.Context, decision *v1alpha1.Decision) (string, error) { - return e.ExplainWithTemplates(ctx, decision) -} - -// getResourceType returns a human-readable resource type. -func (e *Explainer) getResourceType(schedulingDomain v1alpha1.SchedulingDomain) string { - switch schedulingDomain { - case v1alpha1.SchedulingDomainNova: - return "nova server" - case v1alpha1.SchedulingDomainManila: - return "manila share" - case v1alpha1.SchedulingDomainCinder: - return "cinder volume" - case v1alpha1.SchedulingDomainMachines: - return "ironcore machine" - default: - return "resource" - } -} - -// calculateScoreGap calculates the gap between first and second place. -func (e *Explainer) calculateScoreGap(weights map[string]float64) float64 { - if len(weights) < 2 { - return 0.0 - } - - scores := make([]float64, 0, len(weights)) - for _, score := range weights { - scores = append(scores, score) - } - - sort.Slice(scores, func(i, j int) bool { - return scores[i] > scores[j] - }) - - return scores[0] - scores[1] -} - -// fetchDecisionChain retrieves all decisions in the history chain. -func (e *Explainer) fetchDecisionChain(ctx context.Context, decision *v1alpha1.Decision) ([]*v1alpha1.Decision, error) { - var chainDecisions []*v1alpha1.Decision - logger := log.FromContext(ctx) - - // Add all historical decisions - if decision.Status.History != nil { - for _, ref := range *decision.Status.History { - histDecision := &v1alpha1.Decision{} - if err := e.Get(ctx, client.ObjectKey{ - Namespace: ref.Namespace, - Name: ref.Name, - }, histDecision); err != nil { - if errors.IsNotFound(err) { - logger.Info("History decision not found, skipping from chain analysis", - "decision", ref.Name, - "namespace", ref.Namespace, - "uid", ref.UID) - continue // Skip missing decisions instead of failing - } - // For other errors, still fail - return nil, err - } - chainDecisions = append(chainDecisions, histDecision) - } - } - - // Add current decision - chainDecisions = append(chainDecisions, decision) - - return chainDecisions, nil -} - -// HostSegment represents a segment in the host chain with duration and decision count. -type HostSegment struct { - host string - duration time.Duration // Full precision duration - decisions int -} - -// buildHostSegments creates host segments from decisions with durations. -func (e *Explainer) buildHostSegments(decisions []*v1alpha1.Decision) []HostSegment { - if len(decisions) < 2 { - return []HostSegment{} - } - - // Extract host chain - hostChain := make([]string, 0, len(decisions)) - for _, decision := range decisions { - host := "(n/a)" - if decision.Status.Result != nil && decision.Status.Result.TargetHost != nil { - host = *decision.Status.Result.TargetHost - } - hostChain = append(hostChain, host) - } - - // Build segments with durations - segments := make([]HostSegment, 0) - if len(hostChain) > 0 { - currentHost := hostChain[0] - segmentStart := 0 - - for i := 1; i <= len(hostChain); i++ { - // Check if we've reached the end or found a different host - if i == len(hostChain) || hostChain[i] != currentHost { - // Calculate duration for this segment - startTime := decisions[segmentStart].CreationTimestamp.Time - var endTime = startTime // Default to 0 duration for last segment - if i < len(hostChain) { - endTime = decisions[i].CreationTimestamp.Time - } - - duration := endTime.Sub(startTime) - - segments = append(segments, HostSegment{ - host: currentHost, - duration: duration, - decisions: i - segmentStart, - }) - - if i < len(hostChain) { - currentHost = hostChain[i] - segmentStart = i - } - } - } - } - - return segments -} - -// detectLoop checks if there are repeated hosts in the segments. -func (e *Explainer) detectLoop(segments []HostSegment) bool { - seenHosts := make(map[string]bool) - for _, segment := range segments { - if seenHosts[segment.host] { - return true - } - seenHosts[segment.host] = true - } - return false -} - -// findWinner returns the host with the highest score. -func (e *Explainer) findWinner(scores map[string]float64) string { - winner := "" - maxScore := -999999.0 - for host, score := range scores { - if score > maxScore { - maxScore = score - winner = host - } - } - return winner -} - -// ScoreCalculationResult holds both final scores and deleted host tracking information. -type ScoreCalculationResult struct { - FinalScores map[string]float64 - DeletedHosts map[string][]string // host -> list of steps that deleted it -} - -// StepImpact represents the impact of a single pipeline step on the winning host. -type StepImpact struct { - Step string - ScoreBefore float64 - ScoreAfter float64 - ScoreDelta float64 - CompetitorsRemoved int - PromotedToFirst bool -} - -// calculateScoresFromSteps processes step results sequentially to compute final scores and track deleted hosts. -func (e *Explainer) calculateScoresFromSteps(inputWeights map[string]float64, stepResults []v1alpha1.StepResult) ScoreCalculationResult { - if len(inputWeights) == 0 { - return ScoreCalculationResult{ - FinalScores: map[string]float64{}, - DeletedHosts: map[string][]string{}, - } - } - - // Start with input values as initial scores - currentScores := make(map[string]float64) - for hostName, inputValue := range inputWeights { - currentScores[hostName] = inputValue - } - - deletedHosts := make(map[string][]string) - - // Process each step sequentially - for _, stepResult := range stepResults { - // Check which hosts will be deleted in this step - for hostName := range currentScores { - if _, exists := stepResult.Activations[hostName]; !exists { - // Host not in this step's activations - will be deleted - deletedHosts[hostName] = append(deletedHosts[hostName], stepResult.StepName) - } - } - - // Apply activations and remove hosts not in this step - newScores := make(map[string]float64) - for hostName, score := range currentScores { - if activation, exists := stepResult.Activations[hostName]; exists { - // Add activation to current score - newScores[hostName] = score + activation - } - // Hosts not in activations are removed (don't copy to newScores) - } - currentScores = newScores - } - - return ScoreCalculationResult{ - FinalScores: currentScores, - DeletedHosts: deletedHosts, - } -} - -// calculateScoresWithoutStep processes step results while skipping one specific step. -func (e *Explainer) calculateScoresWithoutStep(inputWeights map[string]float64, stepResults []v1alpha1.StepResult, skipIndex int) ScoreCalculationResult { - if len(inputWeights) == 0 || skipIndex < 0 || skipIndex >= len(stepResults) { - return e.calculateScoresFromSteps(inputWeights, stepResults) - } - - // Create reduced step results without the skipped step - reducedSteps := make([]v1alpha1.StepResult, 0, len(stepResults)-1) - reducedSteps = append(reducedSteps, stepResults[:skipIndex]...) - reducedSteps = append(reducedSteps, stepResults[skipIndex+1:]...) - - return e.calculateScoresFromSteps(inputWeights, reducedSteps) -} - -// findCriticalSteps determines which steps change the winning host using backward elimination. -func (e *Explainer) findCriticalSteps(decision *v1alpha1.Decision) []string { - result := decision.Status.Result - if result == nil || len(result.StepResults) == 0 { - return []string{} - } - - // Get input weights (prefer raw, fall back to normalized) - var inputWeights map[string]float64 - switch { - case len(result.RawInWeights) > 0: - inputWeights = result.RawInWeights - case len(result.NormalizedInWeights) > 0: - inputWeights = result.NormalizedInWeights - default: - return []string{} - } - - // Calculate baseline scores with all steps - baselineResult := e.calculateScoresFromSteps(inputWeights, result.StepResults) - baselineWinner := e.findWinner(baselineResult.FinalScores) - - if baselineWinner == "" { - return []string{} - } - - criticalSteps := make([]string, 0) - - // Try removing each step one by one - for i, stepResult := range result.StepResults { - // Calculate scores without this step - reducedResult := e.calculateScoresWithoutStep(inputWeights, result.StepResults, i) - - // Find winner without this step - reducedWinner := e.findWinner(reducedResult.FinalScores) - - // If removing this step changes the winner, it's critical - if reducedWinner != baselineWinner { - criticalSteps = append(criticalSteps, stepResult.StepName) - } - } - - return criticalSteps -} - -func (e *Explainer) calculateStepImpacts(inputWeights map[string]float64, stepResults []v1alpha1.StepResult, targetHost string) []StepImpact { - if len(inputWeights) == 0 || len(stepResults) == 0 { - return []StepImpact{} - } - - impacts := make([]StepImpact, 0, len(stepResults)) - currentScores := make(map[string]float64) - - // Start with input values as initial scores - for hostName, inputValue := range inputWeights { - currentScores[hostName] = inputValue - } - - // Track target host's score before first step - scoreBefore := currentScores[targetHost] - - // Process each pipeline step and track the target host's evolution - for _, stepResult := range stepResults { - // Count how many competitors will be removed in this step - competitorsRemoved := 0 - for hostName := range currentScores { - if hostName != targetHost { - if _, exists := stepResult.Activations[hostName]; !exists { - competitorsRemoved++ - } - } - } - - // Check if target host was #1 before this step - wasFirst := true - targetScoreBefore := currentScores[targetHost] - for host, score := range currentScores { - if host != targetHost && score > targetScoreBefore { - wasFirst = false - break - } - } - - // Apply activations and remove hosts not in this step - newScores := make(map[string]float64) - for hostName, score := range currentScores { - if activation, exists := stepResult.Activations[hostName]; exists { - newScores[hostName] = score + activation - } - // Hosts not in activations are removed (don't copy to newScores) - } - - // Get target host's score after this step - scoreAfter := newScores[targetHost] - - // Check if target host became #1 after this step - isFirstAfter := true - for host, score := range newScores { - if host != targetHost && score > scoreAfter { - isFirstAfter = false - break - } - } - - promotedToFirst := !wasFirst && isFirstAfter - - impacts = append(impacts, StepImpact{ - Step: stepResult.StepName, - ScoreBefore: scoreBefore, - ScoreAfter: scoreAfter, - ScoreDelta: scoreAfter - scoreBefore, - CompetitorsRemoved: competitorsRemoved, - PromotedToFirst: promotedToFirst, - }) - - // Update for next iteration - currentScores = newScores - scoreBefore = scoreAfter - } - - return impacts -} - -// Template data building functions - these functions extract and structure -// decision data into formats suitable for template rendering. - -// buildContextData creates context data for template rendering. -func (e *Explainer) buildContextData(decision *v1alpha1.Decision) ContextData { - resourceType := e.getResourceType(decision.Spec.SchedulingDomain) - - history := decision.Status.History - isInitial := history == nil || len(*history) == 0 - - decisionNumber := 1 - if !isInitial { - decisionNumber = len(*history) + 1 - if decision.Status.Precedence != nil { - decisionNumber = *decision.Status.Precedence + 1 - } - } - - return ContextData{ - ResourceType: resourceType, - DecisionNumber: decisionNumber, - IsInitial: isInitial, - } -} - -// buildHistoryData creates history comparison data for template rendering. -func (e *Explainer) buildHistoryData(ctx context.Context, decision *v1alpha1.Decision) (*HistoryData, error) { - history := decision.Status.History - if history == nil || len(*history) == 0 { - return nil, nil - } - - // Get the last decision - lastDecisionRef := (*history)[len(*history)-1] - lastDecision := &v1alpha1.Decision{} - if err := e.Get(ctx, client.ObjectKey{ - Namespace: lastDecisionRef.Namespace, - Name: lastDecisionRef.Name, - }, lastDecision); err != nil { - logger := log.FromContext(ctx) - if errors.IsNotFound(err) { - logger.Info("History decision not found, skipping history comparison", - "decision", lastDecisionRef.Name, - "namespace", lastDecisionRef.Namespace, - "uid", lastDecisionRef.UID) - return nil, nil // Skip history comparison instead of failing - } - // For other errors, still fail - return nil, err - } - - lastTarget := "(n/a)" - if lastDecision.Status.Result != nil && lastDecision.Status.Result.TargetHost != nil { - lastTarget = *lastDecision.Status.Result.TargetHost - } - - newTarget := "(n/a)" - if decision.Status.Result != nil && decision.Status.Result.TargetHost != nil { - newTarget = *decision.Status.Result.TargetHost - } - - return &HistoryData{ - PreviousTarget: lastTarget, - CurrentTarget: newTarget, - }, nil -} - -// buildWinnerData creates winner analysis data for template rendering. -func (e *Explainer) buildWinnerData(decision *v1alpha1.Decision) *WinnerData { - result := decision.Status.Result - if result == nil || result.TargetHost == nil { - return nil - } - - targetHost := *result.TargetHost - - // Get target host score - targetScore := 0.0 - if result.AggregatedOutWeights != nil { - if score, exists := result.AggregatedOutWeights[targetHost]; exists { - targetScore = score - } - } - - // Count hosts evaluated - hostsEvaluated := len(result.OrderedHosts) - if hostsEvaluated == 0 && result.AggregatedOutWeights != nil { - hostsEvaluated = len(result.AggregatedOutWeights) - } - - // Calculate score gap to second place - gap := e.calculateScoreGap(result.AggregatedOutWeights) - - return &WinnerData{ - HostName: targetHost, - Score: targetScore, - Gap: gap, - HostsEvaluated: hostsEvaluated, - HasGap: gap > 0, - } -} - -// buildInputData creates input comparison data for template rendering. -func (e *Explainer) buildInputData(decision *v1alpha1.Decision) *InputData { - result := decision.Status.Result - if result == nil || result.TargetHost == nil { - return nil - } - - targetHost := *result.TargetHost - - // Get input weights (prefer raw, fall back to normalized) - var inputWeights map[string]float64 - switch { - case len(result.RawInWeights) > 0: - inputWeights = result.RawInWeights - case len(result.NormalizedInWeights) > 0: - inputWeights = result.NormalizedInWeights - default: - return nil - } - - // Find input winner - inputWinner := "" - inputWinnerScore := -999999.0 - for host, score := range inputWeights { - if score > inputWinnerScore { - inputWinnerScore = score - inputWinner = host - } - } - - if inputWinner == "" { - return nil - } - - // Get target host's final score - targetFinalScore := 0.0 - if result.AggregatedOutWeights != nil { - if score, exists := result.AggregatedOutWeights[targetHost]; exists { - targetFinalScore = score - } - } - - return &InputData{ - InputWinner: inputWinner, - InputScore: inputWinnerScore, - FinalWinner: targetHost, - FinalScore: targetFinalScore, - FinalInputScore: inputWeights[targetHost], - InputConfirmed: inputWinner == targetHost, - } -} - -// buildCriticalStepsData creates critical steps data for template rendering. -func (e *Explainer) buildCriticalStepsData(decision *v1alpha1.Decision) *CriticalStepsData { - result := decision.Status.Result - if result == nil || result.TargetHost == nil || len(result.StepResults) == 0 { - return nil - } - - criticalSteps := e.findCriticalSteps(decision) - totalSteps := len(result.StepResults) - - return &CriticalStepsData{ - Steps: criticalSteps, - TotalSteps: totalSteps, - IsInputOnly: len(criticalSteps) == 0, - RequiresAll: len(criticalSteps) == totalSteps, - } -} - -// buildDeletedHostsData creates deleted hosts data for template rendering. -func (e *Explainer) buildDeletedHostsData(decision *v1alpha1.Decision) *DeletedHostsData { - result := decision.Status.Result - if result == nil || result.StepResults == nil || len(result.StepResults) == 0 { - return nil - } - - // Get input weights (prefer raw, fall back to normalized) - var inputWeights map[string]float64 - switch { - case len(result.RawInWeights) > 0: - inputWeights = result.RawInWeights - case len(result.NormalizedInWeights) > 0: - inputWeights = result.NormalizedInWeights - default: - return nil - } - - // Calculate scores and get deleted hosts information - scoreResult := e.calculateScoresFromSteps(inputWeights, result.StepResults) - - if len(scoreResult.DeletedHosts) == 0 { - return nil - } - - // Find input winner - inputWinner := "" - inputWinnerScore := -999999.0 - for host, score := range inputWeights { - if score > inputWinnerScore { - inputWinnerScore = score - inputWinner = host - } - } - - // Build list of deleted hosts - deletedHosts := make([]DeletedHostInfo, 0, len(scoreResult.DeletedHosts)) - for hostName, steps := range scoreResult.DeletedHosts { - deletedHosts = append(deletedHosts, DeletedHostInfo{ - Name: hostName, - Steps: steps, - IsInputWinner: hostName == inputWinner, - }) - } - - return &DeletedHostsData{ - DeletedHosts: deletedHosts, - } -} - -// buildChainData creates chain analysis data for template rendering. -func (e *Explainer) buildChainData(ctx context.Context, decision *v1alpha1.Decision) (*ChainData, error) { - history := decision.Status.History - if history == nil || len(*history) == 0 { - return nil, nil // No chain for initial decisions - } - - // Fetch all decisions in the chain - chainDecisions, err := e.fetchDecisionChain(ctx, decision) - if err != nil { - return nil, err - } - - if len(chainDecisions) < 2 { - return nil, nil // Need at least 2 decisions for a chain - } - - // Build segments - segments := e.buildHostSegments(chainDecisions) - if len(segments) == 0 { - return nil, nil - } - - // Convert to template data format - chainSegments := make([]ChainSegment, len(segments)) - for i, segment := range segments { - chainSegments[i] = ChainSegment{ - Host: segment.host, - Duration: segment.duration, - Decisions: segment.decisions, - } - } - - return &ChainData{ - Segments: chainSegments, - HasLoop: e.detectLoop(segments), - }, nil -} - -// ExplainWithTemplates renders an explanation using Go templates. -func (e *Explainer) ExplainWithTemplates(ctx context.Context, decision *v1alpha1.Decision) (string, error) { - // Build explanation context - explanationCtx := ExplanationContext{ - Context: e.buildContextData(decision), - } - - // Build each component's data - if historyData, err := e.buildHistoryData(ctx, decision); err != nil { - return "", err - } else if historyData != nil { - explanationCtx.History = historyData - } - - if winnerData := e.buildWinnerData(decision); winnerData != nil { - explanationCtx.Winner = winnerData - } - - if inputData := e.buildInputData(decision); inputData != nil { - explanationCtx.Input = inputData - } - - if criticalStepsData := e.buildCriticalStepsData(decision); criticalStepsData != nil { - explanationCtx.CriticalSteps = criticalStepsData - } - - if deletedHostsData := e.buildDeletedHostsData(decision); deletedHostsData != nil { - explanationCtx.DeletedHosts = deletedHostsData - } - - // Build step impacts - if result := decision.Status.Result; result != nil && result.TargetHost != nil && len(result.StepResults) > 0 { - targetHost := *result.TargetHost - var inputWeights map[string]float64 - switch { - case len(result.RawInWeights) > 0: - inputWeights = result.RawInWeights - case len(result.NormalizedInWeights) > 0: - inputWeights = result.NormalizedInWeights - } - if inputWeights != nil { - impacts := e.calculateStepImpacts(inputWeights, result.StepResults, targetHost) - if len(impacts) > 0 { - // Sort impacts by absolute delta (highest first), with promotions taking priority - sort.Slice(impacts, func(i, j int) bool { - absI := impacts[i].ScoreDelta - if absI < 0 { - absI = -absI - } - absJ := impacts[j].ScoreDelta - if absJ < 0 { - absJ = -absJ - } - - if absI != absJ { - return absI > absJ - } - if impacts[i].PromotedToFirst != impacts[j].PromotedToFirst { - return impacts[i].PromotedToFirst - } - return impacts[i].Step < impacts[j].Step - }) - explanationCtx.StepImpacts = impacts - } - } - } - - if chainData, err := e.buildChainData(ctx, decision); err != nil { - return "", err - } else if chainData != nil { - explanationCtx.Chain = chainData - } - - // Render using templates - return e.templateManager.RenderExplanation(explanationCtx) -} diff --git a/internal/scheduling/explanation/explainer_test.go b/internal/scheduling/explanation/explainer_test.go deleted file mode 100644 index ed1d52e13..000000000 --- a/internal/scheduling/explanation/explainer_test.go +++ /dev/null @@ -1,1476 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package explanation - -import ( - "context" - "sort" - "testing" - "time" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestExplainer_Explain(t *testing.T) { - tests := []struct { - name string - decision *v1alpha1.Decision - historyDecisions []*v1alpha1.Decision - expectedContains []string - expectError bool - }{ - { - name: "initial nova server placement", - decision: WithResourceID(NewTestDecision("test-decision"), "test-resource-1"), - expectedContains: []string{"Initial placement of the nova server"}, - }, - { - name: "initial cinder volume placement", - decision: WithSchedulingDomain(WithResourceID(NewTestDecision("test-decision"), "test-resource-2"), v1alpha1.SchedulingDomainCinder), - expectedContains: []string{"Initial placement of the cinder volume"}, - }, - { - name: "initial manila share placement", - decision: WithSchedulingDomain(WithResourceID(NewTestDecision("test-decision"), "test-resource-3"), v1alpha1.SchedulingDomainManila), - expectedContains: []string{"Initial placement of the manila share"}, - }, - { - name: "initial ironcore machine placement", - decision: WithSchedulingDomain(WithResourceID(NewTestDecision("test-decision"), "test-resource-4"), v1alpha1.SchedulingDomainMachines), - expectedContains: []string{"Initial placement of the ironcore machine"}, - }, - { - name: "unknown resource type falls back to generic", - decision: WithSchedulingDomain(WithResourceID(NewTestDecision("test-decision"), "test-resource-5"), "unknown-type"), - expectedContains: []string{"Initial placement of the resource"}, - }, - { - name: "empty history array", - decision: WithResourceID(NewTestDecision("test-decision"), "test-resource-6"), - expectedContains: []string{"Initial placement of the nova server"}, - }, - { - name: "subsequent decision with history", - decision: WithHistoryRef( - WithTargetHost(WithResourceID(NewTestDecision("test-decision-2"), "test-resource-7"), "host-2"), - WithUID(WithTargetHost(WithResourceID(NewTestDecision("test-decision-1"), "test-resource-7"), "host-1"), "test-uid-1")), - historyDecisions: []*v1alpha1.Decision{ - WithUID(WithTargetHost(WithResourceID(NewTestDecision("test-decision-1"), "test-resource-7"), "host-1"), "test-uid-1"), - }, - expectedContains: []string{ - "Decision #2 for this nova server", - "Previous target host was 'host-1'", - "now it's 'host-2'", - }, - }, - { - name: "subsequent decision with nil target hosts", - decision: WithHistoryRef( - WithResourceID(NewTestDecision("test-decision-4"), "test-resource-8"), - WithUID(WithResourceID(NewTestDecision("test-decision-3"), "test-resource-8"), "test-uid-3")), - historyDecisions: []*v1alpha1.Decision{ - WithUID(WithResourceID(NewTestDecision("test-decision-3"), "test-resource-8"), "test-uid-3"), - }, - expectedContains: []string{ - "Decision #2 for this nova server", - "Previous target host was '(n/a)'", - "now it's '(n/a)'", - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if len(tt.historyDecisions) > 0 { - RunExplanationTestWithHistory(t, tt.decision, tt.historyDecisions, tt.expectedContains) - } else { - RunExplanationTest(t, tt.decision, tt.expectedContains) - } - }) - } -} - -func TestExplainer_Explain_HistoryDecisionNotFound_GracefulHandling(t *testing.T) { - decision := NewDecision("test-decision"). - WithResourceID("test-resource"). - WithTargetHost("host-1"). - WithHistory([]corev1.ObjectReference{ - { - Kind: "Decision", - Namespace: "default", - Name: "non-existent-decision", - UID: "non-existent-uid", - }, - }). - Build() - - explainer := SetupExplainerTest(t, decision) - explanation, err := explainer.Explain(context.Background(), decision) - - // Should NOT error anymore - graceful handling - if err != nil { - t.Errorf("Expected no error with graceful handling, but got: %v", err) - } - - // Should contain context but not history comparison - if !contains(explanation, "Decision #2 for this nova server") { - t.Errorf("Expected explanation to contain context, but got: %s", explanation) - } - - if contains(explanation, "Previous target host") { - t.Errorf("Expected explanation to NOT contain history comparison when decision is missing, but got: %s", explanation) - } -} - -func TestExplainer_MissingHistoryDecisions_ChainAnalysis(t *testing.T) { - // Test that chain analysis works when some history decisions are missing - decision := NewDecision("current-decision"). - WithResourceID("test-resource"). - WithTargetHost("host-3"). - WithHistory([]corev1.ObjectReference{ - {Kind: "Decision", Namespace: "default", Name: "decision-1", UID: "uid-1"}, - {Kind: "Decision", Namespace: "default", Name: "missing-decision", UID: "missing-uid"}, - {Kind: "Decision", Namespace: "default", Name: "decision-3", UID: "uid-3"}, - }). - Build() - - // Only provide decision-1 and decision-3, missing decision-2 - availableDecision := NewDecision("decision-1"). - WithUID("uid-1"). - WithTargetHost("host-1"). - WithCreationTimestamp(time.Now().Add(-2 * time.Hour)). - Build() - - explainer := SetupExplainerTest(t, decision, availableDecision) - explanation, err := explainer.Explain(context.Background(), decision) - - if err != nil { - t.Errorf("Expected no error but got: %v", err) - } - - // Should contain context with full history count - if !contains(explanation, "Decision #4 for this nova server") { - t.Errorf("Expected explanation to contain context, but got: %s", explanation) - } - - // Chain analysis should work with available decisions - if !contains(explanation, "Chain:") { - t.Errorf("Expected explanation to contain chain analysis, but got: %s", explanation) - } -} - -// Helper functions -func stringPtr(s string) *string { - return &s -} - -func contains(s, substr string) bool { - return len(s) >= len(substr) && (s == substr || substr == "" || findInString(s, substr)) -} - -func findInString(s, substr string) bool { - for i := 0; i <= len(s)-len(substr); i++ { - if s[i:i+len(substr)] == substr { - return true - } - } - return false -} - -// Generic Decision Helpers - Composable functions with smart defaults -func NewTestDecision(name string) *v1alpha1.Decision { - return &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: "default", // Sensible default - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, // Most common - ResourceID: "test-resource", // Generic default - }, - Status: v1alpha1.DecisionStatus{}, - } -} - -func WithTargetHost(decision *v1alpha1.Decision, host string) *v1alpha1.Decision { - if decision.Status.Result == nil { - decision.Status.Result = &v1alpha1.DecisionResult{} - } - decision.Status.Result.TargetHost = &host - return decision -} - -func WithInputWeights(decision *v1alpha1.Decision, weights map[string]float64) *v1alpha1.Decision { - if decision.Status.Result == nil { - decision.Status.Result = &v1alpha1.DecisionResult{} - } - decision.Status.Result.RawInWeights = weights - return decision -} - -func WithOutputWeights(decision *v1alpha1.Decision, weights map[string]float64) *v1alpha1.Decision { - if decision.Status.Result == nil { - decision.Status.Result = &v1alpha1.DecisionResult{} - } - decision.Status.Result.AggregatedOutWeights = weights - - // Auto-generate ordered hosts from weights - hosts := make([]string, 0, len(weights)) - for host := range weights { - hosts = append(hosts, host) - } - sort.Slice(hosts, func(i, j int) bool { - return weights[hosts[i]] > weights[hosts[j]] - }) - decision.Status.Result.OrderedHosts = hosts - - return decision -} - -func WithSteps(decision *v1alpha1.Decision, steps ...v1alpha1.StepResult) *v1alpha1.Decision { - if decision.Status.Result == nil { - decision.Status.Result = &v1alpha1.DecisionResult{} - } - decision.Status.Result.StepResults = steps - return decision -} - -func WithSchedulingDomain(decision *v1alpha1.Decision, schedulingDomain v1alpha1.SchedulingDomain) *v1alpha1.Decision { - decision.Spec.SchedulingDomain = schedulingDomain - return decision -} - -func WithResourceID(decision *v1alpha1.Decision, resourceID string) *v1alpha1.Decision { - decision.Spec.ResourceID = resourceID - return decision -} - -func WithUID(decision *v1alpha1.Decision, uid string) *v1alpha1.Decision { - decision.UID = types.UID(uid) - return decision -} - -func WithHistory(decision *v1alpha1.Decision, refs []corev1.ObjectReference) *v1alpha1.Decision { - decision.Status.History = &refs - return decision -} - -// Helper to create a decision with history reference to another decision -func WithHistoryRef(decision, historyDecision *v1alpha1.Decision) *v1alpha1.Decision { - refs := []corev1.ObjectReference{ - { - Kind: "Decision", - Namespace: historyDecision.Namespace, - Name: historyDecision.Name, - UID: historyDecision.UID, - }, - } - decision.Status.History = &refs - return decision -} - -// Generic step creator -func Step(name string, activations map[string]float64) v1alpha1.StepResult { - return v1alpha1.StepResult{ - StepName: name, - Activations: activations, - } -} - -// Common step names as constants -const ( - AvailabilityFilter = "availability-filter" - ResourceWeigher = "resource-weigher" - PlacementPolicy = "placement-policy" -) - -// Decision Builder Pattern - Fluent interface for creating test decisions -type DecisionBuilder struct { - decision *v1alpha1.Decision -} - -func NewDecision(name string) *DecisionBuilder { - return &DecisionBuilder{ - decision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource", - }, - Status: v1alpha1.DecisionStatus{}, - }, - } -} - -func (b *DecisionBuilder) WithResourceID(resourceID string) *DecisionBuilder { - b.decision.Spec.ResourceID = resourceID - return b -} - -func (b *DecisionBuilder) WithSchedulingDomain(schedulingDomain v1alpha1.SchedulingDomain) *DecisionBuilder { - b.decision.Spec.SchedulingDomain = schedulingDomain - return b -} - -func (b *DecisionBuilder) WithTargetHost(host string) *DecisionBuilder { - if b.decision.Status.Result == nil { - b.decision.Status.Result = &v1alpha1.DecisionResult{} - } - b.decision.Status.Result.TargetHost = stringPtr(host) - return b -} - -func (b *DecisionBuilder) WithRawInputWeights(weights map[string]float64) *DecisionBuilder { - if b.decision.Status.Result == nil { - b.decision.Status.Result = &v1alpha1.DecisionResult{} - } - b.decision.Status.Result.RawInWeights = weights - return b -} - -func (b *DecisionBuilder) WithNormalizedInputWeights(weights map[string]float64) *DecisionBuilder { - if b.decision.Status.Result == nil { - b.decision.Status.Result = &v1alpha1.DecisionResult{} - } - b.decision.Status.Result.NormalizedInWeights = weights - return b -} - -func (b *DecisionBuilder) WithAggregatedOutputWeights(weights map[string]float64) *DecisionBuilder { - if b.decision.Status.Result == nil { - b.decision.Status.Result = &v1alpha1.DecisionResult{} - } - b.decision.Status.Result.AggregatedOutWeights = weights - return b -} - -func (b *DecisionBuilder) WithOrderedHosts(hosts []string) *DecisionBuilder { - if b.decision.Status.Result == nil { - b.decision.Status.Result = &v1alpha1.DecisionResult{} - } - b.decision.Status.Result.OrderedHosts = hosts - return b -} - -func (b *DecisionBuilder) WithSteps(steps ...v1alpha1.StepResult) *DecisionBuilder { - if b.decision.Status.Result == nil { - b.decision.Status.Result = &v1alpha1.DecisionResult{} - } - b.decision.Status.Result.StepResults = steps - return b -} - -func (b *DecisionBuilder) WithHistory(refs []corev1.ObjectReference) *DecisionBuilder { - b.decision.Status.History = &refs - return b -} - -func (b *DecisionBuilder) WithHistoryDecisions(decisions ...*v1alpha1.Decision) *DecisionBuilder { - refs := make([]corev1.ObjectReference, len(decisions)) - for i, decision := range decisions { - refs[i] = corev1.ObjectReference{ - Kind: "Decision", - Namespace: decision.Namespace, - Name: decision.Name, - UID: decision.UID, - } - } - b.decision.Status.History = &refs - return b -} - -func (b *DecisionBuilder) WithPrecedence(precedence int) *DecisionBuilder { - b.decision.Status.Precedence = intPtr(precedence) - return b -} - -func (b *DecisionBuilder) WithUID(uid string) *DecisionBuilder { - b.decision.UID = types.UID(uid) - return b -} - -func (b *DecisionBuilder) WithCreationTimestamp(timestamp time.Time) *DecisionBuilder { - b.decision.CreationTimestamp = metav1.Time{Time: timestamp} - return b -} - -func (b *DecisionBuilder) Build() *v1alpha1.Decision { - return b.decision -} - -// Pre-built scenario helpers for common test patterns -func DecisionWithScoring(name, winner string, scores map[string]float64) *DecisionBuilder { - orderedHosts := make([]string, 0, len(scores)) - for host := range scores { - orderedHosts = append(orderedHosts, host) - } - // Sort by score descending - sort.Slice(orderedHosts, func(i, j int) bool { - return scores[orderedHosts[i]] > scores[orderedHosts[j]] - }) - - return NewDecision(name). - WithTargetHost(winner). - WithAggregatedOutputWeights(scores). - WithOrderedHosts(orderedHosts) -} - -func DecisionWithInputComparison(name, winner string, inputWeights, finalWeights map[string]float64) *DecisionBuilder { - return NewDecision(name). - WithTargetHost(winner). - WithRawInputWeights(inputWeights). - WithAggregatedOutputWeights(finalWeights) -} - -func DecisionWithCriticalSteps(name, winner string, inputWeights map[string]float64, steps ...v1alpha1.StepResult) *DecisionBuilder { - return NewDecision(name). - WithTargetHost(winner). - WithRawInputWeights(inputWeights). - WithSteps(steps...) -} - -func DecisionWithHistory(name, winner string) *DecisionBuilder { - return NewDecision(name). - WithTargetHost(winner) -} - -// Step result builders for common pipeline steps -func ResourceWeigherStep(activations map[string]float64) v1alpha1.StepResult { - return v1alpha1.StepResult{ - StepName: "resource-weigher", - Activations: activations, - } -} - -func AvailabilityFilterStep(activations map[string]float64) v1alpha1.StepResult { - return v1alpha1.StepResult{ - StepName: "availability-filter", - Activations: activations, - } -} - -func PlacementPolicyStep(activations map[string]float64) v1alpha1.StepResult { - return v1alpha1.StepResult{ - StepName: "placement-policy", - Activations: activations, - } -} - -func WeigherStep(name string, activations map[string]float64) v1alpha1.StepResult { - return v1alpha1.StepResult{ - StepName: name, - Activations: activations, - } -} - -// Test execution helpers -func SetupExplainerTest(t *testing.T, decisions ...*v1alpha1.Decision) *Explainer { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - objects := make([]runtime.Object, len(decisions)) - for i, decision := range decisions { - objects[i] = decision - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(objects...). - Build() - - explainer, err := NewExplainer(client) - if err != nil { - t.Fatalf("Failed to create explainer: %v", err) - } - return explainer -} - -func RunExplanationTest(t *testing.T, decision *v1alpha1.Decision, expectedContains []string) { - explainer := SetupExplainerTest(t, decision) - explanation, err := explainer.Explain(context.Background(), decision) - AssertNoError(t, err) - AssertExplanationContains(t, explanation, expectedContains...) -} - -func RunExplanationTestWithHistory(t *testing.T, decision *v1alpha1.Decision, historyDecisions []*v1alpha1.Decision, expectedContains []string) { - allDecisions := make([]*v1alpha1.Decision, len(historyDecisions)+1) - copy(allDecisions, historyDecisions) - allDecisions[len(historyDecisions)] = decision - explainer := SetupExplainerTest(t, allDecisions...) - explanation, err := explainer.Explain(context.Background(), decision) - AssertNoError(t, err) - AssertExplanationContains(t, explanation, expectedContains...) -} - -func AssertNoError(t *testing.T, err error) { - if err != nil { - t.Errorf("Expected no error but got: %v", err) - } -} - -func AssertExplanationContains(t *testing.T, explanation string, expected ...string) { - for _, exp := range expected { - if !contains(explanation, exp) { - t.Errorf("Expected explanation to contain '%s', but got: %s", exp, explanation) - } - } -} - -func AssertExplanationNotContains(t *testing.T, explanation string, notExpected ...string) { - for _, notExp := range notExpected { - if contains(explanation, notExp) { - t.Errorf("Expected explanation to NOT contain '%s', but got: %s", notExp, explanation) - } - } -} - -func TestExplainer_WinnerAnalysis(t *testing.T) { - tests := []struct { - name string - decision *v1alpha1.Decision - expectedContains []string - }{ - { - name: "winner analysis with score gap", - decision: DecisionWithScoring("test-decision", "host-1", - map[string]float64{"host-1": 2.45, "host-2": 2.10, "host-3": 1.85}). - Build(), - expectedContains: []string{ - "Selected: host-1 (score: 2.45)", - "gap to 2nd: 0.35", - "3 hosts evaluated", - }, - }, - { - name: "winner analysis with single host", - decision: DecisionWithScoring("test-decision", "host-1", - map[string]float64{"host-1": 2.45}). - Build(), - expectedContains: []string{ - "Selected: host-1 (score: 2.45)", - "1 host evaluated", - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - RunExplanationTest(t, tt.decision, tt.expectedContains) - }) - } -} - -func TestExplainer_InputComparison(t *testing.T) { - tests := []struct { - name string - decision *v1alpha1.Decision - expectedContains []string - }{ - { - name: "input choice confirmed", - decision: DecisionWithInputComparison("test-decision", "host-1", - map[string]float64{"host-1": 1.20, "host-2": 1.10, "host-3": 0.95}, - map[string]float64{"host-1": 2.45, "host-2": 2.10, "host-3": 1.85}). - Build(), - expectedContains: []string{ - "Input choice confirmed: host-1 (1.20→2.45)", - }, - }, - { - name: "input choice overridden", - decision: DecisionWithInputComparison("test-decision", "host-2", - map[string]float64{"host-1": 1.50, "host-2": 1.20, "host-3": 0.95}, - map[string]float64{"host-1": 1.85, "host-2": 2.45, "host-3": 2.10}). - Build(), - expectedContains: []string{ - "Input favored host-1 (1.50), final winner: host-2 (1.20→2.45)", - }, - }, - { - name: "raw weights preferred over normalized", - decision: NewDecision("test-decision"). - WithTargetHost("host-1"). - WithRawInputWeights(map[string]float64{"host-1": 100.0, "host-2": 90.0}). - WithNormalizedInputWeights(map[string]float64{"host-1": 1.0, "host-2": 0.9}). - WithAggregatedOutputWeights(map[string]float64{"host-1": 2.45, "host-2": 2.10}). - Build(), - expectedContains: []string{ - "Input choice confirmed: host-1 (100.00→2.45)", // Should now use raw weights (100.00) - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - RunExplanationTest(t, tt.decision, tt.expectedContains) - }) - } -} - -func TestExplainer_CriticalStepsAnalysis(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - tests := []struct { - name string - decision *v1alpha1.Decision - expectedContains []string - }{ - { - name: "single critical step", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 2.0}), - Step("resource-weigher", map[string]float64{"host-1": 1.5, "host-2": 0.2}), - Step("availability-filter", map[string]float64{"host-1": 0.0, "host-2": 0.0})), - expectedContains: []string{ - "Decision driven by 1/2 pipeline step: resource-weigher", - }, - }, - { - name: "multiple critical steps", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 3.0}), - Step("resource-weigher", map[string]float64{"host-1": 1.0, "host-2": -0.5}), - Step("availability-filter", map[string]float64{"host-1": 1.0, "host-2": 0.0}), - Step("placement-policy", map[string]float64{"host-1": 0.05, "host-2": 0.05})), - expectedContains: []string{ - "Decision driven by 2/3 pipeline steps: resource-weigher, availability-filter", - }, - }, - { - name: "all steps non-critical", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 3.0, "host-2": 1.0}), - Step("step-1", map[string]float64{"host-1": 0.05, "host-2": 0.05}), - Step("step-2", map[string]float64{"host-1": 0.02, "host-2": 0.02})), - expectedContains: []string{ - "Decision driven by input only (all 2 steps are non-critical)", - }, - }, - { - name: "all steps critical", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 3.0}), - Step("step-1", map[string]float64{"host-1": 1.0, "host-2": -0.5}), - Step("step-2", map[string]float64{"host-1": 1.0, "host-2": 0.0})), - expectedContains: []string{ - "Decision requires all 2 pipeline steps", - }, - }, - { - name: "three critical steps formatting", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 4.0}), - Step("step-a", map[string]float64{"host-1": 1.0, "host-2": -0.5}), - Step("step-b", map[string]float64{"host-1": 1.0, "host-2": 0.0}), - Step("step-c", map[string]float64{"host-1": 1.0, "host-2": 0.0}), - Step("step-d", map[string]float64{"host-1": 0.05, "host-2": 0.05})), - expectedContains: []string{ - "Decision driven by 3/4 pipeline steps: step-a, step-b, step-c", - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(tt.decision). - Build() - - explainer, err := NewExplainer(client) - if err != nil { - t.Errorf("Failed to create explainer: %v", err) - return - } - - explanation, err := explainer.Explain(context.Background(), tt.decision) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - for _, expected := range tt.expectedContains { - if !contains(explanation, expected) { - t.Errorf("Expected explanation to contain '%s', but got: %s", expected, explanation) - } - } - }) - } -} - -func TestExplainer_CompleteExplanation(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - previousDecision := WithUID(WithTargetHost(NewTestDecision("test-decision-1"), "host-1"), "test-uid-1") - - decision := WithSteps( - WithOutputWeights( - WithInputWeights( - WithHistoryRef( - WithTargetHost(NewTestDecision("test-decision-2"), "host-2"), - previousDecision), - map[string]float64{"host-1": 1.50, "host-2": 1.20, "host-3": 0.95}), - map[string]float64{"host-1": 1.85, "host-2": 2.45, "host-3": 2.10}), - Step("resource-weigher", map[string]float64{"host-1": 0.15, "host-2": 0.85, "host-3": 0.75}), - Step("availability-filter", map[string]float64{"host-1": 0.20, "host-2": 0.40, "host-3": 0.40})) - - // Set precedence manually since it's not commonly used - decision.Status.Precedence = intPtr(1) - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(decision, previousDecision). - Build() - - explainer, err := NewExplainer(client) - if err != nil { - t.Errorf("Failed to create explainer: %v", err) - return - } - - explanation, err := explainer.Explain(context.Background(), decision) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - expectedParts := []string{ - "Decision #2 for this nova server", - "Previous target host was 'host-1', now it's 'host-2'", - "Selected: host-2 (score: 2.45), gap to 2nd: 0.35, 3 hosts evaluated", - "Input favored host-1 (1.50), final winner: host-2 (1.20→2.45)", - "Decision driven by 1/2 pipeline step: resource-weigher", - } - - for _, expected := range expectedParts { - if !contains(explanation, expected) { - t.Errorf("Expected explanation to contain '%s', but got: %s", expected, explanation) - } - } -} - -func TestExplainer_DeletedHostsAnalysis(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - tests := []struct { - name string - decision *v1alpha1.Decision - expectedContains []string - }{ - { - name: "single host filtered by single step", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 2.0}), - Step("availability-filter", map[string]float64{"host-1": 0.5})), - expectedContains: []string{ - "1 host filtered:", - "- host-2 (input choice) by availability-filter", - }, - }, - { - name: "multiple hosts filtered", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 3.0, "host-2": 2.0, "host-3": 1.0}), - Step("availability-filter", map[string]float64{"host-1": 0.5})), - expectedContains: []string{ - "2 hosts filtered", - }, - }, - { - name: "multiple hosts filtered including input winner", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 3.0, "host-3": 2.0}), - Step("availability-filter", map[string]float64{"host-1": 0.5})), - expectedContains: []string{ - "2 hosts filtered:", - "- host-2 (input choice) by availability-filter", - "- host-3 by availability-filter", - }, - }, - { - name: "no hosts filtered", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 2.0}), - Step("resource-weigher", map[string]float64{"host-1": 0.5, "host-2": 0.3})), - expectedContains: []string{}, // No deleted hosts analysis should be present - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(tt.decision). - Build() - - explainer, err := NewExplainer(client) - if err != nil { - t.Errorf("Failed to create explainer: %v", err) - return - } - - explanation, err := explainer.Explain(context.Background(), tt.decision) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - for _, expected := range tt.expectedContains { - if !contains(explanation, expected) { - t.Errorf("Expected explanation to contain '%s', but got: %s", expected, explanation) - } - } - - // For the "no hosts filtered" case, ensure no deleted hosts analysis is present - if len(tt.expectedContains) == 0 { - deletedHostsKeywords := []string{"filtered", "Input winner", "hosts filtered"} - for _, keyword := range deletedHostsKeywords { - if contains(explanation, keyword) { - t.Errorf("Expected explanation to NOT contain '%s' for no deleted hosts case, but got: %s", keyword, explanation) - } - } - } - }) - } -} - -func TestExplainer_GlobalChainAnalysis(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - baseTime := metav1.Now() - time1 := metav1.Time{Time: baseTime.Add(-120 * time.Minute)} // 2 hours ago - time2 := metav1.Time{Time: baseTime.Add(-60 * time.Minute)} // 1 hour ago - time3 := metav1.Time{Time: baseTime.Time} // now - - tests := []struct { - name string - currentDecision *v1alpha1.Decision - historyDecisions []v1alpha1.Decision - expectedContains []string - expectedNotContain []string - }{ - { - name: "simple chain with durations", - currentDecision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-3", - Namespace: "default", - CreationTimestamp: time3, - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource", - }, - Status: v1alpha1.DecisionStatus{ - History: &[]corev1.ObjectReference{ - {Kind: "Decision", Namespace: "default", Name: "decision-1", UID: "uid-1"}, - {Kind: "Decision", Namespace: "default", Name: "decision-2", UID: "uid-2"}, - }, - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-3"), - }, - }, - }, - historyDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-1", - Namespace: "default", - UID: "uid-1", - CreationTimestamp: time1, - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-2", - Namespace: "default", - UID: "uid-2", - CreationTimestamp: time2, - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-2"), - }, - }, - }, - }, - expectedContains: []string{ - "Chain: host-1 (1h0m0s) -> host-2 (1h0m0s) -> host-3 (0s).", - }, - }, - { - name: "chain with loop detection", - currentDecision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-3", - Namespace: "default", - CreationTimestamp: time3, - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource", - }, - Status: v1alpha1.DecisionStatus{ - History: &[]corev1.ObjectReference{ - {Kind: "Decision", Namespace: "default", Name: "decision-1", UID: "uid-1"}, - {Kind: "Decision", Namespace: "default", Name: "decision-2", UID: "uid-2"}, - }, - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), // Back to host-1 - creates loop - }, - }, - }, - historyDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-1", - Namespace: "default", - UID: "uid-1", - CreationTimestamp: time1, - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-2", - Namespace: "default", - UID: "uid-2", - CreationTimestamp: time2, - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-2"), - }, - }, - }, - }, - expectedContains: []string{ - "Chain (loop detected): host-1 (1h0m0s) -> host-2 (1h0m0s) -> host-1 (0s).", - }, - }, - { - name: "chain with multiple decisions on same host", - currentDecision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-4", - Namespace: "default", - CreationTimestamp: time3, - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource", - }, - Status: v1alpha1.DecisionStatus{ - History: &[]corev1.ObjectReference{ - {Kind: "Decision", Namespace: "default", Name: "decision-1", UID: "uid-1"}, - {Kind: "Decision", Namespace: "default", Name: "decision-2", UID: "uid-2"}, - {Kind: "Decision", Namespace: "default", Name: "decision-3", UID: "uid-3"}, - }, - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-2"), - }, - }, - }, - historyDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-1", - Namespace: "default", - UID: "uid-1", - CreationTimestamp: time1, - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-2", - Namespace: "default", - UID: "uid-2", - CreationTimestamp: time1, // Same time as decision-1 - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), // Same host as decision-1 - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-3", - Namespace: "default", - UID: "uid-3", - CreationTimestamp: time2, - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), // Still same host - }, - }, - }, - }, - expectedContains: []string{ - "Chain: host-1 (2h0m0s; 3 decisions) -> host-2 (0s).", - }, - }, - { - name: "chain with multi-day duration", - currentDecision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-2", - Namespace: "default", - CreationTimestamp: metav1.Time{Time: baseTime.Time}, - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource", - }, - Status: v1alpha1.DecisionStatus{ - History: &[]corev1.ObjectReference{ - {Kind: "Decision", Namespace: "default", Name: "decision-1", UID: "uid-1"}, - }, - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-2"), - }, - }, - }, - historyDecisions: []v1alpha1.Decision{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-1", - Namespace: "default", - UID: "uid-1", - CreationTimestamp: metav1.Time{Time: baseTime.Add(-72 * time.Hour)}, // 3 days ago - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), - }, - }, - }, - }, - expectedContains: []string{ - "Chain: host-1 (3d0h0m0s) -> host-2 (0s).", - }, - }, - { - name: "no chain for initial decision", - currentDecision: &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "decision-1", - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource", - }, - Status: v1alpha1.DecisionStatus{ - History: nil, // No history - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-1"), - }, - }, - }, - historyDecisions: []v1alpha1.Decision{}, - expectedNotContain: []string{ - "Chain:", - "chain:", - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - objects := []runtime.Object{tt.currentDecision} - for i := range tt.historyDecisions { - objects = append(objects, &tt.historyDecisions[i]) - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(objects...). - Build() - - explainer, err := NewExplainer(client) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - explanation, err := explainer.Explain(context.Background(), tt.currentDecision) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - for _, expected := range tt.expectedContains { - if !contains(explanation, expected) { - t.Errorf("Expected explanation to contain '%s', but got: %s", expected, explanation) - } - } - - for _, notExpected := range tt.expectedNotContain { - if contains(explanation, notExpected) { - t.Errorf("Expected explanation to NOT contain '%s', but got: %s", notExpected, explanation) - } - } - }) - } -} - -func intPtr(i int) *int { - return &i -} - -func TestExplainer_RawWeightsPriorityBugFix(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - tests := []struct { - name string - decision *v1alpha1.Decision - expectedContains []string - description string - }{ - { - name: "raw_weights_preserve_small_differences", - decision: func() *v1alpha1.Decision { - decision := WithOutputWeights( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-2"), - map[string]float64{"host-1": 1000.05, "host-2": 1000.10, "host-3": 1000.00}), - map[string]float64{"host-1": 1001.05, "host-2": 1002.10, "host-3": 1001.00}) - // Add normalized weights to show they would mask the difference - decision.Status.Result.NormalizedInWeights = map[string]float64{"host-1": 1.0, "host-2": 1.0, "host-3": 1.0} - return decision - }(), - expectedContains: []string{ - "Input choice confirmed: host-2 (1000.10→1002.10)", // Should use raw weights (1000.10) - }, - description: "Raw weights preserve small differences that normalized weights would mask", - }, - { - name: "raw_weights_detect_correct_input_winner", - decision: func() *v1alpha1.Decision { - decision := WithOutputWeights( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-3"), - map[string]float64{"host-1": 2000.15, "host-2": 2000.10, "host-3": 2000.05}), - map[string]float64{"host-1": 2001.15, "host-2": 2001.10, "host-3": 2002.05}) - // Add normalized weights to show they would mask the difference - decision.Status.Result.NormalizedInWeights = map[string]float64{"host-1": 1.0, "host-2": 1.0, "host-3": 1.0} - return decision - }(), - expectedContains: []string{ - "Input favored host-1 (2000.15), final winner: host-3 (2000.05→2002.05)", // Should detect host-1 as input winner using raw weights - }, - description: "Raw weights correctly identify input winner that normalized weights would miss", - }, - { - name: "critical_steps_analysis_uses_raw_weights", - decision: func() *v1alpha1.Decision { - decision := WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1000.05, "host-2": 1000.00}), - Step("resource-weigher", map[string]float64{"host-1": 0.5, "host-2": 0.0})) - // Add normalized weights to show they would mask the difference - decision.Status.Result.NormalizedInWeights = map[string]float64{"host-1": 1.0, "host-2": 1.0} - return decision - }(), - expectedContains: []string{ - "Decision driven by input only (all 1 step is non-critical)", // With small raw weight advantage, step is non-critical - "Input choice confirmed: host-1 (1000.05→0.00)", // Shows raw weights are being used - }, - description: "Critical steps analysis uses raw weights - with small raw advantage, step becomes non-critical", - }, - { - name: "deleted_hosts_analysis_uses_raw_weights", - decision: func() *v1alpha1.Decision { - decision := WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1000.00, "host-2": 1000.05, "host-3": 999.95}), - Step("availability-filter", map[string]float64{"host-1": 0.0})) - // Add normalized weights to show they would mask the difference - decision.Status.Result.NormalizedInWeights = map[string]float64{"host-1": 1.0, "host-2": 1.0, "host-3": 1.0} - return decision - }(), - expectedContains: []string{ - "2 hosts filtered:", - "- host-2 (input choice) by availability-filter", - "Input favored host-2 (1000.05), final winner: host-1 (1000.00→0.00)", - }, - description: "Deleted hosts analysis uses raw weights to correctly identify input winner", - }, - { - name: "fallback_to_normalized_when_no_raw_weights", - decision: func() *v1alpha1.Decision { - decision := WithOutputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 2.5, "host-2": 2.0, "host-3": 1.8}) - // Set normalized weights and clear raw weights to test fallback - decision.Status.Result.NormalizedInWeights = map[string]float64{"host-1": 1.5, "host-2": 1.0, "host-3": 0.8} - decision.Status.Result.RawInWeights = nil - return decision - }(), - expectedContains: []string{ - "Input choice confirmed: host-1 (1.50→2.50)", // Should use normalized weights as fallback - }, - description: "Should fall back to normalized weights when raw weights are not available", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(tt.decision). - Build() - - explainer, err := NewExplainer(client) - if err != nil { - t.Fatalf("Failed to create explainer: %v", err) - } - - explanation, err := explainer.Explain(context.Background(), tt.decision) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - for _, expected := range tt.expectedContains { - if !contains(explanation, expected) { - t.Errorf("Expected explanation to contain '%s', but got: %s", expected, explanation) - } - } - }) - } -} - -// TestExplainer_RawVsNormalizedComparison demonstrates the impact of the bug fix -func TestExplainer_RawVsNormalizedComparison(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - decision := &v1alpha1.Decision{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-decision", - Namespace: "default", - }, - Spec: v1alpha1.DecisionSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - ResourceID: "test-resource", - }, - Status: v1alpha1.DecisionStatus{ - Result: &v1alpha1.DecisionResult{ - TargetHost: stringPtr("host-2"), - RawInWeights: map[string]float64{ - "host-1": 1000.05, // Very small difference - "host-2": 1000.10, // Slightly higher - should be detected as input winner - "host-3": 1000.00, - }, - NormalizedInWeights: map[string]float64{ - "host-1": 1.0, // All normalized to same value - would mask the difference - "host-2": 1.0, - "host-3": 1.0, - }, - AggregatedOutWeights: map[string]float64{ - "host-1": 1001.05, - "host-2": 1002.10, // host-2 wins - "host-3": 1001.00, - }, - }, - }, - } - - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(decision). - Build() - - explainer, err := NewExplainer(client) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - explanation, err := explainer.Explain(context.Background(), decision) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - if !contains(explanation, "Input choice confirmed: host-2 (1000.10→1002.10)") { - t.Errorf("Expected explanation to show raw weight value (1000.10), but got: %s", explanation) - } - - if contains(explanation, "Input favored host-1") || contains(explanation, "Input favored host-3") { - t.Errorf("Expected explanation to NOT show input choice override, but got: %s", explanation) - } -} - -func TestExplainer_StepImpactAnalysis(t *testing.T) { - scheme := runtime.NewScheme() - if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add scheme: %v", err) - } - - tests := []struct { - name string - decision *v1alpha1.Decision - expectedContains []string - }{ - { - name: "step with positive impact", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 2.0}), - Step("resource-weigher", map[string]float64{"host-1": 1.5, "host-2": 0.2})), - expectedContains: []string{ - "Step impacts:", - "resource-weigher +1.50", - }, - }, - { - name: "step with promotion to first", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 2.0}), - Step("resource-weigher", map[string]float64{"host-1": 2.0, "host-2": 0.5})), - expectedContains: []string{ - "Step impacts:", - "resource-weigher +2.00→#1", - }, - }, - { - name: "step with competitor removal", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 2.0, "host-2": 1.0, "host-3": 0.5}), - Step("availability-filter", map[string]float64{"host-1": 0.0})), - expectedContains: []string{ - "Step impacts:", - "availability-filter +0.00 (removed 2)", - }, - }, - { - name: "multiple steps sorted by impact", - decision: WithSteps( - WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 1.0, "host-2": 2.0}), - Step("resource-weigher", map[string]float64{"host-1": 1.5, "host-2": 0.2}), - Step("availability-filter", map[string]float64{"host-1": 0.1, "host-2": 0.0})), - expectedContains: []string{ - "Step impacts:", - "resource-weigher +1.50", - "availability-filter +0.10", - }, - }, - { - name: "no step impacts for decision without steps", - decision: WithInputWeights( - WithTargetHost(NewTestDecision("test-decision"), "host-1"), - map[string]float64{"host-1": 2.0, "host-2": 1.0}), - expectedContains: []string{}, // No step impacts should be present - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - client := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(tt.decision). - Build() - - explainer, err := NewExplainer(client) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - explanation, err := explainer.Explain(context.Background(), tt.decision) - if err != nil { - t.Errorf("Expected no error but got: %v", err) - return - } - - for _, expected := range tt.expectedContains { - if !contains(explanation, expected) { - t.Errorf("Expected explanation to contain '%s', but got: %s", expected, explanation) - } - } - - // For the "no step impacts" case, ensure no step impacts analysis is present - if len(tt.expectedContains) == 0 { - stepImpactsKeywords := []string{"Step impacts:", "→#1", "removed"} - for _, keyword := range stepImpactsKeywords { - if contains(explanation, keyword) { - t.Errorf("Expected explanation to NOT contain '%s' for no step impacts case, but got: %s", keyword, explanation) - } - } - } - }) - } -} diff --git a/internal/scheduling/explanation/templates.go b/internal/scheduling/explanation/templates.go deleted file mode 100644 index dc7160c07..000000000 --- a/internal/scheduling/explanation/templates.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package explanation - -import ( - "bytes" - "fmt" - "strings" - "text/template" - "time" -) - -type TemplateManager struct { - templates *template.Template -} - -func NewTemplateManager() (*TemplateManager, error) { - tmpl := template.New("explanation").Funcs(template.FuncMap{ - "join": strings.Join, - "formatDuration": formatTemplateDuration, - "formatFloat": func(f float64) string { return fmt.Sprintf("%.2f", f) }, - "formatDelta": func(f float64) string { return fmt.Sprintf("%+.2f", f) }, - "add": func(a, b int) int { return a + b }, - "plural": func(n int, singular, plural string) string { - if n == 1 { - return singular - } - return plural - }, - }) - - tmpl, err := tmpl.Parse(mainTemplate) - if err != nil { - return nil, fmt.Errorf("failed to parse main template: %w", err) - } - - templates := map[string]string{ - "context": contextTemplate, - "history": historyTemplate, - "winner": winnerTemplate, - "input": inputTemplate, - "critical": criticalTemplate, - "deleted": deletedTemplate, - "impacts": impactsTemplate, - "chain": chainTemplate, - } - - for name, templateStr := range templates { - tmpl, err = tmpl.Parse(fmt.Sprintf(`{{define "%s"}}%s{{end}}`, name, templateStr)) - if err != nil { - return nil, fmt.Errorf("failed to parse %s template: %w", name, err) - } - } - - return &TemplateManager{templates: tmpl}, nil -} - -func (tm *TemplateManager) RenderExplanation(ctx ExplanationContext) (string, error) { - var buf bytes.Buffer - err := tm.templates.Execute(&buf, ctx) - if err != nil { - return "", fmt.Errorf("failed to render explanation: %w", err) - } - return strings.TrimSpace(buf.String()), nil -} - -func formatTemplateDuration(d time.Duration) string { - if d == 0 { - return "0s" - } - - // Truncate to seconds to remove sub-second precision - d = d.Truncate(time.Second) - - // For durations >= 24 hours, convert to days format - if d >= 24*time.Hour { - days := int(d.Hours()) / 24 - remainder := d - time.Duration(days)*24*time.Hour - if remainder == 0 { - return fmt.Sprintf("%dd0h0m0s", days) - } - return fmt.Sprintf("%d%s", days, remainder.String()) - } - - // For shorter durations, use Go's built-in formatting - return d.String() -} - -const mainTemplate = `{{template "context" .Context}} -{{- if .History}} {{template "history" .History}}{{end}} -{{- if .Winner}} {{template "winner" .Winner}}{{end}} -{{- if .Input}} {{template "input" .Input}}{{end}} -{{- if .CriticalSteps}} {{template "critical" .CriticalSteps}}{{end}} -{{- if .DeletedHosts}} {{template "deleted" .DeletedHosts}}{{end}} -{{- if .StepImpacts}} {{template "impacts" .StepImpacts}}{{end}} -{{- if .Chain}} {{template "chain" .Chain}}{{end}}` - -const contextTemplate = `{{if .IsInitial -}} -Initial placement of the {{.ResourceType}}. -{{- else -}} -Decision #{{.DecisionNumber}} for this {{.ResourceType}}. -{{- end}}` - -const historyTemplate = `Previous target host was '{{.PreviousTarget}}', now it's '{{.CurrentTarget}}'.` - -const winnerTemplate = `Selected: {{.HostName}} (score: {{formatFloat .Score}}) -{{- if .HasGap}}, gap to 2nd: {{formatFloat .Gap}}{{end}}, {{.HostsEvaluated}} {{plural .HostsEvaluated "host" "hosts"}} evaluated.` - -const inputTemplate = `{{if .InputConfirmed -}} -Input choice confirmed: {{.FinalWinner}} ({{formatFloat .InputScore}}→{{formatFloat .FinalScore}}). -{{- else -}} -Input favored {{.InputWinner}} ({{formatFloat .InputScore}}), final winner: {{.FinalWinner}} ({{formatFloat .FinalInputScore}}→{{formatFloat .FinalScore}}). -{{- end}}` - -const criticalTemplate = `{{if .IsInputOnly -}} -Decision driven by input only (all {{.TotalSteps}} {{plural .TotalSteps "step is" "steps are"}} non-critical). -{{- else if .RequiresAll -}} -Decision requires all {{.TotalSteps}} pipeline {{plural .TotalSteps "step" "steps"}}. -{{- else if eq (len .Steps) 1 -}} -Decision driven by 1/{{.TotalSteps}} pipeline step: {{index .Steps 0}}. -{{- else -}} -Decision driven by {{len .Steps}}/{{.TotalSteps}} pipeline {{plural .TotalSteps "step" "steps"}}: {{join .Steps ", "}}. -{{- end}}` - -const deletedTemplate = `{{len .DeletedHosts}} {{plural (len .DeletedHosts) "host" "hosts"}} filtered: -{{- range .DeletedHosts}} - - {{.Name}}{{if .IsInputWinner}} (input choice){{end}} by {{join .Steps ", "}} -{{- end}}` - -const impactsTemplate = ` Step impacts: -{{- range $i, $impact := .}} -• {{$impact.Step}} -{{- if $impact.PromotedToFirst}} {{formatDelta $impact.ScoreDelta}}→#1 -{{- else if ne $impact.ScoreDelta 0.0}} {{formatDelta $impact.ScoreDelta}} -{{- else if gt $impact.CompetitorsRemoved 0}} +0.00 (removed {{$impact.CompetitorsRemoved}}) -{{- else}} +0.00{{end}} -{{- end}}` - -const chainTemplate = `{{if .HasLoop}}Chain (loop detected): {{else}}Chain: {{end}} -{{- range $i, $segment := .Segments}}{{if gt $i 0}} -> {{end}}{{$segment.Host}} ({{formatDuration $segment.Duration}}{{if gt $segment.Decisions 1}}; {{$segment.Decisions}} decisions{{end}}){{end}}.` diff --git a/internal/scheduling/explanation/types.go b/internal/scheduling/explanation/types.go deleted file mode 100644 index 31f6d0aa1..000000000 --- a/internal/scheduling/explanation/types.go +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package explanation - -import "time" - -// ExplanationContext holds all data needed to render a complete explanation. -type ExplanationContext struct { - Context ContextData `json:"context"` - History *HistoryData `json:"history,omitempty"` - Winner *WinnerData `json:"winner,omitempty"` - Input *InputData `json:"input,omitempty"` - CriticalSteps *CriticalStepsData `json:"criticalSteps,omitempty"` - DeletedHosts *DeletedHostsData `json:"deletedHosts,omitempty"` - StepImpacts []StepImpact `json:"stepImpacts,omitempty"` - Chain *ChainData `json:"chain,omitempty"` -} - -type ContextData struct { - ResourceType string `json:"resourceType"` - DecisionNumber int `json:"decisionNumber"` - IsInitial bool `json:"isInitial"` -} - -// HistoryData contains information about the previous decision in the chain. -type HistoryData struct { - PreviousTarget string `json:"previousTarget"` - CurrentTarget string `json:"currentTarget"` -} - -type WinnerData struct { - HostName string `json:"hostName"` - Score float64 `json:"score"` - Gap float64 `json:"gap"` - HostsEvaluated int `json:"hostsEvaluated"` - HasGap bool `json:"hasGap"` -} - -// InputData contains information about input vs final winner comparison. -type InputData struct { - InputWinner string `json:"inputWinner"` - InputScore float64 `json:"inputScore"` - FinalWinner string `json:"finalWinner"` - FinalScore float64 `json:"finalScore"` - FinalInputScore float64 `json:"finalInputScore"` // Final winner's input score - InputConfirmed bool `json:"inputConfirmed"` -} - -// CriticalStepsData contains information about which pipeline steps were critical. -type CriticalStepsData struct { - Steps []string `json:"steps"` - TotalSteps int `json:"totalSteps"` - IsInputOnly bool `json:"isInputOnly"` - RequiresAll bool `json:"requiresAll"` -} - -// DeletedHostsData contains information about hosts that were filtered out. -type DeletedHostsData struct { - DeletedHosts []DeletedHostInfo `json:"deletedHosts"` -} - -// DeletedHostInfo contains details about a single deleted host. -type DeletedHostInfo struct { - Name string `json:"name"` - Steps []string `json:"steps"` - IsInputWinner bool `json:"isInputWinner"` -} - -// ChainData contains information about the decision chain over time. -type ChainData struct { - Segments []ChainSegment `json:"segments"` - HasLoop bool `json:"hasLoop"` -} - -// ChainSegment represents a period where the resource was on a specific host. -type ChainSegment struct { - Host string `json:"host"` - Duration time.Duration `json:"duration"` - // number of decisions with this as the target host - Decisions int `json:"decisions"` -} diff --git a/internal/scheduling/lib/filter_weigher_pipeline.go b/internal/scheduling/lib/filter_weigher_pipeline.go index f362c07af..655f7382b 100644 --- a/internal/scheduling/lib/filter_weigher_pipeline.go +++ b/internal/scheduling/lib/filter_weigher_pipeline.go @@ -138,7 +138,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest]( func (p *filterWeigherPipeline[RequestType]) runFilters( log *slog.Logger, request RequestType, -) (filteredRequest RequestType) { +) (filteredRequest RequestType, stepResults []v1alpha1.StepResult) { filteredRequest = request for _, filterName := range p.filtersOrder { @@ -155,11 +155,15 @@ func (p *filterWeigherPipeline[RequestType]) runFilters( continue } stepLog.Info("scheduler: finished filter") + stepResults = append(stepResults, v1alpha1.StepResult{ + StepName: filterName, + Activations: result.Activations, + }) // Mutate the request to only include the remaining hosts. // Assume the resulting request type is the same as the input type. filteredRequest = filteredRequest.FilterHosts(result.Activations).(RequestType) } - return filteredRequest + return filteredRequest, stepResults } // Execute weighers and collect their activations by step name. @@ -275,7 +279,7 @@ func (p *filterWeigherPipeline[RequestType]) Run(request RequestType) (v1alpha1. // Run filters first to reduce the number of hosts. // Any weights assigned to filtered out hosts are ignored. - filteredRequest := p.runFilters(traceLog, request) + filteredRequest, filterStepResults := p.runFilters(traceLog, request) traceLog.Info( "scheduler: finished filters", "remainingHosts", filteredRequest.GetHosts(), @@ -296,9 +300,23 @@ func (p *filterWeigherPipeline[RequestType]) Run(request RequestType) (v1alpha1. // Collect some metrics about the pipeline execution. go p.monitor.observePipelineResult(request, hosts) + // Build step results from filters and weighers. + stepResults := filterStepResults + for _, weigherName := range p.weighersOrder { + activations, ok := stepWeights[weigherName] + if !ok { + continue + } + stepResults = append(stepResults, v1alpha1.StepResult{ + StepName: weigherName, + Activations: activations, + }) + } + result := v1alpha1.DecisionResult{ RawInWeights: request.GetWeights(), NormalizedInWeights: inWeights, + StepResults: stepResults, AggregatedOutWeights: outWeights, OrderedHosts: hosts, } diff --git a/internal/scheduling/lib/filter_weigher_pipeline_test.go b/internal/scheduling/lib/filter_weigher_pipeline_test.go index 5d025eb2b..0e2775944 100644 --- a/internal/scheduling/lib/filter_weigher_pipeline_test.go +++ b/internal/scheduling/lib/filter_weigher_pipeline_test.go @@ -221,7 +221,7 @@ func TestPipeline_RunFilters(t *testing.T) { Weights: map[string]float64{"host1": 0.0, "host2": 0.0, "host3": 0.0}, } - req := p.runFilters(slog.Default(), request) + req, _ := p.runFilters(slog.Default(), request) if len(req.Hosts) != 2 { t.Fatalf("expected 2 step results, got %d", len(req.Hosts)) } diff --git a/internal/scheduling/lib/history_manager.go b/internal/scheduling/lib/history_manager.go new file mode 100644 index 000000000..ff6e3f388 --- /dev/null +++ b/internal/scheduling/lib/history_manager.go @@ -0,0 +1,263 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package lib + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/events" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func getName(schedulingDomain v1alpha1.SchedulingDomain, resourceID string) string { + return fmt.Sprintf("%s-%s", schedulingDomain, resourceID) +} + +// generateSimplifiedExplanation produces a human-readable explanation from a +// decision result. On failure it includes the error. On success it describes +// which pipeline steps filtered out which hosts. +func generateExplanation(result *v1alpha1.DecisionResult, pipelineErr error) string { + if pipelineErr != nil { + return fmt.Sprintf("Pipeline run failed: %s.", pipelineErr.Error()) + } + + if result == nil || len(result.StepResults) == 0 { + if result != nil && result.TargetHost != nil { + return fmt.Sprintf("Selected host: %s.", *result.TargetHost) + } + return "" + } + + // Get all initial hosts from input weights. + var allHosts map[string]float64 + if len(result.RawInWeights) > 0 { + allHosts = result.RawInWeights + } else if len(result.NormalizedInWeights) > 0 { + allHosts = result.NormalizedInWeights + } + if allHosts == nil { + if result.TargetHost != nil { + return fmt.Sprintf("Selected host: %s.", *result.TargetHost) + } + return "" + } + + var sb strings.Builder + fmt.Fprintf(&sb, "Started with %d host(s).\n\n", len(allHosts)) + + // Track current set of surviving hosts. + currentHosts := make(map[string]bool, len(allHosts)) + for h := range allHosts { + currentHosts[h] = true + } + + for _, step := range result.StepResults { + // Determine which hosts were removed by this step. + var removed []string + for h := range currentHosts { + if _, exists := step.Activations[h]; !exists { + removed = append(removed, h) + } + } + if len(removed) > 0 { + sort.Strings(removed) + for _, h := range removed { + delete(currentHosts, h) + } + fmt.Fprintf(&sb, "%s filtered out %s\n", + step.StepName, + strings.Join(removed, ", "), + ) + } + } + + // Summary of remaining hosts. + remaining := make([]string, 0, len(currentHosts)) + for h := range currentHosts { + remaining = append(remaining, h) + } + sort.Strings(remaining) + fmt.Fprintf(&sb, "\n%d hosts remaining (%s)\n", + len(remaining), + strings.Join(remaining, ", "), + ) + + if result.TargetHost != nil { + fmt.Fprintf(&sb, "\nSelected host: %s.", *result.TargetHost) + } + + return strings.TrimSpace(sb.String()) +} + +// HistoryManager manages History CRDs for scheduling decisions. It holds the +// Kubernetes client and event recorder so callers don't have to pass them on +// every invocation. +type HistoryManager struct { + Client client.Client + Recorder events.EventRecorder +} + +// Upsert creates or updates a History CRD for the given decision. +// It is called after every pipeline run (success and failure). The pipelineErr +// parameter is used to generate a meaningful explanation when the pipeline fails. +// If a non-nil Recorder is set, a Kubernetes Event is emitted on the History +// object to short-term persist the scheduling decision. +func (h *HistoryManager) Upsert( + ctx context.Context, + decision *v1alpha1.Decision, + intent v1alpha1.SchedulingIntent, + pipelineErr error, +) error { + + log := ctrl.LoggerFrom(ctx) + + name := getName(decision.Spec.SchedulingDomain, decision.Spec.ResourceID) + + history := &v1alpha1.History{} + err := h.Client.Get(ctx, client.ObjectKey{Name: name}, history) + + if apierrors.IsNotFound(err) { + // Create new History CRD. + history = &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: v1alpha1.HistorySpec{ + SchedulingDomain: decision.Spec.SchedulingDomain, + ResourceID: decision.Spec.ResourceID, + }, + } + if createErr := h.Client.Create(ctx, history); createErr != nil { + if apierrors.IsAlreadyExists(createErr) { + // Another controller beat us to it, re-fetch. + if getErr := h.Client.Get(ctx, client.ObjectKey{Name: name}, history); getErr != nil { + return getErr + } + } else { + log.Error(createErr, "failed to create history CRD", "name", name) + return createErr + } + } + } else if err != nil { + log.Error(err, "failed to get history CRD", "name", name) + return err + } + + successful := pipelineErr == nil && decision.Status.Result != nil && decision.Status.Result.TargetHost != nil + + // Archive the previous current decision into the history list. + if !history.Status.Current.Timestamp.IsZero() { + entry := v1alpha1.SchedulingHistoryEntry{ + Timestamp: history.Status.Current.Timestamp, + PipelineRef: history.Status.Current.PipelineRef, + Intent: history.Status.Current.Intent, + OrderedHosts: history.Status.Current.OrderedHosts, + Successful: history.Status.Current.Successful, + } + history.Status.History = append(history.Status.History, entry) + if len(history.Status.History) > 10 { + history.Status.History = history.Status.History[len(history.Status.History)-10:] + } + } + + // Build the new current decision. + current := v1alpha1.CurrentDecision{ + Timestamp: metav1.Now(), + PipelineRef: decision.Spec.PipelineRef, + Intent: intent, + Successful: successful, + Explanation: generateExplanation(decision.Status.Result, pipelineErr), + } + if decision.Status.Result != nil { + current.TargetHost = decision.Status.Result.TargetHost + hosts := decision.Status.Result.OrderedHosts + if len(hosts) > 3 { + hosts = hosts[:3] + } + current.OrderedHosts = hosts + } + history.Status.Current = current + + // Set Ready condition — True only when a host was successfully selected. + condStatus := metav1.ConditionTrue + reason := "SchedulingSucceeded" + message := "scheduling decision selected a target host" + if pipelineErr != nil { + condStatus = metav1.ConditionFalse + reason = "PipelineRunFailed" + message = "pipeline run failed: " + pipelineErr.Error() + } else if !successful { + condStatus = metav1.ConditionFalse + reason = "NoHostFound" + message = "pipeline completed but no suitable host was found" + } + meta.SetStatusCondition(&history.Status.Conditions, metav1.Condition{ + Type: "Ready", + Status: condStatus, + Reason: reason, + Message: message, + }) + + // Use Update instead of MergeFrom+Patch because JSON merge patch strips + // boolean false values, which causes CRD validation to reject the patch + // when Successful is false. + if updateErr := h.Client.Status().Update(ctx, history); updateErr != nil { + log.Error(updateErr, "failed to update history CRD status", "name", name) + return updateErr + } + + // Emit a Kubernetes Event on the History object to short-term persist the + // scheduling decision. Events auto-expire (default TTL ~1h) so this gives + // devops short-term visibility into individual scheduling runs. + if h.Recorder != nil { + eventType := corev1.EventTypeNormal + eventReason := "SchedulingSucceeded" + action := "Scheduled" + if !successful { + eventType = corev1.EventTypeWarning + eventReason = "SchedulingFailed" + action = "FailedScheduling" + } + h.Recorder.Eventf(history, nil, eventType, eventReason, action, "%s", current.Explanation) + } + + log.Info("history CRD updated", "name", name, "entries", len(history.Status.History)) + return nil +} + +// Delete deletes the History CRD associated with the given scheduling domain +// and resource ID. It is a no-op if the History CRD does not exist. +func (h *HistoryManager) Delete( + ctx context.Context, + schedulingDomain v1alpha1.SchedulingDomain, + resourceID string, +) error { + + log := ctrl.LoggerFrom(ctx) + name := getName(schedulingDomain, resourceID) + + history := &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + } + if err := h.Client.Delete(ctx, history); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + log.Error(err, "failed to delete history CRD", "name", name) + return err + } + log.Info("deleted history CRD", "name", name) + return nil +} diff --git a/internal/scheduling/lib/history_manager_test.go b/internal/scheduling/lib/history_manager_test.go new file mode 100644 index 000000000..b55b16210 --- /dev/null +++ b/internal/scheduling/lib/history_manager_test.go @@ -0,0 +1,646 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package lib + +import ( + "context" + "errors" + "fmt" + "strings" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + testlib "github.com/cobaltcore-dev/cortex/pkg/testing" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestGetName(t *testing.T) { + tests := []struct { + domain v1alpha1.SchedulingDomain + resourceID string + expected string + }{ + {v1alpha1.SchedulingDomainNova, "uuid-1", "nova-uuid-1"}, + {v1alpha1.SchedulingDomainCinder, "vol-abc", "cinder-vol-abc"}, + {v1alpha1.SchedulingDomainManila, "share-xyz", "manila-share-xyz"}, + {v1alpha1.SchedulingDomainMachines, "machine-1", "machines-machine-1"}, + {v1alpha1.SchedulingDomainPods, "pod-ns-name", "pods-pod-ns-name"}, + } + for _, tt := range tests { + t.Run(tt.expected, func(t *testing.T) { + got := getName(tt.domain, tt.resourceID) + if got != tt.expected { + t.Errorf("getName(%q, %q) = %q, want %q", tt.domain, tt.resourceID, got, tt.expected) + } + }) + } +} + +func TestGenerateExplanation(t *testing.T) { + tests := []struct { + name string + result *v1alpha1.DecisionResult + err error + expected string + }{ + { + name: "nil result no error", + result: nil, + err: nil, + expected: "", + }, + { + name: "nil result with error", + result: nil, + err: errors.New("something broke"), + expected: "Pipeline run failed: something broke.", + }, + { + name: "result with target host only no steps", + result: &v1alpha1.DecisionResult{ + TargetHost: testlib.Ptr("host-1"), + }, + expected: "Selected host: host-1.", + }, + { + name: "result with RawInWeights and filtering steps", + result: &v1alpha1.DecisionResult{ + RawInWeights: map[string]float64{ + "host-a": 1.0, + "host-b": 0.5, + "host-c": 0.3, + }, + StepResults: []v1alpha1.StepResult{ + { + StepName: "filter_capacity", + Activations: map[string]float64{ + "host-a": 1.0, + "host-b": 0.5, + }, + }, + { + StepName: "filter_status", + Activations: map[string]float64{ + "host-a": 1.0, + }, + }, + }, + TargetHost: testlib.Ptr("host-a"), + }, + expected: "Started with 3 host(s).\n\n" + + "filter_capacity filtered out host-c\n" + + "filter_status filtered out host-b\n\n" + + "1 hosts remaining (host-a)\n\n" + + "Selected host: host-a.", + }, + { + name: "uses NormalizedInWeights when RawInWeights empty", + result: &v1alpha1.DecisionResult{ + NormalizedInWeights: map[string]float64{ + "host-x": 0.6, + "host-y": 0.4, + }, + StepResults: []v1alpha1.StepResult{ + { + StepName: "weigher_cpu", + Activations: map[string]float64{ + "host-x": 0.8, + "host-y": 0.2, + }, + }, + }, + TargetHost: testlib.Ptr("host-x"), + }, + expected: "Started with 2 host(s).\n\n\n2 hosts remaining (host-x, host-y)\n\nSelected host: host-x.", + }, + { + name: "no weights no target", + result: &v1alpha1.DecisionResult{ + StepResults: []v1alpha1.StepResult{ + {StepName: "some-step", Activations: map[string]float64{}}, + }, + }, + expected: "", + }, + { + name: "no weights with target", + result: &v1alpha1.DecisionResult{ + TargetHost: testlib.Ptr("host-1"), + StepResults: []v1alpha1.StepResult{ + {StepName: "some-step", Activations: map[string]float64{}}, + }, + }, + expected: "Selected host: host-1.", + }, + { + name: "all hosts survive all steps", + result: &v1alpha1.DecisionResult{ + RawInWeights: map[string]float64{ + "host-a": 1.0, + "host-b": 0.5, + }, + StepResults: []v1alpha1.StepResult{ + { + StepName: "noop", + Activations: map[string]float64{ + "host-a": 1.0, + "host-b": 0.5, + }, + }, + }, + TargetHost: testlib.Ptr("host-a"), + }, + expected: "Started with 2 host(s).\n\n\n2 hosts remaining (host-a, host-b)\n\nSelected host: host-a.", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := generateExplanation(tt.result, tt.err) + if got != tt.expected { + t.Errorf("generateExplanation() =\n%q\nwant:\n%q", got, tt.expected) + } + }) + } +} + +func newTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 scheme: %v", err) + } + return scheme +} + +func TestHistoryManager_Upsert(t *testing.T) { + tests := []struct { + name string + // setup returns a fake client and any pre-existing objects. + setup func(t *testing.T) client.Client + // decision to upsert. + decision *v1alpha1.Decision + intent v1alpha1.SchedulingIntent + pipelineErr error + // assertions on the history list (archived entries only). + expectHistoryLen int + // assertions on the current decision. + expectTargetHost *string + expectSuccessful bool + expectCondStatus metav1.ConditionStatus + expectReason string + checkExplanation func(t *testing.T, explanation string) + checkCurrentHosts func(t *testing.T, hosts []string) + }{ + { + name: "create new history", + setup: func(t *testing.T) client.Client { + return fake.NewClientBuilder(). + WithScheme(newTestScheme(t)). + WithStatusSubresource(&v1alpha1.History{}). + Build() + }, + decision: &v1alpha1.Decision{ + Spec: v1alpha1.DecisionSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "uuid-1", + PipelineRef: corev1.ObjectReference{Name: "nova-pipeline"}, + }, + Status: v1alpha1.DecisionStatus{ + Result: &v1alpha1.DecisionResult{ + TargetHost: testlib.Ptr("compute-1"), + }, + }, + }, + intent: v1alpha1.SchedulingIntentUnknown, + expectHistoryLen: 0, + expectTargetHost: testlib.Ptr("compute-1"), + expectSuccessful: true, + expectCondStatus: metav1.ConditionTrue, + expectReason: "SchedulingSucceeded", + checkExplanation: func(t *testing.T, explanation string) { + if !strings.Contains(explanation, "Selected host: compute-1") { + t.Errorf("expected explanation to contain selected host, got: %q", explanation) + } + }, + }, + { + name: "update existing history with pre-existing entries", + setup: func(t *testing.T) client.Client { + scheme := newTestScheme(t) + existing := &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{Name: "nova-uuid-2"}, + Spec: v1alpha1.HistorySpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "uuid-2", + }, + Status: v1alpha1.HistoryStatus{ + History: []v1alpha1.SchedulingHistoryEntry{ + {Timestamp: metav1.Now(), PipelineRef: corev1.ObjectReference{Name: "old-pipeline"}}, + }, + }, + } + return fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(existing). + WithStatusSubresource(&v1alpha1.History{}). + Build() + }, + decision: &v1alpha1.Decision{ + Spec: v1alpha1.DecisionSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "uuid-2", + PipelineRef: corev1.ObjectReference{Name: "nova-pipeline"}, + }, + Status: v1alpha1.DecisionStatus{ + Result: &v1alpha1.DecisionResult{ + TargetHost: testlib.Ptr("compute-2"), + }, + }, + }, + intent: v1alpha1.SchedulingIntentUnknown, + expectHistoryLen: 1, // pre-existing entry preserved, no current to archive + expectTargetHost: testlib.Ptr("compute-2"), + expectSuccessful: true, + expectCondStatus: metav1.ConditionTrue, + expectReason: "SchedulingSucceeded", + }, + { + name: "archive current to history on second upsert", + setup: func(t *testing.T) client.Client { + scheme := newTestScheme(t) + existing := &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{Name: "nova-uuid-3"}, + Spec: v1alpha1.HistorySpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "uuid-3", + }, + Status: v1alpha1.HistoryStatus{ + Current: v1alpha1.CurrentDecision{ + Timestamp: metav1.Now(), + PipelineRef: corev1.ObjectReference{Name: "old-pipeline"}, + Intent: v1alpha1.SchedulingIntentUnknown, + Successful: true, + TargetHost: testlib.Ptr("old-host"), + }, + }, + } + return fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(existing). + WithStatusSubresource(&v1alpha1.History{}). + Build() + }, + decision: &v1alpha1.Decision{ + Spec: v1alpha1.DecisionSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "uuid-3", + PipelineRef: corev1.ObjectReference{Name: "new-pipeline"}, + }, + Status: v1alpha1.DecisionStatus{ + Result: &v1alpha1.DecisionResult{ + TargetHost: testlib.Ptr("new-host"), + }, + }, + }, + intent: v1alpha1.SchedulingIntentUnknown, + expectHistoryLen: 1, // old current archived + expectTargetHost: testlib.Ptr("new-host"), + expectSuccessful: true, + expectCondStatus: metav1.ConditionTrue, + expectReason: "SchedulingSucceeded", + }, + { + name: "pipeline error", + setup: func(t *testing.T) client.Client { + return fake.NewClientBuilder(). + WithScheme(newTestScheme(t)). + WithStatusSubresource(&v1alpha1.History{}). + Build() + }, + decision: &v1alpha1.Decision{ + Spec: v1alpha1.DecisionSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainCinder, + ResourceID: "vol-1", + PipelineRef: corev1.ObjectReference{Name: "cinder-pipeline"}, + }, + }, + intent: v1alpha1.SchedulingIntentUnknown, + pipelineErr: errors.New("no hosts available"), + expectHistoryLen: 0, + expectTargetHost: nil, + expectSuccessful: false, + expectCondStatus: metav1.ConditionFalse, + expectReason: "PipelineRunFailed", + checkExplanation: func(t *testing.T, explanation string) { + if !strings.Contains(explanation, "no hosts available") { + t.Errorf("expected explanation to contain error text, got: %q", explanation) + } + }, + }, + { + name: "no host found", + setup: func(t *testing.T) client.Client { + return fake.NewClientBuilder(). + WithScheme(newTestScheme(t)). + WithStatusSubresource(&v1alpha1.History{}). + Build() + }, + decision: &v1alpha1.Decision{ + Spec: v1alpha1.DecisionSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "uuid-nohost", + PipelineRef: corev1.ObjectReference{Name: "nova-pipeline"}, + }, + Status: v1alpha1.DecisionStatus{ + Result: &v1alpha1.DecisionResult{ + // Pipeline succeeded but no target host selected (all filtered out). + OrderedHosts: []string{}, + }, + }, + }, + intent: v1alpha1.SchedulingIntentUnknown, + expectHistoryLen: 0, + expectTargetHost: nil, + expectSuccessful: false, + expectCondStatus: metav1.ConditionFalse, + expectReason: "NoHostFound", + }, + { + name: "history capped at 10 entries", + setup: func(t *testing.T) client.Client { + scheme := newTestScheme(t) + entries := make([]v1alpha1.SchedulingHistoryEntry, 10) + for i := range entries { + entries[i] = v1alpha1.SchedulingHistoryEntry{ + Timestamp: metav1.Now(), + PipelineRef: corev1.ObjectReference{Name: fmt.Sprintf("pipeline-%d", i)}, + } + } + existing := &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{Name: "manila-share-1"}, + Spec: v1alpha1.HistorySpec{ + SchedulingDomain: v1alpha1.SchedulingDomainManila, + ResourceID: "share-1", + }, + Status: v1alpha1.HistoryStatus{ + Current: v1alpha1.CurrentDecision{ + Timestamp: metav1.Now(), + PipelineRef: corev1.ObjectReference{Name: "prev-pipeline"}, + Successful: true, + TargetHost: testlib.Ptr("old-backend"), + }, + History: entries, + }, + } + return fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(existing). + WithStatusSubresource(&v1alpha1.History{}). + Build() + }, + decision: &v1alpha1.Decision{ + Spec: v1alpha1.DecisionSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainManila, + ResourceID: "share-1", + PipelineRef: corev1.ObjectReference{Name: "manila-pipeline"}, + }, + Status: v1alpha1.DecisionStatus{ + Result: &v1alpha1.DecisionResult{ + TargetHost: testlib.Ptr("backend-1"), + }, + }, + }, + intent: v1alpha1.SchedulingIntentUnknown, + expectHistoryLen: 10, // 10 existing + 1 archived current, capped to 10 + expectTargetHost: testlib.Ptr("backend-1"), + expectSuccessful: true, + expectCondStatus: metav1.ConditionTrue, + expectReason: "SchedulingSucceeded", + }, + { + name: "ordered hosts capped at 3", + setup: func(t *testing.T) client.Client { + return fake.NewClientBuilder(). + WithScheme(newTestScheme(t)). + WithStatusSubresource(&v1alpha1.History{}). + Build() + }, + decision: &v1alpha1.Decision{ + Spec: v1alpha1.DecisionSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "uuid-cap", + PipelineRef: corev1.ObjectReference{Name: "nova-pipeline"}, + }, + Status: v1alpha1.DecisionStatus{ + Result: &v1alpha1.DecisionResult{ + TargetHost: testlib.Ptr("h1"), + OrderedHosts: []string{"h1", "h2", "h3", "h4", "h5"}, + }, + }, + }, + intent: v1alpha1.SchedulingIntentUnknown, + expectHistoryLen: 0, + expectTargetHost: testlib.Ptr("h1"), + expectSuccessful: true, + expectCondStatus: metav1.ConditionTrue, + expectReason: "SchedulingSucceeded", + checkCurrentHosts: func(t *testing.T, hosts []string) { + if len(hosts) != 3 { + t.Errorf("ordered hosts length = %d, want 3", len(hosts)) + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := tt.setup(t) + hm := HistoryManager{Client: c} + + err := hm.Upsert(context.Background(), tt.decision, tt.intent, tt.pipelineErr) + if err != nil { + t.Fatalf("Upsert() returned error: %v", err) + } + + // Fetch the history CRD. + name := getName(tt.decision.Spec.SchedulingDomain, tt.decision.Spec.ResourceID) + var history v1alpha1.History + if err := c.Get(context.Background(), client.ObjectKey{Name: name}, &history); err != nil { + t.Fatalf("Failed to get history: %v", err) + } + + // Verify spec. + if history.Spec.SchedulingDomain != tt.decision.Spec.SchedulingDomain { + t.Errorf("spec.schedulingDomain = %q, want %q", history.Spec.SchedulingDomain, tt.decision.Spec.SchedulingDomain) + } + if history.Spec.ResourceID != tt.decision.Spec.ResourceID { + t.Errorf("spec.resourceID = %q, want %q", history.Spec.ResourceID, tt.decision.Spec.ResourceID) + } + + // Verify history list length. + if len(history.Status.History) != tt.expectHistoryLen { + t.Errorf("status.history has %d entries, want %d", len(history.Status.History), tt.expectHistoryLen) + } + + // Verify current decision. + current := history.Status.Current + if current.Successful != tt.expectSuccessful { + t.Errorf("current.successful = %v, want %v", current.Successful, tt.expectSuccessful) + } + if current.PipelineRef.Name != tt.decision.Spec.PipelineRef.Name { + t.Errorf("current.pipelineRef = %q, want %q", current.PipelineRef.Name, tt.decision.Spec.PipelineRef.Name) + } + + // Verify target host. + if tt.expectTargetHost == nil { + if current.TargetHost != nil { + t.Errorf("current.targetHost = %q, want nil", *current.TargetHost) + } + } else { + if current.TargetHost == nil { + t.Errorf("current.targetHost = nil, want %q", *tt.expectTargetHost) + } else if *current.TargetHost != *tt.expectTargetHost { + t.Errorf("current.targetHost = %q, want %q", *current.TargetHost, *tt.expectTargetHost) + } + } + + // Verify condition. + if len(history.Status.Conditions) == 0 { + t.Fatal("expected at least one condition") + } + readyCond := history.Status.Conditions[0] + if readyCond.Status != tt.expectCondStatus { + t.Errorf("condition status = %q, want %q", readyCond.Status, tt.expectCondStatus) + } + if readyCond.Reason != tt.expectReason { + t.Errorf("condition reason = %q, want %q", readyCond.Reason, tt.expectReason) + } + + // Verify explanation. + if tt.checkExplanation != nil { + tt.checkExplanation(t, current.Explanation) + } + + // Verify ordered hosts on current. + if tt.checkCurrentHosts != nil { + tt.checkCurrentHosts(t, current.OrderedHosts) + } + }) + } +} + +func TestHistoryManager_UpsertFromGoroutine(t *testing.T) { + c := fake.NewClientBuilder(). + WithScheme(newTestScheme(t)). + WithStatusSubresource(&v1alpha1.History{}). + Build() + hm := HistoryManager{Client: c} + + decision := &v1alpha1.Decision{ + Spec: v1alpha1.DecisionSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "async-uuid", + PipelineRef: corev1.ObjectReference{Name: "nova-pipeline"}, + }, + Status: v1alpha1.DecisionStatus{ + Result: &v1alpha1.DecisionResult{ + TargetHost: testlib.Ptr("compute-async"), + }, + }, + } + + // Mirrors the pattern used in pipeline controllers. + ctx := context.Background() + go func() { + if err := hm.Upsert(ctx, decision, v1alpha1.SchedulingIntentUnknown, nil); err != nil { + t.Errorf("Upsert() returned error: %v", err) + } + }() + + // Poll for history creation. + var histories v1alpha1.HistoryList + deadline := time.Now().Add(2 * time.Second) + for { + if err := c.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) + } + if len(histories.Items) > 0 { + break + } + if time.Now().After(deadline) { + t.Fatal("timed out waiting for async history creation") + } + time.Sleep(5 * time.Millisecond) + } + + got := histories.Items[0].Status.Current.TargetHost + if got == nil || *got != "compute-async" { + t.Errorf("target host = %v, want %q", got, "compute-async") + } +} + +func TestHistoryManager_Delete(t *testing.T) { + tests := []struct { + name string + domain v1alpha1.SchedulingDomain + resourceID string + preCreate bool + }{ + { + name: "delete existing", + domain: v1alpha1.SchedulingDomainNova, + resourceID: "uuid-del", + preCreate: true, + }, + { + name: "delete non-existing is no-op", + domain: v1alpha1.SchedulingDomainCinder, + resourceID: "does-not-exist", + preCreate: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := newTestScheme(t) + var objects []client.Object + if tt.preCreate { + objects = append(objects, &v1alpha1.History{ + ObjectMeta: metav1.ObjectMeta{ + Name: getName(tt.domain, tt.resourceID), + }, + Spec: v1alpha1.HistorySpec{ + SchedulingDomain: tt.domain, + ResourceID: tt.resourceID, + }, + }) + } + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + Build() + hm := HistoryManager{Client: c} + + err := hm.Delete(context.Background(), tt.domain, tt.resourceID) + if err != nil { + t.Fatalf("Delete() returned error: %v", err) + } + + // Verify history is gone. + var histories v1alpha1.HistoryList + if err := c.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) + } + if len(histories.Items) != 0 { + t.Errorf("expected 0 histories, got %d", len(histories.Items)) + } + }) + } +} diff --git a/internal/scheduling/lib/pipeline_controller.go b/internal/scheduling/lib/pipeline_controller.go index a435133e4..c76ca79cf 100644 --- a/internal/scheduling/lib/pipeline_controller.go +++ b/internal/scheduling/lib/pipeline_controller.go @@ -30,6 +30,8 @@ type BasePipelineController[PipelineType any] struct { client.Client // The scheduling domain to scope resources to. SchedulingDomain v1alpha1.SchedulingDomain + // Manager for creating, updating, and deleting History CRDs. + HistoryManager HistoryManager } // Handle the startup of the manager by initializing the pipeline map. diff --git a/internal/scheduling/machines/filter_weigher_pipeline_controller.go b/internal/scheduling/machines/filter_weigher_pipeline_controller.go index 2b0c44f64..33715c5f7 100644 --- a/internal/scheduling/machines/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/machines/filter_weigher_pipeline_controller.go @@ -100,12 +100,6 @@ func (c *FilterWeigherPipelineController) ProcessNewMachine(ctx context.Context, if !ok { return fmt.Errorf("pipeline %s not configured", decision.Spec.PipelineRef.Name) } - if pipelineConf.Spec.CreateDecisions { - if err := c.Create(ctx, decision); err != nil { - return err - } - } - old := decision.DeepCopy() err := c.process(ctx, decision) if err != nil { meta.SetStatusCondition(&decision.Status.Conditions, metav1.Condition{ @@ -123,10 +117,11 @@ func (c *FilterWeigherPipelineController) ProcessNewMachine(ctx context.Context, }) } if pipelineConf.Spec.CreateDecisions { - patch := client.MergeFrom(old) - if err := c.Status().Patch(ctx, decision, patch); err != nil { - return err - } + go func() { + if upsertErr := c.HistoryManager.Upsert(context.Background(), decision, v1alpha1.SchedulingIntentUnknown, err); upsertErr != nil { + ctrl.LoggerFrom(ctx).Error(upsertErr, "failed to create/update history") + } + }() } return err } @@ -216,20 +211,10 @@ func (c *FilterWeigherPipelineController) handleMachine() handler.EventHandler { } }, DeleteFunc: func(ctx context.Context, evt event.DeleteEvent, queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { - // Delete the associated decision(s). - log := ctrl.LoggerFrom(ctx) machine := evt.Object.(*ironcorev1alpha1.Machine) - var decisions v1alpha1.DecisionList - if err := c.List(ctx, &decisions); err != nil { - log.Error(err, "failed to list decisions for deleted machine") - return - } - for _, decision := range decisions.Items { - if decision.Spec.MachineRef.Name == machine.Name && decision.Spec.MachineRef.Namespace == machine.Namespace { - if err := c.Delete(ctx, &decision); err != nil { - log.Error(err, "failed to delete decision for deleted machine") - } - } + if err := c.HistoryManager.Delete(ctx, v1alpha1.SchedulingDomainMachines, machine.Name); err != nil { + log := ctrl.LoggerFrom(ctx) + log.Error(err, "failed to delete history CRD for machine", "machine", machine.Name) } }, } @@ -238,6 +223,7 @@ func (c *FilterWeigherPipelineController) handleMachine() handler.EventHandler { func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainMachines + c.HistoryManager = lib.HistoryManager{Client: mgr.GetClient(), Recorder: mgr.GetEventRecorder("cortex-machines-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/machines/filter_weigher_pipeline_controller_test.go b/internal/scheduling/machines/filter_weigher_pipeline_controller_test.go index 58c574f45..12a72226e 100644 --- a/internal/scheduling/machines/filter_weigher_pipeline_controller_test.go +++ b/internal/scheduling/machines/filter_weigher_pipeline_controller_test.go @@ -6,6 +6,7 @@ package machines import ( "context" "testing" + "time" "github.com/cobaltcore-dev/cortex/api/external/ironcore" ironcorev1alpha1 "github.com/cobaltcore-dev/cortex/api/external/ironcore/v1alpha1" @@ -291,7 +292,7 @@ func TestFilterWeigherPipelineController_ProcessNewMachine(t *testing.T) { pipelineConfig *v1alpha1.Pipeline createDecisions bool expectError bool - expectDecisionCreated bool + expectHistoryCreated bool expectMachinePoolAssigned bool expectTargetHost string }{ @@ -328,7 +329,7 @@ func TestFilterWeigherPipelineController_ProcessNewMachine(t *testing.T) { }, createDecisions: true, expectError: false, - expectDecisionCreated: true, + expectHistoryCreated: true, expectMachinePoolAssigned: true, expectTargetHost: "pool1", }, @@ -362,7 +363,7 @@ func TestFilterWeigherPipelineController_ProcessNewMachine(t *testing.T) { }, createDecisions: false, expectError: false, - expectDecisionCreated: false, + expectHistoryCreated: false, expectMachinePoolAssigned: true, expectTargetHost: "pool1", }, @@ -380,7 +381,7 @@ func TestFilterWeigherPipelineController_ProcessNewMachine(t *testing.T) { machinePools: []ironcorev1alpha1.MachinePool{}, pipelineConfig: nil, expectError: true, - expectDecisionCreated: false, + expectHistoryCreated: false, expectMachinePoolAssigned: false, }, { @@ -409,7 +410,7 @@ func TestFilterWeigherPipelineController_ProcessNewMachine(t *testing.T) { }, createDecisions: true, expectError: true, - expectDecisionCreated: true, // Decision is created but processing fails + expectHistoryCreated: true, // Decision is created but processing fails expectMachinePoolAssigned: false, }, } @@ -427,13 +428,14 @@ func TestFilterWeigherPipelineController_ProcessNewMachine(t *testing.T) { client := fake.NewClientBuilder(). WithScheme(scheme). WithRuntimeObjects(objects...). - WithStatusSubresource(&v1alpha1.Decision{}). + WithStatusSubresource(&v1alpha1.Decision{}, &v1alpha1.History{}). Build() controller := &FilterWeigherPipelineController{ BasePipelineController: lib.BasePipelineController[lib.FilterWeigherPipeline[ironcore.MachinePipelineRequest]]{ Pipelines: map[string]lib.FilterWeigherPipeline[ironcore.MachinePipelineRequest]{}, PipelineConfigs: map[string]v1alpha1.Pipeline{}, + HistoryManager: lib.HistoryManager{Client: client}, }, Monitor: lib.FilterWeigherPipelineMonitor{}, } @@ -456,70 +458,29 @@ func TestFilterWeigherPipelineController_ProcessNewMachine(t *testing.T) { return } - // Check if decision was created (if expected) - if tt.expectDecisionCreated { - var decisions v1alpha1.DecisionList - err := client.List(context.Background(), &decisions) - if err != nil { - t.Errorf("Failed to list decisions: %v", err) - return - } - - found := false - for _, decision := range decisions.Items { - if decision.Spec.MachineRef != nil && - decision.Spec.MachineRef.Name == tt.machine.Name && - decision.Spec.MachineRef.Namespace == tt.machine.Namespace { - found = true - - // Verify decision properties - if decision.Spec.SchedulingDomain != v1alpha1.SchedulingDomainMachines { - t.Errorf("expected scheduling domain %q, got %q", v1alpha1.SchedulingDomainMachines, decision.Spec.SchedulingDomain) - } - if decision.Spec.ResourceID != tt.machine.Name { - t.Errorf("expected resource ID %q, got %q", tt.machine.Name, decision.Spec.ResourceID) - } - if decision.Spec.PipelineRef.Name != "machines-scheduler" { - t.Errorf("expected pipeline ref %q, got %q", "machines-scheduler", decision.Spec.PipelineRef.Name) - } - - // Check if result was set (only for successful cases) - if !tt.expectError && tt.expectTargetHost != "" { - if decision.Status.Result == nil { - t.Error("expected decision result to be set") - return - } - if decision.Status.Result.TargetHost == nil { - t.Error("expected target host to be set") - return - } - if *decision.Status.Result.TargetHost != tt.expectTargetHost { - t.Errorf("expected target host %q, got %q", tt.expectTargetHost, *decision.Status.Result.TargetHost) - } - } + // Check if history CRD was created when expected + if tt.expectHistoryCreated { + var histories v1alpha1.HistoryList + deadline := time.Now().Add(2 * time.Second) + for { + if err := client.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) + } + if len(histories.Items) > 0 { break } - } - - if !found { - t.Error("expected decision to be created but was not found") + if time.Now().After(deadline) { + t.Fatal("timed out waiting for history CRD to be created") + } + time.Sleep(5 * time.Millisecond) } } else { - // Check that no decisions were created - var decisions v1alpha1.DecisionList - err := client.List(context.Background(), &decisions) - if err != nil { - t.Errorf("Failed to list decisions: %v", err) - return + var histories v1alpha1.HistoryList + if err := client.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) } - - for _, decision := range decisions.Items { - if decision.Spec.MachineRef != nil && - decision.Spec.MachineRef.Name == tt.machine.Name && - decision.Spec.MachineRef.Namespace == tt.machine.Namespace { - t.Error("expected no decision to be created but found one") - break - } + if len(histories.Items) != 0 { + t.Error("Expected no history CRD but found one") } } diff --git a/internal/scheduling/manila/decisions_cleanup.go b/internal/scheduling/manila/decisions_cleanup.go index ee662db4c..b350b1e7d 100644 --- a/internal/scheduling/manila/decisions_cleanup.go +++ b/internal/scheduling/manila/decisions_cleanup.go @@ -111,22 +111,22 @@ func DecisionsCleanup(ctx context.Context, client client.Client, conf DecisionsC } // List all decisions and delete those whose share no longer exists. - decisionList := &v1alpha1.DecisionList{} - if err := client.List(ctx, decisionList); err != nil { + historyList := &v1alpha1.HistoryList{} + if err := client.List(ctx, historyList); err != nil { return err } - for _, decision := range decisionList.Items { - // Skip non-manila decisions. - if decision.Spec.SchedulingDomain != v1alpha1.SchedulingDomainManila { + for _, history := range historyList.Items { + // Skip non-manila histories. + if history.Spec.SchedulingDomain != v1alpha1.SchedulingDomainManila { continue } - // Skip decisions for which the share still exists. - if _, ok := sharesByID[decision.Spec.ResourceID]; ok { + // Skip histories for which the share still exists. + if _, ok := sharesByID[history.Spec.ResourceID]; ok { continue } - // Delete the decision since the share no longer exists. - slog.Info("deleting decision for deleted share", "decision", decision.Name, "shareID", decision.Spec.ResourceID) - if err := client.Delete(ctx, &decision); err != nil { + // Delete the history since the share no longer exists. + slog.Info("deleting history for deleted share", "history", history.Name, "shareID", history.Spec.ResourceID) + if err := client.Delete(ctx, &history); err != nil { return err } } diff --git a/internal/scheduling/manila/decisions_cleanup_test.go b/internal/scheduling/manila/decisions_cleanup_test.go index f978ab2f4..c4e88c5b3 100644 --- a/internal/scheduling/manila/decisions_cleanup_test.go +++ b/internal/scheduling/manila/decisions_cleanup_test.go @@ -33,7 +33,7 @@ func TestCleanupManila(t *testing.T) { tests := []struct { name string - decisions []v1alpha1.Decision + histories []v1alpha1.History expectError bool authError bool endpointError bool @@ -44,45 +44,45 @@ func TestCleanupManila(t *testing.T) { }{ { name: "handle authentication error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, authError: true, expectError: true, }, { name: "handle endpoint error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, endpointError: true, expectError: true, }, { name: "handle server error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, mockServerError: true, expectError: true, }, { name: "handle empty shares case", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, emptySharesError: true, expectError: true, }, { name: "delete decisions for non-existent shares", - decisions: []v1alpha1.Decision{ + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-existing-share", + Name: "history-existing-share", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainManila, ResourceID: "share-exists", }, }, { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-deleted-share", + Name: "history-deleted-share", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainManila, ResourceID: "share-deleted", }, @@ -91,26 +91,26 @@ func TestCleanupManila(t *testing.T) { mockShares: []mockShare{ {ID: "share-exists"}, }, - expectedDeleted: []string{"decision-deleted-share"}, + expectedDeleted: []string{"history-deleted-share"}, expectError: false, }, { name: "keep decisions for existing shares", - decisions: []v1alpha1.Decision{ + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-share-1", + Name: "history-share-1", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainManila, ResourceID: "share-1", }, }, { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-share-2", + Name: "history-share-2", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainManila, ResourceID: "share-2", }, @@ -124,22 +124,22 @@ func TestCleanupManila(t *testing.T) { expectError: false, }, { - name: "skip non-manila decisions", - decisions: []v1alpha1.Decision{ + name: "skip non-manila histories", + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-other-type", + Name: "history-other-type", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainCinder, ResourceID: "some-resource", }, }, { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-other-operator", + Name: "history-other-operator", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: "other-operator", ResourceID: "share-1", }, @@ -153,9 +153,9 @@ func TestCleanupManila(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - objects := make([]client.Object, len(tt.decisions)) - for i := range tt.decisions { - objects[i] = &tt.decisions[i] + objects := make([]client.Object, len(tt.histories)) + for i := range tt.histories { + objects[i] = &tt.histories[i] } // Create mock Manila server @@ -357,22 +357,22 @@ func TestCleanupManila(t *testing.T) { } } - // Verify other decisions still exist - for _, originalDecision := range tt.decisions { + // Verify other histories still exist + for _, originalHistory := range tt.histories { shouldBeDeleted := false for _, expectedDeleted := range tt.expectedDeleted { - if originalDecision.Name == expectedDeleted { + if originalHistory.Name == expectedDeleted { shouldBeDeleted = true break } } if !shouldBeDeleted { - var decision v1alpha1.Decision + var history v1alpha1.History err := client.Get(context.Background(), - types.NamespacedName{Name: originalDecision.Name}, &decision) + types.NamespacedName{Name: originalHistory.Name}, &history) if err != nil { - t.Errorf("Expected decision %s to still exist but got error: %v", - originalDecision.Name, err) + t.Errorf("Expected history %s to still exist but got error: %v", + originalHistory.Name, err) } } } diff --git a/internal/scheduling/manila/filter_weigher_pipeline_controller.go b/internal/scheduling/manila/filter_weigher_pipeline_controller.go index 3b63d64e0..4d4295176 100644 --- a/internal/scheduling/manila/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/manila/filter_weigher_pipeline_controller.go @@ -79,12 +79,6 @@ func (c *FilterWeigherPipelineController) ProcessNewDecisionFromAPI(ctx context. if !ok { return fmt.Errorf("pipeline %s not configured", decision.Spec.PipelineRef.Name) } - if pipelineConf.Spec.CreateDecisions { - if err := c.Create(ctx, decision); err != nil { - return err - } - } - old := decision.DeepCopy() err := c.process(ctx, decision) if err != nil { meta.SetStatusCondition(&decision.Status.Conditions, metav1.Condition{ @@ -102,10 +96,11 @@ func (c *FilterWeigherPipelineController) ProcessNewDecisionFromAPI(ctx context. }) } if pipelineConf.Spec.CreateDecisions { - patch := client.MergeFrom(old) - if err := c.Status().Patch(ctx, decision, patch); err != nil { - return err - } + go func() { + if upsertErr := c.HistoryManager.Upsert(context.Background(), decision, v1alpha1.SchedulingIntentUnknown, err); upsertErr != nil { + ctrl.LoggerFrom(ctx).Error(upsertErr, "failed to create/update history") + } + }() } return err } @@ -156,6 +151,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainManila + c.HistoryManager = lib.HistoryManager{Client: mgr.GetClient(), Recorder: mgr.GetEventRecorder("cortex-manila-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/manila/filter_weigher_pipeline_controller_test.go b/internal/scheduling/manila/filter_weigher_pipeline_controller_test.go index a9fc2df1d..876fadc7e 100644 --- a/internal/scheduling/manila/filter_weigher_pipeline_controller_test.go +++ b/internal/scheduling/manila/filter_weigher_pipeline_controller_test.go @@ -7,6 +7,7 @@ import ( "context" "encoding/json" "testing" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" @@ -245,13 +246,13 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) } tests := []struct { - name string - decision *v1alpha1.Decision - pipelineConfig *v1alpha1.Pipeline - createDecisions bool - expectError bool - expectDecisionCreated bool - expectResult bool + name string + decision *v1alpha1.Decision + pipelineConfig *v1alpha1.Pipeline + createDecisions bool + expectError bool + expectHistoryCreated bool + expectResult bool }{ { name: "successful decision processing with creation", @@ -262,6 +263,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) }, Spec: v1alpha1.DecisionSpec{ SchedulingDomain: v1alpha1.SchedulingDomainManila, + ResourceID: "test-uuid-1", PipelineRef: corev1.ObjectReference{ Name: "test-pipeline", }, @@ -282,10 +284,10 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: true, - expectError: false, - expectDecisionCreated: true, - expectResult: true, + createDecisions: true, + expectError: false, + expectHistoryCreated: true, + expectResult: true, }, { name: "successful decision processing without creation", @@ -316,10 +318,10 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: false, - expectError: false, - expectDecisionCreated: false, - expectResult: true, + createDecisions: false, + expectError: false, + expectHistoryCreated: false, + expectResult: true, }, { name: "pipeline not configured", @@ -338,10 +340,10 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) }, }, }, - pipelineConfig: nil, - expectError: true, - expectDecisionCreated: false, - expectResult: false, + pipelineConfig: nil, + expectError: true, + expectHistoryCreated: false, + expectResult: false, }, { name: "decision without manilaRaw spec", @@ -370,10 +372,10 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: true, - expectError: true, - expectDecisionCreated: false, - expectResult: false, + createDecisions: true, + expectError: true, + expectHistoryCreated: false, + expectResult: false, }, } @@ -387,7 +389,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) client := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(objects...). - WithStatusSubresource(&v1alpha1.Decision{}). + WithStatusSubresource(&v1alpha1.Decision{}, &v1alpha1.History{}). Build() controller := &FilterWeigherPipelineController{ @@ -395,6 +397,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Client: client, Pipelines: make(map[string]lib.FilterWeigherPipeline[api.ExternalSchedulerRequest]), PipelineConfigs: make(map[string]v1alpha1.Pipeline), + HistoryManager: lib.HistoryManager{Client: client}, }, Monitor: lib.FilterWeigherPipelineMonitor{}, } @@ -417,41 +420,23 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) t.Errorf("Expected no error but got: %v", err) } - // Check if decision was created (if expected) - if tt.expectDecisionCreated { - var decisions v1alpha1.DecisionList - err := client.List(context.Background(), &decisions) - if err != nil { - t.Errorf("Failed to list decisions: %v", err) - return - } - - found := false - for _, decision := range decisions.Items { - if decision.Spec.SchedulingDomain == v1alpha1.SchedulingDomainManila { - found = true - - // Verify decision properties - if decision.Spec.PipelineRef.Name != "test-pipeline" { - t.Errorf("expected pipeline ref %q, got %q", "test-pipeline", decision.Spec.PipelineRef.Name) - } - - // Check if result was set - if tt.expectResult { - if decision.Status.Result == nil { - t.Error("expected decision result to be set") - return - } - } + // Check if history CRD was created when expected + if tt.expectHistoryCreated { + var histories v1alpha1.HistoryList + deadline := time.Now().Add(2 * time.Second) + for { + if err := client.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) + } + if len(histories.Items) > 0 { break } - } - - if !found { - t.Error("expected decision to be created but was not found") + if time.Now().After(deadline) { + t.Fatal("timed out waiting for history CRD to be created") + } + time.Sleep(5 * time.Millisecond) } } else if !tt.expectError { - // For cases without creation, check that the decision has the right status if tt.expectResult && tt.decision.Status.Result == nil { t.Error("expected decision result to be set in original decision object") } diff --git a/internal/scheduling/nova/decisions_cleanup.go b/internal/scheduling/nova/decisions_cleanup.go index 7414f8bc4..19f792fcf 100644 --- a/internal/scheduling/nova/decisions_cleanup.go +++ b/internal/scheduling/nova/decisions_cleanup.go @@ -119,27 +119,27 @@ func DecisionsCleanup(ctx context.Context, client client.Client, conf DecisionsC reservationsByName[reservation.Name] = reservation } - // List all decisions and check if the server still exists. - decisionList := &v1alpha1.DecisionList{} - if err := client.List(ctx, decisionList); err != nil { + // List all histories and check if the server still exists. + historyList := &v1alpha1.HistoryList{} + if err := client.List(ctx, historyList); err != nil { return err } - for _, decision := range decisionList.Items { - // Skip non-nova decisions. - if decision.Spec.SchedulingDomain != v1alpha1.SchedulingDomainNova { + for _, history := range historyList.Items { + // Skip non-nova histories. + if history.Spec.SchedulingDomain != v1alpha1.SchedulingDomainNova { continue } - // Skip decisions that are linked to existing reservations. - if _, ok := reservationsByName[decision.Spec.ResourceID]; ok { + // Skip histories that are linked to existing reservations. + if _, ok := reservationsByName[history.Spec.ResourceID]; ok { continue } - // Skip decisions for which the server still exists. - if _, ok := serversByID[decision.Spec.ResourceID]; ok { + // Skip histories for which the server still exists. + if _, ok := serversByID[history.Spec.ResourceID]; ok { continue } - // Delete the decision since the server no longer exists. - slog.Info("deleting decision for deleted server", "decision", decision.Name, "serverID", decision.Spec.ResourceID) - if err := client.Delete(ctx, &decision); err != nil { + // Delete the history since the server no longer exists. + slog.Info("deleting history for deleted server", "history", history.Name, "serverID", history.Spec.ResourceID) + if err := client.Delete(ctx, &history); err != nil { return err } } diff --git a/internal/scheduling/nova/decisions_cleanup_test.go b/internal/scheduling/nova/decisions_cleanup_test.go index 9e84d8dbe..8ae152b49 100644 --- a/internal/scheduling/nova/decisions_cleanup_test.go +++ b/internal/scheduling/nova/decisions_cleanup_test.go @@ -33,7 +33,7 @@ func TestCleanupNova(t *testing.T) { tests := []struct { name string - decisions []v1alpha1.Decision + histories []v1alpha1.History reservations []v1alpha1.Reservation mockServers []mockServer authError bool @@ -45,45 +45,45 @@ func TestCleanupNova(t *testing.T) { }{ { name: "authentication error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, authError: true, expectError: true, }, { name: "endpoint discovery error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, endpointError: true, expectError: true, }, { name: "nova server error", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, serverError: true, expectError: true, }, { name: "no servers found", - decisions: []v1alpha1.Decision{}, + histories: []v1alpha1.History{}, emptyServers: true, expectError: false, }, { - name: "delete decisions for non-existent servers", - decisions: []v1alpha1.Decision{ + name: "delete histories for non-existent servers", + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-existing-server", + Name: "history-existing-server", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, ResourceID: "server-exists", }, }, { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-deleted-server", + Name: "history-deleted-server", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, ResourceID: "server-deleted", }, @@ -92,26 +92,26 @@ func TestCleanupNova(t *testing.T) { mockServers: []mockServer{ {ID: "server-exists"}, }, - expectedDeleted: []string{"decision-deleted-server"}, + expectedDeleted: []string{"history-deleted-server"}, expectError: false, }, { - name: "keep decisions for existing servers", - decisions: []v1alpha1.Decision{ + name: "keep histories for existing servers", + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-server-1", + Name: "history-server-1", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, ResourceID: "server-1", }, }, { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-server-2", + Name: "history-server-2", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, ResourceID: "server-2", }, @@ -125,22 +125,22 @@ func TestCleanupNova(t *testing.T) { expectError: false, }, { - name: "skip decisions with linked reservations", - decisions: []v1alpha1.Decision{ + name: "skip histories with linked reservations", + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-reserved-server", + Name: "history-reserved-server", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, ResourceID: "server-reserved", }, }, { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-unreserved-server", + Name: "history-unreserved-server", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, ResourceID: "server-unreserved", }, @@ -154,26 +154,26 @@ func TestCleanupNova(t *testing.T) { }, }, mockServers: []mockServer{}, - expectedDeleted: []string{"decision-unreserved-server"}, + expectedDeleted: []string{"history-unreserved-server"}, expectError: false, }, { - name: "skip non-nova decisions", - decisions: []v1alpha1.Decision{ + name: "skip non-nova histories", + histories: []v1alpha1.History{ { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-cinder", + Name: "history-cinder", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: v1alpha1.SchedulingDomainCinder, ResourceID: "volume-1", }, }, { ObjectMeta: metav1.ObjectMeta{ - Name: "decision-wrong-operator", + Name: "history-wrong-operator", }, - Spec: v1alpha1.DecisionSpec{ + Spec: v1alpha1.HistorySpec{ SchedulingDomain: "other-operator", ResourceID: "server-1", }, @@ -187,9 +187,9 @@ func TestCleanupNova(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - objects := make([]client.Object, 0, len(tt.decisions)+len(tt.reservations)) - for i := range tt.decisions { - objects = append(objects, &tt.decisions[i]) + objects := make([]client.Object, 0, len(tt.histories)+len(tt.reservations)) + for i := range tt.histories { + objects = append(objects, &tt.histories[i]) } for i := range tt.reservations { objects = append(objects, &tt.reservations[i]) @@ -360,22 +360,22 @@ func TestCleanupNova(t *testing.T) { } } - // Verify other decisions still exist - for _, originalDecision := range tt.decisions { + // Verify other histories still exist + for _, originalHistory := range tt.histories { shouldBeDeleted := false for _, expectedDeleted := range tt.expectedDeleted { - if originalDecision.Name == expectedDeleted { + if originalHistory.Name == expectedDeleted { shouldBeDeleted = true break } } if !shouldBeDeleted { - var decision v1alpha1.Decision + var history v1alpha1.History err := client.Get(context.Background(), - types.NamespacedName{Name: originalDecision.Name}, &decision) + types.NamespacedName{Name: originalHistory.Name}, &history) if err != nil { - t.Errorf("Expected decision %s to still exist but got error: %v", - originalDecision.Name, err) + t.Errorf("Expected history %s to still exist but got error: %v", + originalHistory.Name, err) } } } diff --git a/internal/scheduling/nova/filter_weigher_pipeline_controller.go b/internal/scheduling/nova/filter_weigher_pipeline_controller.go index 301e6076a..47e66c4db 100644 --- a/internal/scheduling/nova/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/nova/filter_weigher_pipeline_controller.go @@ -82,12 +82,6 @@ func (c *FilterWeigherPipelineController) ProcessNewDecisionFromAPI(ctx context. if !ok { return fmt.Errorf("pipeline %s not configured", decision.Spec.PipelineRef.Name) } - if pipelineConf.Spec.CreateDecisions { - if err := c.Create(ctx, decision); err != nil { - return err - } - } - old := decision.DeepCopy() err := c.process(ctx, decision) if err != nil { meta.SetStatusCondition(&decision.Status.Conditions, metav1.Condition{ @@ -105,10 +99,11 @@ func (c *FilterWeigherPipelineController) ProcessNewDecisionFromAPI(ctx context. }) } if pipelineConf.Spec.CreateDecisions { - patch := client.MergeFrom(old) - if err := c.Status().Patch(ctx, decision, patch); err != nil { - return err - } + go func() { + if upsertErr := c.HistoryManager.Upsert(context.Background(), decision, v1alpha1.SchedulingIntentUnknown, err); upsertErr != nil { + ctrl.LoggerFrom(ctx).Error(upsertErr, "failed to create/update history") + } + }() } return err } @@ -181,6 +176,7 @@ func (c *FilterWeigherPipelineController) InitPipeline( func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainNova + c.HistoryManager = lib.HistoryManager{Client: mgr.GetClient(), Recorder: mgr.GetEventRecorder("cortex-nova-scheduler")} c.gatherer = &candidateGatherer{Client: mcl} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err diff --git a/internal/scheduling/nova/filter_weigher_pipeline_controller_test.go b/internal/scheduling/nova/filter_weigher_pipeline_controller_test.go index 287b488d9..919d1ea88 100644 --- a/internal/scheduling/nova/filter_weigher_pipeline_controller_test.go +++ b/internal/scheduling/nova/filter_weigher_pipeline_controller_test.go @@ -9,6 +9,7 @@ import ( "errors" "strings" "testing" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -393,17 +394,17 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) } tests := []struct { - name string - decision *v1alpha1.Decision - pipeline *v1alpha1.Pipeline - pipelineConf *v1alpha1.Pipeline - setupPipelineConfigs bool - createDecisions bool - expectError bool - expectResult bool - expectCreatedDecision bool - expectUpdatedStatus bool - errorContains string + name string + decision *v1alpha1.Decision + pipeline *v1alpha1.Pipeline + pipelineConf *v1alpha1.Pipeline + setupPipelineConfigs bool + createDecisions bool + expectError bool + expectResult bool + expectHistoryCreated bool + expectUpdatedStatus bool + errorContains string }{ { name: "successful processing with decision creation enabled", @@ -414,6 +415,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) }, Spec: v1alpha1.DecisionSpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "test-uuid-1", PipelineRef: corev1.ObjectReference{ Name: "test-pipeline", }, @@ -446,12 +448,12 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - setupPipelineConfigs: true, - createDecisions: true, - expectError: false, - expectResult: true, - expectCreatedDecision: true, - expectUpdatedStatus: true, + setupPipelineConfigs: true, + createDecisions: true, + expectError: false, + expectResult: true, + expectHistoryCreated: true, + expectUpdatedStatus: true, }, { name: "successful processing with decision creation disabled", @@ -462,6 +464,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) }, Spec: v1alpha1.DecisionSpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, + ResourceID: "test-uuid-2", PipelineRef: corev1.ObjectReference{ Name: "test-pipeline-no-create", }, @@ -494,12 +497,12 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - setupPipelineConfigs: true, - createDecisions: false, - expectError: false, - expectResult: true, - expectCreatedDecision: false, - expectUpdatedStatus: false, + setupPipelineConfigs: true, + createDecisions: false, + expectError: false, + expectResult: true, + expectHistoryCreated: false, + expectUpdatedStatus: false, }, { name: "pipeline not configured", @@ -518,14 +521,14 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) }, }, }, - pipeline: nil, - pipelineConf: nil, - setupPipelineConfigs: false, - expectError: true, - expectResult: false, - expectCreatedDecision: false, - expectUpdatedStatus: false, - errorContains: "pipeline nonexistent-pipeline not configured", + pipeline: nil, + pipelineConf: nil, + setupPipelineConfigs: false, + expectError: true, + expectResult: false, + expectHistoryCreated: false, + expectUpdatedStatus: false, + errorContains: "pipeline nonexistent-pipeline not configured", }, { name: "decision without novaRaw spec", @@ -566,13 +569,13 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - setupPipelineConfigs: true, - createDecisions: true, - expectError: true, - expectResult: false, - expectCreatedDecision: true, - expectUpdatedStatus: false, - errorContains: "no novaRaw spec defined", + setupPipelineConfigs: true, + createDecisions: true, + expectError: true, + expectResult: false, + expectHistoryCreated: true, + expectUpdatedStatus: false, + errorContains: "no novaRaw spec defined", }, { name: "processing fails after decision creation", @@ -604,13 +607,13 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - setupPipelineConfigs: true, - createDecisions: true, - expectError: true, - expectResult: false, - expectCreatedDecision: true, - expectUpdatedStatus: false, - errorContains: "pipeline not found or not ready", + setupPipelineConfigs: true, + createDecisions: true, + expectError: true, + expectResult: false, + expectHistoryCreated: true, + expectUpdatedStatus: false, + errorContains: "pipeline not found or not ready", }, { name: "pipeline not found in runtime map", @@ -642,13 +645,13 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Weighers: []v1alpha1.WeigherSpec{}, }, }, - setupPipelineConfigs: true, - createDecisions: true, - expectError: true, - expectResult: false, - expectCreatedDecision: true, - expectUpdatedStatus: false, - errorContains: "pipeline not found or not ready", + setupPipelineConfigs: true, + createDecisions: true, + expectError: true, + expectResult: false, + expectHistoryCreated: true, + expectUpdatedStatus: false, + errorContains: "pipeline not found or not ready", }, } @@ -662,7 +665,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) client := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(objects...). - WithStatusSubresource(&v1alpha1.Decision{}). + WithStatusSubresource(&v1alpha1.Decision{}, &v1alpha1.History{}). Build() controller := &FilterWeigherPipelineController{ @@ -670,6 +673,7 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) Client: client, Pipelines: make(map[string]lib.FilterWeigherPipeline[api.ExternalSchedulerRequest]), PipelineConfigs: make(map[string]v1alpha1.Pipeline), + HistoryManager: lib.HistoryManager{Client: client}, }, Monitor: lib.FilterWeigherPipelineMonitor{}, } @@ -707,24 +711,33 @@ func TestFilterWeigherPipelineController_ProcessNewDecisionFromAPI(t *testing.T) t.Errorf("Expected error to contain %q, got: %v", tt.errorContains, err) } - // Check if decision was created in the cluster when expected - if tt.expectCreatedDecision { - var createdDecision v1alpha1.Decision - key := types.NamespacedName{Name: tt.decision.Name, Namespace: tt.decision.Namespace} - err := client.Get(context.Background(), key, &createdDecision) - if err != nil { - t.Errorf("Expected decision to be created but got error: %v", err) + // Check if history CRD was created when expected + if tt.expectHistoryCreated { + var histories v1alpha1.HistoryList + deadline := time.Now().Add(2 * time.Second) + for { + if err := client.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) + } + if len(histories.Items) > 0 { + break + } + if time.Now().After(deadline) { + t.Fatal("timed out waiting for history CRD to be created") + } + time.Sleep(5 * time.Millisecond) } } else { - var createdDecision v1alpha1.Decision - key := types.NamespacedName{Name: tt.decision.Name, Namespace: tt.decision.Namespace} - err := client.Get(context.Background(), key, &createdDecision) - if err == nil { - t.Error("Expected decision not to be created but it was found") + var histories v1alpha1.HistoryList + if err := client.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) + } + if len(histories.Items) != 0 { + t.Error("Expected no history CRD but found one") } } - // Validate result and duration expectations + // Validate result expectations if tt.expectResult && tt.decision.Status.Result == nil { t.Error("Expected result to be set but was nil") } diff --git a/internal/scheduling/pods/filter_weigher_pipeline_controller.go b/internal/scheduling/pods/filter_weigher_pipeline_controller.go index 28e10ff88..26279aacb 100644 --- a/internal/scheduling/pods/filter_weigher_pipeline_controller.go +++ b/internal/scheduling/pods/filter_weigher_pipeline_controller.go @@ -99,12 +99,6 @@ func (c *FilterWeigherPipelineController) ProcessNewPod(ctx context.Context, pod if !ok { return fmt.Errorf("pipeline %s not configured", decision.Spec.PipelineRef.Name) } - if pipelineConf.Spec.CreateDecisions { - if err := c.Create(ctx, decision); err != nil { - return err - } - } - old := decision.DeepCopy() err := c.process(ctx, decision) if err != nil { meta.SetStatusCondition(&decision.Status.Conditions, metav1.Condition{ @@ -122,10 +116,11 @@ func (c *FilterWeigherPipelineController) ProcessNewPod(ctx context.Context, pod }) } if pipelineConf.Spec.CreateDecisions { - patch := client.MergeFrom(old) - if err := c.Status().Patch(ctx, decision, patch); err != nil { - return err - } + go func() { + if upsertErr := c.HistoryManager.Upsert(context.Background(), decision, v1alpha1.SchedulingIntentUnknown, err); upsertErr != nil { + ctrl.LoggerFrom(ctx).Error(upsertErr, "failed to create/update history") + } + }() } return err } @@ -227,20 +222,10 @@ func (c *FilterWeigherPipelineController) handlePod() handler.EventHandler { } }, DeleteFunc: func(ctx context.Context, evt event.DeleteEvent, queue workqueue.TypedRateLimitingInterface[reconcile.Request]) { - // Delete the associated decision(s). - log := ctrl.LoggerFrom(ctx) pod := evt.Object.(*corev1.Pod) - var decisions v1alpha1.DecisionList - if err := c.List(ctx, &decisions); err != nil { - log.Error(err, "failed to list decisions for deleted pod") - return - } - for _, decision := range decisions.Items { - if decision.Spec.PodRef.Name == pod.Name && decision.Spec.PodRef.Namespace == pod.Namespace { - if err := c.Delete(ctx, &decision); err != nil { - log.Error(err, "failed to delete decision for deleted pod") - } - } + if err := c.HistoryManager.Delete(ctx, v1alpha1.SchedulingDomainPods, pod.Name); err != nil { + log := ctrl.LoggerFrom(ctx) + log.Error(err, "failed to delete history CRD for pod", "pod", pod.Name) } }, } @@ -249,6 +234,7 @@ func (c *FilterWeigherPipelineController) handlePod() handler.EventHandler { func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error { c.Initializer = c c.SchedulingDomain = v1alpha1.SchedulingDomainPods + c.HistoryManager = lib.HistoryManager{Client: mgr.GetClient(), Recorder: mgr.GetEventRecorder("cortex-pods-scheduler")} if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil { return err } diff --git a/internal/scheduling/pods/filter_weigher_pipeline_controller_test.go b/internal/scheduling/pods/filter_weigher_pipeline_controller_test.go index 3d523873b..99b5698ec 100644 --- a/internal/scheduling/pods/filter_weigher_pipeline_controller_test.go +++ b/internal/scheduling/pods/filter_weigher_pipeline_controller_test.go @@ -6,6 +6,7 @@ package pods import ( "context" "testing" + "time" "github.com/cobaltcore-dev/cortex/api/external/pods" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -263,15 +264,15 @@ func TestFilterWeigherPipelineController_ProcessNewPod(t *testing.T) { } tests := []struct { - name string - pod *corev1.Pod - nodes []corev1.Node - pipelineConfig *v1alpha1.Pipeline - createDecisions bool - expectError bool - expectDecisionCreated bool - expectNodeAssigned bool - expectTargetHost string + name string + pod *corev1.Pod + nodes []corev1.Node + pipelineConfig *v1alpha1.Pipeline + createDecisions bool + expectError bool + expectHistoryCreated bool + expectNodeAssigned bool + expectTargetHost string }{ { name: "successful pod processing with decision creation", @@ -304,11 +305,11 @@ func TestFilterWeigherPipelineController_ProcessNewPod(t *testing.T) { Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: true, - expectError: false, - expectDecisionCreated: true, - expectNodeAssigned: true, - expectTargetHost: "node1", + createDecisions: true, + expectError: false, + expectHistoryCreated: true, + expectNodeAssigned: true, + expectTargetHost: "node1", }, { name: "successful pod processing without decision creation", @@ -338,11 +339,11 @@ func TestFilterWeigherPipelineController_ProcessNewPod(t *testing.T) { Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: false, - expectError: false, - expectDecisionCreated: false, - expectNodeAssigned: true, - expectTargetHost: "node1", + createDecisions: false, + expectError: false, + expectHistoryCreated: false, + expectNodeAssigned: true, + expectTargetHost: "node1", }, { name: "pipeline not configured", @@ -355,11 +356,11 @@ func TestFilterWeigherPipelineController_ProcessNewPod(t *testing.T) { SchedulerName: "", }, }, - nodes: []corev1.Node{}, - pipelineConfig: nil, - expectError: true, - expectDecisionCreated: false, - expectNodeAssigned: false, + nodes: []corev1.Node{}, + pipelineConfig: nil, + expectError: true, + expectHistoryCreated: false, + expectNodeAssigned: false, }, { name: "no nodes available", @@ -385,10 +386,10 @@ func TestFilterWeigherPipelineController_ProcessNewPod(t *testing.T) { Weighers: []v1alpha1.WeigherSpec{}, }, }, - createDecisions: true, - expectError: true, - expectDecisionCreated: true, // Decision is created but processing fails - expectNodeAssigned: false, + createDecisions: true, + expectError: true, + expectHistoryCreated: true, // Decision is created but processing fails + expectNodeAssigned: false, }, } @@ -405,13 +406,14 @@ func TestFilterWeigherPipelineController_ProcessNewPod(t *testing.T) { client := fake.NewClientBuilder(). WithScheme(scheme). WithRuntimeObjects(objects...). - WithStatusSubresource(&v1alpha1.Decision{}). + WithStatusSubresource(&v1alpha1.Decision{}, &v1alpha1.History{}). Build() controller := &FilterWeigherPipelineController{ BasePipelineController: lib.BasePipelineController[lib.FilterWeigherPipeline[pods.PodPipelineRequest]]{ Pipelines: map[string]lib.FilterWeigherPipeline[pods.PodPipelineRequest]{}, PipelineConfigs: map[string]v1alpha1.Pipeline{}, + HistoryManager: lib.HistoryManager{Client: client}, }, Monitor: lib.FilterWeigherPipelineMonitor{}, } @@ -434,70 +436,29 @@ func TestFilterWeigherPipelineController_ProcessNewPod(t *testing.T) { return } - // Check if decision was created (if expected) - if tt.expectDecisionCreated { - var decisions v1alpha1.DecisionList - err := client.List(context.Background(), &decisions) - if err != nil { - t.Errorf("Failed to list decisions: %v", err) - return - } - - found := false - for _, decision := range decisions.Items { - if decision.Spec.PodRef != nil && - decision.Spec.PodRef.Name == tt.pod.Name && - decision.Spec.PodRef.Namespace == tt.pod.Namespace { - found = true - - // Verify decision properties - if decision.Spec.SchedulingDomain != v1alpha1.SchedulingDomainPods { - t.Errorf("expected scheduling domain %q, got %q", v1alpha1.SchedulingDomainPods, decision.Spec.SchedulingDomain) - } - if decision.Spec.ResourceID != tt.pod.Name { - t.Errorf("expected resource ID %q, got %q", tt.pod.Name, decision.Spec.ResourceID) - } - if decision.Spec.PipelineRef.Name != "pods-scheduler" { - t.Errorf("expected pipeline ref %q, got %q", "pods-scheduler", decision.Spec.PipelineRef.Name) - } - - // Check if result was set (only for successful cases) - if !tt.expectError && tt.expectTargetHost != "" { - if decision.Status.Result == nil { - t.Error("expected decision result to be set") - return - } - if decision.Status.Result.TargetHost == nil { - t.Error("expected target host to be set") - return - } - if *decision.Status.Result.TargetHost != tt.expectTargetHost { - t.Errorf("expected target host %q, got %q", tt.expectTargetHost, *decision.Status.Result.TargetHost) - } - } + // Check if history CRD was created when expected + if tt.expectHistoryCreated { + var histories v1alpha1.HistoryList + deadline := time.Now().Add(2 * time.Second) + for { + if err := client.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) + } + if len(histories.Items) > 0 { break } - } - - if !found { - t.Error("expected decision to be created but was not found") + if time.Now().After(deadline) { + t.Fatal("timed out waiting for history CRD to be created") + } + time.Sleep(5 * time.Millisecond) } } else { - // Check that no decisions were created - var decisions v1alpha1.DecisionList - err := client.List(context.Background(), &decisions) - if err != nil { - t.Errorf("Failed to list decisions: %v", err) - return + var histories v1alpha1.HistoryList + if err := client.List(context.Background(), &histories); err != nil { + t.Fatalf("Failed to list histories: %v", err) } - - for _, decision := range decisions.Items { - if decision.Spec.PodRef != nil && - decision.Spec.PodRef.Name == tt.pod.Name && - decision.Spec.PodRef.Namespace == tt.pod.Namespace { - t.Error("expected no decision to be created but found one") - break - } + if len(histories.Items) != 0 { + t.Error("Expected no history CRD but found one") } }