diff --git a/CODEOWNERS b/CODEOWNERS index 23f370f22..f6e423ecb 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* arno.uhlig@sap.com julius.clausnitzer@sap.com malte.viering@sap.com marcel.bloecher@sap.com markus.wieland@sap.com p.matthes@sap.com \ No newline at end of file +* arno.uhlig@sap.com julius.clausnitzer@sap.com malte.viering@sap.com marcel.gute@sap.com markus.wieland@sap.com p.matthes@sap.com \ No newline at end of file diff --git a/api/v1alpha1/knowledge_types.go b/api/v1alpha1/knowledge_types.go index d90f76565..504b30449 100644 --- a/api/v1alpha1/knowledge_types.go +++ b/api/v1alpha1/knowledge_types.go @@ -93,6 +93,11 @@ type KnowledgeStatus struct { // +kubebuilder:validation:Optional LastExtracted metav1.Time `json:"lastExtracted"` + // When the extracted knowledge content last changed. + // Updated only when the Raw data actually changes, not on every reconcile. + // +kubebuilder:validation:Optional + LastContentChange metav1.Time `json:"lastContentChange,omitempty"` + // The raw data behind the extracted knowledge, e.g. a list of features. // +kubebuilder:validation:Optional Raw runtime.RawExtension `json:"raw"` @@ -111,6 +116,7 @@ type KnowledgeStatus struct { // +kubebuilder:printcolumn:name="Domain",type="string",JSONPath=".spec.schedulingDomain" // +kubebuilder:printcolumn:name="Created",type="date",JSONPath=".metadata.creationTimestamp" // +kubebuilder:printcolumn:name="Extracted",type="date",JSONPath=".status.lastExtracted" +// +kubebuilder:printcolumn:name="Changed",type="date",JSONPath=".status.lastContentChange" // +kubebuilder:printcolumn:name="Recency",type="string",JSONPath=".spec.recency" // +kubebuilder:printcolumn:name="Features",type="integer",JSONPath=".status.rawLength" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index ed8e42f43..df3ad473e 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -21,6 +21,20 @@ const ( ReservationTypeFailover ReservationType = "FailoverReservation" ) +// Label keys for Reservation metadata. +// Labels follow Kubernetes naming conventions using reverse-DNS notation +const ( + // ===== Common Reservation Labels ===== + + // LabelReservationType identifies the type of reservation. + // This label is present on all reservations to enable type-based filtering. + LabelReservationType = "reservations.cortex.sap.com/type" + + // Reservation type label values + ReservationTypeLabelCommittedResource = "committed-resource" + ReservationTypeLabelFailover = "failover" +) + // CommittedResourceAllocation represents a workload's assignment to a committed resource reservation slot. // The workload could be a VM (Nova/IronCore), Pod (Kubernetes), or other resource. type CommittedResourceAllocation struct { @@ -79,6 +93,10 @@ type ReservationSpec struct { // +kubebuilder:validation:Optional SchedulingDomain string `json:"schedulingDomain,omitempty"` + // AvailabilityZone specifies the availability zone for this reservation, if restricted to a specific AZ. + // +kubebuilder:validation:Optional + AvailabilityZone string `json:"availabilityZone,omitempty"` + // Resources to reserve for this instance. // +kubebuilder:validation:Optional Resources map[string]resource.Quantity `json:"resources,omitempty"` @@ -166,7 +184,7 @@ type ReservationStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster -// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".spec.type" +// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".metadata.labels['reservations\\.cortex\\.sap\\.com/type']" // +kubebuilder:printcolumn:name="Host",type="string",JSONPath=".status.host" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 5a756e045..564f30cac 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -833,6 +833,7 @@ func (in *KnowledgeSpec) DeepCopy() *KnowledgeSpec { func (in *KnowledgeStatus) DeepCopyInto(out *KnowledgeStatus) { *out = *in in.LastExtracted.DeepCopyInto(&out.LastExtracted) + in.LastContentChange.DeepCopyInto(&out.LastContentChange) in.Raw.DeepCopyInto(&out.Raw) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions diff --git a/cmd/main.go b/cmd/main.go index 4e4865567..43ae63f9c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -309,6 +309,10 @@ func main() { httpAPIConf := conf.GetConfigOrDie[nova.HTTPAPIConfig]() nova.NewAPI(httpAPIConf, filterWeigherController).Init(mux) + // Initialize commitments API for LIQUID interface + commitmentsAPI := commitments.NewAPI(multiclusterClient) + commitmentsAPI.Init(mux) + // Detector pipeline controller setup. novaClient := nova.NewNovaClient() novaClientConfig := conf.GetConfigOrDie[nova.NovaClientConfig]() @@ -456,11 +460,11 @@ func main() { monitor := reservationscontroller.NewControllerMonitor(multiclusterClient) metrics.Registry.MustRegister(&monitor) reservationsControllerConfig := conf.GetConfigOrDie[reservationscontroller.Config]() + if err := (&reservationscontroller.ReservationReconciler{ - Client: multiclusterClient, - Scheme: mgr.GetScheme(), - Conf: reservationsControllerConfig, - HypervisorClient: reservationscontroller.NewHypervisorClient(), + Client: multiclusterClient, + Scheme: mgr.GetScheme(), + Conf: reservationsControllerConfig, }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Reservation") os.Exit(1) diff --git a/go.mod b/go.mod index 245513cb3..f6b914718 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/go-gorp/gorp v2.2.0+incompatible github.com/gophercloud/gophercloud/v2 v2.10.0 github.com/ironcore-dev/ironcore v0.2.4 + github.com/majewsky/gg v1.5.0 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/client_model v0.6.2 github.com/sapcc/go-bits v0.0.0-20260226170120-c20f89b66c3c @@ -36,7 +37,7 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/logr v1.4.3 github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.22.1 // indirect @@ -71,7 +72,7 @@ require ( github.com/poy/onpar v0.3.5 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.17.0 // indirect - github.com/sapcc/go-api-declarations v1.20.2 // indirect + github.com/sapcc/go-api-declarations v1.20.2 github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cobra v1.10.1 // indirect github.com/spf13/pflag v1.0.10 // indirect diff --git a/helm/bundles/cortex-cinder/values.yaml b/helm/bundles/cortex-cinder/values.yaml index f002fc58b..b01656205 100644 --- a/helm/bundles/cortex-cinder/values.yaml +++ b/helm/bundles/cortex-cinder/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-crds/values.yaml b/helm/bundles/cortex-crds/values.yaml index 2033e435c..bf072086c 100644 --- a/helm/bundles/cortex-crds/values.yaml +++ b/helm/bundles/cortex-crds/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-ironcore/values.yaml b/helm/bundles/cortex-ironcore/values.yaml index 2f885c7a5..82e490585 100644 --- a/helm/bundles/cortex-ironcore/values.yaml +++ b/helm/bundles/cortex-ironcore/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-manila/values.yaml b/helm/bundles/cortex-manila/values.yaml index cc341a112..50d16352e 100644 --- a/helm/bundles/cortex-manila/values.yaml +++ b/helm/bundles/cortex-manila/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml b/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml index f2181fe96..6b3d9fcbc 100644 --- a/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml @@ -2,6 +2,23 @@ --- apiVersion: cortex.cloud/v1alpha1 kind: Knowledge +metadata: + name: flavor-groups +spec: + schedulingDomain: nova + extractor: + name: flavor_groups + recency: "5m" + description: | + This knowledge extracts flavor groups from Nova flavors based on the + hw_version extra_spec. It identifies all flavors belonging to each group + and determines the largest flavor for reservation slot sizing. + dependencies: + datasources: + - name: nova-flavors +--- +apiVersion: cortex.cloud/v1alpha1 +kind: Knowledge metadata: name: kvm-libvirt-domain-cpu-steal-pct spec: diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index b2dbba788..200ba3ff3 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" @@ -114,8 +114,12 @@ cortex-scheduling-controllers: - nova-pipeline-controllers - nova-deschedulings-executor - explanation-controller + - reservations-controller enabledTasks: - nova-decisions-cleanup-task + # Endpoints configuration for reservations controller + endpoints: + novaExternalScheduler: "http://localhost:8080/scheduler/nova/external" cortex-knowledge-controllers: <<: *cortex @@ -134,7 +138,6 @@ cortex-knowledge-controllers: - datasource-controllers - knowledge-controllers - kpis-controller - - reservations-controller enabledTasks: - commitments-sync-task diff --git a/helm/bundles/cortex-pods/values.yaml b/helm/bundles/cortex-pods/values.yaml index b7aab8a6d..4c381f539 100644 --- a/helm/bundles/cortex-pods/values.yaml +++ b/helm/bundles/cortex-pods/values.yaml @@ -8,7 +8,7 @@ owner-info: - "arno.uhlig@sap.com" - "julius.clausnitzer@sap.com" - "malte.viering@sap.com" - - "marcel.bloecher@sap.com" + - "marcel.gute@sap.com" - "markus.wieland@sap.com" - "p.matthes@sap.com" support-group: "workload-management" diff --git a/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml b/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml index 0ac596bc2..2e3891ffa 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_knowledges.yaml @@ -24,6 +24,9 @@ spec: - jsonPath: .status.lastExtracted name: Extracted type: date + - jsonPath: .status.lastContentChange + name: Changed + type: date - jsonPath: .spec.recency name: Recency type: string @@ -248,6 +251,12 @@ spec: - type type: object type: array + lastContentChange: + description: |- + When the extracted knowledge content last changed. + Updated only when the Raw data actually changes, not on every reconcile. + format: date-time + type: string lastExtracted: description: When the knowledge was last successfully extracted. format: date-time diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml index 5d341cdf6..915e5677e 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml @@ -15,7 +15,7 @@ spec: scope: Cluster versions: - additionalPrinterColumns: - - jsonPath: .spec.type + - jsonPath: .metadata.labels['reservations\.cortex\.sap\.com/type'] name: Type type: string - jsonPath: .status.host @@ -49,6 +49,10 @@ spec: spec: description: spec defines the desired state of Reservation properties: + availabilityZone: + description: AvailabilityZone specifies the availability zone for + this reservation, if restricted to a specific AZ. + type: string committedResourceReservation: description: |- CommittedResourceReservation contains fields specific to committed resource reservations. diff --git a/internal/knowledge/extractor/controller.go b/internal/knowledge/extractor/controller.go index cd4f63972..3dd511b39 100644 --- a/internal/knowledge/extractor/controller.go +++ b/internal/knowledge/extractor/controller.go @@ -5,6 +5,8 @@ package extractor import ( "context" + "encoding/json" + "reflect" "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" @@ -202,9 +204,27 @@ func (r *KnowledgeReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( Reason: "KnowledgeExtracted", Message: "knowledge extracted successfully", }) + + // Check if content actually changed by comparing deserialized data structures. + // This avoids false positives from JSON serialization non-determinism (e.g., map key ordering). + contentChanged := true + if len(knowledge.Status.Raw.Raw) > 0 { + var oldData, newData interface{} + if err := json.Unmarshal(knowledge.Status.Raw.Raw, &oldData); err == nil { + if err := json.Unmarshal(raw.Raw, &newData); err == nil { + contentChanged = !reflect.DeepEqual(oldData, newData) + } + } + } + knowledge.Status.Raw = raw knowledge.Status.LastExtracted = metav1.NewTime(time.Now()) knowledge.Status.RawLength = len(features) + + if contentChanged { + log.Info("content of knowledge has changed", "name", knowledge.Name) + knowledge.Status.LastContentChange = metav1.NewTime(time.Now()) + } patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, knowledge, patch); err != nil { log.Error(err, "failed to patch knowledge status") diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups.go b/internal/knowledge/extractor/plugins/compute/flavor_groups.go new file mode 100644 index 000000000..d5c47cf2a --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups.go @@ -0,0 +1,154 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + _ "embed" + "encoding/json" + "errors" + "sort" + + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins" + ctrl "sigs.k8s.io/controller-runtime" +) + +// FlavorInGroup represents a single flavor within a flavor group. +type FlavorInGroup struct { + Name string `json:"name"` + VCPUs uint64 `json:"vcpus"` + MemoryMB uint64 `json:"memoryMB"` + DiskGB uint64 `json:"diskGB"` + EphemeralGB uint64 `json:"ephemeralGB,omitempty"` + ExtraSpecs map[string]string `json:"extraSpecs,omitempty"` +} + +// FlavorGroupFeature represents a flavor group with all its member flavors. +// This is the feature that gets stored in the Knowledge CRD. +type FlavorGroupFeature struct { + // Name of the flavor group (from hw_version extra_spec) + Name string `json:"name"` + + // All flavors belonging to this group + Flavors []FlavorInGroup `json:"flavors"` + + // The largest flavor in the group (used for reservation slot sizing) + LargestFlavor FlavorInGroup `json:"largestFlavor"` + + // The smallest flavor in the group (used for CR size quantification) + SmallestFlavor FlavorInGroup `json:"smallestFlavor"` +} + +// flavorRow represents a row from the SQL query. +type flavorRow struct { + Name string `db:"name"` + VCPUs uint64 `db:"vcpus"` + MemoryMB uint64 `db:"memory_mb"` + DiskGB uint64 `db:"disk"` + EphemeralGB uint64 `db:"ephemeral"` + ExtraSpecs string `db:"extra_specs"` +} + +// FlavorGroupExtractor extracts flavor group information from the database. +type FlavorGroupExtractor struct { + // Common base for all extractors that provides standard functionality. + plugins.BaseExtractor[ + struct{}, // No options passed through yaml config + FlavorGroupFeature, // Feature model + ] +} + +//go:embed flavor_groups.sql +var flavorGroupsQuery string + +var flavorGroupLog = ctrl.Log.WithName("flavor_group_extractor") + +// Extract flavor groups from the database. +func (e *FlavorGroupExtractor) Extract() ([]plugins.Feature, error) { + if e.DB == nil { + return nil, errors.New("database connection is not initialized") + } + + // Query all flavors from database + var rows []flavorRow + if _, err := e.DB.Select(&rows, flavorGroupsQuery); err != nil { + flavorGroupLog.Error(err, "failed to query flavors") + return nil, err + } + + // Group flavors by flavorGroupIdentifierName + groupMap := make(map[string][]FlavorInGroup) + + for _, row := range rows { + // Parse extra_specs JSON + var extraSpecs map[string]string + if row.ExtraSpecs != "" { + if err := json.Unmarshal([]byte(row.ExtraSpecs), &extraSpecs); err != nil { + flavorGroupLog.Info("failed to parse extra_specs for flavor", "flavor", row.Name, "error", err) + continue + } + } + + hwVersion, exists := extraSpecs["quota:hw_version"] + if !exists || hwVersion == "" { + flavorGroupLog.Info("flavor missing hw_version extra_spec", "flavor", row.Name) + continue + } + + // Add flavor to its group + flavor := FlavorInGroup{ + Name: row.Name, + VCPUs: row.VCPUs, + MemoryMB: row.MemoryMB, + DiskGB: row.DiskGB, + EphemeralGB: row.EphemeralGB, + ExtraSpecs: extraSpecs, + } + groupMap[hwVersion] = append(groupMap[hwVersion], flavor) + } + + // Convert map to features + features := make([]FlavorGroupFeature, 0, len(groupMap)) + for groupName, flavors := range groupMap { + if len(flavors) == 0 { + continue + } + + // Sort flavors by size descending (largest first), tie break by name for consistent ordering + sort.Slice(flavors, func(i, j int) bool { + if flavors[i].MemoryMB != flavors[j].MemoryMB { + return flavors[i].MemoryMB > flavors[j].MemoryMB + } + if flavors[i].VCPUs != flavors[j].VCPUs { + return flavors[i].VCPUs > flavors[j].VCPUs + } + return flavors[i].Name < flavors[j].Name + }) + + largest := flavors[0] + smallest := flavors[len(flavors)-1] + + flavorGroupLog.Info("identified largest and smallest flavors", + "groupName", groupName, + "largestFlavor", largest.Name, + "largestMemoryMB", largest.MemoryMB, + "largestVCPUs", largest.VCPUs, + "smallestFlavor", smallest.Name, + "smallestMemoryMB", smallest.MemoryMB, + "smallestVCPUs", smallest.VCPUs) + + features = append(features, FlavorGroupFeature{ + Name: groupName, + Flavors: flavors, + LargestFlavor: largest, + SmallestFlavor: smallest, + }) + } + + // Sort features by group name for consistent ordering + sort.Slice(features, func(i, j int) bool { + return features[i].Name < features[j].Name + }) + + return e.Extracted(features) +} diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups.sql b/internal/knowledge/extractor/plugins/compute/flavor_groups.sql new file mode 100644 index 000000000..0905e0b7d --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups.sql @@ -0,0 +1,17 @@ +-- Copyright SAP SE +-- SPDX-License-Identifier: Apache-2.0 + +-- Query to extract flavor groups from the openstack_flavors_v2 table +-- Groups flavors by their hw_version extra_spec (or flavor name prefix as workaround) +-- Filters to only include KVM flavors (QEMU and Cloud-Hypervisor) +SELECT + name, + vcpus, + ram as memory_mb, + disk, + ephemeral, + extra_specs +FROM openstack_flavors_v2 +WHERE LOWER(extra_specs) LIKE '%"capabilities:hypervisor_type":"qemu"%' + OR LOWER(extra_specs) LIKE '%"capabilities:hypervisor_type":"ch"%' +ORDER BY name; diff --git a/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go b/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go new file mode 100644 index 000000000..becccadd0 --- /dev/null +++ b/internal/knowledge/extractor/plugins/compute/flavor_groups_test.go @@ -0,0 +1,273 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package compute + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + testlibDB "github.com/cobaltcore-dev/cortex/internal/knowledge/db/testing" +) + +func TestFlavorGroupExtractor_Extract(t *testing.T) { + dbEnv := testlibDB.SetupDBEnv(t) + defer dbEnv.Close() + testDB := db.DB{DbMap: dbEnv.DbMap} + + // Setup test data - create flavors table + if err := testDB.CreateTable( + testDB.AddTable(nova.Flavor{}), + ); err != nil { + t.Fatal(err) + } + + // Insert test flavors with quota:hw_version in extra_specs + // Mix of KVM flavors (should be included) and VMware flavors (should be excluded) + flavors := []any{ + &nova.Flavor{ + ID: "1", + Name: "hana_c30_m480_v2", + VCPUs: 30, + RAM: 491520, // 480GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "2", + Name: "hana_c60_m960_v2", + VCPUs: 60, + RAM: 983040, // 960GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "3", + Name: "hana_c240_m3840_v2", + VCPUs: 240, + RAM: 3932160, // 3840GB in MB + Disk: 100, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","hw:cpu_policy":"dedicated","hw:numa_nodes":"4","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "4", + Name: "gp_c8_m32_v2", + VCPUs: 8, + RAM: 32768, // 32GB in MB + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + &nova.Flavor{ + ID: "5", + Name: "gp_c16_m64_v2", + VCPUs: 16, + RAM: 65536, // 64GB in MB + Disk: 50, + Ephemeral: 20, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + // VMware flavor - should be excluded from results (filtered by SQL query) + &nova.Flavor{ + ID: "6", + Name: "vmwa_c32_m512_v1", + VCPUs: 32, + RAM: 524288, // 512GB in MB + Disk: 200, + Ephemeral: 0, + ExtraSpecs: `{"capabilities:hypervisor_type":"VMware vCenter Server","quota:hw_version":"v1"}`, + }, + // Cloud-Hypervisor flavor - should be included (case insensitive) + &nova.Flavor{ + ID: "7", + Name: "gp_c4_m16_ch", + VCPUs: 4, + RAM: 16384, // 16GB in MB + Disk: 25, + Ephemeral: 5, + ExtraSpecs: `{"capabilities:hypervisor_type":"CH","quota:hw_version":"ch"}`, + }, + // Corner case: Same memory as gp_c8_m32_v2 but MORE VCPUs (should come first) + &nova.Flavor{ + ID: "8", + Name: "gp_c12_m32_v2", + VCPUs: 12, + RAM: 32768, // 32GB in MB - same as gp_c8_m32_v2 + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + // Corner case: Same memory AND same VCPUs as gp_c12_m32_v2 (tests name sorting) + &nova.Flavor{ + ID: "9", + Name: "gp_c12_m32_alt", + VCPUs: 12, + RAM: 32768, // 32GB in MB + Disk: 50, + Ephemeral: 10, + ExtraSpecs: `{"capabilities:hypervisor_type":"qemu","quota:hw_version":"v2"}`, + }, + } + + if err := testDB.Insert(flavors...); err != nil { + t.Fatal(err) + } + + // Create and run extractor + extractor := &FlavorGroupExtractor{} + config := v1alpha1.KnowledgeSpec{} + if err := extractor.Init(&testDB, nil, config); err != nil { + t.Fatal(err) + } + + features, err := extractor.Extract() + if err != nil { + t.Fatal(err) + } + + // Verify results - should be 2 groups (v2 and ch based on hw_version) + // VMware flavor should be filtered out, Cloud-Hypervisor should be included + if len(features) != 2 { + t.Fatalf("expected 2 flavor groups, got %d", len(features)) + } + + // Convert to typed features for easier testing + var v2Group, chGroup *FlavorGroupFeature + for _, f := range features { + fg := f.(FlavorGroupFeature) + switch fg.Name { + case "v2": + v2Group = &fg + case "ch": + chGroup = &fg + } + } + + // Verify v2 group (contains both HANA and general purpose flavors) + if v2Group == nil { + t.Fatal("v2 group not found") + } + if len(v2Group.Flavors) != 7 { + t.Errorf("expected 7 flavors in v2 group (3 HANA + 4 general purpose), got %d", len(v2Group.Flavors)) + } + // Largest flavor in v2 group should be hana_c240_m3840_v2 (highest memory) + if v2Group.LargestFlavor.Name != "hana_c240_m3840_v2" { + t.Errorf("expected largest flavor to be hana_c240_m3840_v2, got %s", v2Group.LargestFlavor.Name) + } + if v2Group.LargestFlavor.VCPUs != 240 { + t.Errorf("expected largest flavor VCPUs to be 240, got %d", v2Group.LargestFlavor.VCPUs) + } + if v2Group.LargestFlavor.MemoryMB != 3932160 { + t.Errorf("expected largest flavor memory to be 3932160 MB, got %d", v2Group.LargestFlavor.MemoryMB) + } + if v2Group.LargestFlavor.DiskGB != 100 { + t.Errorf("expected largest flavor disk to be 100 GB, got %d", v2Group.LargestFlavor.DiskGB) + } + if v2Group.LargestFlavor.ExtraSpecs == nil { + t.Error("expected largest flavor to have extra_specs") + } + if v2Group.LargestFlavor.ExtraSpecs["hw:numa_nodes"] != "4" { + t.Errorf("expected largest flavor to have hw:numa_nodes=4, got %s", v2Group.LargestFlavor.ExtraSpecs["hw:numa_nodes"]) + } + if v2Group.LargestFlavor.ExtraSpecs["quota:hw_version"] != "v2" { + t.Errorf("expected largest flavor to have quota:hw_version=v2, got %s", v2Group.LargestFlavor.ExtraSpecs["quota:hw_version"]) + } + + // Verify smallest flavor in v2 group should be gp_c4_m16_ch is NOT in v2, so it's gp_c8_m32_v2 (lowest memory among v2 flavors) + if v2Group.SmallestFlavor.Name != "gp_c8_m32_v2" { + t.Errorf("expected smallest flavor to be gp_c8_m32_v2, got %s", v2Group.SmallestFlavor.Name) + } + if v2Group.SmallestFlavor.MemoryMB != 32768 { + t.Errorf("expected smallest flavor memory to be 32768 MB, got %d", v2Group.SmallestFlavor.MemoryMB) + } + if v2Group.SmallestFlavor.VCPUs != 8 { + t.Errorf("expected smallest flavor VCPUs to be 8, got %d", v2Group.SmallestFlavor.VCPUs) + } + + // Verify Cloud-Hypervisor group + if chGroup == nil { + t.Fatal("ch group not found") + } + if len(chGroup.Flavors) != 1 { + t.Errorf("expected 1 flavor in ch group, got %d", len(chGroup.Flavors)) + } + if chGroup.LargestFlavor.Name != "gp_c4_m16_ch" { + t.Errorf("expected largest flavor to be gp_c4_m16_ch, got %s", chGroup.LargestFlavor.Name) + } + if chGroup.LargestFlavor.ExtraSpecs["quota:hw_version"] != "ch" { + t.Errorf("expected ch flavor to have quota:hw_version=ch, got %s", chGroup.LargestFlavor.ExtraSpecs["quota:hw_version"]) + } + + // Verify smallest flavor in ch group (only has 1 flavor, so same as largest) + if chGroup.SmallestFlavor.Name != "gp_c4_m16_ch" { + t.Errorf("expected smallest flavor to be gp_c4_m16_ch, got %s", chGroup.SmallestFlavor.Name) + } + + // Generic check: Verify all flavor groups have correctly ordered flavors + // Flavors must be sorted descending by memory (largest first), with VCPUs as tiebreaker + for _, f := range features { + fg := f.(FlavorGroupFeature) + + // Check that flavors are sorted in descending order + for i := range len(fg.Flavors) - 1 { + current := fg.Flavors[i] + next := fg.Flavors[i+1] + + // Primary sort: memory descending + if current.MemoryMB < next.MemoryMB { + t.Errorf("Flavors in group %s not sorted by memory: %s (%d MB) should come after %s (%d MB)", + fg.Name, current.Name, current.MemoryMB, next.Name, next.MemoryMB) + } + + // Secondary sort: if memory equal, VCPUs descending + if current.MemoryMB == next.MemoryMB && current.VCPUs < next.VCPUs { + t.Errorf("Flavors in group %s with equal memory not sorted by VCPUs: %s (%d VCPUs) should come after %s (%d VCPUs)", + fg.Name, current.Name, current.VCPUs, next.Name, next.VCPUs) + } + } + + // Verify LargestFlavor matches the first flavor in sorted list + if len(fg.Flavors) > 0 && fg.LargestFlavor.Name != fg.Flavors[0].Name { + t.Errorf("Group %s: LargestFlavor (%s) doesn't match first flavor in sorted list (%s)", + fg.Name, fg.LargestFlavor.Name, fg.Flavors[0].Name) + } + + // Verify SmallestFlavor matches the last flavor in sorted list + if len(fg.Flavors) > 0 && fg.SmallestFlavor.Name != fg.Flavors[len(fg.Flavors)-1].Name { + t.Errorf("Group %s: SmallestFlavor (%s) doesn't match last flavor in sorted list (%s)", + fg.Name, fg.SmallestFlavor.Name, fg.Flavors[len(fg.Flavors)-1].Name) + } + } + + // Verify that VMware flavor was filtered out + for _, f := range features { + fg := f.(FlavorGroupFeature) + for _, flavor := range fg.Flavors { + if flavor.Name == "vmwa_c32_m512_v1" { + t.Errorf("VMware flavor should have been filtered out but was found in group %s", fg.Name) + } + } + } + + // Verify that Cloud-Hypervisor flavor was included in ch group + foundCH := false + for _, flavor := range chGroup.Flavors { + if flavor.Name == "gp_c4_m16_ch" { + foundCH = true + if flavor.ExtraSpecs["capabilities:hypervisor_type"] != "CH" { + t.Errorf("expected CH hypervisor_type, got %s", flavor.ExtraSpecs["capabilities:hypervisor_type"]) + } + if flavor.ExtraSpecs["quota:hw_version"] != "ch" { + t.Errorf("expected quota:hw_version=ch, got %s", flavor.ExtraSpecs["quota:hw_version"]) + } + } + } + if !foundCH { + t.Error("Cloud-Hypervisor flavor should have been included but was not found") + } +} diff --git a/internal/knowledge/extractor/supported_extractors.go b/internal/knowledge/extractor/supported_extractors.go index 684697928..6f1cb2fd2 100644 --- a/internal/knowledge/extractor/supported_extractors.go +++ b/internal/knowledge/extractor/supported_extractors.go @@ -23,6 +23,7 @@ var supportedExtractors = map[string]plugins.FeatureExtractor{ "host_az_extractor": &compute.HostAZExtractor{}, "host_pinned_projects_extractor": &compute.HostPinnedProjectsExtractor{}, "sap_host_details_extractor": &compute.HostDetailsExtractor{}, + "flavor_groups": &compute.FlavorGroupExtractor{}, "netapp_storage_pool_cpu_usage_extractor": &storage.StoragePoolCPUUsageExtractor{}, } diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go new file mode 100644 index 000000000..ba83e2ab8 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api.go @@ -0,0 +1,33 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "net/http" + "sync" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// HTTPAPI implements Limes LIQUID commitment validation endpoints. +type HTTPAPI struct { + client client.Client + // Mutex to serialize change-commitments requests + changeMutex sync.Mutex +} + +func NewAPI(client client.Client) *HTTPAPI { + return &HTTPAPI{ + client: client, + } +} + +func (api *HTTPAPI) Init(mux *http.ServeMux) { + mux.HandleFunc("/v1/change-commitments", api.HandleChangeCommitments) + // mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity) + mux.HandleFunc("/v1/info", api.HandleInfo) +} + +var commitmentApiLog = ctrl.Log.WithName("commitment_api") diff --git a/internal/scheduling/reservations/commitments/api_change_commitments.go b/internal/scheduling/reservations/commitments/api_change_commitments.go new file mode 100644 index 000000000..3134b3b9d --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments.go @@ -0,0 +1,353 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + . "github.com/majewsky/gg/option" + "github.com/sapcc/go-api-declarations/liquid" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + // watchTimeout is how long to wait for all reservations to become ready + watchTimeout = 20 * time.Second + + // pollInterval is how frequently to poll reservation status + pollInterval = 1 * time.Second +) + +// implements POST /v1/change-commitments from Limes LIQUID API: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +// +// This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle. +// A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back. +func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) { + // Serialize all change-commitments requests + api.changeMutex.Lock() + defer api.changeMutex.Unlock() + + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/change-commitments") + + // Only accept POST method + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + // Parse request body + var req liquid.CommitmentChangeRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + log.Error(err, "invalid request body") + http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest) + return + } + + log.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ) + + // Initialize response + resp := liquid.CommitmentChangeResponse{} + + // Check for dry run -> early reject, not supported yet + if req.DryRun { + resp.RejectionReason = "Dry run not supported yet" + log.Info("rejecting dry run request") + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return + } + return + } + + // Process commitment changes + // For now, we'll implement a simplified path that checks capacity for immediate start CRs + if err := api.processCommitmentChanges(w, log, req, &resp); err != nil { + // Error already written to response by processCommitmentChanges + return + } + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return + } +} + +func (api *HTTPAPI) processCommitmentChanges(w http.ResponseWriter, log logr.Logger, req liquid.CommitmentChangeRequest, resp *liquid.CommitmentChangeResponse) error { + ctx := context.Background() + manager := NewReservationManager(api.client) + requireRollback := false + log.Info("processing commitment change request", "availabilityZone", req.AZ, "dryRun", req.DryRun, "affectedProjects", len(req.ByProject)) + + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + log.Info("failed to get flavor groups from knowledge extractor", "error", err) + resp.RejectionReason = "caches not ready" + retryTime := time.Now().Add(1 * time.Minute) + resp.RetryAt = Some(retryTime) + return nil + } + + // Validate InfoVersion from request matches current version (= last content change of flavor group knowledge) + var currentVersion int64 = -1 + if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { + currentVersion = knowledgeCRD.Status.LastContentChange.Unix() + } + + if req.InfoVersion != currentVersion { + log.Info("version mismatch in commitment change request", + "requestVersion", req.InfoVersion, + "currentVersion", currentVersion) + http.Error(w, fmt.Sprintf("Version mismatch: request version %d, current version %d. Please refresh and retry.", + req.InfoVersion, currentVersion), http.StatusConflict) + return errors.New("version mismatch") + } + + statesBefore := make(map[string]*CommitmentState) // map of commitmentID to existing state for rollback + var reservationsToWatch []v1alpha1.Reservation + + if req.DryRun { + resp.RejectionReason = "Dry run not supported yet" + return nil + } + +ProcessLoop: + for projectID, projectChanges := range req.ByProject { + for resourceName, resourceChanges := range projectChanges.ByResource { + // Validate resource name pattern (instances_group_*) + flavorGroupName, err := getFlavorGroupNameFromResource(string(resourceName)) + if err != nil { + resp.RejectionReason = fmt.Sprintf("project with unknown resource name %s: %v", projectID, err) + requireRollback = true + break ProcessLoop + } + + // Verify flavor group exists in Knowledge CRDs + flavorGroup, flavorGroupExists := flavorGroups[flavorGroupName] + if !flavorGroupExists { + resp.RejectionReason = "flavor group not found: " + flavorGroupName + requireRollback = true + break ProcessLoop + } + + for _, commitment := range resourceChanges.Commitments { + // Additional per-commitment validation if needed + log.Info("processing commitment change", "commitmentUUID", commitment.UUID, "projectID", projectID, "resourceName", resourceName, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) + + // TODO add domain + + // List all committed resource reservations, then filter by name prefix + var all_reservations v1alpha1.ReservationList + if err := api.client.List(ctx, &all_reservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + resp.RejectionReason = fmt.Sprintf("failed to list reservations for commitment %s: %v", commitment.UUID, err) + requireRollback = true + break ProcessLoop + } + + // Filter by name prefix to find reservations for this commitment + namePrefix := fmt.Sprintf("commitment-%s-", string(commitment.UUID)) + var existing_reservations v1alpha1.ReservationList + for _, res := range all_reservations.Items { + if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix { + existing_reservations.Items = append(existing_reservations.Items, res) + } + } + + var stateBefore *CommitmentState + if len(existing_reservations.Items) == 0 { + stateBefore = &CommitmentState{ + CommitmentUUID: string(commitment.UUID), + ProjectID: string(projectID), + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: 0, + } + } else { + stateBefore, err = FromReservations(existing_reservations.Items) + if err != nil { + resp.RejectionReason = fmt.Sprintf("failed to get existing state for commitment %s: %v", commitment.UUID, err) + requireRollback = true + break ProcessLoop + } + } + statesBefore[string(commitment.UUID)] = stateBefore + + // get desired state + stateDesired, err := FromChangeCommitmentTargetState(commitment, string(projectID), flavorGroupName, flavorGroup, string(req.AZ)) + if err != nil { + resp.RejectionReason = fmt.Sprintf("failed to get desired state for commitment %s: %v", commitment.UUID, err) + requireRollback = true + break ProcessLoop + } + + log.Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldState", stateBefore, "desiredState", stateDesired) + + touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, log, stateDesired, flavorGroups, "changeCommitmentsApi") + if err != nil { + resp.RejectionReason = fmt.Sprintf("failed to apply commitment state for commitment %s: %v", commitment.UUID, err) + requireRollback = true + break ProcessLoop + } + log.Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations)) + reservationsToWatch = append(reservationsToWatch, touchedReservations...) + } + } + } + + // TODO make the rollback defer safe + if !requireRollback { + log.Info("applied commitment changes, now watching for reservation readiness", "reservationsToWatch", len(reservationsToWatch)) + + time_start := time.Now() + + if err := watchReservationsUntilReady(ctx, log, api.client, reservationsToWatch, watchTimeout); err != nil { + log.Info("reservations failed to become ready, initiating rollback", + "reason", err.Error()) + resp.RejectionReason = fmt.Sprintf("Not all reservations can be fulfilled: %v", err) + requireRollback = true + } + + log.Info("finished watching reservation", "totalSchedulingTimeSeconds", time.Since(time_start).Seconds()) + } + + if requireRollback { + log.Info("rollback of commitment changes") + for commitmentUUID, state := range statesBefore { + // Rollback to statesBefore for this commitment + log.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) + _, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, "changeCommitmentsApiRollback") + if err != nil { + log.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) + // continue with best effort rollback for other projects + } + } + + log.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason) + + // TODO improve human-readable reasoning based on actual failure, i.e. polish resp.RejectionReason + return nil + } + + log.Info("commitment changes accepted") + if resp.RejectionReason != "" { + log.Info("unexpected non-empty rejection reason without rollback", "reason", resp.RejectionReason) + resp.RejectionReason = "" + } + return nil +} + +// watchReservationsUntilReady polls until all reservations reach Ready=True or timeout. +func watchReservationsUntilReady( + ctx context.Context, + log logr.Logger, + k8sClient client.Client, + reservations []v1alpha1.Reservation, + timeout time.Duration, +) error { + + if len(reservations) == 0 { + return nil + } + + deadline := time.Now().Add(timeout) + + for { + if time.Now().After(deadline) { + return fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout) + } + + allReady := true + var notReadyReasons []string + + for _, res := range reservations { + // Fetch current state + var current v1alpha1.Reservation + nn := types.NamespacedName{ + Name: res.Name, + Namespace: res.Namespace, + } + + if err := k8sClient.Get(ctx, nn, ¤t); err != nil { + if apierrors.IsNotFound(err) { + // Reservation is still in process of being created + allReady = false + continue + } + return fmt.Errorf("failed to get reservation %s: %w", res.Name, err) + } + + // Check Ready condition + readyCond := meta.FindStatusCondition( + current.Status.Conditions, + v1alpha1.ReservationConditionReady, + ) + + if readyCond == nil { + // Condition not set yet, keep waiting + allReady = false + notReadyReasons = append(notReadyReasons, + res.Name+": condition not set") + continue + } + + switch readyCond.Status { + case metav1.ConditionTrue: + // This reservation is ready + continue + case metav1.ConditionFalse: + // Explicit failure - stop immediately + return fmt.Errorf("reservation %s failed: %s (reason: %s)", + res.Name, readyCond.Message, readyCond.Reason) + case metav1.ConditionUnknown: + // Still processing + allReady = false + notReadyReasons = append(notReadyReasons, + fmt.Sprintf("%s: %s", res.Name, readyCond.Message)) + } + } + + if allReady { + log.Info("all reservations are ready", + "count", len(reservations)) + return nil + } + + // Log progress + log.Info("waiting for reservations to become ready", + "notReady", len(notReadyReasons), + "total", len(reservations), + "timeRemaining", time.Until(deadline).Round(time.Second)) + + // Wait before next poll + select { + case <-time.After(pollInterval): + // Continue polling + case <-ctx.Done(): + return fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err()) + } + } +} diff --git a/internal/scheduling/reservations/commitments/api_change_commitments_test.go b/internal/scheduling/reservations/commitments/api_change_commitments_test.go new file mode 100644 index 000000000..c4703c4a1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_change_commitments_test.go @@ -0,0 +1,246 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/sapcc/go-api-declarations/liquid" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// TODO refactor with proper integration tests + +func TestHandleChangeCommitments_VersionMismatch(t *testing.T) { + // Create a fake Kubernetes client with a Knowledge CRD + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // Create a Knowledge CRD with a specific version timestamp and flavor groups + knowledgeTimestamp := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) + flavorGroup := createTestFlavorGroup() + + // Box the features using the Knowledge API + rawExt, err := v1alpha1.BoxFeatureList([]compute.FlavorGroupFeature{flavorGroup}) + if err != nil { + t.Fatalf("failed to box feature list: %v", err) + } + + knowledge := &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor-groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + LastContentChange: metav1.Time{Time: knowledgeTimestamp}, + Raw: rawExt, + RawLength: 1, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "Ready", + }, + }, + }, + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(knowledge). + WithStatusSubresource(knowledge). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + // Create request JSON with mismatched version + requestJSON := `{ + "az": "az-a", + "dryRun": false, + "infoVersion": 12345, + "byProject": {} + }` + + req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) + req.Header.Set("Content-Type", "application/json") + + w := httptest.NewRecorder() + + // Call the handler + api.HandleChangeCommitments(w, req) + + // Check response + resp := w.Result() + defer resp.Body.Close() + + // Verify HTTP 409 Conflict status + if resp.StatusCode != http.StatusConflict { + t.Errorf("expected status code %d (Conflict), got %d", http.StatusConflict, resp.StatusCode) + } + + // Verify Content-Type is text/plain (set by http.Error) + contentType := resp.Header.Get("Content-Type") + if contentType != "text/plain; charset=utf-8" { + t.Errorf("expected Content-Type 'text/plain; charset=utf-8', got %q", contentType) + } + + // Verify error message contains version information + var responseBody bytes.Buffer + if _, err = responseBody.ReadFrom(resp.Body); err != nil { + t.Fatalf("failed to read response body: %v", err) + } + + bodyStr := responseBody.String() + if !bytes.Contains([]byte(bodyStr), []byte("Version mismatch")) { + t.Errorf("expected response to contain 'Version mismatch', got: %s", bodyStr) + } + if !bytes.Contains([]byte(bodyStr), []byte("12345")) { + t.Errorf("expected response to contain request version '12345', got: %s", bodyStr) + } +} +func TestHandleChangeCommitments_DryRun(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + // Create dry run request JSON + requestJSON := `{ + "az": "az-a", + "dryRun": true, + "infoVersion": 12345, + "byProject": {} + }` + + req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + api.HandleChangeCommitments(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Dry run should return 200 OK with rejection reason + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status code %d (OK), got %d", http.StatusOK, resp.StatusCode) + } + + // Verify response is JSON + contentType := resp.Header.Get("Content-Type") + if contentType != "application/json" { + t.Errorf("expected Content-Type 'application/json', got %q", contentType) + } + + // Parse response + var response liquid.CommitmentChangeResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.RejectionReason != "Dry run not supported yet" { + t.Errorf("expected rejection reason 'Dry run not supported yet', got %q", response.RejectionReason) + } +} + +func TestProcessCommitmentChanges_KnowledgeNotReady(t *testing.T) { + // Test when flavor groups knowledge is not available + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // No Knowledge CRD created - simulates knowledge not ready + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + requestJSON := `{ + "az": "az-a", + "dryRun": false, + "infoVersion": 12345, + "byProject": {} + }` + + req := httptest.NewRequest(http.MethodPost, "/v1/change-commitments", bytes.NewReader([]byte(requestJSON))) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + api.HandleChangeCommitments(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Should return 200 OK with rejection reason + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status code %d (OK), got %d", http.StatusOK, resp.StatusCode) + } + + var response liquid.CommitmentChangeResponse + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + + if response.RejectionReason != "caches not ready" { + t.Errorf("expected rejection reason 'caches not ready', got %q", response.RejectionReason) + } + + if response.RetryAt.IsNone() { + t.Error("expected RetryAt to be set") + } +} + +// Helper function to create a minimal flavor group for testing +func createTestFlavorGroup() compute.FlavorGroupFeature { + return compute.FlavorGroupFeature{ + Name: "test_group", + Flavors: []compute.FlavorInGroup{ + { + Name: "test.small", + MemoryMB: 8192, + VCPUs: 2, + DiskGB: 40, + ExtraSpecs: map[string]string{ + "quota:separate": "true", + }, + }, + }, + SmallestFlavor: compute.FlavorInGroup{ + Name: "test.small", + MemoryMB: 8192, + VCPUs: 2, + DiskGB: 40, + }, + } +} diff --git a/internal/scheduling/reservations/commitments/api_info.go b/internal/scheduling/reservations/commitments/api_info.go new file mode 100644 index 000000000..db02dd708 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_info.go @@ -0,0 +1,117 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "strings" + "time" + + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + liquid "github.com/sapcc/go-api-declarations/liquid" +) + +// handles GET /v1/info requests from Limes: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) { + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/info") + + // Only accept GET method + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + log.V(1).Info("processing info request") + + // Build info response + info, err := api.buildServiceInfo(r.Context(), log) + if err != nil { + // Use Info level for expected conditions like knowledge not being ready yet + log.Info("service info not available yet", "error", err.Error()) + http.Error(w, "Service temporarily unavailable: "+err.Error(), + http.StatusServiceUnavailable) + return + } + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(info); err != nil { + log.Error(err, "failed to encode service info") + return + } +} + +// buildServiceInfo constructs the ServiceInfo response with metadata for all flavor groups. +func (api *HTTPAPI) buildServiceInfo(ctx context.Context, log logr.Logger) (liquid.ServiceInfo, error) { + // Get all flavor groups from Knowledge CRDs + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + // Return -1 as version when knowledge is not ready + return liquid.ServiceInfo{ + Version: -1, + Resources: make(map[liquid.ResourceName]liquid.ResourceInfo), + }, err + } + + // Build resources map + resources := make(map[liquid.ResourceName]liquid.ResourceInfo) + for groupName, groupData := range flavorGroups { + resourceName := liquid.ResourceName("ram_" + groupName) + + flavorNames := make([]string, 0, len(groupData.Flavors)) + for _, flavor := range groupData.Flavors { + flavorNames = append(flavorNames, flavor.Name) + } + displayName := fmt.Sprintf( + "multiples of %d MiB (usable by: %s)", + groupData.SmallestFlavor.MemoryMB, + strings.Join(flavorNames, ", "), + ) + + resources[resourceName] = liquid.ResourceInfo{ + DisplayName: displayName, + Unit: liquid.UnitNone, // Countable: multiples of smallest flavor instances + Topology: liquid.AZAwareTopology, // Commitments are per-AZ + NeedsResourceDemand: false, // Capacity planning out of scope for now + HasCapacity: true, // We report capacity via /v1/report-capacity + HasQuota: false, // No quota enforcement as of now + HandlesCommitments: true, // We handle commitment changes via /v1/change-commitments + } + + log.V(1).Info("registered flavor group resource", + "resourceName", resourceName, + "flavorGroup", groupName, + "displayName", displayName, + "smallestFlavor", groupData.SmallestFlavor.Name, + "smallestRamMB", groupData.SmallestFlavor.MemoryMB) + } + + // Get last content changed from flavor group knowledge and treat it as version + var version int64 = -1 + if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { + version = knowledgeCRD.Status.LastContentChange.Unix() + } + + log.Info("built service info", + "resourceCount", len(resources), + "version", version) + + return liquid.ServiceInfo{ + Version: version, + Resources: resources, + }, nil +} diff --git a/internal/scheduling/reservations/commitments/api_info_test.go b/internal/scheduling/reservations/commitments/api_info_test.go new file mode 100644 index 000000000..71c560c19 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_info_test.go @@ -0,0 +1,78 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestHandleInfo_KnowledgeNotReady(t *testing.T) { + // Test when flavor groups knowledge is not available + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + // No Knowledge CRD created - simulates knowledge not ready + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody) + w := httptest.NewRecorder() + + api.HandleInfo(w, req) + + resp := w.Result() + defer resp.Body.Close() + + // Should return 503 Service Unavailable when knowledge is not ready + if resp.StatusCode != http.StatusServiceUnavailable { + t.Errorf("expected status code %d (Service Unavailable), got %d", http.StatusServiceUnavailable, resp.StatusCode) + } + + // Verify Content-Type is text/plain (set by http.Error) + contentType := resp.Header.Get("Content-Type") + if contentType != "text/plain; charset=utf-8" { + t.Errorf("expected Content-Type 'text/plain; charset=utf-8', got %q", contentType) + } +} + +func TestHandleInfo_MethodNotAllowed(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + api := &HTTPAPI{ + client: k8sClient, + } + + // Use POST instead of GET + req := httptest.NewRequest(http.MethodPost, "/v1/info", http.NoBody) + w := httptest.NewRecorder() + + api.HandleInfo(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != http.StatusMethodNotAllowed { + t.Errorf("expected status code %d (Method Not Allowed), got %d", http.StatusMethodNotAllowed, resp.StatusCode) + } +} diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go new file mode 100644 index 000000000..0ec1f5e7d --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_report_capacity.go @@ -0,0 +1,61 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/sapcc/go-api-declarations/liquid" +) + +// handles POST /v1/report-capacity requests from Limes: +// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid +// Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group. +func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) { + // Extract or generate request ID for tracing + requestID := r.Header.Get("X-Request-ID") + if requestID == "" { + requestID = fmt.Sprintf("req-%d", time.Now().UnixNano()) + } + log := commitmentApiLog.WithValues("requestID", requestID, "endpoint", "/v1/report-capacity") + + // Only accept POST method + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + log.V(1).Info("processing report capacity request") + + // Parse request body (may be empty or contain ServiceCapacityRequest) + var req liquid.ServiceCapacityRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + // Empty body is acceptable for capacity reports + req = liquid.ServiceCapacityRequest{} + } + + // Calculate capacity + calculator := NewCapacityCalculator(api.client) + report, err := calculator.CalculateCapacity(r.Context()) + if err != nil { + log.Error(err, "failed to calculate capacity") + http.Error(w, "Failed to calculate capacity: "+err.Error(), + http.StatusInternalServerError) + return + } + + log.Info("calculated capacity report", "resourceCount", len(report.Resources)) + + // Return response + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(report); err != nil { + log.Error(err, "failed to encode capacity report") + return + } +} diff --git a/internal/scheduling/reservations/commitments/api_report_capacity_test.go b/internal/scheduling/reservations/commitments/api_report_capacity_test.go new file mode 100644 index 000000000..76140e218 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api_report_capacity_test.go @@ -0,0 +1,285 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/sapcc/go-api-declarations/liquid" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +func TestHandleReportCapacity(t *testing.T) { + // Setup fake client + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create empty flavor groups knowledge so capacity calculation doesn't fail + emptyKnowledge := createEmptyFlavorGroupKnowledge() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(emptyKnowledge). + Build() + + api := NewAPI(fakeClient) + + tests := []struct { + name string + method string + body interface{} + expectedStatus int + checkResponse func(*testing.T, *liquid.ServiceCapacityReport) + }{ + { + name: "POST request succeeds", + method: http.MethodPost, + body: liquid.ServiceCapacityRequest{}, + expectedStatus: http.StatusOK, + checkResponse: func(t *testing.T, resp *liquid.ServiceCapacityReport) { + // Resources may be nil or empty for empty capacity + if len(resp.Resources) != 0 { + t.Errorf("Expected empty or nil Resources, got %d resources", len(resp.Resources)) + } + }, + }, + { + name: "POST with empty body succeeds", + method: http.MethodPost, + body: nil, + expectedStatus: http.StatusOK, + checkResponse: func(t *testing.T, resp *liquid.ServiceCapacityReport) { + // Resources may be nil or empty for empty capacity + if len(resp.Resources) != 0 { + t.Errorf("Expected empty or nil Resources, got %d resources", len(resp.Resources)) + } + }, + }, + { + name: "GET request fails", + method: http.MethodGet, + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + checkResponse: nil, + }, + { + name: "PUT request fails", + method: http.MethodPut, + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + checkResponse: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create request + var req *http.Request + if tt.body != nil { + bodyBytes, err := json.Marshal(tt.body) + if err != nil { + t.Fatal(err) + } + req = httptest.NewRequest(tt.method, "/v1/report-capacity", bytes.NewReader(bodyBytes)) + } else { + req = httptest.NewRequest(tt.method, "/v1/report-capacity", http.NoBody) + } + req = req.WithContext(context.Background()) + + // Create response recorder + rr := httptest.NewRecorder() + + // Call handler + api.HandleReportCapacity(rr, req) + + // Check status code + if rr.Code != tt.expectedStatus { + t.Errorf("Expected status %d, got %d", tt.expectedStatus, rr.Code) + } + + // Check response if applicable + if tt.checkResponse != nil && rr.Code == http.StatusOK { + var resp liquid.ServiceCapacityReport + if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil { + t.Fatalf("Failed to decode response: %v", err) + } + tt.checkResponse(t, &resp) + } + }) + } +} + +func TestCapacityCalculator(t *testing.T) { + // Setup fake client with Knowledge CRD + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + t.Run("CalculateCapacity returns error when no flavor groups knowledge exists", func(t *testing.T) { + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + calculator := NewCapacityCalculator(fakeClient) + _, err := calculator.CalculateCapacity(context.Background()) + if err == nil { + t.Fatal("Expected error when flavor groups knowledge doesn't exist, got nil") + } + if !strings.Contains(err.Error(), "not found") { + t.Errorf("Expected 'not found' error, got: %v", err) + } + }) + + t.Run("CalculateCapacity returns empty report when flavor groups knowledge exists but is empty", func(t *testing.T) { + // Create empty flavor groups knowledge + emptyKnowledge := createEmptyFlavorGroupKnowledge() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(emptyKnowledge). + Build() + + calculator := NewCapacityCalculator(fakeClient) + report, err := calculator.CalculateCapacity(context.Background()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if report.Resources == nil { + t.Error("Expected Resources map to be initialized") + } + + if len(report.Resources) != 0 { + t.Errorf("Expected 0 resources, got %d", len(report.Resources)) + } + }) + + t.Run("CalculateCapacity returns empty perAZ when no HostDetails exist", func(t *testing.T) { + // Create a flavor group knowledge without host details + flavorGroupKnowledge := createTestFlavorGroupKnowledge(t, "test-group") + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(flavorGroupKnowledge). + Build() + + calculator := NewCapacityCalculator(fakeClient) + report, err := calculator.CalculateCapacity(context.Background()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if len(report.Resources) != 1 { + t.Fatalf("Expected 1 resource, got %d", len(report.Resources)) + } + + resource := report.Resources[liquid.ResourceName("ram_test-group")] + if resource == nil { + t.Fatal("Expected ram_test-group resource to exist") + } + + // Should have empty perAZ map when no host details + if len(resource.PerAZ) != 0 { + t.Errorf("Expected 0 AZs, got %d", len(resource.PerAZ)) + } + }) +} + +// createEmptyFlavorGroupKnowledge creates an empty flavor groups Knowledge CRD +func createEmptyFlavorGroupKnowledge() *v1alpha1.Knowledge { + // Box empty array properly + emptyFeatures := []map[string]interface{}{} + raw, err := v1alpha1.BoxFeatureList(emptyFeatures) + if err != nil { + panic(err) // Should never happen for empty slice + } + + return &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{ + Name: "flavor-groups", + // No namespace - Knowledge is cluster-scoped + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: "True", + }, + }, + Raw: raw, + }, + } +} + +// createTestFlavorGroupKnowledge creates a test Knowledge CRD with flavor group data +func createTestFlavorGroupKnowledge(t *testing.T, groupName string) *v1alpha1.Knowledge { + t.Helper() + + features := []map[string]interface{}{ + { + "name": groupName, + "flavors": []map[string]interface{}{ + { + "name": "test_c8_m32", + "vcpus": 8, + "memoryMB": 32768, + "diskGB": 50, + }, + }, + "largestFlavor": map[string]interface{}{ + "name": "test_c8_m32", + "vcpus": 8, + "memoryMB": 32768, + "diskGB": 50, + }, + }, + } + + // Use BoxFeatureList to properly format the features + raw, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatal(err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{ + Name: "flavor-groups", + // No namespace - Knowledge is cluster-scoped + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: "True", + }, + }, + Raw: raw, + }, + } +} diff --git a/internal/scheduling/reservations/commitments/capacity.go b/internal/scheduling/reservations/commitments/capacity.go new file mode 100644 index 000000000..04ad177e1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/capacity.go @@ -0,0 +1,124 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "fmt" + "sort" + + "github.com/sapcc/go-api-declarations/liquid" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" +) + +// CapacityCalculator computes capacity reports for Limes LIQUID API. +type CapacityCalculator struct { + client client.Client +} + +func NewCapacityCalculator(client client.Client) *CapacityCalculator { + return &CapacityCalculator{client: client} +} + +// CalculateCapacity computes per-AZ capacity for all flavor groups. +func (c *CapacityCalculator) CalculateCapacity(ctx context.Context) (liquid.ServiceCapacityReport, error) { + // Get all flavor groups from Knowledge CRDs + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to get flavor groups: %w", err) + } + + // Build capacity report per flavor group + report := liquid.ServiceCapacityReport{ + Resources: make(map[liquid.ResourceName]*liquid.ResourceCapacityReport), + } + + for groupName, groupData := range flavorGroups { + // Resource name follows pattern: ram_ + resourceName := liquid.ResourceName("ram_" + groupName) + + // Calculate per-AZ capacity and usage + azCapacity, err := c.calculateAZCapacity(ctx, groupName, groupData) + if err != nil { + return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to calculate capacity for %s: %w", groupName, err) + } + + report.Resources[resourceName] = &liquid.ResourceCapacityReport{ + PerAZ: azCapacity, + } + } + + return report, nil +} + +func (c *CapacityCalculator) calculateAZCapacity( + ctx context.Context, + _ string, // groupName - reserved for future use + _ compute.FlavorGroupFeature, // groupData - reserved for future use +) (map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, error) { + // Get list of availability zones from HostDetails Knowledge + azs, err := c.getAvailabilityZones(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get availability zones: %w", err) + } + + // Create report entry for each AZ with empty capacity/usage + // Capacity and Usage are left unset (zero value of option.Option[uint64]) + // This signals to Limes: "These AZs exist, but capacity/usage not yet calculated" + result := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport) + for _, az := range azs { + result[liquid.AvailabilityZone(az)] = &liquid.AZResourceCapacityReport{ + // Both Capacity and Usage left unset (empty optional values) + // TODO: Calculate actual capacity from Reservation CRDs or host resources + // TODO: Calculate actual usage from VM allocations + } + } + + return result, nil +} + +func (c *CapacityCalculator) getAvailabilityZones(ctx context.Context) ([]string, error) { + // List all Knowledge CRDs to find host-details knowledge + var knowledgeList v1alpha1.KnowledgeList + if err := c.client.List(ctx, &knowledgeList); err != nil { + return nil, fmt.Errorf("failed to list Knowledge CRDs: %w", err) + } + + // Find host-details knowledge and extract AZs + azSet := make(map[string]struct{}) + for _, knowledge := range knowledgeList.Items { + // Look for host-details extractor + if knowledge.Spec.Extractor.Name != "host_details" { + continue + } + + // Parse features from Raw data + features, err := v1alpha1.UnboxFeatureList[compute.HostDetails](knowledge.Status.Raw) + if err != nil { + // Skip if we can't parse this knowledge + continue + } + + // Collect unique AZ names + for _, feature := range features { + if feature.AvailabilityZone != "" { + azSet[feature.AvailabilityZone] = struct{}{} + } + } + } + + // Convert set to sorted slice + azs := make([]string, 0, len(azSet)) + for az := range azSet { + azs = append(azs, az) + } + sort.Strings(azs) + + return azs, nil +} diff --git a/internal/scheduling/reservations/commitments/client.go b/internal/scheduling/reservations/commitments/client.go index 31e79c5b0..2e5585c99 100644 --- a/internal/scheduling/reservations/commitments/client.go +++ b/internal/scheduling/reservations/commitments/client.go @@ -14,11 +14,10 @@ import ( "github.com/cobaltcore-dev/cortex/pkg/keystone" "github.com/cobaltcore-dev/cortex/pkg/sso" "github.com/gophercloud/gophercloud/v2" - "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/flavors" - "github.com/gophercloud/gophercloud/v2/openstack/compute/v2/servers" "github.com/gophercloud/gophercloud/v2/openstack/identity/v3/projects" "github.com/sapcc/go-bits/jobloop" "github.com/sapcc/go-bits/must" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -28,13 +27,8 @@ type CommitmentsClient interface { Init(ctx context.Context, client client.Client, conf SyncerConfig) error // List all projects to resolve commitments. ListProjects(ctx context.Context) ([]Project, error) - // List all flavors by their name to resolve instance commitments. - ListFlavorsByName(ctx context.Context) (map[string]Flavor, error) // List all commitments with resolved metadata (e.g. project, flavor, ...). ListCommitmentsByID(ctx context.Context, projects ...Project) (map[string]Commitment, error) - // List all servers for the given projects from nova. - // The result is a map from project ID to the list of servers. - ListServersByProjectID(ctx context.Context, projects ...Project) (map[string][]Server, error) } // Commitments client fetching commitments from openstack services. @@ -49,14 +43,13 @@ type commitmentsClient struct { limes *gophercloud.ServiceClient } -// Create a new commitments client. -// By default, this client will fetch commitments from the limes API. func NewCommitmentsClient() CommitmentsClient { return &commitmentsClient{} } -// Init the client. func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf SyncerConfig) error { + log := ctrl.Log.WithName("CommitmentClient") + var authenticatedHTTP = http.DefaultClient if conf.SSOSecretRef != nil { var err error @@ -79,7 +72,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "identity", Availability: "public", })) - syncLog.Info("using identity endpoint", "url", url) + log.Info("using identity endpoint", "url", url) c.keystone = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -91,7 +84,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "compute", Availability: "public", })) - syncLog.Info("using nova endpoint", "url", url) + log.Info("using nova endpoint", "url", url) c.nova = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -104,7 +97,7 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf Type: "resources", Availability: "public", })) - syncLog.Info("using limes endpoint", "url", url) + log.Info("using limes endpoint", "url", url) c.limes = &gophercloud.ServiceClient{ ProviderClient: c.provider, Endpoint: url, @@ -113,32 +106,10 @@ func (c *commitmentsClient) Init(ctx context.Context, client client.Client, conf return nil } -// Get all Nova flavors by their name to resolve instance commitments. -func (c *commitmentsClient) ListFlavorsByName(ctx context.Context) (map[string]Flavor, error) { - syncLog.Info("fetching all flavors from nova") - flo := flavors.ListOpts{AccessType: flavors.AllAccess} - pages, err := flavors.ListDetail(c.nova, flo).AllPages(ctx) - if err != nil { - return nil, err - } - // Parse the json data into our custom model. - var data = &struct { - Flavors []Flavor `json:"flavors"` - }{} - if err := pages.(flavors.FlavorPage).ExtractInto(data); err != nil { - return nil, err - } - syncLog.Info("fetched flavors from nova", "count", len(data.Flavors)) - flavorsByName := make(map[string]Flavor, len(data.Flavors)) - for _, flavor := range data.Flavors { - flavorsByName[flavor.Name] = flavor - } - return flavorsByName, nil -} - -// Get all projects from Keystone to resolve commitments. func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) { - syncLog.Info("fetching projects from keystone") + log := ctrl.Log.WithName("CommitmentClient") + + log.V(1).Info("fetching projects from keystone") allPages, err := projects.List(c.keystone, nil).AllPages(ctx) if err != nil { return nil, err @@ -149,14 +120,15 @@ func (c *commitmentsClient) ListProjects(ctx context.Context) ([]Project, error) if err := allPages.(projects.ProjectPage).ExtractInto(data); err != nil { return nil, err } - syncLog.Info("fetched projects from keystone", "count", len(data.Projects)) + log.V(1).Info("fetched projects from keystone", "count", len(data.Projects)) return data.Projects, nil } -// Get all available commitments from limes + keystone + nova. -// This function fetches the commitments for each project in parallel. +// ListCommitmentsByID fetches commitments for all projects in parallel. func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects ...Project) (map[string]Commitment, error) { - syncLog.Info("fetching commitments from limes", "projects", len(projects)) + log := ctrl.Log.WithName("CommitmentClient") + + log.V(1).Info("fetching commitments from limes", "projects", len(projects)) commitmentsMutex := gosync.Mutex{} commitments := make(map[string]Commitment) var wg gosync.WaitGroup @@ -189,15 +161,14 @@ func (c *commitmentsClient) ListCommitmentsByID(ctx context.Context, projects .. // Return the first error encountered, if any. for err := range errChan { if err != nil { - syncLog.Error(err, "failed to resolve commitments") + log.Error(err, "failed to resolve commitments") return nil, err } } - syncLog.Info("resolved commitments from limes", "count", len(commitments)) + log.V(1).Info("resolved commitments from limes", "count", len(commitments)) return commitments, nil } -// Resolve the commitments for the given project. func (c *commitmentsClient) listCommitments(ctx context.Context, project Project) ([]Commitment, error) { url := c.limes.Endpoint + "v1" + "/domains/" + project.DomainID + @@ -232,67 +203,3 @@ func (c *commitmentsClient) listCommitments(ctx context.Context, project Project } return commitments, nil } - -// Get all servers for the given project ids from nova. -// The result is a map from project ID to the list of servers. -func (c *commitmentsClient) ListServersByProjectID(ctx context.Context, projects ...Project) (map[string][]Server, error) { - syncLog.Info("fetching servers from nova") - serversByProject := make(map[string][]Server, len(projects)) - var mu gosync.Mutex - var wg gosync.WaitGroup - ctx, cancel := context.WithCancel(ctx) - defer cancel() - // Channel to communicate errors from goroutines. - errChan := make(chan error, len(projects)) - for _, project := range projects { - wg.Go(func() { - servers, err := c.listServersForProject(ctx, project) - if err != nil { - errChan <- err - cancel() - return - } - mu.Lock() - serversByProject[project.ID] = servers - mu.Unlock() - }) - time.Sleep(jobloop.DefaultJitter(50 * time.Millisecond)) // Don't overload the API. - } - // Wait for all goroutines to finish and close the error channel. - go func() { - wg.Wait() - close(errChan) - }() - // Return the first error encountered, if any. - for err := range errChan { - if err != nil { - syncLog.Error(err, "failed to fetch servers") - return nil, err - } - } - syncLog.Info("fetched servers from nova", "projects", len(serversByProject)) - return serversByProject, nil -} - -// Get all servers for the given project id from nova. -func (c *commitmentsClient) listServersForProject(ctx context.Context, project Project) ([]Server, error) { - lo := servers.ListOpts{ - // AllTenants must be set to fetch servers from other projects - // than the one we are authenticated with. - AllTenants: true, - TenantID: project.ID, - } - pages, err := servers.List(c.nova, lo).AllPages(ctx) - if err != nil { - return nil, err - } - // Parse the json data into our custom model. - var data = &struct { - Servers []Server `json:"servers"` - }{} - if err := pages.(servers.ServerPage).ExtractInto(data); err != nil { - return nil, err - } - syncLog.Info("fetched servers for project", "project", project.ID, "count", len(data.Servers)) - return data.Servers, nil -} diff --git a/internal/scheduling/reservations/commitments/client_test.go b/internal/scheduling/reservations/commitments/client_test.go index f3a1d0a8f..be2d66ff9 100644 --- a/internal/scheduling/reservations/commitments/client_test.go +++ b/internal/scheduling/reservations/commitments/client_test.go @@ -8,7 +8,6 @@ import ( "encoding/json" "net/http" "net/http/httptest" - "reflect" "strings" "testing" "time" @@ -127,134 +126,6 @@ func TestCommitmentsClient_ListProjects_Error(t *testing.T) { } } -func TestCommitmentsClient_ListFlavorsByName(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if strings.Contains(r.URL.Path, "/flavors/detail") { - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - _, err := w.Write([]byte(`{ - "flavors": [ - { - "id": "flavor1", - "name": "m1.small", - "ram": 2048, - "vcpus": 1, - "disk": 20, - "rxtx_factor": 1.0, - "os-flavor-access:is_public": true, - "OS-FLV-EXT-DATA:ephemeral": 0, - "description": "Small flavor", - "extra_specs": {"hw:cpu_policy": "shared"} - }, - { - "id": "flavor2", - "name": "m1.medium", - "ram": 4096, - "vcpus": 2, - "disk": 40, - "rxtx_factor": 1.0, - "os-flavor-access:is_public": true, - "OS-FLV-EXT-DATA:ephemeral": 0, - "description": "Medium flavor", - "extra_specs": {"hw:cpu_policy": "dedicated"} - } - ] - }`)) - if err != nil { - t.Fatalf("failed to write response: %v", err) - } - return - } - http.NotFound(w, r) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - Microversion: "2.61", - }, - } - - ctx := context.Background() - flavorsByName, err := client.ListFlavorsByName(ctx) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - expectedFlavors := map[string]Flavor{ - "m1.small": { - ID: "flavor1", - Name: "m1.small", - RAM: 2048, - VCPUs: 1, - Disk: 20, - RxTxFactor: 1.0, - IsPublic: true, - Ephemeral: 0, - Description: "Small flavor", - ExtraSpecs: map[string]string{"hw:cpu_policy": "shared"}, - }, - "m1.medium": { - ID: "flavor2", - Name: "m1.medium", - RAM: 4096, - VCPUs: 2, - Disk: 40, - RxTxFactor: 1.0, - IsPublic: true, - Ephemeral: 0, - Description: "Medium flavor", - ExtraSpecs: map[string]string{"hw:cpu_policy": "dedicated"}, - }, - } - - if len(flavorsByName) != len(expectedFlavors) { - t.Fatalf("expected %d flavors, got %d", len(expectedFlavors), len(flavorsByName)) - } - - for name, expected := range expectedFlavors { - actual, exists := flavorsByName[name] - if !exists { - t.Errorf("expected flavor %s to exist", name) - continue - } - if !reflect.DeepEqual(actual, expected) { - t.Errorf("flavor %s: expected %+v, got %+v", name, expected, actual) - } - } -} - -func TestCommitmentsClient_ListFlavorsByName_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Service Unavailable", http.StatusServiceUnavailable) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - ctx := context.Background() - flavors, err := client.ListFlavorsByName(ctx) - if err == nil { - t.Fatal("expected error, got nil") - } - if flavors != nil { - t.Errorf("expected nil flavors, got %+v", flavors) - } -} - func TestCommitmentsClient_ListCommitmentsByID(t *testing.T) { // Mock server for Limes service server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -342,147 +213,6 @@ func TestCommitmentsClient_ListCommitmentsByID(t *testing.T) { } } -func TestCommitmentsClient_ListCommitmentsByID_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Unauthorized", http.StatusUnauthorized) - })) - defer server.Close() - - client := &commitmentsClient{ - limes: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - TokenID: "test-token", - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1", DomainID: "domain1"}, - } - - ctx := context.Background() - commitments, err := client.ListCommitmentsByID(ctx, projects...) - if err == nil { - t.Fatal("expected error, got nil") - } - if commitments != nil { - t.Errorf("expected nil commitments, got %+v", commitments) - } -} - -func TestCommitmentsClient_ListServersByProjectID(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if strings.Contains(r.URL.Path, "/servers/detail") { - // Parse query parameters to determine which project - tenantID := r.URL.Query().Get("tenant_id") - - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - if tenantID == "project1" { - if _, err := w.Write([]byte(`{ - "servers": [ - { - "id": "server1", - "name": "test-server-1", - "status": "ACTIVE", - "tenant_id": "project1", - "flavor": {"original_name": "m1.small"} - } - ] - }`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - } else { - if _, err := w.Write([]byte(`{"servers": []}`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - } - return - } - http.NotFound(w, r) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1", Name: "Test Project 1"}, - {ID: "project2", Name: "Test Project 2"}, - } - - ctx := context.Background() - serversByProject, err := client.ListServersByProjectID(ctx, projects...) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if len(serversByProject) != 2 { - t.Fatalf("expected 2 project entries, got %d", len(serversByProject)) - } - - // Check project1 has 1 server - servers1, exists := serversByProject["project1"] - if !exists { - t.Fatal("expected project1 to exist in results") - } - if len(servers1) != 1 { - t.Fatalf("expected 1 server for project1, got %d", len(servers1)) - } - if servers1[0].ID != "server1" { - t.Errorf("expected server ID server1, got %s", servers1[0].ID) - } - - // Check project2 has 0 servers - servers2, exists := serversByProject["project2"] - if !exists { - t.Fatal("expected project2 to exist in results") - } - if len(servers2) != 0 { - t.Fatalf("expected 0 servers for project2, got %d", len(servers2)) - } -} - -func TestCommitmentsClient_ListServersByProjectID_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Forbidden", http.StatusForbidden) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - projects := []Project{ - {ID: "project1"}, - } - - ctx := context.Background() - servers, err := client.ListServersByProjectID(ctx, projects...) - if err == nil { - t.Fatal("expected error, got nil") - } - if servers != nil { - t.Errorf("expected nil servers, got %+v", servers) - } -} - func TestCommitmentsClient_listCommitments(t *testing.T) { // Mock server for Limes service server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -625,136 +355,6 @@ func TestCommitmentsClient_listCommitments_JSONError(t *testing.T) { } } -func TestCommitmentsClient_listServersForProject(t *testing.T) { - // Mock server for Nova compute service - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if !strings.Contains(r.URL.Path, "/servers/detail") { - http.NotFound(w, r) - return - } - - // Verify query parameters - query := r.URL.Query() - if query.Get("all_tenants") != "true" { - t.Errorf("expected all_tenants=true, got %s", query.Get("all_tenants")) - } - if query.Get("tenant_id") != "test-project" { - t.Errorf("expected tenant_id=test-project, got %s", query.Get("tenant_id")) - } - - // Return raw JSON string as the gophercloud pages expect - w.Header().Set("Content-Type", "application/json") - if _, err := w.Write([]byte(`{ - "servers": [ - { - "id": "server1", - "name": "test-server", - "status": "ACTIVE", - "tenant_id": "test-project", - "flavor": {"original_name": "m1.small"} - }, - { - "id": "server2", - "name": "another-server", - "status": "ACTIVE", - "tenant_id": "test-project", - "flavor": {"original_name": "m1.medium"} - } - ] - }`)); err != nil { - t.Fatalf("failed to write response: %v", err) - } - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL + "/", - }, - } - - project := Project{ - ID: "test-project", - Name: "Test Project", - } - - ctx := context.Background() - servers, err := client.listServersForProject(ctx, project) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if len(servers) != 2 { - t.Fatalf("expected 2 servers, got %d", len(servers)) - } - - expectedServers := []Server{ - { - ID: "server1", - Name: "test-server", - Status: "ACTIVE", - TenantID: "test-project", - FlavorName: "m1.small", - }, - { - ID: "server2", - Name: "another-server", - Status: "ACTIVE", - TenantID: "test-project", - FlavorName: "m1.medium", - }, - } - - for i, expected := range expectedServers { - if servers[i].ID != expected.ID { - t.Errorf("server %d: expected ID %s, got %s", i, expected.ID, servers[i].ID) - } - if servers[i].Name != expected.Name { - t.Errorf("server %d: expected Name %s, got %s", i, expected.Name, servers[i].Name) - } - if servers[i].Status != expected.Status { - t.Errorf("server %d: expected Status %s, got %s", i, expected.Status, servers[i].Status) - } - if servers[i].TenantID != expected.TenantID { - t.Errorf("server %d: expected TenantID %s, got %s", i, expected.TenantID, servers[i].TenantID) - } - if servers[i].FlavorName != expected.FlavorName { - t.Errorf("server %d: expected FlavorName %s, got %s", i, expected.FlavorName, servers[i].FlavorName) - } - } -} - -func TestCommitmentsClient_listServersForProject_Error(t *testing.T) { - // Mock server that returns an error - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Error(w, "Internal Server Error", http.StatusInternalServerError) - })) - defer server.Close() - - client := &commitmentsClient{ - nova: &gophercloud.ServiceClient{ - ProviderClient: &gophercloud.ProviderClient{ - HTTPClient: *http.DefaultClient, - }, - Endpoint: server.URL, - }, - } - - project := Project{ID: "test-project"} - - ctx := context.Background() - servers, err := client.listServersForProject(ctx, project) - if err == nil { - t.Fatal("expected error, got nil") - } - if servers != nil { - t.Errorf("expected nil servers, got %+v", servers) - } -} - func TestCommitmentsClient_ContextCancellation(t *testing.T) { // Test context cancellation handling slowServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go new file mode 100644 index 000000000..350de7e8c --- /dev/null +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -0,0 +1,310 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "fmt" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/go-logr/logr" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ReservationManager handles CRUD operations for Reservation CRDs. +type ReservationManager struct { + client.Client +} + +func NewReservationManager(k8sClient client.Client) *ReservationManager { + return &ReservationManager{ + Client: k8sClient, + } +} + +// ApplyCommitmentState synchronizes Reservation CRDs to match the desired commitment state. +// This function performs CRUD operations (create/update/delete) on reservation slots to align +// with the capacity specified in desiredState. +// +// Entry points: +// - from Syncer - periodic sync with Limes state +// - from API ChangeCommitmentsHandler - batch processing of commitment changes +// +// The function is idempotent and handles: +// - Repairing inconsistent slots (wrong flavor group/project) +// - Creating new reservation slots when capacity increases +// - Deleting unused/excess slots when capacity decreases +// - Syncing reservation metadata for all remaining slots +// +// Returns touched reservations (created/updated) and removed reservations for caller tracking. +func (m *ReservationManager) ApplyCommitmentState( + ctx context.Context, + log logr.Logger, + desiredState *CommitmentState, + flavorGroups map[string]compute.FlavorGroupFeature, + creator string, +) (touchedReservations, removedReservations []v1alpha1.Reservation, err error) { + + log = log.WithName("ReservationManager") + + // Phase 1: List and filter existing reservations for this commitment + var allReservations v1alpha1.ReservationList + if err := m.List(ctx, &allReservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + return nil, nil, fmt.Errorf("failed to list reservations: %w", err) + } + + // Filter by name prefix to find reservations for this commitment + namePrefix := fmt.Sprintf("commitment-%s-", desiredState.CommitmentUUID) + var existing []v1alpha1.Reservation + for _, res := range allReservations.Items { + if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix { + existing = append(existing, res) + } + } + + // Phase 2: Calculate memory delta (desired - current) + flavorGroup, exists := flavorGroups[desiredState.FlavorGroupName] + + if !exists { + return nil, nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) + } + deltaMemoryBytes := desiredState.TotalMemoryBytes + for _, res := range existing { + memoryQuantity := res.Spec.Resources["memory"] + deltaMemoryBytes -= memoryQuantity.Value() + } + + log.Info("applying commitment state", + "commitmentUUID", desiredState.CommitmentUUID, + "desiredMemoryBytes", desiredState.TotalMemoryBytes, + "deltaMemoryBytes", deltaMemoryBytes, + "existingSlots", len(existing), + ) + + nextSlotIndex := GetNextSlotIndex(existing) + + // Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project) + // They will be recreated with correct metadata in subsequent phases. + var validReservations []v1alpha1.Reservation + for _, res := range existing { + if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName || + res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID { + log.Info("Found a reservation with wrong flavor group or project, delete and recreate afterward", + "commitmentUUID", desiredState.CommitmentUUID, + "name", res.Name, + "expectedFlavorGroup", desiredState.FlavorGroupName, + "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, + "expectedProjectID", desiredState.ProjectID, + "actualProjectID", res.Spec.CommittedResourceReservation.ProjectID) + removedReservations = append(removedReservations, res) + memValue := res.Spec.Resources["memory"] + deltaMemoryBytes += memValue.Value() + + if err := m.Delete(ctx, &res); err != nil { + return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) + } + } else { + validReservations = append(validReservations, res) + } + } + existing = validReservations + + // Phase 4 (DELETE): Remove reservations (capacity decreased) + for deltaMemoryBytes < 0 && len(existing) > 0 { + // prefer unused reservation slot or simply remove last one + var reservationToDelete *v1alpha1.Reservation + for i, res := range existing { + if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { + reservationToDelete = &res + existing = append(existing[:i], existing[i+1:]...) // remove from existing list + break + } + } + if reservationToDelete == nil { + reservationToDelete = &existing[len(existing)-1] + existing = existing[:len(existing)-1] // remove from existing list + } + removedReservations = append(removedReservations, *reservationToDelete) + memValue := reservationToDelete.Spec.Resources["memory"] + deltaMemoryBytes += memValue.Value() + + log.Info("deleting reservation", + "commitmentUUID", desiredState.CommitmentUUID, + "deltaMemoryBytes", deltaMemoryBytes, + "name", reservationToDelete.Name, + "numAllocations", len(reservationToDelete.Spec.CommittedResourceReservation.Allocations), + "memoryBytes", memValue.Value()) + + if err := m.Delete(ctx, reservationToDelete); err != nil { + return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err) + } + } + + // Phase 5 (CREATE): Create new reservations (capacity increased) + for deltaMemoryBytes > 0 { + // Need to create new reservation slots, always prefer largest flavor within the group + // TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio + reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator) + touchedReservations = append(touchedReservations, *reservation) + memValue := reservation.Spec.Resources["memory"] + deltaMemoryBytes -= memValue.Value() + + log.Info("creating reservation", + "commitmentUUID", desiredState.CommitmentUUID, + "deltaMemoryBytes", deltaMemoryBytes, + "name", reservation.Name, + "memoryBytes", memValue.Value()) + + if err := m.Create(ctx, reservation); err != nil { + if apierrors.IsAlreadyExists(err) { + return touchedReservations, removedReservations, fmt.Errorf( + "reservation %s already exists (collision detected): %w", + reservation.Name, err) + } + return touchedReservations, removedReservations, fmt.Errorf( + "failed to create reservation slot %d: %w", + nextSlotIndex, err) + } + + nextSlotIndex++ + } + + // Phase 6 (UPDATE): Sync metadata for remaining reservations + for i := range existing { + updated, err := m.syncReservationMetadata(ctx, log, &existing[i], desiredState) + if err != nil { + return touchedReservations, removedReservations, err + } + if updated != nil { + touchedReservations = append(touchedReservations, *updated) + } + } + + log.Info("completed commitment state sync", + "commitmentUUID", desiredState.CommitmentUUID, + "totalReservations", len(existing), + "created", len(touchedReservations)-len(existing), + "deleted", len(removedReservations)) + + return touchedReservations, removedReservations, nil +} + +// syncReservationMetadata updates reservation metadata if it differs from desired state. +func (m *ReservationManager) syncReservationMetadata( + ctx context.Context, + log logr.Logger, + reservation *v1alpha1.Reservation, + state *CommitmentState, +) (*v1alpha1.Reservation, error) { + + // if any of AZ, StarTime, EndTime differ from desired state, need to patch + if (state.AvailabilityZone != "" && reservation.Spec.AvailabilityZone != state.AvailabilityZone) || + (state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) || + (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) { + // Apply patch + log.Info("syncing reservation metadata", + "reservation", reservation.Name, + "availabilityZone", state.AvailabilityZone, + "startTime", state.StartTime, + "endTime", state.EndTime) + + patch := client.MergeFrom(reservation.DeepCopy()) + + if state.AvailabilityZone != "" { + reservation.Spec.AvailabilityZone = state.AvailabilityZone + } + if state.StartTime != nil { + reservation.Spec.StartTime = &metav1.Time{Time: *state.StartTime} + } + if state.EndTime != nil { + reservation.Spec.EndTime = &metav1.Time{Time: *state.EndTime} + } + + if err := m.Patch(ctx, reservation, patch); err != nil { + return nil, fmt.Errorf("failed to patch reservation %s: %w", + reservation.Name, err) + } + + return reservation, nil + } else { + return nil, nil // No changes needed + } +} + +func (m *ReservationManager) newReservation( + state *CommitmentState, + slotIndex int, + deltaMemoryBytes int64, + flavorGroup compute.FlavorGroupFeature, + creator string, +) *v1alpha1.Reservation { + + name := fmt.Sprintf("commitment-%s-%d", state.CommitmentUUID, slotIndex) + + // Select first flavor that fits remaining memory (flavors sorted descending by size) + flavorInGroup := flavorGroup.Flavors[len(flavorGroup.Flavors)-1] // default to smallest + memoryBytes := deltaMemoryBytes + cpus := int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded + + for _, flavor := range flavorGroup.Flavors { + flavorMemoryBytes := int64(flavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + if flavorMemoryBytes <= deltaMemoryBytes { + flavorInGroup = flavor + memoryBytes = flavorMemoryBytes + cpus = int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs from flavor specs, realistically bounded + break + } + } + + spec := v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity( + memoryBytes, + resource.BinarySI, + ), + "cpu": *resource.NewQuantity( + cpus, + resource.DecimalSI, + ), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: state.ProjectID, + DomainID: state.DomainID, + ResourceGroup: state.FlavorGroupName, + ResourceName: flavorInGroup.Name, + Creator: creator, + Allocations: nil, + }, + } + + // Set AvailabilityZone if specified + if state.AvailabilityZone != "" { + spec.AvailabilityZone = state.AvailabilityZone + } + + // Set validity times if specified + if state.StartTime != nil { + spec.StartTime = &metav1.Time{Time: *state.StartTime} + } + if state.EndTime != nil { + spec.EndTime = &metav1.Time{Time: *state.EndTime} + } + + return &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: spec, + } +} diff --git a/internal/scheduling/reservations/commitments/reservation_manager_test.go b/internal/scheduling/reservations/commitments/reservation_manager_test.go new file mode 100644 index 000000000..d8cf9c267 --- /dev/null +++ b/internal/scheduling/reservations/commitments/reservation_manager_test.go @@ -0,0 +1,540 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "context" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestApplyCommitmentState_CreatesNewReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: 3 multiples of smallest flavor (24 GiB) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 3 * 8192 * 1024 * 1024, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(removed) != 0 { + t.Errorf("expected 0 removed reservations, got %d", len(removed)) + } + + // Should create reservations to fulfill the commitment + if len(touched) == 0 { + t.Fatal("expected at least one reservation to be created") + } + + // Verify created reservations sum to desired state + totalMemory := int64(0) + for _, res := range touched { + memQuantity := res.Spec.Resources["memory"] + totalMemory += memQuantity.Value() + } + + if totalMemory != desiredState.TotalMemoryBytes { + t.Errorf("expected total memory %d, got %d", desiredState.TotalMemoryBytes, totalMemory) + } +} + +func TestApplyCommitmentState_DeletesExcessReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create existing reservations (32 GiB total) + existingReservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservations[0], &existingReservations[1]). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: only 8 GiB (need to reduce) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + _, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Note: May create a new 8GiB reservation while removing the two 16GiB ones + // This is expected behavior based on the slot sizing algorithm + + // Should remove excess reservations + if len(removed) == 0 { + t.Fatal("expected reservations to be removed") + } + + // Verify remaining capacity matches desired state + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + totalMemory := int64(0) + for _, res := range remainingList.Items { + memQuantity := res.Spec.Resources["memory"] + totalMemory += memQuantity.Value() + } + + if totalMemory != desiredState.TotalMemoryBytes { + t.Errorf("expected remaining memory %d, got %d", desiredState.TotalMemoryBytes, totalMemory) + } +} + +func TestApplyCommitmentState_PreservesAllocatedReservations(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create reservations: one with allocation, one without + existingReservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + "vm-123": {}, // Has allocation + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, // No allocation + }, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservations[0], &existingReservations[1]). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: only 16 GiB (need to reduce by one slot) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 16 * 1024 * 1024 * 1024, + } + + _, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should remove the unallocated reservation, not the allocated one + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + // Verify the removed one had no allocations + if len(removed[0].Spec.CommittedResourceReservation.Allocations) != 0 { + t.Error("expected unallocated reservation to be removed first") + } + + // Verify the allocated reservation still exists + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + if len(remainingList.Items) != 1 { + t.Fatalf("expected 1 remaining reservation, got %d", len(remainingList.Items)) + } + + // Verify the remaining one has the allocation + if len(remainingList.Items[0].Spec.CommittedResourceReservation.Allocations) == 0 { + t.Error("expected allocated reservation to be preserved") + } +} + +func TestApplyCommitmentState_HandlesZeroCapacity(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create existing reservation + existingReservation := v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservation). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state: zero capacity (commitment expired or canceled) + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 0, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if len(touched) != 0 { + t.Errorf("expected 0 new reservations, got %d", len(touched)) + } + + // Should remove all reservations + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + // Verify no reservations remain + var remainingList v1alpha1.ReservationList + if err := client.List(context.Background(), &remainingList); err != nil { + t.Fatal(err) + } + + if len(remainingList.Items) != 0 { + t.Errorf("expected 0 remaining reservations, got %d", len(remainingList.Items)) + } +} + +func TestApplyCommitmentState_FixesWrongFlavorGroup(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + // Create reservation with wrong flavor group + existingReservation := v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "wrong-group", // Wrong flavor group + Creator: "syncer", + Allocations: map[string]v1alpha1.CommittedResourceAllocation{}, + }, + }, + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(&existingReservation). + Build() + + manager := NewReservationManager(client) + flavorGroup := testFlavorGroup() + flavorGroups := map[string]compute.FlavorGroupFeature{ + "test-group": flavorGroup, + } + + // Desired state with correct flavor group + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + touched, removed, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should remove wrong reservation and create new one + if len(removed) != 1 { + t.Fatalf("expected 1 removed reservation, got %d", len(removed)) + } + + if len(touched) != 1 { + t.Fatalf("expected 1 new reservation, got %d", len(touched)) + } + + // Verify new reservation has correct flavor group + if touched[0].Spec.CommittedResourceReservation.ResourceGroup != "test-group" { + t.Errorf("expected flavor group test-group, got %s", + touched[0].Spec.CommittedResourceReservation.ResourceGroup) + } +} + +func TestApplyCommitmentState_UnknownFlavorGroup(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } + + client := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + manager := NewReservationManager(client) + flavorGroups := map[string]compute.FlavorGroupFeature{} // Empty + + desiredState := &CommitmentState{ + CommitmentUUID: "abc123", + ProjectID: "project-1", + FlavorGroupName: "unknown-group", + TotalMemoryBytes: 8 * 1024 * 1024 * 1024, + } + + _, _, err := manager.ApplyCommitmentState( + context.Background(), + logr.Discard(), + desiredState, + flavorGroups, + "syncer", + ) + + if err == nil { + t.Fatal("expected error for unknown flavor group, got nil") + } +} + +func TestNewReservation_SelectsAppropriateFlavor(t *testing.T) { + manager := &ReservationManager{} + flavorGroup := testFlavorGroup() + + tests := []struct { + name string + deltaMemory int64 + expectedName string + expectedCores int64 + }{ + { + name: "fits large flavor", + deltaMemory: 32768 * 1024 * 1024, // 32 GiB + expectedName: "large", + expectedCores: 16, + }, + { + name: "fits medium flavor", + deltaMemory: 16384 * 1024 * 1024, // 16 GiB + expectedName: "medium", + expectedCores: 8, + }, + { + name: "fits small flavor", + deltaMemory: 8192 * 1024 * 1024, // 8 GiB + expectedName: "small", + expectedCores: 4, + }, + { + name: "oversized uses largest available flavor", + deltaMemory: 100 * 1024 * 1024 * 1024, // 100 GiB (larger than any flavor) + expectedName: "large", // Will use largest available + expectedCores: 16, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + state := &CommitmentState{ + CommitmentUUID: "test-uuid", + ProjectID: "project-1", + FlavorGroupName: "test-group", + TotalMemoryBytes: tt.deltaMemory, + } + + reservation := manager.newReservation( + state, + 0, + tt.deltaMemory, + flavorGroup, + "syncer", + ) + + // Verify flavor selection + if reservation.Spec.CommittedResourceReservation.ResourceName != tt.expectedName { + t.Errorf("expected flavor %s, got %s", + tt.expectedName, + reservation.Spec.CommittedResourceReservation.ResourceName) + } + + // Verify CPU allocation + cpuQuantity := reservation.Spec.Resources["cpu"] + if cpuQuantity.Value() != tt.expectedCores { + t.Errorf("expected %d cores, got %d", + tt.expectedCores, cpuQuantity.Value()) + } + }) + } +} diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go new file mode 100644 index 000000000..996efff8e --- /dev/null +++ b/internal/scheduling/reservations/commitments/state.go @@ -0,0 +1,202 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "errors" + "fmt" + "strings" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/sapcc/go-api-declarations/liquid" + ctrl "sigs.k8s.io/controller-runtime" +) + +var stateLog = ctrl.Log.WithName("commitment_state") + +// Limes LIQUID resource naming convention: ram_ +const commitmentResourceNamePrefix = "ram_" + +func getFlavorGroupNameFromResource(resourceName string) (string, error) { + if !strings.HasPrefix(resourceName, commitmentResourceNamePrefix) { + return "", fmt.Errorf("invalid resource name: %s", resourceName) + } + return strings.TrimPrefix(resourceName, commitmentResourceNamePrefix), nil +} + +// CommitmentState represents desired or current commitment resource allocation. +type CommitmentState struct { + // CommitmentUUID uniquely identifies this commitment + CommitmentUUID string + // ProjectID is the OpenStack project this commitment belongs to + ProjectID string + // DomainID is the OpenStack domain this commitment belongs to + DomainID string + // FlavorGroupName identifies the flavor group (e.g., "hana_medium_v2") + FlavorGroupName string + // the total memory in bytes across all reservation slots + TotalMemoryBytes int64 + // AvailabilityZone specifies the availability zone for this commitment + AvailabilityZone string + // StartTime is when the commitment becomes active + StartTime *time.Time + // EndTime is when the commitment expires + EndTime *time.Time +} + +// FromCommitment converts Limes commitment to CommitmentState. +func FromCommitment( + commitment Commitment, + flavorGroup compute.FlavorGroupFeature, +) (*CommitmentState, error) { + + flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) + if err != nil { + return nil, err + } + + // Calculate total memory from commitment amount (amount = multiples of smallest flavor) + smallestFlavorMemoryBytes := int64(flavorGroup.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + totalMemoryBytes := int64(commitment.Amount) * smallestFlavorMemoryBytes //nolint:gosec // commitment amount from Limes API, bounded by quota limits + + // Set start time: use ConfirmedAt if available, otherwise CreatedAt + var startTime *time.Time + if commitment.ConfirmedAt != nil { + t := time.Unix(int64(*commitment.ConfirmedAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + startTime = &t + } else { + t := time.Unix(int64(commitment.CreatedAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + startTime = &t + } + + // Set end time from ExpiresAt + var endTime *time.Time + if commitment.ExpiresAt > 0 { + t := time.Unix(int64(commitment.ExpiresAt), 0) //nolint:gosec // timestamp from Limes API, realistically bounded + endTime = &t + } + + return &CommitmentState{ + CommitmentUUID: commitment.UUID, + ProjectID: commitment.ProjectID, + DomainID: commitment.DomainID, + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: totalMemoryBytes, + AvailabilityZone: commitment.AvailabilityZone, + StartTime: startTime, + EndTime: endTime, + }, nil +} + +// FromChangeCommitmentTargetState converts LIQUID API request to CommitmentState. +func FromChangeCommitmentTargetState( + commitment liquid.Commitment, + projectID string, + flavorGroupName string, + flavorGroup compute.FlavorGroupFeature, + az string, +) (*CommitmentState, error) { + + amountMultiple := uint64(0) + var startTime *time.Time + var endTime *time.Time + + switch commitment.NewStatus.UnwrapOr("none") { + // guaranteed and confirmed commitments are honored with start time now + case liquid.CommitmentStatusGuaranteed, liquid.CommitmentStatusConfirmed: + amountMultiple = commitment.Amount + // Set start time to now for active commitments + now := time.Now() + startTime = &now + } + + // ConfirmBy is ignored for now + // TODO do more sophisticated handling of guaranteed commitments + + // Set end time if not zero (commitments can have no expiry) + if !commitment.ExpiresAt.IsZero() { + endTime = &commitment.ExpiresAt + // check expiry time + if commitment.ExpiresAt.Before(time.Now()) || commitment.ExpiresAt.Equal(time.Now()) { + // commitment is already expired, ignore capacity + amountMultiple = 0 + } + } + + // Flavors are sorted by size descending, so the last one is the smallest + smallestFlavor := flavorGroup.SmallestFlavor + smallestFlavorMemoryBytes := int64(smallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory from specs, realistically bounded + + // Amount represents multiples of the smallest flavor in the group + totalMemoryBytes := int64(amountMultiple) * smallestFlavorMemoryBytes //nolint:gosec // commitment amount from Limes API, bounded by quota limits + + return &CommitmentState{ + CommitmentUUID: string(commitment.UUID), + ProjectID: projectID, + FlavorGroupName: flavorGroupName, + TotalMemoryBytes: totalMemoryBytes, + AvailabilityZone: az, + StartTime: startTime, + EndTime: endTime, + }, nil +} + +// FromReservations reconstructs CommitmentState from existing Reservation CRDs. +func FromReservations(reservations []v1alpha1.Reservation) (*CommitmentState, error) { + if len(reservations) == 0 { + return nil, errors.New("no reservations provided") + } + + // Extract commitment metadata from first reservation + first := reservations[0] + if first.Spec.CommittedResourceReservation == nil { + return nil, errors.New("not a committed resource reservation") + } + + state := &CommitmentState{ + CommitmentUUID: extractCommitmentUUID(first.Name), + ProjectID: first.Spec.CommittedResourceReservation.ProjectID, + DomainID: first.Spec.CommittedResourceReservation.DomainID, + FlavorGroupName: first.Spec.CommittedResourceReservation.ResourceGroup, + TotalMemoryBytes: 0, + AvailabilityZone: first.Spec.AvailabilityZone, + } + + if first.Spec.StartTime != nil { + state.StartTime = &first.Spec.StartTime.Time + } + if first.Spec.EndTime != nil { + state.EndTime = &first.Spec.EndTime.Time + } + + // Sum memory across all reservations + for _, res := range reservations { + if res.Spec.CommittedResourceReservation == nil { + return nil, errors.New("unexpected reservation type of reservation " + res.Name) + } + // check if it belongs to the same commitment + if extractCommitmentUUID(res.Name) != state.CommitmentUUID { + return nil, errors.New("reservation " + res.Name + " does not belong to commitment " + state.CommitmentUUID) + } + // check flavor group consistency, ignore if not matching to repair corrupted state in k8s + if res.Spec.CommittedResourceReservation.ResourceGroup != state.FlavorGroupName { + // log message + stateLog.Error(errors.New("inconsistent flavor group in reservation"), + "reservation belongs to same commitment but has different flavor group - ignoring reservation for capacity calculation", + "reservationName", res.Name, + "expectedFlavorGroup", state.FlavorGroupName, + "actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup, + ) + continue + } + + memoryQuantity := res.Spec.Resources["memory"] + memoryBytes := memoryQuantity.Value() + state.TotalMemoryBytes += memoryBytes + } + + return state, nil +} diff --git a/internal/scheduling/reservations/commitments/state_test.go b/internal/scheduling/reservations/commitments/state_test.go new file mode 100644 index 000000000..d8581cec1 --- /dev/null +++ b/internal/scheduling/reservations/commitments/state_test.go @@ -0,0 +1,252 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Test helper: creates a minimal flavor group for testing +func testFlavorGroup() compute.FlavorGroupFeature { + return compute.FlavorGroupFeature{ + Name: "test-group", + Flavors: []compute.FlavorInGroup{ + {Name: "large", VCPUs: 16, MemoryMB: 32768, DiskGB: 100}, + {Name: "medium", VCPUs: 8, MemoryMB: 16384, DiskGB: 50}, + {Name: "small", VCPUs: 4, MemoryMB: 8192, DiskGB: 25}, + }, + SmallestFlavor: compute.FlavorInGroup{ + Name: "small", VCPUs: 4, MemoryMB: 8192, DiskGB: 25, + }, + LargestFlavor: compute.FlavorInGroup{ + Name: "large", VCPUs: 16, MemoryMB: 32768, DiskGB: 100, + }, + } +} + +func TestFromCommitment_CalculatesMemoryCorrectly(t *testing.T) { + flavorGroup := testFlavorGroup() + commitment := Commitment{ + UUID: "test-uuid", + ProjectID: "project-1", + ResourceName: "ram_test-group", + Amount: 5, // 5 multiples of smallest flavor + } + + state, err := FromCommitment(commitment, flavorGroup) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify basic fields + if state.CommitmentUUID != "test-uuid" { + t.Errorf("expected UUID test-uuid, got %s", state.CommitmentUUID) + } + if state.ProjectID != "project-1" { + t.Errorf("expected ProjectID project-1, got %s", state.ProjectID) + } + if state.FlavorGroupName != "test-group" { + t.Errorf("expected FlavorGroupName test-group, got %s", state.FlavorGroupName) + } + + // Verify memory calculation: 5 * 8192 MB = 40960 MB = 42949672960 bytes + expectedMemory := int64(5 * 8192 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d, got %d", expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromCommitment_InvalidResourceName(t *testing.T) { + flavorGroup := testFlavorGroup() + commitment := Commitment{ + UUID: "test-uuid", + ProjectID: "project-1", + ResourceName: "invalid_resource_name", // missing "ram_" prefix + Amount: 1, + } + + _, err := FromCommitment(commitment, flavorGroup) + if err == nil { + t.Fatal("expected error for invalid resource name, got nil") + } +} + +func TestFromReservations_SumsMemoryCorrectly(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), // 8 GiB + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), // 16 GiB + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + } + + state, err := FromReservations(reservations) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify fields extracted from first reservation + if state.CommitmentUUID != "abc123" { + t.Errorf("expected UUID abc123, got %s", state.CommitmentUUID) + } + if state.ProjectID != "project-1" { + t.Errorf("expected ProjectID project-1, got %s", state.ProjectID) + } + if state.FlavorGroupName != "test-group" { + t.Errorf("expected FlavorGroupName test-group, got %s", state.FlavorGroupName) + } + + // Verify memory is summed correctly: 8 GiB + 16 GiB = 24 GiB + expectedMemory := int64(24 * 1024 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d, got %d", expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromReservations_EmptyList(t *testing.T) { + _, err := FromReservations([]v1alpha1.Reservation{}) + if err == nil { + t.Fatal("expected error for empty reservation list, got nil") + } +} + +func TestFromReservations_SkipsInconsistentFlavorGroup(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(8*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-1", + }, + Spec: v1alpha1.ReservationSpec{ + Resources: map[string]resource.Quantity{ + "memory": *resource.NewQuantity(16*1024*1024*1024, resource.BinarySI), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "wrong-group", // Different flavor group + }, + }, + }, + } + + state, err := FromReservations(reservations) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Should only count first reservation with matching flavor group + expectedMemory := int64(8 * 1024 * 1024 * 1024) + if state.TotalMemoryBytes != expectedMemory { + t.Errorf("expected memory %d (ignoring inconsistent reservation), got %d", + expectedMemory, state.TotalMemoryBytes) + } +} + +func TestFromReservations_MixedCommitmentUUIDs(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-xyz789-0", // Different commitment UUID + }, + Spec: v1alpha1.ReservationSpec{ + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "project-1", + ResourceGroup: "test-group", + }, + }, + }, + } + + _, err := FromReservations(reservations) + if err == nil { + t.Fatal("expected error for mixed commitment UUIDs, got nil") + } +} + +func TestFromReservations_NonCommittedResourceType(t *testing.T) { + reservations := []v1alpha1.Reservation{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-abc123-0", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, // Wrong type + }, + }, + } + + _, err := FromReservations(reservations) + if err == nil { + t.Fatal("expected error for non-CR reservation type, got nil") + } +} + +func TestGetFlavorGroupNameFromResource_Valid(t *testing.T) { + name, err := getFlavorGroupNameFromResource("ram_hana_medium_v2") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if name != "hana_medium_v2" { + t.Errorf("expected hana_medium_v2, got %s", name) + } +} + +func TestGetFlavorGroupNameFromResource_Invalid(t *testing.T) { + _, err := getFlavorGroupNameFromResource("invalid_resource") + if err == nil { + t.Fatal("expected error for invalid resource name, got nil") + } +} diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index 970a44b26..b9e6fe3b4 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -5,24 +5,20 @@ package commitments import ( "context" - "errors" "fmt" - "slices" - "sort" "strings" - - ctrl "sigs.k8s.io/controller-runtime" + "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" - k8serrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/resource" - "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) var ( - syncLog = ctrl.Log.WithName("sync") // CreatorValue identifies reservations created by this syncer. CreatorValue = "commitments-syncer" ) @@ -35,13 +31,12 @@ type SyncerConfig struct { } type Syncer struct { - // Client to fetch commitments. + // Client to fetch commitments from Limes CommitmentsClient - // Client for the kubernetes API. + // Kubernetes client for CRD operations client.Client } -// Create a new compute reservation syncer. func NewSyncer(k8sClient client.Client) *Syncer { return &Syncer{ CommitmentsClient: NewCommitmentsClient(), @@ -49,233 +44,175 @@ func NewSyncer(k8sClient client.Client) *Syncer { } } -// Initialize the syncer. func (s *Syncer) Init(ctx context.Context, config SyncerConfig) error { - // Initialize the syncer. if err := s.CommitmentsClient.Init(ctx, s.Client, config); err != nil { return err } return nil } -// Helper struct to unify the commitment with metadata needed for reservation creation. -type resolvedCommitment struct { - Commitment - Flavor Flavor -} - -// Get all compute commitments that should be converted to reservations. -func (s *Syncer) resolveUnusedCommitments(ctx context.Context) ([]resolvedCommitment, error) { - // Get all data we need from the openstack services. +func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavorGroups map[string]compute.FlavorGroupFeature) ([]*CommitmentState, error) { allProjects, err := s.ListProjects(ctx) if err != nil { return nil, err } - flavors, err := s.ListFlavorsByName(ctx) - if err != nil { - return nil, err - } commitments, err := s.ListCommitmentsByID(ctx, allProjects...) if err != nil { return nil, err } - // Remove non-compute/non-instance commitments or commitments we can't resolve. - var resolvedCommitments []resolvedCommitment + // Filter for compute commitments with RAM flavor group resources + var commitmentStates []*CommitmentState for id, commitment := range commitments { if commitment.ServiceType != "compute" { - delete(commitments, id) - syncLog.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) - continue - } - if !strings.HasPrefix(commitment.ResourceName, "instances_") { - syncLog.Info("skipping non-instance commitment", "id", id, "resourceName", commitment.ResourceName) - delete(commitments, id) + log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) continue } - flavorName := strings.TrimPrefix(commitment.ResourceName, "instances_") - flavor, ok := flavors[flavorName] - if !ok { - syncLog.Info("skipping commitment without known flavor", "id", id, "flavorName", flavorName) - delete(commitments, id) + if !strings.HasPrefix(commitment.ResourceName, commitmentResourceNamePrefix) { + log.Info("skipping non-RAM-flavor-group commitment", "id", id, "resourceName", commitment.ResourceName) continue } - // We only support cloud-hypervisor and qemu hypervisors for commitments. - hvType, ok := flavor.ExtraSpecs["capabilities:hypervisor_type"] - if !ok || !slices.Contains([]string{"ch", "qemu"}, strings.ToLower(hvType)) { - syncLog.Info("skipping commitment with unsupported hv type", "commitmentID", commitment.UUID, "hypervisorType", hvType) - delete(commitments, id) + + // Extract flavor group name from resource name + flavorGroupName, err := getFlavorGroupNameFromResource(commitment.ResourceName) + if err != nil { + log.Info("skipping commitment with invalid resource name", + "id", id, + "resourceName", commitment.ResourceName, + "error", err) continue } - resolvedCommitments = append(resolvedCommitments, resolvedCommitment{ - Commitment: commitment, - Flavor: flavor, - }) - } - // Remove all commitments which are currently actively in use by a vm. - projectsWithCommitments := make([]Project, 0, len(resolvedCommitments)) - projectIDs := make(map[string]bool) - for _, commitment := range resolvedCommitments { - projectIDs[commitment.ProjectID] = true - } - for _, project := range allProjects { - if _, exists := projectIDs[project.ID]; exists { - projectsWithCommitments = append(projectsWithCommitments, project) + // Validate flavor group exists in Knowledge + flavorGroup, exists := flavorGroups[flavorGroupName] + if !exists { + log.Info("skipping commitment with unknown flavor group", + "id", id, + "flavorGroup", flavorGroupName) + continue } - } - // List all servers, not only the active ones, like limes when it calculates - // subresource usage: https://github.com/sapcc/limes/blob/c146c82/internal/liquids/nova/subresources.go#L94 - servers, err := s.ListServersByProjectID(ctx, projectsWithCommitments...) - if err != nil { - return nil, err - } - sort.Slice(resolvedCommitments, func(i, j int) bool { - return resolvedCommitments[i].ID < resolvedCommitments[j].ID - }) - mappedServers := map[string]struct{}{} // Servers subtracted from a commitment - var unusedCommitments []resolvedCommitment - for _, commitment := range resolvedCommitments { - matchingServerCount := uint64(0) - activeServers, ok := servers[commitment.ProjectID] - if !ok || len(activeServers) == 0 { - // No active servers in this project, keep the commitment. - unusedCommitments = append(unusedCommitments, commitment) + // Skip commitments with empty UUID + if commitment.UUID == "" { + log.Info("skipping commitment with empty UUID", + "id", id) continue } - // Some active servers, subtract them from the commitment amount. - sort.Slice(activeServers, func(i, j int) bool { - return activeServers[i].ID < activeServers[j].ID - }) - for _, server := range activeServers { - if _, exists := mappedServers[server.ID]; exists { - // This server is already subtracted from another commitment. - continue - } - if server.FlavorName != commitment.Flavor.Name { - // This server is of a different flavor, skip it. - continue - } - mappedServers[server.ID] = struct{}{} - matchingServerCount++ - syncLog.Info("subtracting server from commitment", "commitmentID", commitment.UUID, "serverID", server.ID, "remainingAmount", commitment.Amount) - } - if matchingServerCount >= commitment.Amount { - syncLog.Info("skipping commitment that is fully used by active servers", "id", commitment.UUID, "project", commitment.ProjectID) + + // Convert commitment to state using FromCommitment + state, err := FromCommitment(commitment, flavorGroup) + if err != nil { + log.Error(err, "failed to convert commitment to state", + "id", id, + "uuid", commitment.UUID) continue } - commitment.Amount -= matchingServerCount - unusedCommitments = append(unusedCommitments, commitment) + + log.Info("resolved commitment to state", + "commitmentID", commitment.UUID, + "flavorGroup", flavorGroupName, + "amount", commitment.Amount, + "totalMemoryBytes", state.TotalMemoryBytes) + + commitmentStates = append(commitmentStates, state) } - return unusedCommitments, nil + return commitmentStates, nil } -// Fetch commitments and update/create reservations for each of them. +// SyncReservations fetches commitments from Limes and synchronizes Reservation CRDs. func (s *Syncer) SyncReservations(ctx context.Context) error { - // Get all commitments that should be converted to reservations. - // TODO keep all commitments, not only the unused ones, propagate allocation correctly - commitments, err := s.resolveUnusedCommitments(ctx) + // TODO handle concurrency with change API: consider creation time of reservations and status ready + + // Create logger with run ID for this sync execution + runID := fmt.Sprintf("sync-%d", time.Now().Unix()) + log := ctrl.Log.WithName("CommitmentSyncer").WithValues("runID", runID) + + log.Info("starting commitment sync") + + // Check if flavor group knowledge is ready + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client} + knowledgeCRD, err := knowledge.Get(ctx) if err != nil { - syncLog.Error(err, "failed to get compute commitments") + log.Error(err, "failed to check flavor group knowledge readiness") return err } - // Map commitments to reservations. - var reservationsByName = make(map[string]v1alpha1.Reservation) - for _, commitment := range commitments { - // Get only the 5 first characters from the uuid. This should be safe enough. - if len(commitment.UUID) < 5 { - err := errors.New("commitment UUID is too short") - syncLog.Error(err, "uuid is less than 5 characters", "uuid", commitment.UUID) - continue - } - commitmentUUIDShort := commitment.UUID[:5] - spec := v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - Resources: map[string]resource.Quantity{ - "memory": *resource.NewQuantity(int64(commitment.Flavor.RAM)*1024*1024, resource.BinarySI), - "cpu": *resource.NewQuantity(int64(commitment.Flavor.VCPUs), resource.DecimalSI), - // Disk is currently not considered. - }, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: commitment.ProjectID, - DomainID: commitment.DomainID, - ResourceName: commitment.Flavor.Name, - ResourceGroup: commitment.Flavor.ExtraSpecs["hw_version"], - Allocations: make(map[string]v1alpha1.CommittedResourceAllocation), - Creator: CreatorValue, - }, - } - for n := range commitment.Amount { // N instances - meta := ctrl.ObjectMeta{ - Name: fmt.Sprintf("commitment-%s-%d", commitmentUUIDShort, n), - } - if _, exists := reservationsByName[meta.Name]; exists { - syncLog.Error(errors.New("duplicate reservation name"), - "reservation name already exists", - "name", meta.Name, - "commitmentUUID", commitment.UUID, - ) - continue - } - reservationsByName[meta.Name] = v1alpha1.Reservation{ - ObjectMeta: meta, - Spec: spec, - } - } + if knowledgeCRD == nil { + log.Info("skipping commitment sync - flavor group knowledge not ready yet") + return nil } - // Create new reservations or update existing ones. - for _, res := range reservationsByName { - // Check if the reservation already exists. - nn := types.NamespacedName{Name: res.Name, Namespace: res.Namespace} - var existing v1alpha1.Reservation - if err := s.Get(ctx, nn, &existing); err != nil { - if !k8serrors.IsNotFound(err) { - syncLog.Error(err, "failed to get reservation", "name", nn.Name) - return err - } - // Reservation does not exist, create it. - if err := s.Create(ctx, &res); err != nil { - return err - } - syncLog.Info("created reservation", "name", nn.Name) + // Get flavor groups using the CRD we already fetched + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, knowledgeCRD) + if err != nil { + log.Error(err, "failed to get flavor groups from knowledge") + return err + } + + // Get all commitments as states + commitmentStates, err := s.getCommitmentStates(ctx, log, flavorGroups) + if err != nil { + log.Error(err, "failed to get compute commitments") + return err + } + + // Create ReservationManager to handle state application + manager := NewReservationManager(s.Client) + + // Apply each commitment state using the manager + for _, state := range commitmentStates { + log.Info("applying commitment state", + "commitmentUUID", state.CommitmentUUID, + "projectID", state.ProjectID, + "flavorGroup", state.FlavorGroupName, + "totalMemoryBytes", state.TotalMemoryBytes) + + _, _, err := manager.ApplyCommitmentState(ctx, log, state, flavorGroups, CreatorValue) + if err != nil { + log.Error(err, "failed to apply commitment state", + "commitmentUUID", state.CommitmentUUID) + // Continue with other commitments even if one fails continue } - // Reservation exists, update it. - old := existing.DeepCopy() - existing.Spec = res.Spec - patch := client.MergeFrom(old) - if err := s.Patch(ctx, &existing, patch); err != nil { - syncLog.Error(err, "failed to patch reservation", "name", nn.Name) - return err - } - syncLog.Info("updated reservation", "name", nn.Name) } - // Delete reservations that are not in the commitments anymore. + // Delete reservations that are no longer in commitments + // Only query committed resource reservations using labels for efficiency var existingReservations v1alpha1.ReservationList - if err := s.List(ctx, &existingReservations); err != nil { - syncLog.Error(err, "failed to list existing reservations") + if err := s.List(ctx, &existingReservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + log.Error(err, "failed to list existing committed resource reservations") return err } + + // Build set of commitment UUIDs we should have + activeCommitments := make(map[string]bool) + for _, state := range commitmentStates { + activeCommitments[state.CommitmentUUID] = true + } + + // Delete reservations for commitments that no longer exist for _, existing := range existingReservations.Items { - // Only manage reservations created by this syncer (identified by Creator field). - if existing.Spec.CommittedResourceReservation == nil || - existing.Spec.CommittedResourceReservation.Creator != CreatorValue { + // Extract commitment UUID from reservation name + commitmentUUID := extractCommitmentUUID(existing.Name) + if commitmentUUID == "" { + log.Info("skipping reservation with unparseable name", "name", existing.Name) continue } - if _, found := reservationsByName[existing.Name]; !found { - // Reservation not found in commitments, delete it. + + if !activeCommitments[commitmentUUID] { + // This commitment no longer exists, delete the reservation if err := s.Delete(ctx, &existing); err != nil { - syncLog.Error(err, "failed to delete reservation", "name", existing.Name) + log.Error(err, "failed to delete reservation", "name", existing.Name) return err } - syncLog.Info("deleted reservation", "name", existing.Name) + log.Info("deleted reservation for expired commitment", + "name", existing.Name, + "commitmentUUID", commitmentUUID) } } - syncLog.Info("synced reservations", "count", len(reservationsByName)) + log.Info("synced reservations", "commitmentCount", len(commitmentStates)) return nil } diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index 4db74801b..0790545e8 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -7,15 +7,89 @@ import ( "context" "testing" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - - "github.com/cobaltcore-dev/cortex/api/v1alpha1" ) +// FlavorGroupData holds test data for creating a flavor group +type FlavorGroupData struct { + LargestFlavorName string + LargestFlavorVCPUs uint64 + LargestFlavorMemoryMB uint64 + SmallestFlavorName string + SmallestFlavorVCPUs uint64 + SmallestFlavorMemoryMB uint64 +} + +// createFlavorGroupKnowledge creates a Knowledge CRD with flavor group data for testing +func createFlavorGroupKnowledge(t *testing.T, groups map[string]FlavorGroupData) *v1alpha1.Knowledge { + t.Helper() + + // Build flavor group features + features := make([]compute.FlavorGroupFeature, 0, len(groups)) + for groupName, data := range groups { + features = append(features, compute.FlavorGroupFeature{ + Name: groupName, + Flavors: []compute.FlavorInGroup{ + { + Name: data.LargestFlavorName, + VCPUs: data.LargestFlavorVCPUs, + MemoryMB: data.LargestFlavorMemoryMB, + }, + { + Name: data.SmallestFlavorName, + VCPUs: data.SmallestFlavorVCPUs, + MemoryMB: data.SmallestFlavorMemoryMB, + }, + }, + LargestFlavor: compute.FlavorInGroup{ + Name: data.LargestFlavorName, + VCPUs: data.LargestFlavorVCPUs, + MemoryMB: data.LargestFlavorMemoryMB, + }, + SmallestFlavor: compute.FlavorInGroup{ + Name: data.SmallestFlavorName, + VCPUs: data.SmallestFlavorVCPUs, + MemoryMB: data.SmallestFlavorMemoryMB, + }, + }) + } + + // Box the features + rawFeatures, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatalf("Failed to box flavor group features: %v", err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", + }, + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: rawFeatures, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "ExtractorSucceeded", + }, + }, + }, + } +} + // Mock CommitmentsClient for testing type mockCommitmentsClient struct { initFunc func(ctx context.Context, client client.Client, conf SyncerConfig) error @@ -123,19 +197,32 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { t.Fatalf("Failed to add scheme: %v", err) } + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor", + LargestFlavorVCPUs: 2, + LargestFlavorMemoryMB: 1024, + SmallestFlavorName: "test-flavor", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + k8sClient := fake.NewClientBuilder(). WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). Build() - // Create mock commitments with instance flavors + // Create mock commitments with flavor group resources (using ram_ prefix) mockCommitments := []Commitment{ { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "instances_test-flavor", + ResourceName: "ram_test_group_v1", AvailabilityZone: "az1", - Amount: 2, // 2 instances + Amount: 2, // 2 multiples of smallest flavor (2 * 1024MB = 2048MB total) Unit: "", ProjectID: "test-project-1", DomainID: "test-domain-1", @@ -150,23 +237,6 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "test-flavor": { - ID: "flavor-1", - Name: "test-flavor", - RAM: 1024, // 1GB in MB - VCPUs: 2, - Disk: 20, // 20GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "dedicated", - "hw:numa_nodes": "1", - "aggregate_instance_extra_specs:pinned": "true", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "test-project-1", DomainID: "test-domain-1", Name: "Test Project 1"}, @@ -200,7 +270,7 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { return } - // Should have 2 reservations (Amount = 2) + // Should have 2 reservations (Amount = 2, each for smallest flavor) if len(reservations.Items) != 2 { t.Errorf("Expected 2 reservations, got %d", len(reservations.Items)) return @@ -216,11 +286,12 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { t.Errorf("Expected project ID test-project-1, got %v", res.Spec.CommittedResourceReservation.ProjectID) } - if res.Spec.CommittedResourceReservation.ResourceName != "test-flavor" { - t.Errorf("Expected flavor test-flavor, got %v", res.Spec.CommittedResourceReservation.ResourceName) + if res.Spec.CommittedResourceReservation.ResourceGroup != "test_group_v1" { + t.Errorf("Expected resource group test_group_v1, got %v", res.Spec.CommittedResourceReservation.ResourceGroup) } - // Check resource values + // Check resource values - should be sized for the flavor that fits + // With 2048MB total capacity, we can fit 2x 1024MB flavors expectedMemory := resource.MustParse("1073741824") // 1024MB in bytes if !res.Spec.Resources["memory"].Equal(expectedMemory) { t.Errorf("Expected memory %v, got %v", expectedMemory, res.Spec.Resources["memory"]) @@ -238,17 +309,34 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { t.Fatalf("Failed to add scheme: %v", err) } - // Create an existing reservation + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "new_group_v1": { + LargestFlavorName: "new-flavor", + LargestFlavorVCPUs: 4, + LargestFlavorMemoryMB: 2048, + SmallestFlavorName: "new-flavor-small", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + + // Create an existing reservation with mismatched project/flavor group + // The ReservationManager will delete this and create a new one existingReservation := &v1alpha1.Reservation{ ObjectMeta: ctrl.ObjectMeta{ - Name: "commitment-12345-0", // Instance commitments have -0 suffix + Name: "commitment-12345-67890-abcdef-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, }, Spec: v1alpha1.ReservationSpec{ Type: v1alpha1.ReservationTypeCommittedResource, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: "old-project", - ResourceName: "old-flavor", - Creator: CreatorValue, + ProjectID: "old-project", + ResourceName: "old-flavor", + ResourceGroup: "old_group", + Creator: CreatorValue, }, Resources: map[string]resource.Quantity{ "memory": resource.MustParse("512Mi"), @@ -259,16 +347,16 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { k8sClient := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(existingReservation). + WithObjects(existingReservation, flavorGroupsKnowledge). Build() - // Create mock commitment that should update the existing reservation + // Create mock commitment that will replace the existing reservation mockCommitments := []Commitment{ { ID: 1, UUID: "12345-67890-abcdef", ServiceType: "compute", - ResourceName: "instances_new-flavor", + ResourceName: "ram_new_group_v1", AvailabilityZone: "az1", Amount: 1, Unit: "", @@ -285,23 +373,6 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "new-flavor": { - ID: "flavor-2", - Name: "new-flavor", - RAM: 2048, // 2GB in MB - VCPUs: 4, - Disk: 40, // 40GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "shared", - "hw:numa_nodes": "2", - "aggregate_instance_extra_specs:pinned": "false", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "new-project", DomainID: "new-domain", Name: "New Project"}, @@ -327,45 +398,66 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { return } - // Verify that the reservation was updated - var updatedReservation v1alpha1.Reservation - err = k8sClient.Get(context.Background(), client.ObjectKey{Name: "commitment-12345-0"}, &updatedReservation) + // Verify that reservations were updated (old one deleted, new one created) + // The new reservation will be at index 0 since the old one was deleted first + var reservations v1alpha1.ReservationList + err = k8sClient.List(context.Background(), &reservations) if err != nil { - t.Errorf("Failed to get updated reservation: %v", err) + t.Errorf("Failed to list reservations: %v", err) + return + } + + if len(reservations.Items) != 1 { + t.Errorf("Expected 1 reservation, got %d", len(reservations.Items)) return } - // Verify the reservation was updated with new values - if updatedReservation.Spec.CommittedResourceReservation == nil { + newReservation := reservations.Items[0] + + // Verify the new reservation has correct values + if newReservation.Spec.CommittedResourceReservation == nil { t.Errorf("Expected CommittedResourceReservation to be set") return } - if updatedReservation.Spec.CommittedResourceReservation.ProjectID != "new-project" { - t.Errorf("Expected project ID new-project, got %v", updatedReservation.Spec.CommittedResourceReservation.ProjectID) + if newReservation.Spec.CommittedResourceReservation.ProjectID != "new-project" { + t.Errorf("Expected project ID new-project, got %v", newReservation.Spec.CommittedResourceReservation.ProjectID) } - if updatedReservation.Spec.CommittedResourceReservation.ResourceName != "new-flavor" { - t.Errorf("Expected flavor new-flavor, got %v", updatedReservation.Spec.CommittedResourceReservation.ResourceName) + if newReservation.Spec.CommittedResourceReservation.ResourceGroup != "new_group_v1" { + t.Errorf("Expected resource group new_group_v1, got %v", newReservation.Spec.CommittedResourceReservation.ResourceGroup) } } -func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { +func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) { scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatalf("Failed to add scheme: %v", err) } + // Create flavor group knowledge CRD + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor", + LargestFlavorVCPUs: 2, + LargestFlavorMemoryMB: 1024, + SmallestFlavorName: "test-flavor", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + k8sClient := fake.NewClientBuilder(). WithScheme(scheme). + WithObjects(flavorGroupsKnowledge). Build() - // Create mock commitment with short UUID (should be skipped) + // Create mock commitment with empty UUID (should be skipped) mockCommitments := []Commitment{ { ID: 1, - UUID: "123", // Too short + UUID: "", // Empty UUID ServiceType: "compute", - ResourceName: "instances_test-flavor", + ResourceName: "ram_test_group_v1", AvailabilityZone: "az1", Amount: 1, Unit: "", @@ -382,23 +474,6 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { } return result, nil }, - listFlavorsByNameFunc: func(ctx context.Context) (map[string]Flavor, error) { - return map[string]Flavor{ - "test-flavor": { - ID: "flavor-1", - Name: "test-flavor", - RAM: 1024, // 1GB in MB - VCPUs: 2, - Disk: 20, // 20GB - ExtraSpecs: map[string]string{ - "hw:cpu_policy": "dedicated", - "hw:numa_nodes": "1", - "aggregate_instance_extra_specs:pinned": "true", - "capabilities:hypervisor_type": "qemu", - }, - }, - }, nil - }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { return []Project{ {ID: "test-project", DomainID: "test-domain", Name: "Test Project"}, @@ -424,7 +499,7 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { return } - // Verify that no reservations were created due to short UUID + // Verify that no reservations were created due to empty UUID var reservations v1alpha1.ReservationList err = k8sClient.List(context.Background(), &reservations) if err != nil { @@ -433,6 +508,6 @@ func TestSyncer_SyncReservations_ShortUUID(t *testing.T) { } if len(reservations.Items) != 0 { - t.Errorf("Expected 0 reservations due to short UUID, got %d", len(reservations.Items)) + t.Errorf("Expected 0 reservations due to empty UUID, got %d", len(reservations.Items)) } } diff --git a/internal/scheduling/reservations/commitments/utils.go b/internal/scheduling/reservations/commitments/utils.go new file mode 100644 index 000000000..0afb3ab67 --- /dev/null +++ b/internal/scheduling/reservations/commitments/utils.go @@ -0,0 +1,46 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "strconv" + "strings" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" +) + +func GetMaxSlotIndex(reservations []v1alpha1.Reservation) int { + maxIndex := -1 + for _, res := range reservations { + // Parse slot index from name: "commitment--" + parts := strings.Split(res.Name, "-") + if len(parts) >= 3 { + if index, err := strconv.Atoi(parts[len(parts)-1]); err == nil { + if index > maxIndex { + maxIndex = index + } + } + } + } + return maxIndex +} + +// Always continue counting slots from max, instead of filling gaps. +func GetNextSlotIndex(reservations []v1alpha1.Reservation) int { + maxIndex := GetMaxSlotIndex(reservations) + return maxIndex + 1 +} + +// extractCommitmentUUID parses UUID from reservation name (commitment--). +func extractCommitmentUUID(name string) string { + // Remove "commitment-" prefix + withoutPrefix := strings.TrimPrefix(name, "commitment-") + // Split by "-" and take all but the last part (which is the slot index) + parts := strings.Split(withoutPrefix, "-") + if len(parts) > 1 { + // Rejoin all parts except the last one (slot index) + return strings.Join(parts[:len(parts)-1], "-") + } + return withoutPrefix +} diff --git a/internal/scheduling/reservations/commitments/utils_test.go b/internal/scheduling/reservations/commitments/utils_test.go new file mode 100644 index 000000000..b16268b2f --- /dev/null +++ b/internal/scheduling/reservations/commitments/utils_test.go @@ -0,0 +1,84 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +import ( + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestGetMaxSlotIndex_FindsHighestIndex(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-0"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-5"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-2"}}, + } + + maxIndex := GetMaxSlotIndex(reservations) + if maxIndex != 5 { + t.Errorf("expected max index 5, got %d", maxIndex) + } +} + +func TestGetMaxSlotIndex_EmptyList(t *testing.T) { + maxIndex := GetMaxSlotIndex([]v1alpha1.Reservation{}) + if maxIndex != -1 { + t.Errorf("expected -1 for empty list, got %d", maxIndex) + } +} + +func TestGetMaxSlotIndex_InvalidNames(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "invalid-name"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123"}}, // Missing slot index + } + + maxIndex := GetMaxSlotIndex(reservations) + if maxIndex != -1 { + t.Errorf("expected -1 when no valid indices found, got %d", maxIndex) + } +} + +func TestGetNextSlotIndex_IncrementsByOne(t *testing.T) { + reservations := []v1alpha1.Reservation{ + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-0"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "commitment-abc123-3"}}, + } + + nextIndex := GetNextSlotIndex(reservations) + if nextIndex != 4 { + t.Errorf("expected next index 4, got %d", nextIndex) + } +} + +func TestGetNextSlotIndex_EmptyList(t *testing.T) { + nextIndex := GetNextSlotIndex([]v1alpha1.Reservation{}) + if nextIndex != 0 { + t.Errorf("expected 0 for empty list, got %d", nextIndex) + } +} + +func TestExtractCommitmentUUID_SimpleUUID(t *testing.T) { + uuid := extractCommitmentUUID("commitment-abc123-0") + if uuid != "abc123" { + t.Errorf("expected abc123, got %s", uuid) + } +} + +func TestExtractCommitmentUUID_ComplexUUID(t *testing.T) { + // UUID with dashes (like standard UUID format) + uuid := extractCommitmentUUID("commitment-550e8400-e29b-41d4-a716-446655440000-5") + if uuid != "550e8400-e29b-41d4-a716-446655440000" { + t.Errorf("expected full UUID, got %s", uuid) + } +} + +func TestExtractCommitmentUUID_NoSlotIndex(t *testing.T) { + uuid := extractCommitmentUUID("commitment-abc123") + if uuid != "abc123" { + t.Errorf("expected abc123, got %s", uuid) + } +} diff --git a/internal/scheduling/reservations/controller/client.go b/internal/scheduling/reservations/controller/client.go deleted file mode 100644 index a57428dc9..000000000 --- a/internal/scheduling/reservations/controller/client.go +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - "encoding/json" - "fmt" - "net/http" - - "github.com/cobaltcore-dev/cortex/pkg/keystone" - "github.com/cobaltcore-dev/cortex/pkg/sso" - "github.com/gophercloud/gophercloud/v2" - "github.com/sapcc/go-bits/must" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -var ( - syncLog = ctrl.Log.WithName("sync") -) - -// OpenStack hypervisor model as returned by the Nova API under /os-hypervisors/detail. -// See: https://docs.openstack.org/api-ref/compute/#list-hypervisors-details -type Hypervisor struct { - ID string `json:"id"` - Hostname string `json:"hypervisor_hostname"` - Service struct { - Host string `json:"host"` - } `json:"service"` - Type string `json:"hypervisor_type"` -} - -// Client to fetch hypervisor data. -type HypervisorClient interface { - // Init the client. - Init(ctx context.Context, client client.Client, conf Config) error - // List all hypervisors. - ListHypervisors(ctx context.Context) ([]Hypervisor, error) -} - -// Hypervisor client fetching commitments from openstack services. -type hypervisorClient struct { - // Providerclient authenticated against openstack. - provider *gophercloud.ProviderClient - // Nova service client for OpenStack. - nova *gophercloud.ServiceClient -} - -// Create a new hypervisor client. -// By default, this client will fetch hypervisors from the nova API. -func NewHypervisorClient() HypervisorClient { - return &hypervisorClient{} -} - -// Init the client. -func (c *hypervisorClient) Init(ctx context.Context, client client.Client, conf Config) error { - var authenticatedHTTP = http.DefaultClient - if conf.SSOSecretRef != nil { - var err error - authenticatedHTTP, err = sso.Connector{Client: client}. - FromSecretRef(ctx, *conf.SSOSecretRef) - if err != nil { - return err - } - } - authenticatedKeystone, err := keystone. - Connector{Client: client, HTTPClient: authenticatedHTTP}. - FromSecretRef(ctx, conf.KeystoneSecretRef) - if err != nil { - return err - } - // Automatically fetch the nova endpoint from the keystone service catalog. - c.provider = authenticatedKeystone.Client() - - // Get the nova endpoint. - url := must.Return(c.provider.EndpointLocator(gophercloud.EndpointOpts{ - Type: "compute", - Availability: "public", - })) - syncLog.Info("using nova endpoint", "url", url) - c.nova = &gophercloud.ServiceClient{ - ProviderClient: c.provider, - Endpoint: url, - Type: "compute", - Microversion: "2.61", - } - return nil -} - -func (c *hypervisorClient) ListHypervisors(ctx context.Context) ([]Hypervisor, error) { - // Note: currently we need to fetch this without gophercloud. - // Gophercloud will just assume the request is a single page even when - // the response is paginated, returning only the first page. - initialURL := c.nova.Endpoint + "os-hypervisors/detail" - var nextURL = &initialURL - var hypervisors []Hypervisor - for nextURL != nil { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, *nextURL, http.NoBody) - if err != nil { - return nil, err - } - req.Header.Set("X-Auth-Token", c.provider.Token()) - req.Header.Set("X-OpenStack-Nova-API-Version", c.nova.Microversion) - resp, err := c.nova.HTTPClient.Do(req) - if err != nil { - return nil, err - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) - } - var list struct { - Hypervisors []Hypervisor `json:"hypervisors"` - Links []struct { - Rel string `json:"rel"` - Href string `json:"href"` - } `json:"hypervisors_links"` - } - err = json.NewDecoder(resp.Body).Decode(&list) - if err != nil { - return nil, err - } - hypervisors = append(hypervisors, list.Hypervisors...) - nextURL = nil - for _, link := range list.Links { - if link.Rel == "next" { - nextURL = &link.Href - break - } - } - } - return hypervisors, nil -} diff --git a/internal/scheduling/reservations/controller/client_test.go b/internal/scheduling/reservations/controller/client_test.go deleted file mode 100644 index f2b5582bc..000000000 --- a/internal/scheduling/reservations/controller/client_test.go +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type mockHypervisorClient struct { - hypervisorsToReturn []Hypervisor - errToReturn error -} - -func (m *mockHypervisorClient) Init(ctx context.Context, client client.Client, conf Config) error { - return nil -} - -func (m *mockHypervisorClient) ListHypervisors(ctx context.Context) ([]Hypervisor, error) { - return m.hypervisorsToReturn, m.errToReturn -} diff --git a/internal/scheduling/reservations/controller/controller.go b/internal/scheduling/reservations/controller/controller.go index 4eae4cfc2..17177770c 100644 --- a/internal/scheduling/reservations/controller/controller.go +++ b/internal/scheduling/reservations/controller/controller.go @@ -10,6 +10,7 @@ import ( "fmt" "net/http" "strings" + "time" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" @@ -23,10 +24,21 @@ import ( schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" + "github.com/cobaltcore-dev/cortex/internal/knowledge/db" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/cobaltcore-dev/cortex/pkg/multicluster" corev1 "k8s.io/api/core/v1" ) +const ( + // RequeueIntervalActive is the interval for requeueing active reservations for verification. + RequeueIntervalActive = 5 * time.Minute + // RequeueIntervalRetry is the interval for requeueing when retrying after knowledge is not ready. + RequeueIntervalRetry = 1 * time.Minute +) + // Endpoints for the reservations operator. type EndpointsConfig struct { // The nova external scheduler endpoint. @@ -42,18 +54,21 @@ type Config struct { // Secret ref to keystone credentials stored in a k8s secret. KeystoneSecretRef corev1.SecretReference `json:"keystoneSecretRef"` + + // Secret ref to the database credentials for querying VM state. + DatabaseSecretRef *corev1.SecretReference `json:"databaseSecretRef,omitempty"` } // ReservationReconciler reconciles a Reservation object type ReservationReconciler struct { - // Client to fetch hypervisors. - HypervisorClient // Client for the kubernetes API. client.Client // Kubernetes scheme to use for the reservations. Scheme *runtime.Scheme // Configuration for the controller. Conf Config + // Database connection for querying VM state from Knowledge cache. + DB *db.DB } // Reconcile is part of the main kubernetes reconciliation loop which aims to @@ -63,16 +78,60 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Fetch the reservation object. var res v1alpha1.Reservation if err := r.Get(ctx, req.NamespacedName, &res); err != nil { - // Can happen when the resource was just deleted. - return ctrl.Result{}, err + // Ignore not-found errors, since they can't be fixed by an immediate requeue + return ctrl.Result{}, client.IgnoreNotFound(err) } - // If the reservation is already active (Ready=True), skip it. + if meta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) { - log.Info("reservation is already active, skipping", "reservation", req.Name) - return ctrl.Result{}, nil // Don't need to requeue. + log.Info("reservation is active, verifying allocations", "reservation", req.Name) + + // Verify all allocations in Spec against actual VM state from database + if err := r.reconcileAllocations(ctx, &res); err != nil { + log.Error(err, "failed to reconcile allocations") + return ctrl.Result{}, err + } + + // Requeue periodically to keep verifying allocations + return ctrl.Result{RequeueAfter: RequeueIntervalActive}, nil } - // Sync Spec values to Status fields + // TODO trigger re-placement of unused reservations over time + + // Check if this is a pre-allocated reservation with allocations + if res.Spec.CommittedResourceReservation != nil && + len(res.Spec.CommittedResourceReservation.Allocations) > 0 && + res.Spec.TargetHost != "" { + // mark as ready without calling the placement API + log.Info("detected pre-allocated reservation", + "reservation", req.Name, + "targetHost", res.Spec.TargetHost, + "allocatedVMs", len(res.Spec.CommittedResourceReservation.Allocations)) + + old := res.DeepCopy() + res.Status.Host = res.Spec.TargetHost + meta.SetStatusCondition(&res.Status.Conditions, metav1.Condition{ + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "PreAllocated", + Message: "reservation pre-allocated with VM allocations", + }) + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, &res, patch); err != nil { + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch pre-allocated reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil + } + + log.Info("marked pre-allocated reservation as ready", "reservation", req.Name, "host", res.Status.Host) + // Requeue immediately to run verification in next reconcile loop + return ctrl.Result{Requeue: true}, nil + } + + // Sync Spec values to Status fields for non-pre-allocated reservations // This ensures the observed state reflects the desired state from Spec needsStatusUpdate := false if res.Spec.TargetHost != "" && res.Status.Host != res.Spec.TargetHost { @@ -83,13 +142,18 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) old := res.DeepCopy() patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to sync spec to status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to sync spec to status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } log.Info("synced spec to status", "reservation", req.Name, "host", res.Status.Host) } - // Currently we can only reconcile nova CommittedResourceReservations (those with ResourceName set). + // filter for CR reservations resourceName := "" if res.Spec.CommittedResourceReservation != nil { resourceName = res.Spec.CommittedResourceReservation.ResourceName @@ -105,8 +169,13 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) }) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // Don't need to requeue. } @@ -130,49 +199,67 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) cpu = uint64(cpuValue) } - // Get all hosts and assign zero-weights to them. - hypervisors, err := r.ListHypervisors(ctx) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to list hypervisors: %w", err) + // Get project ID from CommittedResourceReservation spec if available. + projectID := "" + if res.Spec.CommittedResourceReservation != nil { + projectID = res.Spec.CommittedResourceReservation.ProjectID } - var eligibleHosts []schedulerdelegationapi.ExternalSchedulerHost - for _, hv := range hypervisors { - eligibleHosts = append(eligibleHosts, schedulerdelegationapi.ExternalSchedulerHost{ - ComputeHost: hv.Service.Host, - HypervisorHostname: hv.Hostname, - }) + + // Get AvailabilityZone from reservation if available + availabilityZone := "" + if res.Spec.AvailabilityZone != "" { + availabilityZone = res.Spec.AvailabilityZone } - if len(eligibleHosts) == 0 { - log.Info("no eligible hosts found for reservation", "reservation", req.Name) - return ctrl.Result{}, errors.New("no eligible hosts found for reservation") + + // Get flavor details from flavor group knowledge CRD + knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client} + flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) + if err != nil { + log.Info("flavor knowledge not ready, requeueing", + "resourceName", resourceName, + "error", err) + return ctrl.Result{RequeueAfter: RequeueIntervalRetry}, nil } - weights := make(map[string]float64, len(eligibleHosts)) - for _, host := range eligibleHosts { - weights[host.ComputeHost] = 0.0 + + // Search for the flavor across all flavor groups + var flavorDetails *compute.FlavorInGroup + for _, fg := range flavorGroups { + for _, flavor := range fg.Flavors { + if flavor.Name == resourceName { + flavorDetails = &flavor + break + } + } + if flavorDetails != nil { + break + } } - // Get project ID from CommittedResourceReservation spec if available. - projectID := "" - if res.Spec.CommittedResourceReservation != nil { - projectID = res.Spec.CommittedResourceReservation.ProjectID + // Check if flavor was found + if flavorDetails == nil { + log.Error(errors.New("flavor not found"), "flavor not found in any flavor group", + "resourceName", resourceName) + return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil } // Call the external scheduler delegation API to get a host for the reservation. + // Cortex will fetch candidate hosts and weights itself from its knowledge state. externalSchedulerRequest := schedulerdelegationapi.ExternalSchedulerRequest{ Reservation: true, - Hosts: eligibleHosts, - Weights: weights, Spec: schedulerdelegationapi.NovaObject[schedulerdelegationapi.NovaSpec]{ Data: schedulerdelegationapi.NovaSpec{ - InstanceUUID: res.Name, - NumInstances: 1, // One for each reservation. - ProjectID: projectID, + InstanceUUID: res.Name, + NumInstances: 1, // One for each reservation. + ProjectID: projectID, + AvailabilityZone: availabilityZone, Flavor: schedulerdelegationapi.NovaObject[schedulerdelegationapi.NovaFlavor]{ Data: schedulerdelegationapi.NovaFlavor{ - Name: resourceName, - MemoryMB: memoryMB, - VCPUs: cpu, + Name: flavorDetails.Name, + MemoryMB: memoryMB, // take the memory from the reservation spec, not from the flavor - reservation might be bigger + VCPUs: cpu, // take the cpu from the reservation spec, not from the flavor - reservation might be bigger + ExtraSpecs: flavorDetails.ExtraSpecs, // Disk is currently not considered. + }, }, }, @@ -187,13 +274,26 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) } response, err := httpClient.Post(url, "application/json", strings.NewReader(string(reqBody))) if err != nil { - log.Error(err, "failed to send external scheduler request") + log.Error(err, "failed to send external scheduler request", "url", url) return ctrl.Result{}, err } defer response.Body.Close() + + // Check HTTP status code before attempting to decode JSON + if response.StatusCode != http.StatusOK { + err := fmt.Errorf("unexpected HTTP status code: %d", response.StatusCode) + log.Error(err, "external scheduler returned non-OK status", + "url", url, + "statusCode", response.StatusCode, + "status", response.Status) + return ctrl.Result{}, err + } + var externalSchedulerResponse schedulerdelegationapi.ExternalSchedulerResponse if err := json.NewDecoder(response.Body).Decode(&externalSchedulerResponse); err != nil { - log.Error(err, "failed to decode external scheduler response") + log.Error(err, "failed to decode external scheduler response", + "url", url, + "statusCode", response.StatusCode) return ctrl.Result{}, err } if len(externalSchedulerResponse.Hosts) == 0 { @@ -207,8 +307,13 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) }) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // No need to requeue, we didn't find a host. } @@ -226,12 +331,141 @@ func (r *ReservationReconciler) Reconcile(ctx context.Context, req ctrl.Request) res.Status.Host = host patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { - log.Error(err, "failed to patch reservation status") - return ctrl.Result{}, err + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) != nil { + log.Error(err, "failed to patch reservation status") + return ctrl.Result{}, err + } + // Object was deleted, no need to continue + return ctrl.Result{}, nil } return ctrl.Result{}, nil // No need to requeue, the reservation is now active. } +// reconcileAllocations verifies all allocations in Spec against actual Nova VM state. +// It updates Status.Allocations based on the actual host location of each VM. +func (r *ReservationReconciler) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) error { + log := logf.FromContext(ctx) + + // Skip if no CommittedResourceReservation + if res.Spec.CommittedResourceReservation == nil { + return nil + } + + // TODO trigger migrations of unused reservations (to PAYG VMs) + + // Skip if no allocations to verify + if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { + log.Info("no allocations to verify", "reservation", res.Name) + return nil + } + + // Query all VMs for this project from the database + projectID := res.Spec.CommittedResourceReservation.ProjectID + serverMap, err := r.listServersByProjectID(ctx, projectID) + if err != nil { + return fmt.Errorf("failed to list servers for project %s: %w", projectID, err) + } + + // initialize + if res.Status.CommittedResourceReservation == nil { + res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{} + } + + // Build new Status.Allocations map based on actual VM locations + newStatusAllocations := make(map[string]string) + + for vmUUID := range res.Spec.CommittedResourceReservation.Allocations { + server, exists := serverMap[vmUUID] + if exists { + // VM found - record its actual host location + actualHost := server.OSEXTSRVATTRHost + newStatusAllocations[vmUUID] = actualHost + + log.Info("verified VM allocation", + "vm", vmUUID, + "reservation", res.Name, + "actualHost", actualHost, + "expectedHost", res.Status.Host) + } else { + // VM not found in database + log.Info("VM not found in database", + "vm", vmUUID, + "reservation", res.Name, + "projectID", projectID) + + // TODO handle entering and leave event + } + } + + // Patch the reservation status + old := res.DeepCopy() + + // Update Status.Allocations + res.Status.CommittedResourceReservation.Allocations = newStatusAllocations + + patch := client.MergeFrom(old) + if err := r.Status().Patch(ctx, res, patch); err != nil { + // Ignore not-found errors during background deletion + if client.IgnoreNotFound(err) == nil { + // Object was deleted, no need to continue + return nil + } + return fmt.Errorf("failed to patch reservation status: %w", err) + } + + log.Info("reconciled allocations", + "reservation", res.Name, + "specAllocations", len(res.Spec.CommittedResourceReservation.Allocations), + "statusAllocations", len(newStatusAllocations)) + + return nil +} + +// Init initializes the reconciler with required clients and DB connection. +func (r *ReservationReconciler) Init(ctx context.Context, client client.Client, conf Config) error { + // Initialize database connection if DatabaseSecretRef is provided. + if conf.DatabaseSecretRef != nil { + var err error + r.DB, err = db.Connector{Client: client}.FromSecretRef(ctx, *conf.DatabaseSecretRef) + if err != nil { + return fmt.Errorf("failed to initialize database connection: %w", err) + } + logf.FromContext(ctx).Info("database connection initialized for reservation controller") + } + + return nil +} + +func (r *ReservationReconciler) listServersByProjectID(ctx context.Context, projectID string) (map[string]*nova.Server, error) { + if r.DB == nil { + return nil, errors.New("database connection not initialized") + } + + log := logf.FromContext(ctx) + + // Query servers from the database cache. + var servers []nova.Server + _, err := r.DB.Select(&servers, + "SELECT * FROM openstack_servers WHERE tenant_id = $1", + projectID) + if err != nil { + return nil, fmt.Errorf("failed to query servers from database: %w", err) + } + + log.V(1).Info("queried servers from database", + "projectID", projectID, + "serverCount", len(servers)) + + // Build lookup map for O(1) access by VM UUID. + serverMap := make(map[string]*nova.Server, len(servers)) + for i := range servers { + serverMap[servers[i].ID] = &servers[i] + } + + return serverMap, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *ReservationReconciler) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { diff --git a/internal/scheduling/reservations/controller/controller_test.go b/internal/scheduling/reservations/controller/controller_test.go index d716c0b63..548857d3a 100644 --- a/internal/scheduling/reservations/controller/controller_test.go +++ b/internal/scheduling/reservations/controller/controller_test.go @@ -36,7 +36,7 @@ func TestReservationReconciler_Reconcile(t *testing.T) { shouldRequeue bool }{ { - name: "skip already active reservation", + name: "expect already active reservation", reservation: &v1alpha1.Reservation{ ObjectMeta: ctrl.ObjectMeta{ Name: "test-reservation", @@ -59,7 +59,7 @@ func TestReservationReconciler_Reconcile(t *testing.T) { }, }, expectedReady: true, - shouldRequeue: false, + shouldRequeue: true, }, { name: "skip reservation without resource name", @@ -155,10 +155,71 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T }, } + // Create flavor group knowledge CRD for the test + // Need to import compute package for FlavorGroupFeature + flavorGroups := []struct { + Name string `json:"name"` + Flavors []struct { + Name string `json:"name"` + MemoryMB uint64 `json:"memoryMB"` + VCPUs uint64 `json:"vcpus"` + ExtraSpecs map[string]string `json:"extraSpecs"` + } `json:"flavors"` + }{ + { + Name: "test-group", + Flavors: []struct { + Name string `json:"name"` + MemoryMB uint64 `json:"memoryMB"` + VCPUs uint64 `json:"vcpus"` + ExtraSpecs map[string]string `json:"extraSpecs"` + }{ + { + Name: "test-flavor", + MemoryMB: 1024, + VCPUs: 2, + ExtraSpecs: map[string]string{}, + }, + }, + }, + } + + // Marshal flavor groups into runtime.RawExtension + flavorGroupsJSON, err := json.Marshal(map[string]interface{}{ + "features": flavorGroups, + }) + if err != nil { + t.Fatalf("Failed to marshal flavor groups: %v", err) + } + + flavorGroupKnowledge := &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{ + Name: "flavor_groups", // Note: underscore not hyphen + }, + Recency: metav1.Duration{Duration: 0}, + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: runtime.RawExtension{Raw: flavorGroupsJSON}, + RawLength: 1, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + Reason: "TestReady", + }, + }, + }, + } + client := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(reservation). - WithStatusSubresource(&v1alpha1.Reservation{}). + WithObjects(reservation, flavorGroupKnowledge). + WithStatusSubresource(&v1alpha1.Reservation{}, &v1alpha1.Knowledge{}). Build() // Create a mock server that returns a successful response @@ -196,28 +257,6 @@ func TestReservationReconciler_reconcileInstanceReservation_Success(t *testing.T Client: client, Scheme: scheme, Conf: config, - HypervisorClient: &mockHypervisorClient{ - hypervisorsToReturn: []Hypervisor{ - { - Hostname: "test-host-1", - Type: "qemu", - Service: struct { - Host string `json:"host"` - }{ - Host: "compute1", - }, - }, - { - Hostname: "test-host-2", - Type: "qemu", - Service: struct { - Host string `json:"host"` - }{ - Host: "compute2", - }, - }, - }, - }, } req := ctrl.Request{ diff --git a/internal/scheduling/reservations/flavor_groups.go b/internal/scheduling/reservations/flavor_groups.go new file mode 100644 index 000000000..197406eac --- /dev/null +++ b/internal/scheduling/reservations/flavor_groups.go @@ -0,0 +1,74 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package reservations + +import ( + "context" + "errors" + "fmt" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// FlavorGroupKnowledgeClient accesses flavor group data from Knowledge CRDs. +type FlavorGroupKnowledgeClient struct { + client.Client +} + +// Get retrieves the flavor groups Knowledge CRD and returns it if ready. +// Returns nil, nil if not ready yet. +func (c *FlavorGroupKnowledgeClient) Get(ctx context.Context) (*v1alpha1.Knowledge, error) { + knowledge := &v1alpha1.Knowledge{} + err := c.Client.Get(ctx, types.NamespacedName{ + Name: "flavor-groups", + // Namespace is empty as Knowledge is cluster-scoped + }, knowledge) + + if err != nil { + return nil, fmt.Errorf("failed to get flavor groups knowledge: %w", err) + } + + if meta.IsStatusConditionTrue(knowledge.Status.Conditions, v1alpha1.KnowledgeConditionReady) { + return knowledge, nil + } + + // Found but not ready yet + return nil, nil +} + +// GetAllFlavorGroups returns all flavor groups as a map. +// If knowledgeCRD is provided, uses it directly. Otherwise fetches the Knowledge CRD. +func (c *FlavorGroupKnowledgeClient) GetAllFlavorGroups(ctx context.Context, knowledgeCRD *v1alpha1.Knowledge) (map[string]compute.FlavorGroupFeature, error) { + // If no CRD provided, fetch it + if knowledgeCRD == nil { + var err error + knowledgeCRD, err = c.Get(ctx) + if err != nil { + return nil, err + } + if knowledgeCRD == nil { + return nil, errors.New("flavor groups knowledge is not ready") + } + } + + // Unbox the features from the raw extension + features, err := v1alpha1.UnboxFeatureList[compute.FlavorGroupFeature]( + knowledgeCRD.Status.Raw, + ) + if err != nil { + return nil, fmt.Errorf("failed to unbox flavor group features: %w", err) + } + + // Build map for efficient lookups + flavorGroupMap := make(map[string]compute.FlavorGroupFeature, len(features)) + for _, feature := range features { + flavorGroupMap[feature.Name] = feature + } + + return flavorGroupMap, nil +}