From 955ba2d89e88b524345ecbd8d9e33811e7c05593 Mon Sep 17 00:00:00 2001 From: Rajath Agasthya Date: Fri, 19 Dec 2025 15:21:22 -0600 Subject: [PATCH 1/2] Add support for dynamic MIG config generation Removes static, handcrafted MIG config file and uses config generated dynamically by mig-parted CLI, if a custom config is not specified. Signed-off-by: Rajath Agasthya --- api/nvidia/v1/clusterpolicy_types.go | 3 +- assets/state-mig-manager/0200_role.yaml | 10 + assets/state-mig-manager/0400_configmap.yaml | 625 ------------------ assets/state-mig-manager/0600_daemonset.yaml | 15 +- .../manifests/nvidia.com_clusterpolicies.yaml | 4 +- .../crd/bases/nvidia.com_clusterpolicies.yaml | 4 +- controllers/object_controls.go | 39 +- .../crds/nvidia.com_clusterpolicies.yaml | 4 +- deployments/gpu-operator/values.yaml | 21 +- 9 files changed, 60 insertions(+), 665 deletions(-) delete mode 100644 assets/state-mig-manager/0400_configmap.yaml diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 121493c7d..1a2c448e0 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1429,9 +1429,8 @@ type GDRCopySpec struct { // MIGPartedConfigSpec defines custom mig-parted config for NVIDIA MIG Manager container type MIGPartedConfigSpec struct { - // ConfigMap name + // ConfigMap name. If not specified, MIG configuration will be dynamically generated from hardware. // +kubebuilder:validation:Optional - // +kubebuilder:default=default-mig-parted-config // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ConfigMap Name" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" diff --git a/assets/state-mig-manager/0200_role.yaml b/assets/state-mig-manager/0200_role.yaml index 5396cbeaa..c9fd933ad 100644 --- a/assets/state-mig-manager/0200_role.yaml +++ b/assets/state-mig-manager/0200_role.yaml @@ -21,3 +21,13 @@ rules: - list - watch - delete +- apiGroups: + - "" + resources: + - configmaps + verbs: + - create + - get + - list + - update + - patch diff --git a/assets/state-mig-manager/0400_configmap.yaml b/assets/state-mig-manager/0400_configmap.yaml deleted file mode 100644 index 90cdde095..000000000 --- a/assets/state-mig-manager/0400_configmap.yaml +++ /dev/null @@ -1,625 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: default-mig-parted-config - namespace: "FILLED BY THE OPERATOR" -data: - config.yaml: | - version: v1 - mig-configs: - all-disabled: - - devices: all - mig-enabled: false - - all-enabled: - - devices: all - mig-enabled: true - mig-devices: {} - - # A100-40GB, A800-40GB - all-1g.5gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.5gb": 7 - - all-1g.5gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.5gb+me": 1 - - all-2g.10gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.10gb": 3 - - all-3g.20gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.20gb": 2 - - all-4g.20gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.20gb": 1 - - all-7g.40gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.40gb": 1 - - # RTX-PRO-6000-96GB - all-1g.24gb.gfx: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb+gfx": 4 - - all-1g.24gb.me.all: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb+me.all": 1 - - all-1g.24gb-me: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb-me": 4 - - all-2g.48gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.48gb": 2 - - all-2g.48gb.gfx: - - devices: all - mig-enabled: true - mig-devices: - "2g.48gb+gfx": 2 - - all-2g.48gb.me.all: - - devices: all - mig-enabled: true - mig-devices: - "2g.48gb+me.all": 1 - - all-2g.48gb-me: - - devices: all - mig-enabled: true - mig-devices: - "2g.48gb-me": 2 - - all-4g.96gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.96gb": 1 - - all-4g.96gb.gfx: - - devices: all - mig-enabled: true - mig-devices: - "4g.96gb+gfx": 1 - - # H100-80GB, H800-80GB, A100-80GB, A800-80GB, A100-40GB, A800-40GB - all-1g.10gb: - # H100-80GB, H800-80GB, A100-80GB, A800-80GB - - device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.10gb": 7 - - # A100-40GB, A800-40GB - - device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.10gb": 4 - - # H100-80GB, H800-80GB, A100-80GB, A800-80GB - all-1g.10gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.10gb+me": 1 - - # H100-80GB, H800-80GB, A100-80GB, A800-80GB - all-1g.20gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.20gb": 4 - - # GB200, B200 - all-1g.23gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.23gb": 7 - - # GB200, B200 - all-1g.23gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.23gb+me": 1 - - all-1g.24gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb+me": 1 - - all-2g.20gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.20gb": 3 - - all-3g.40gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.40gb": 2 - - all-4g.40gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.40gb": 1 - - all-7g.80gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.80gb": 1 - - # A30-24GB - all-1g.6gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.6gb": 4 - - all-1g.6gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.6gb+me": 1 - - all-2g.12gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.12gb": 2 - - all-2g.12gb.me: - - devices: all - mig-enabled: true - mig-devices: - "2g.12gb+me": 1 - - all-4g.24gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.24gb": 1 - - # H100 NVL, H800 NVL, GH200 - all-1g.12gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.12gb": 7 - - all-1g.12gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.12gb+me": 1 - - all-1g.24gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.24gb": 4 - - all-1g.45gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.45gb": 4 - - all-1g.47gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.47gb": 4 - - all-2g.24gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.24gb": 3 - - all-2g.45gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.45gb": 3 - - all-2g.47gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.47gb": 3 - - # H100 NVL, H800 NVL - all-3g.47gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.47gb": 2 - - all-4g.47gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.47gb": 1 - - all-7g.94gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.94gb": 1 - - # H100-96GB, PG506-96GB, GH200 - all-3g.48gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.48gb": 2 - - all-3g.90gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.90gb": 2 - - all-3g.93gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.93gb": 2 - - all-3g.95gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.95gb": 2 - - all-4g.48gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.48gb": 1 - - all-4g.90gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.90gb": 1 - - all-4g.93gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.93gb": 1 - - all-4g.95gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.95gb": 1 - - all-7g.96gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.96gb": 1 - - all-7g.180gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.180gb": 1 - - all-7g.186gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.186gb": 1 - - all-7g.189gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.189gb": 1 - - # GB200 HGX, B200, GH200 144G HBM3e, H200-141GB, H200 NVL, H100-96GB, GH200, H100 NVL, H800 NVL, H100-80GB, H800-80GB, A800-40GB, A800-80GB, A100-40GB, A100-80GB, A30-24GB, PG506-96GB - all-balanced: - # GB200 HGX - - device-filter: ["0x294110DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.23gb": 2 - "2g.47gb": 1 - "3g.93gb": 1 - - # RTX-PRO-6000-96GB - - device-filter: ["0x2BB510DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.24gb": 2 - "2g.48gb": 1 - - # B200 - - device-filter: ["0x290110DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.23gb": 2 - "2g.45gb": 1 - "3g.90gb": 1 - - # GH200 144G HBM3e - - device-filter: ["0x234810DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.18gb": 2 - "2g.36gb": 1 - "3g.72gb": 1 - - # H200 141GB, H200 NVL - - device-filter: ["0x233510DE", "0x233B10DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.18gb": 2 - "2g.35gb": 1 - "3g.71gb": 1 - - # H100 NVL, H800 NVL - - device-filter: ["0x232110DE", "0x233A10DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.12gb": 2 - "2g.24gb": 1 - "3g.47gb": 1 - - # H100-80GB, H800-80GB, A100-80GB, A800-80GB - - device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.10gb": 2 - "2g.20gb": 1 - "3g.40gb": 1 - - # A100-40GB, A800-40GB - - device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.5gb": 2 - "2g.10gb": 1 - "3g.20gb": 1 - - # A30-24GB - - device-filter: "0x20B710DE" - devices: all - mig-enabled: true - mig-devices: - "1g.6gb": 2 - "2g.12gb": 1 - - # H100-96GB, PG506-96GB, GH200, H20 - - device-filter: ["0x234210DE", "0x233D10DE", "0x20B610DE", "0x232910DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.12gb": 2 - "2g.24gb": 1 - "3g.48gb": 1 - - # B300 - - device-filter: ["0x318210DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.34gb": 2 - "2g.67gb": 1 - "3g.135gb": 1 - - # GB300 - - device-filter: ["0x31C210DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.35gb": 2 - "2g.70gb": 1 - "3g.139gb": 1 - - # H200-141GB, GH200 144G HBM3e - all-1g.18gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.18gb": 7 - - all-1g.18gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.18gb+me": 1 - - all-1g.35gb: - # H200-141GB - - device-filter: ["0x233510DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.35gb": 4 - # GB300 - - device-filter: ["0x31C210DE"] - devices: all - mig-enabled: true - mig-devices: - "1g.35gb": 7 - - all-2g.35gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.35gb": 3 - - all-3g.71gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.71gb": 2 - - all-4g.71gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.71gb": 1 - - all-7g.141gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.141gb": 1 - - # GH200 144G HBM3e - all-1g.36gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.36gb": 4 - - all-2g.36gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.36gb": 3 - - all-3g.72gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.72gb": 2 - - all-4g.72gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.72gb": 1 - - all-7g.144gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.144gb": 1 - - # B300 - all-1g.34gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.34gb": 7 - - all-1g.34gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.34gb+me": 1 - - all-1g.67gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.67gb": 4 - - all-2g.67gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.67gb": 3 - - all-3g.135gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.135gb": 2 - - all-4g.135gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.135gb": 1 - - all-7g.269gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.269gb": 1 - - # GB300 - all-1g.35gb.me: - - devices: all - mig-enabled: true - mig-devices: - "1g.35gb+me": 1 - - all-1g.70gb: - - devices: all - mig-enabled: true - mig-devices: - "1g.70gb": 4 - - all-2g.70gb: - - devices: all - mig-enabled: true - mig-devices: - "2g.70gb": 3 - - all-3g.139gb: - - devices: all - mig-enabled: true - mig-devices: - "3g.139gb": 2 - - all-4g.139gb: - - devices: all - mig-enabled: true - mig-devices: - "4g.139gb": 1 - - all-7g.278gb: - - devices: all - mig-enabled: true - mig-devices: - "7g.278gb": 1 diff --git a/assets/state-mig-manager/0600_daemonset.yaml b/assets/state-mig-manager/0600_daemonset.yaml index 1a9076169..c5ec8e283 100644 --- a/assets/state-mig-manager/0600_daemonset.yaml +++ b/assets/state-mig-manager/0600_daemonset.yaml @@ -47,8 +47,14 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName - - name: CONFIG_FILE - value: "/mig-parted-config/config.yaml" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace - name: GPU_CLIENTS_FILE value: "/gpu-clients/clients.yaml" - name: DEFAULT_GPU_CLIENTS_NAMESPACE @@ -66,8 +72,6 @@ spec: mountPath: /run/nvidia/validations - mountPath: /sys name: host-sys - - mountPath: /mig-parted-config - name: mig-parted-config - mountPath: /host name: host-root mountPropagation: HostToContainer @@ -87,9 +91,6 @@ spec: hostPath: path: /sys type: Directory - - name: mig-parted-config - configMap: - name: "FILLED_BY_OPERATOR" - name: run-nvidia-validations hostPath: path: "/run/nvidia/validations" diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index afaffbbe6..c688533a0 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -1410,8 +1410,8 @@ spec: - "" type: string name: - default: default-mig-parted-config - description: ConfigMap name + description: ConfigMap name. If not specified, MIG configuration + will be dynamically generated from hardware. type: string type: object enabled: diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index afaffbbe6..c688533a0 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -1410,8 +1410,8 @@ spec: - "" type: string name: - default: default-mig-parted-config - description: ConfigMap name + description: ConfigMap name. If not specified, MIG configuration + will be dynamically generated from hardware. type: string type: object enabled: diff --git a/controllers/object_controls.go b/controllers/object_controls.go index e84adb6c2..1aaf9a842 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -23,13 +23,12 @@ import ( "fmt" "os" "path" + "path/filepath" "regexp" "sort" "strconv" "strings" - "path/filepath" - apiconfigv1 "github.com/openshift/api/config/v1" apiimagev1 "github.com/openshift/api/image/v1" secv1 "github.com/openshift/api/security/v1" @@ -102,8 +101,6 @@ const ( ValidatorRuntimeClassEnvName = "VALIDATOR_RUNTIME_CLASS" // MigStrategyEnvName indicates env name for passing MIG strategy MigStrategyEnvName = "MIG_STRATEGY" - // MigPartedDefaultConfigMapName indicates name of ConfigMap containing default mig-parted config - MigPartedDefaultConfigMapName = "default-mig-parted-config" // MigDefaultGPUClientsConfigMapName indicates name of ConfigMap containing default gpu-clients MigDefaultGPUClientsConfigMapName = "default-gpu-clients" // DCGMRemoteEngineEnvName indicates env name to specify remote DCGM host engine ip:port @@ -536,14 +533,6 @@ func createConfigMap(n ClusterPolicyController, configMapIdx int) (gpuv1.State, return gpuv1.Disabled, nil } - // avoid creating default 'mig-parted-config' ConfigMap if custom one is provided - if obj.Name == MigPartedDefaultConfigMapName { - if name, isCustom := gpuv1.GetConfigMapName(config.MIGManager.Config, MigPartedDefaultConfigMapName); isCustom { - logger.Info("Not creating resource, custom ConfigMap provided", "Name", name) - return gpuv1.Ready, nil - } - } - // avoid creating default 'gpu-clients' ConfigMap if custom one is provided if obj.Name == MigDefaultGPUClientsConfigMapName { if name, isCustom := gpuv1.GetConfigMapName(config.MIGManager.GPUClientsConfig, MigDefaultGPUClientsConfigMapName); isCustom { @@ -1982,15 +1971,25 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) setNRIPluginAnnotation(&obj.Spec.Template.ObjectMeta, &config.CDI, obj.Spec.Template.Spec.Containers[0].Name) - // set ConfigMap name for "mig-parted-config" Volume - for i, vol := range obj.Spec.Template.Spec.Volumes { - if !strings.Contains(vol.Name, "mig-parted-config") { - continue - } + // mount custom mig-parted config if provided + hasCustomConfig := config.MIGManager.Config != nil && config.MIGManager.Config.Name != "" + if hasCustomConfig { + migConfigVolume := createConfigMapVolume(config.MIGManager.Config.Name, nil) + migConfigVolume.Name = "mig-parted-config" + obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, migConfigVolume) + + migConfigMount := corev1.VolumeMount{ + Name: "mig-parted-config", + MountPath: "/mig-parted-config", + ReadOnly: true, + } + obj.Spec.Template.Spec.Containers[0].VolumeMounts = append( + obj.Spec.Template.Spec.Containers[0].VolumeMounts, + migConfigMount, + ) - name, _ := gpuv1.GetConfigMapName(config.MIGManager.Config, MigPartedDefaultConfigMapName) - obj.Spec.Template.Spec.Volumes[i].ConfigMap.Name = name - break + // NOTE: assumes ConfigMap has key "config.yaml" + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "CONFIG_FILE", "/mig-parted-config/config.yaml") } // set ConfigMap name for "gpu-clients" Volume diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index afaffbbe6..c688533a0 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -1410,8 +1410,8 @@ spec: - "" type: string name: - default: default-mig-parted-config - description: ConfigMap name + description: ConfigMap name. If not specified, MIG configuration + will be dynamically generated from hardware. type: string type: object enabled: diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 73903bc70..0943cd620 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -344,9 +344,18 @@ migManager: env: [] resources: {} # MIG configuration - # Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true). - # Use "data" to build an integrated ConfigMap from a set of configurations as - # part of this helm chart. An example of setting "data" might be: + # NOTE: MIG manager automatically generates configuration from hardware on each node. + # Only provide a custom config if you need settings that differ from hardware discovery. + # + # To use an existing ConfigMap: + # - Set name="your-configmap-name" with create=false + # - ConfigMap MUST have a key named "config.yaml" + # + # To create a new ConfigMap via Helm: + # - Set create=true, name="your-configmap-name", and provide data below + # - If create=true but data is empty, ConfigMap creation is skipped + # + # Example of creating a custom ConfigMap: # config: # name: custom-mig-parted-configs # create: true @@ -378,9 +387,11 @@ migManager: default: "all-disabled" # Create a ConfigMap (default: false) create: false - # ConfigMap name (either existing or to create a new one with create=true above) + # ConfigMap name (either existing or to create with create=true) + # If name is provided, mig-manager will use this config instead of auto-generated one. + # REQUIREMENT: Custom ConfigMaps must contain a key named "config.yaml" name: "" - # Data section for the ConfigMap to create (i.e only applies when create=true) + # Data section for the ConfigMap (required only if create=true) data: {} gpuClientsConfig: name: "" From f7b2f8f06f54fc295248c931a449a03f28e1d426 Mon Sep 17 00:00:00 2001 From: Rajath Agasthya Date: Thu, 22 Jan 2026 12:04:43 -0600 Subject: [PATCH 2/2] Add tests for MIG Manager dynamic config Test the conditional ConfigMap mounting logic for custom MIG config: * Unit tests verify volume, volumeMount, and CONFIG_FILE env var * Integration tests verify end-to-end daemonset creation Signed-off-by: Rajath Agasthya --- controllers/object_controls_test.go | 152 ++++++++++++++++++++++++++++ controllers/transforms_test.go | 92 +++++++++++++++++ 2 files changed, 244 insertions(+) diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index 2d84b2fc5..340ce818f 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -58,6 +58,7 @@ const ( sandboxDevicePluginAssetsPath = "assets/state-sandbox-device-plugin" devicePluginAssetsPath = "assets/state-device-plugin/" dcgmExporterAssetsPath = "assets/state-dcgm-exporter/" + migManagerAssetsPath = "assets/state-mig-manager/" nfdNvidiaPCILabelKey = "feature.node.kubernetes.io/pci-10de.present" upgradedKernel = "5.4.135-generic" ) @@ -423,6 +424,24 @@ func testDaemonsetCommon(t *testing.T, cp *gpuv1.ClusterPolicy, component string if err != nil { return nil, fmt.Errorf("unable to get mainCtrImage for dcgm-exporter: %v", err) } + case "MIGManager": + spec = commonDaemonsetSpec{ + repository: cp.Spec.MIGManager.Repository, + image: cp.Spec.MIGManager.Image, + version: cp.Spec.MIGManager.Version, + imagePullPolicy: cp.Spec.MIGManager.ImagePullPolicy, + imagePullSecrets: getImagePullSecrets(cp.Spec.MIGManager.ImagePullSecrets), + args: cp.Spec.MIGManager.Args, + env: cp.Spec.MIGManager.Env, + resources: cp.Spec.MIGManager.Resources, + } + dsLabel = "nvidia-mig-manager" + mainCtrName = "nvidia-mig-manager" + manifestFile = filepath.Join(cfg.root, migManagerAssetsPath) + mainCtrImage, err = gpuv1.ImagePath(&cp.Spec.MIGManager) + if err != nil { + return nil, fmt.Errorf("unable to get mainCtrImage for mig-manager: %v", err) + } default: return nil, fmt.Errorf("invalid component for testDaemonsetCommon(): %s", component) } @@ -1479,3 +1498,136 @@ func TestCertConfigPathMap(t *testing.T) { require.Equal(t, expectedPath, path, "Incorrect path for OS %s", os) } } + +// getMIGManagerTestInput returns a ClusterPolicy instance for a particular +// MIG Manager test case. This function will grow as new test cases are added +func getMIGManagerTestInput(testCase string) *gpuv1.ClusterPolicy { + cp := clusterPolicy.DeepCopy() + + // Set default values for MIG Manager + cp.Spec.MIGManager.Repository = "nvcr.io/nvidia/cloud-native" + cp.Spec.MIGManager.Image = "k8s-mig-manager" + cp.Spec.MIGManager.Version = "v0.5.0" + cp.Spec.MIGManager.ImagePullSecrets = []string{"ngc-secret"} + + // Validator is required for all daemonset tests + cp.Spec.Validator.Repository = "nvcr.io/nvidia/cloud-native" + cp.Spec.Validator.Image = "gpu-operator-validator" + cp.Spec.Validator.Version = "v1.11.0" + cp.Spec.Validator.ImagePullSecrets = []string{"ngc-secret"} + + switch testCase { + case "default": + // No custom config + case "custom-config": + cp.Spec.MIGManager.Config = &gpuv1.MIGPartedConfigSpec{Name: "custom-mig-config"} + default: + return nil + } + + return cp +} + +// getMIGManagerTestOutput returns a map containing expected output for +// MIG Manager test case. This function will grow as new test cases are added +func getMIGManagerTestOutput(testCase string) map[string]interface{} { + // default output + output := map[string]interface{}{ + "numDaemonsets": 1, + "migManagerImage": "nvcr.io/nvidia/cloud-native/k8s-mig-manager:v0.5.0", + "imagePullSecret": "ngc-secret", + "migConfigVolumePresent": false, + "env": map[string]string{}, + } + + switch testCase { + case "default": + // No config volume + case "custom-config": + output["migConfigVolumePresent"] = true + output["env"] = map[string]string{ + "CONFIG_FILE": "/mig-parted-config/config.yaml", + } + default: + return nil + } + + return output +} + +// TestMIGManager tests that the GPU Operator correctly deploys the mig-manager daemonset +// under various scenarios/config options +func TestMIGManager(t *testing.T) { + testCases := []struct { + description string + clusterPolicy *gpuv1.ClusterPolicy + output map[string]interface{} + }{ + { + "Default", + getMIGManagerTestInput("default"), + getMIGManagerTestOutput("default"), + }, + { + "CustomConfig", + getMIGManagerTestInput("custom-config"), + getMIGManagerTestOutput("custom-config"), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + ds, err := testDaemonsetCommon(t, tc.clusterPolicy, "MIGManager", tc.output["numDaemonsets"].(int)) + if err != nil { + t.Fatalf("error in testDaemonsetCommon(): %v", err) + } + if ds == nil { + return + } + + migManagerImage := "" + mainCtrIdx := 0 + migConfigVolumePresent := false + + // Find nvidia-mig-manager container and check image + for i, container := range ds.Spec.Template.Spec.Containers { + if container.Name == "nvidia-mig-manager" { + migManagerImage = container.Image + mainCtrIdx = i + break + } + } + + // Check for mig-parted-config volume + for _, vol := range ds.Spec.Template.Spec.Volumes { + if vol.Name == "mig-parted-config" { + migConfigVolumePresent = true + break + } + } + + require.Equal(t, tc.output["migManagerImage"], migManagerImage, "Unexpected configuration for mig-manager image") + require.Equal(t, tc.output["migConfigVolumePresent"], migConfigVolumePresent, "Unexpected configuration for mig-parted-config volume") + + // Check expected env vars + for key, value := range tc.output["env"].(map[string]string) { + envFound := false + for _, envVar := range ds.Spec.Template.Spec.Containers[mainCtrIdx].Env { + if envVar.Name == key && envVar.Value == value { + envFound = true + } + } + if !envFound { + t.Fatalf("Expected env is not set for daemonset mig-manager %s->%s", key, value) + } + } + + // cleanup by deleting all kubernetes objects + err = removeState(&clusterPolicyController, clusterPolicyController.idx-1) + if err != nil { + t.Fatalf("error removing state %v:", err) + } + clusterPolicyController.idx-- + }) + } +} diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 9787a4556..1b86fa7a4 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -1656,6 +1656,98 @@ func TestTransformMigManager(t *testing.T) { }, }).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"), }, + { + description: "mig manager with custom config", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "mig-manager"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + MIGManager: gpuv1.MIGManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "mig-manager", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Config: &gpuv1.MIGPartedConfigSpec{Name: "custom-mig-config"}, + }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "mig-manager", + Image: "nvcr.io/nvidia/cloud-native/mig-manager:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: "CONFIG_FILE", Value: "/mig-parted-config/config.yaml"}, + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "mig-parted-config", MountPath: "/mig-parted-config", ReadOnly: true}, + }, + }).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithVolume(corev1.Volume{ + Name: "mig-parted-config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: "custom-mig-config"}, + }, + }, + }), + }, + { + description: "mig manager without config (nil)", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "mig-manager"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + MIGManager: gpuv1.MIGManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "mig-manager", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Config: nil, + }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "mig-manager", + Image: "nvcr.io/nvidia/cloud-native/mig-manager:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, + }, + }).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"), + }, + { + description: "mig manager with empty config name", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "mig-manager"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + MIGManager: gpuv1.MIGManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "mig-manager", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Config: &gpuv1.MIGPartedConfigSpec{Name: ""}, + }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "mig-manager", + Image: "nvcr.io/nvidia/cloud-native/mig-manager:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, + }, + }).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"), + }, } for _, tc := range testCases {