diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index ea4b21d86..4a1fb7290 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -94,6 +94,8 @@ type ClusterPolicySpec struct { CCManager CCManagerSpec `json:"ccManager,omitempty"` // HostPaths defines various paths on the host needed by GPU Operator components HostPaths HostPathsSpec `json:"hostPaths,omitempty"` + // FabricManager component spec + FabricManager FabricManagerSpec `json:"fabricManager,omitempty"` } // Runtime defines container runtime type @@ -1724,6 +1726,38 @@ type CDIConfigSpec struct { Default *bool `json:"default,omitempty"` } +// FabricMode defines the Fabric Manager mode +type FabricMode string + +const ( + // FabricModeFullPassthrough indicates Full-passthrough mode (FABRIC_MODE=0) + FabricModeFullPassthrough FabricMode = "full-passthrough" + // FabricModeSharedNVSwitch indicates Shared NVSwitch Virtualization mode (FABRIC_MODE=1) + FabricModeSharedNVSwitch FabricMode = "shared-nvswitch" +) + +func (f FabricMode) String() string { + switch f { + case FabricModeFullPassthrough: + return "full-passthrough" + case FabricModeSharedNVSwitch: + return "shared-nvswitch" + default: + return "" + } +} + +// FabricManagerSpec defines the properties for NVIDIA Fabric Manager configuration +type FabricManagerSpec struct { + // Mode indicates the Fabric Manager mode + // +kubebuilder:validation:Enum=full-passthrough;shared-nvswitch + // +kubebuilder:default=full-passthrough + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Fabric Manager Mode" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:full-passthrough,urn:alm:descriptor:com.tectonic.ui:select:shared-nvswitch" + Mode FabricMode `json:"mode,omitempty"` +} + // MIGStrategy indicates MIG mode type MIGStrategy string @@ -2218,3 +2252,18 @@ func (c *MIGPartedConfigSpec) GetName() string { func (c *VGPUDevicesConfigSpec) GetName() string { return ptr.Deref(c, VGPUDevicesConfigSpec{}).Name } + +// IsSharedNVSwitchMode returns true if Fabric Manager is configured for Shared NVSwitch mode +func (f *FabricManagerSpec) IsSharedNVSwitchMode() bool { + return f.Mode == FabricModeSharedNVSwitch +} + +// ValidateFabricManagerConfig validates the Fabric Manager configuration +func (c *ClusterPolicySpec) ValidateFabricManagerConfig() error { + if c.SandboxWorkloads.DefaultWorkload == "vm-passthrough" && + c.FabricManager.IsSharedNVSwitchMode() && + !c.Driver.IsEnabled() { + return fmt.Errorf("driver must be enabled when using vm-passthrough with Fabric Manager Shared NVSwitch mode") + } + return nil +} diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 9e68fdb37..5b86cd8e2 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -209,6 +209,7 @@ func (in *ClusterPolicySpec) DeepCopyInto(out *ClusterPolicySpec) { in.KataManager.DeepCopyInto(&out.KataManager) in.CCManager.DeepCopyInto(&out.CCManager) out.HostPaths = in.HostPaths + out.FabricManager = in.FabricManager } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterPolicySpec. @@ -788,6 +789,21 @@ func (in *EnvVar) DeepCopy() *EnvVar { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FabricManagerSpec) DeepCopyInto(out *FabricManagerSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FabricManagerSpec. +func (in *FabricManagerSpec) DeepCopy() *FabricManagerSpec { + if in == nil { + return nil + } + out := new(FabricManagerSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GDRCopySpec) DeepCopyInto(out *GDRCopySpec) { *out = *in diff --git a/assets/state-driver/0400_configmap.yaml b/assets/state-driver/0400_configmap.yaml index 67aa1e2ca..3ab40437c 100644 --- a/assets/state-driver/0400_configmap.yaml +++ b/assets/state-driver/0400_configmap.yaml @@ -22,8 +22,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml index 853cf6fc9..c6023e9b3 100644 --- a/assets/state-driver/0500_daemonset.yaml +++ b/assets/state-driver/0500_daemonset.yaml @@ -50,29 +50,29 @@ spec: command: ["driver-manager"] args: ["uninstall_driver"] env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - # always use runc for driver containers - - name: NVIDIA_VISIBLE_DEVICES - value: void - - name: ENABLE_GPU_POD_EVICTION - value: "true" - - name: ENABLE_AUTO_DRAIN - value: "false" - - name: DRAIN_USE_FORCE - value: "false" - - name: DRAIN_POD_SELECTOR_LABEL - value: "" - - name: DRAIN_TIMEOUT_SECONDS - value: "0s" - - name: DRAIN_DELETE_EMPTYDIR_DATA - value: "false" - - name: OPERATOR_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + # always use runc for driver containers + - name: NVIDIA_VISIBLE_DEVICES + value: void + - name: ENABLE_GPU_POD_EVICTION + value: "true" + - name: ENABLE_AUTO_DRAIN + value: "false" + - name: DRAIN_USE_FORCE + value: "false" + - name: DRAIN_POD_SELECTOR_LABEL + value: "" + - name: DRAIN_TIMEOUT_SECONDS + value: "0s" + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: "false" + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace securityContext: privileged: true volumeMounts: @@ -89,193 +89,203 @@ spec: mountPath: /run/mellanox/drivers mountPropagation: HostToContainer containers: - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: nvidia-driver-ctr - command: ["nvidia-driver"] - args: ["init"] - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: run-nvidia - mountPath: /run/nvidia - mountPropagation: Bidirectional - - name: run-nvidia-fabricmanager - mountPath: /run/nvidia-fabricmanager - - name: run-nvidia-topologyd - mountPath: /run/nvidia-topologyd - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log - - name: host-os-release - mountPath: "/host-etc/os-release" - readOnly: true - - name: mlnx-ofed-usr-src - mountPath: /run/mellanox/drivers/usr/src - mountPropagation: HostToContainer - - name: run-mellanox-drivers - mountPath: /run/mellanox/drivers - mountPropagation: HostToContainer - - name: sysfs-memory-online - mountPath: /sys/devices/system/memory/auto_online_blocks - - name: firmware-search-path - mountPath: /sys/module/firmware_class/parameters/path - - name: nv-firmware - mountPath: /lib/firmware - - name: driver-startup-probe-script - mountPath: /usr/local/bin/startup-probe.sh - subPath: startup-probe.sh - startupProbe: - exec: - command: - - sh - - /usr/local/bin/startup-probe.sh - initialDelaySeconds: 60 - failureThreshold: 120 - successThreshold: 1 - periodSeconds: 10 - timeoutSeconds: 60 - lifecycle: - preStop: + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-driver-ctr + command: ["nvidia-driver"] + args: ["init"] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: Bidirectional + - name: run-nvidia-fabricmanager + mountPath: /run/nvidia-fabricmanager + - name: run-nvidia-topologyd + mountPath: /run/nvidia-topologyd + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true + - name: mlnx-ofed-usr-src + mountPath: /run/mellanox/drivers/usr/src + mountPropagation: HostToContainer + - name: run-mellanox-drivers + mountPath: /run/mellanox/drivers + mountPropagation: HostToContainer + - name: sysfs-memory-online + mountPath: /sys/devices/system/memory/auto_online_blocks + - name: firmware-search-path + mountPath: /sys/module/firmware_class/parameters/path + - name: nv-firmware + mountPath: /lib/firmware + - name: driver-startup-probe-script + mountPath: /usr/local/bin/startup-probe.sh + subPath: startup-probe.sh + startupProbe: + exec: + command: + - sh + - /usr/local/bin/startup-probe.sh + initialDelaySeconds: 60 + failureThreshold: 120 + successThreshold: 1 + periodSeconds: 10 + timeoutSeconds: 60 + lifecycle: + preStop: + exec: + command: + [ + "/bin/sh", + "-c", + "rm -f /run/nvidia/validations/.driver-ctr-ready", + ] + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-peermem-ctr + command: ["nvidia-driver"] + # takes care of loading nvidia_peermem whenever it gets dynamically unloaded during MOFED driver re-install/update + args: ["reload_nvidia_peermem"] + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: Bidirectional + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log + readOnly: true + - name: run-mellanox-drivers + mountPath: /run/mellanox/drivers + mountPropagation: HostToContainer + startupProbe: + exec: + command: [sh, -c, "nvidia-driver probe_nvidia_peermem"] + initialDelaySeconds: 10 + failureThreshold: 120 + successThreshold: 1 + periodSeconds: 10 + timeoutSeconds: 10 + livenessProbe: exec: - command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/.driver-ctr-ready"] - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: nvidia-peermem-ctr - command: ["nvidia-driver"] - # takes care of loading nvidia_peermem whenever it gets dynamically unloaded during MOFED driver re-install/update - args: ["reload_nvidia_peermem"] - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: run-nvidia - mountPath: /run/nvidia - mountPropagation: Bidirectional - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log - readOnly: true - - name: run-mellanox-drivers - mountPath: /run/mellanox/drivers - mountPropagation: HostToContainer - startupProbe: - exec: - command: - [sh, -c, 'nvidia-driver probe_nvidia_peermem'] - initialDelaySeconds: 10 - failureThreshold: 120 - successThreshold: 1 - periodSeconds: 10 - timeoutSeconds: 10 - livenessProbe: - exec: - command: - [sh, -c, 'nvidia-driver probe_nvidia_peermem'] - periodSeconds: 30 - initialDelaySeconds: 30 - failureThreshold: 1 - successThreshold: 1 - timeoutSeconds: 10 - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: nvidia-fs-ctr - command: [bash, -xc] - args: ["until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gds-driver install"] - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: run-nvidia - mountPath: /run/nvidia - mountPropagation: HostToContainer - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log - readOnly: true - startupProbe: - exec: - command: - [sh, -c, 'lsmod | grep nvidia_fs'] - initialDelaySeconds: 10 - failureThreshold: 120 - successThreshold: 1 - periodSeconds: 10 - timeoutSeconds: 10 - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: nvidia-gdrcopy-ctr - command: [bash, -xc] - args: ["until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gdrcopy-driver install"] - securityContext: - privileged: true - seLinuxOptions: - level: "s0" - volumeMounts: - - name: run-nvidia - mountPath: /run/nvidia - mountPropagation: HostToContainer - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log - readOnly: true - startupProbe: - exec: - command: - [sh, -c, 'lsmod | grep gdrdrv'] - initialDelaySeconds: 10 - failureThreshold: 120 - successThreshold: 1 - periodSeconds: 10 - timeoutSeconds: 10 - # Only kept when OpenShift DriverToolkit side-car is enabled. - - image: "FILLED BY THE OPERATOR" - imagePullPolicy: IfNotPresent - name: openshift-driver-toolkit-ctr - command: [bash, -xc] - args: ["until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ]; do echo Waiting for nvidia-driver-ctr container to prepare the shared directory ...; sleep 10; done; exec /mnt/shared-nvidia-driver-toolkit/ocp_dtk_entrypoint dtk-build-driver"] - securityContext: - # currently mandatory as 'nvidia-installer' loads (and - # unloads) the kernel module as part of the build process - privileged: true - seLinuxOptions: - level: "s0" - env: - - name: RHCOS_VERSION - value: "FILLED BY THE OPERATOR" - # always use runc for driver containers - - name: NVIDIA_VISIBLE_DEVICES - value: void - volumeMounts: - # corresponding volumes are dynamically injected by the - # operator when the OCP DriverToolkit side-car is enabled - - name: shared-nvidia-driver-toolkit - mountPath: /mnt/shared-nvidia-driver-toolkit - - name: var-log - mountPath: /var/log - - name: mlnx-ofed-usr-src - mountPath: /run/mellanox/drivers/usr/src - mountPropagation: HostToContainer - - name: host-os-release - mountPath: /host-etc/os-release - readOnly: true + command: [sh, -c, "nvidia-driver probe_nvidia_peermem"] + periodSeconds: 30 + initialDelaySeconds: 30 + failureThreshold: 1 + successThreshold: 1 + timeoutSeconds: 10 + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-fs-ctr + command: [bash, -xc] + args: + [ + "until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gds-driver install", + ] + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: HostToContainer + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log + readOnly: true + startupProbe: + exec: + command: [sh, -c, "lsmod | grep nvidia_fs"] + initialDelaySeconds: 10 + failureThreshold: 120 + successThreshold: 1 + periodSeconds: 10 + timeoutSeconds: 10 + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-gdrcopy-ctr + command: [bash, -xc] + args: + [ + "until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gdrcopy-driver install", + ] + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: run-nvidia + mountPath: /run/nvidia + mountPropagation: HostToContainer + - name: var-log + mountPath: /var/log + - name: dev-log + mountPath: /dev/log + readOnly: true + startupProbe: + exec: + command: [sh, -c, "lsmod | grep gdrdrv"] + initialDelaySeconds: 10 + failureThreshold: 120 + successThreshold: 1 + periodSeconds: 10 + timeoutSeconds: 10 + # Only kept when OpenShift DriverToolkit side-car is enabled. + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: openshift-driver-toolkit-ctr + command: [bash, -xc] + args: + [ + "until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ]; do echo Waiting for nvidia-driver-ctr container to prepare the shared directory ...; sleep 10; done; exec /mnt/shared-nvidia-driver-toolkit/ocp_dtk_entrypoint dtk-build-driver", + ] + securityContext: + # currently mandatory as 'nvidia-installer' loads (and + # unloads) the kernel module as part of the build process + privileged: true + seLinuxOptions: + level: "s0" + env: + - name: RHCOS_VERSION + value: "FILLED BY THE OPERATOR" + # always use runc for driver containers + - name: NVIDIA_VISIBLE_DEVICES + value: void + volumeMounts: + # corresponding volumes are dynamically injected by the + # operator when the OCP DriverToolkit side-car is enabled + - name: shared-nvidia-driver-toolkit + mountPath: /mnt/shared-nvidia-driver-toolkit + - name: var-log + mountPath: /var/log + - name: mlnx-ofed-usr-src + mountPath: /run/mellanox/drivers/usr/src + mountPropagation: HostToContainer + - name: host-os-release + mountPath: /host-etc/os-release + readOnly: true volumes: - name: run-nvidia hostPath: diff --git a/assets/state-sandbox-validation/0200_role.yaml b/assets/state-sandbox-validation/0200_role.yaml index 79da66ff7..e1f616acb 100644 --- a/assets/state-sandbox-validation/0200_role.yaml +++ b/assets/state-sandbox-validation/0200_role.yaml @@ -12,3 +12,10 @@ rules: - use resourceNames: - privileged +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list diff --git a/assets/state-sandbox-validation/0500_daemonset.yaml b/assets/state-sandbox-validation/0500_daemonset.yaml index fcc2aa12a..982f64b53 100644 --- a/assets/state-sandbox-validation/0500_daemonset.yaml +++ b/assets/state-sandbox-validation/0500_daemonset.yaml @@ -26,6 +26,36 @@ spec: priorityClassName: system-node-critical serviceAccountName: nvidia-sandbox-validator initContainers: + - name: driver-validation + image: "FILLED BY THE OPERATOR" + command: ["sh", "-c"] + args: ["nvidia-validator"] + env: + - name: WITH_WAIT + value: "true" + - name: COMPONENT + value: driver + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + securityContext: + privileged: true + seLinuxOptions: + level: "s0" + volumeMounts: + - name: host-root + mountPath: /host + readOnly: true + mountPropagation: HostToContainer + - name: driver-install-path + mountPath: /run/nvidia/driver + mountPropagation: HostToContainer + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional + - name: host-dev-char + mountPath: /host-dev-char - name: cc-manager-validation image: "FILLED BY THE OPERATOR" command: ['sh', '-c'] @@ -145,3 +175,6 @@ spec: - name: host-root hostPath: path: / + - name: host-dev-char + hostPath: + path: /dev/char diff --git a/assets/state-vfio-manager/0500_daemonset.yaml b/assets/state-vfio-manager/0500_daemonset.yaml index 1039cc874..ed867a70a 100644 --- a/assets/state-vfio-manager/0500_daemonset.yaml +++ b/assets/state-vfio-manager/0500_daemonset.yaml @@ -80,6 +80,9 @@ spec: readOnly: true - name: host-root mountPath: /host + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional securityContext: privileged: true seLinuxOptions: @@ -102,6 +105,10 @@ spec: hostPath: path: /run/nvidia type: DirectoryOrCreate + - name: run-nvidia-validations + hostPath: + path: /run/nvidia/validations + type: DirectoryOrCreate - name: host-root hostPath: path: "/" diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 379e98d87..b8a6ad74a 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -1057,6 +1057,17 @@ spec: type: string type: object type: object + fabricManager: + description: FabricManager component spec + properties: + mode: + default: full-passthrough + description: Mode indicates the Fabric Manager mode + enum: + - full-passthrough + - shared-nvswitch + type: string + type: object gdrcopy: description: GDRCopy component spec properties: diff --git a/cmd/nvidia-validator/main.go b/cmd/nvidia-validator/main.go index ea6cb5d5d..cb235ae72 100644 --- a/cmd/nvidia-validator/main.go +++ b/cmd/nvidia-validator/main.go @@ -1624,18 +1624,22 @@ func (v *VfioPCI) validate() error { return err } - err = v.runValidation() - if err != nil { - return err - } - log.Info("Validation completed successfully - all devices are bound to vfio-pci") + for { + log.Info("Attempting to validate that all device are bound to vfio-pci") + err := v.runValidation() + if err != nil { + if !withWaitFlag { + return fmt.Errorf("error validating vfio-pci: %w", err) + } + log.Warningf("failed to validate vfio-pci, retrying after %d seconds\n", sleepIntervalSecondsFlag) + time.Sleep(time.Duration(sleepIntervalSecondsFlag) * time.Second) + continue + } - // delete status file is already present - err = createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile) - if err != nil { - return err + log.Info("Validation completed successfully - all devices are bound to vfio-pci") + + return createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile) } - return nil } func (v *VfioPCI) runValidation() error { diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 379e98d87..b8a6ad74a 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -1057,6 +1057,17 @@ spec: type: string type: object type: object + fabricManager: + description: FabricManager component spec + properties: + mode: + default: full-passthrough + description: Mode indicates the Fabric Manager mode + enum: + - full-passthrough + - shared-nvswitch + type: string + type: object gdrcopy: description: GDRCopy component spec properties: diff --git a/controllers/object_controls.go b/controllers/object_controls.go index a2a862bb0..782e46e9a 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -23,13 +23,12 @@ import ( "fmt" "os" "path" + "path/filepath" "regexp" "sort" "strconv" "strings" - "path/filepath" - apiconfigv1 "github.com/openshift/api/config/v1" apiimagev1 "github.com/openshift/api/image/v1" secv1 "github.com/openshift/api/security/v1" @@ -2013,10 +2012,70 @@ func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec // TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { - // update k8s-driver-manager initContainer - err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil) - if err != nil { - return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err) + // Check if we're in shared-nvswitch mode + if config.FabricManager.IsSharedNVSwitchMode() { + // In shared-nvswitch mode, replace driver uninstall with device unbind + // Find the k8s-driver-manager init container and replace it with vfio-manage unbind + for i := range obj.Spec.Template.Spec.InitContainers { + if obj.Spec.Template.Spec.InitContainers[i].Name == "k8s-driver-manager" { + initContainer := &obj.Spec.Template.Spec.InitContainers[i] + + // Get the main container image for consistency + mainImage, err := gpuv1.ImagePath(&config.VFIOManager) + if err != nil { + return err + } + + // Replace with synchronized vfio-manage unbind init container + initContainer.Name = "vfio-device-unbind" + initContainer.Image = mainImage + initContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.VFIOManager.ImagePullPolicy) + initContainer.Command = []string{"/bin/sh"} + initContainer.Args = []string{"-c", ` +# For shared-nvswitch mode, wait for driver to be ready before unbinding +echo "Shared NVSwitch mode detected, waiting for driver readiness..." +until [ -f /run/nvidia/validations/driver-ready ] +do + echo "waiting for the driver validations to be ready..." + sleep 5 +done + +set -o allexport +cat /run/nvidia/validations/driver-ready +. /run/nvidia/validations/driver-ready + +echo "Driver is ready, proceeding with device unbind" +exec vfio-manage unbind --all`} + + // Add HOST_ROOT env var needed by vfio-manage + setContainerEnv(initContainer, "HOST_ROOT", "/host") + + // Add nvidia-validations volume mount for driver-ready file + initContainer.VolumeMounts = append(initContainer.VolumeMounts, corev1.VolumeMount{ + Name: "nvidia-validations", + MountPath: "/run/nvidia/validations", + ReadOnly: true, + }) + break + } + } + + // Add nvidia-validations volume + obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, corev1.Volume{ + Name: "nvidia-validations", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/run/nvidia/validations", + Type: &[]corev1.HostPathType{corev1.HostPathDirectoryOrCreate}[0], + }, + }, + }) + } else { + // Default behavior: update k8s-driver-manager initContainer + err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil) + if err != nil { + return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err) + } } // update image @@ -2232,12 +2291,27 @@ func TransformSandboxValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic "vgpu-devices", } + // Add driver validation when FabricManager.Mode is shared-nvswitch + if config.FabricManager.IsSharedNVSwitchMode() { + components = append(components, "driver") + } + for _, component := range components { if err := TransformValidatorComponent(config, &obj.Spec.Template.Spec, component); err != nil { validatorErr = errors.Join(validatorErr, err) } } + // Remove driver validation init container if NOT in shared-nvswitch mode + if !config.FabricManager.IsSharedNVSwitchMode() { + for i, initContainer := range obj.Spec.Template.Spec.InitContainers { + if initContainer.Name == "driver-validation" { + obj.Spec.Template.Spec.InitContainers = append(obj.Spec.Template.Spec.InitContainers[:i], obj.Spec.Template.Spec.InitContainers[i+1:]...) + break + } + } + } + if validatorErr != nil { n.logger.Info("WARN: errors transforming the validator containers: %v", validatorErr) } @@ -3492,6 +3566,13 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy } } + // Set Fabric Manager environment variable if configured + if config.FabricManager.IsSharedNVSwitchMode() { + setContainerEnv(driverContainer, "FABRIC_MANAGER_FABRIC_MODE", "1") + } else if config.FabricManager.Mode == gpuv1.FabricModeFullPassthrough { + setContainerEnv(driverContainer, "FABRIC_MANAGER_FABRIC_MODE", "0") + } + // no further repo configuration required when using pre-compiled drivers, return here. if config.Driver.UsePrecompiledDrivers() { return nil diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 4ea634ebe..badd54afc 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -42,6 +42,7 @@ const ( commonGPULabelValue = "true" commonOperandsLabelKey = "nvidia.com/gpu.deploy.operands" commonOperandsLabelValue = "true" + driverLabelKey = "nvidia.com/gpu.deploy.driver" migManagerLabelKey = "nvidia.com/gpu.deploy.mig-manager" migManagerLabelValue = "true" migCapableLabelKey = "nvidia.com/mig.capable" @@ -116,9 +117,10 @@ var gpuNodeLabels = map[string]string{ } type gpuWorkloadConfiguration struct { - config string - node string - log logr.Logger + config string + node string + log logr.Logger + clusterPolicy *gpuv1.ClusterPolicy } // OpenShiftDriverToolkit contains the values required to deploy @@ -322,6 +324,15 @@ func isValidWorkloadConfig(workloadConfig string) bool { return ok } +// shouldDeployDriverForVMPassthrough returns true if driver should be deployed for vm-passthrough workload +// based on Fabric Manager configuration +func (w *gpuWorkloadConfiguration) shouldDeployDriverForVMPassthrough() bool { + if w.config != gpuWorkloadConfigVMPassthrough || w.clusterPolicy == nil { + return false + } + return w.clusterPolicy.Spec.FabricManager.IsSharedNVSwitchMode() +} + // getWorkloadConfig returns the GPU workload configured for the node. // If an error occurs when searching for the workload config, // return defaultGPUWorkloadConfig. @@ -382,6 +393,16 @@ func (w *gpuWorkloadConfiguration) addGPUStateLabels(labels map[string]string) b modified = true } } + + // Add conditional driver deployment for vm-passthrough workload + if w.shouldDeployDriverForVMPassthrough() { + if _, ok := labels[driverLabelKey]; !ok { + w.log.Info("Setting node label for driver deployment in vm-passthrough with Fabric Manager shared-nvswitch mode", "NodeName", w.node, "Label", driverLabelKey, "Value", "true") + labels[driverLabelKey] = "true" + modified = true + } + } + if w.config == gpuWorkloadConfigContainer && hasMIGCapableGPU(labels) && !hasMIGManagerLabel(labels) { w.log.Info("Setting node label", "NodeName", w.node, "Label", migManagerLabelKey, "Value", migManagerLabelValue) labels[migManagerLabelKey] = migManagerLabelValue @@ -506,7 +527,7 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) { "Error", err, "defaultGPUWorkloadConfig", defaultGPUWorkloadConfig) } n.logger.Info("GPU workload configuration", "NodeName", node.Name, "GpuWorkloadConfig", config) - gpuWorkloadConfig := &gpuWorkloadConfiguration{config, node.Name, n.logger} + gpuWorkloadConfig := &gpuWorkloadConfiguration{config, node.Name, n.logger, n.singleton} if !hasCommonGPULabel(labels) && hasGPULabels(labels) { n.logger.Info("Node has GPU(s)", "NodeName", node.Name) // label the node with common Nvidia GPU label diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go index bd1641e94..584e51902 100644 --- a/controllers/state_manager_test.go +++ b/controllers/state_manager_test.go @@ -19,6 +19,8 @@ package controllers import ( "testing" + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" @@ -186,3 +188,323 @@ func TestHasMIGCapableGPU(t *testing.T) { } } } + +func TestGpuWorkloadConfiguration_ShouldDeployDriverForVMPassthrough(t *testing.T) { + tests := []struct { + name string + config string + clusterPolicy *gpuv1.ClusterPolicy + expected bool + }{ + { + name: "non-vm-passthrough workload", + config: gpuWorkloadConfigContainer, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + expected: false, + }, + { + name: "vm-passthrough with nil cluster policy", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: nil, + expected: false, + }, + { + name: "vm-passthrough with shared-nvswitch mode", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + expected: true, + }, + { + name: "vm-passthrough with full-passthrough mode", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, + }, + }, + expected: false, + }, + { + name: "vm-passthrough with default (empty) fabric manager mode", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: "", // empty defaults to full-passthrough + }, + }, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + workloadConfig := &gpuWorkloadConfiguration{ + config: tt.config, + node: "test-node", + log: logr.Discard(), + clusterPolicy: tt.clusterPolicy, + } + + result := workloadConfig.shouldDeployDriverForVMPassthrough() + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestGpuWorkloadConfiguration_AddGPUStateLabels(t *testing.T) { + tests := []struct { + name string + config string + clusterPolicy *gpuv1.ClusterPolicy + inputLabels map[string]string + expectedLabels map[string]string + expectModified bool + }{ + { + name: "vm-passthrough with shared-nvswitch adds driver label", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + inputLabels: map[string]string{}, + expectedLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + "nvidia.com/gpu.deploy.driver": "true", + }, + expectModified: true, + }, + { + name: "vm-passthrough with full-passthrough does not add driver label", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, + }, + }, + inputLabels: map[string]string{}, + expectedLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + }, + expectModified: true, + }, + { + name: "container workload is not affected", + config: gpuWorkloadConfigContainer, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + inputLabels: map[string]string{ + "existing-label": "value", + }, + expectedLabels: map[string]string{ + "existing-label": "value", + "nvidia.com/gpu.deploy.driver": "true", + "nvidia.com/gpu.deploy.gpu-feature-discovery": "true", + "nvidia.com/gpu.deploy.container-toolkit": "true", + "nvidia.com/gpu.deploy.device-plugin": "true", + "nvidia.com/gpu.deploy.dcgm": "true", + "nvidia.com/gpu.deploy.dcgm-exporter": "true", + "nvidia.com/gpu.deploy.node-status-exporter": "true", + "nvidia.com/gpu.deploy.operator-validator": "true", + }, + expectModified: true, + }, + { + name: "vm-passthrough with nil cluster policy does not add driver label", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: nil, + inputLabels: map[string]string{}, + expectedLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + }, + expectModified: true, + }, + { + name: "driver label already exists - no modification", + config: gpuWorkloadConfigVMPassthrough, + clusterPolicy: &gpuv1.ClusterPolicy{ + Spec: gpuv1.ClusterPolicySpec{ + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + }, + inputLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + "nvidia.com/gpu.deploy.driver": "true", + }, + expectedLabels: map[string]string{ + "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", + "nvidia.com/gpu.deploy.sandbox-validator": "true", + "nvidia.com/gpu.deploy.vfio-manager": "true", + "nvidia.com/gpu.deploy.kata-manager": "true", + "nvidia.com/gpu.deploy.cc-manager": "true", + "nvidia.com/gpu.deploy.driver": "true", + }, + expectModified: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + workloadConfig := &gpuWorkloadConfiguration{ + config: tt.config, + node: "test-node", + log: logr.Discard(), + clusterPolicy: tt.clusterPolicy, + } + + // Make a copy of input labels to avoid modifying the test data + labels := make(map[string]string) + for k, v := range tt.inputLabels { + labels[k] = v + } + + modified := workloadConfig.addGPUStateLabels(labels) + + assert.Equal(t, tt.expectModified, modified) + assert.Equal(t, tt.expectedLabels, labels) + }) + } +} + +func TestClusterPolicyValidateFabricManagerConfig(t *testing.T) { + tests := []struct { + name string + clusterPolicy *gpuv1.ClusterPolicySpec + expectError bool + errorMessage string + }{ + { + name: "valid configuration - vm-passthrough with shared-nvswitch and driver enabled", + clusterPolicy: &gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{ + DefaultWorkload: "vm-passthrough", + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + Driver: gpuv1.DriverSpec{ + Enabled: newBoolPtr(true), + }, + }, + expectError: false, + }, + { + name: "valid configuration - vm-passthrough with full-passthrough mode", + clusterPolicy: &gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{ + DefaultWorkload: "vm-passthrough", + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, + Driver: gpuv1.DriverSpec{ + Enabled: newBoolPtr(false), + }, + }, + expectError: false, + }, + { + name: "valid configuration - container workload with any fabric manager mode", + clusterPolicy: &gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{ + DefaultWorkload: "container", + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + Driver: gpuv1.DriverSpec{ + Enabled: newBoolPtr(false), + }, + }, + expectError: false, + }, + { + name: "invalid configuration - vm-passthrough with shared-nvswitch but driver disabled", + clusterPolicy: &gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{ + DefaultWorkload: "vm-passthrough", + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + Driver: gpuv1.DriverSpec{ + Enabled: newBoolPtr(false), + }, + }, + expectError: true, + errorMessage: "driver must be enabled when using vm-passthrough with Fabric Manager Shared NVSwitch mode", + }, + { + name: "valid configuration - vm-passthrough with shared-nvswitch and driver not specified (defaults to enabled)", + clusterPolicy: &gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{ + DefaultWorkload: "vm-passthrough", + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + Driver: gpuv1.DriverSpec{ + // Enabled not specified, defaults to true + }, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.clusterPolicy.ValidateFabricManagerConfig() + + if tt.expectError { + assert.Error(t, err) + assert.Contains(t, err.Error(), tt.errorMessage) + } else { + assert.NoError(t, err) + } + }) + } +} diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index cfab7da49..23ead1db0 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -1810,7 +1810,7 @@ func TestTransformVFIOManager(t *testing.T) { expectedDaemonset Daemonset }{ { - description: "transform vfio manager", + description: "transform vfio manager - normal mode", daemonset: NewDaemonset(). WithContainer(corev1.Container{Name: "nvidia-vfio-manager"}). WithContainer(corev1.Container{Name: "sidecar"}). @@ -1833,6 +1833,9 @@ func TestTransformVFIOManager(t *testing.T) { Env: mockEnv, }, }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, }, expectedDaemonset: NewDaemonset(). WithContainer(corev1.Container{ @@ -1855,6 +1858,80 @@ func TestTransformVFIOManager(t *testing.T) { }). WithPullSecret(secret), }, + { + description: "transform vfio manager - shared-nvswitch mode", + daemonset: NewDaemonset(). + WithContainer(corev1.Container{Name: "nvidia-vfio-manager"}). + WithContainer(corev1.Container{Name: "sidecar"}). + WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}), + clusterPolicySpec: &gpuv1.ClusterPolicySpec{ + VFIOManager: gpuv1.VFIOManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "vfio-pci-manager", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{secret}, + Resources: &gpuv1.ResourceRequirements{Limits: resources.Limits, Requests: resources.Requests}, + Args: []string{"--test-flag"}, + Env: mockEnv, + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + expectedDaemonset: NewDaemonset(). + WithContainer(corev1.Container{ + Name: "nvidia-vfio-manager", + Image: "nvcr.io/nvidia/cloud-native/vfio-pci-manager:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Args: []string{"--test-flag"}, + Env: mockEnvCore, + Resources: resources, + }). + WithContainer(corev1.Container{ + Name: "sidecar", + Resources: resources, + }). + WithInitContainer(corev1.Container{ + Name: "vfio-device-unbind", + Image: "nvcr.io/nvidia/cloud-native/vfio-pci-manager:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sh"}, + Args: []string{"-c", ` +# For shared-nvswitch mode, wait for driver to be ready before unbinding +echo "Shared NVSwitch mode detected, waiting for driver readiness..." +until [ -f /run/nvidia/validations/driver-ready ] +do + echo "waiting for the driver validations to be ready..." + sleep 5 +done + +set -o allexport +cat /run/nvidia/validations/driver-ready +. /run/nvidia/validations/driver-ready + +echo "Driver is ready, proceeding with device unbind" +exec vfio-manage unbind --all`}, + Env: []corev1.EnvVar{{Name: "HOST_ROOT", Value: "/host"}}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "nvidia-validations", + MountPath: "/run/nvidia/validations", + ReadOnly: true, + }, + }, + }). + WithVolume(corev1.Volume{ + Name: "nvidia-validations", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/run/nvidia/validations", + Type: &[]corev1.HostPathType{corev1.HostPathDirectoryOrCreate}[0], + }, + }, + }). + WithPullSecret(secret), + }, } for _, tc := range testCases { @@ -2612,6 +2689,104 @@ func TestTransformSandboxValidator(t *testing.T) { WithPullSecret("pull-secret"). WithRuntimeClassName("nvidia"), }, + { + description: "fabric manager shared-nvswitch mode - driver validation should be preserved", + ds: NewDaemonset(). + WithInitContainer(corev1.Container{Name: "driver-validation", Image: "old-image"}). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "old-image", + }), + cpSpec: &gpuv1.ClusterPolicySpec{ + Validator: gpuv1.ValidatorSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gpu-operator-validator", + Version: "v1.0.0", + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + expectedDs: NewDaemonset(). + WithInitContainer(corev1.Container{ + Name: "driver-validation", + Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0", + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }), + }, + { + description: "fabric manager full-passthrough mode - driver validation should be removed", + ds: NewDaemonset(). + WithInitContainer(corev1.Container{Name: "driver-validation", Image: "old-image"}). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "old-image", + }), + cpSpec: &gpuv1.ClusterPolicySpec{ + Validator: gpuv1.ValidatorSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gpu-operator-validator", + Version: "v1.0.0", + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, + }, + expectedDs: func() Daemonset { + ds := NewDaemonset(). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }) + // Set an empty InitContainers slice to match what happens after removal + ds.Spec.Template.Spec.InitContainers = []corev1.Container{} + return ds + }(), + }, + { + description: "no fabric manager mode specified - driver validation should be removed", + ds: NewDaemonset(). + WithInitContainer(corev1.Container{Name: "driver-validation", Image: "old-image"}). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "old-image", + }), + cpSpec: &gpuv1.ClusterPolicySpec{ + Validator: gpuv1.ValidatorSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gpu-operator-validator", + Version: "v1.0.0", + }, + }, + expectedDs: func() Daemonset { + ds := NewDaemonset(). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }) + // Set an empty InitContainers slice to match what happens after removal + ds.Spec.Template.Spec.InitContainers = []corev1.Container{} + return ds + }(), + }, } for _, tc := range testCases { @@ -2771,6 +2946,78 @@ func TestTransformDriver(t *testing.T) { }), errorExpected: false, }, + { + description: "driver with fabric manager shared-nvswitch mode", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-driver-ctr"}). + WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "driver", + Version: "570.172.08", + Manager: gpuv1.DriverManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "k8s-driver-manager", + Version: "v0.8.0", + }, + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeSharedNVSwitch, + }, + }, + client: mockClientMap["secret-env-client"], + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "nvidia-driver-ctr", + Image: "nvcr.io/nvidia/driver:570.172.08-", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + { + Name: "FABRIC_MANAGER_FABRIC_MODE", + Value: "1", + }, + }, + }).WithInitContainer(corev1.Container{ + Name: "k8s-driver-manager", + Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0", + }), + errorExpected: false, + }, + { + description: "driver with fabric manager full-passthrough mode", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-driver-ctr"}). + WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "driver", + Version: "570.172.08", + Manager: gpuv1.DriverManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "k8s-driver-manager", + Version: "v0.8.0", + }, + }, + FabricManager: gpuv1.FabricManagerSpec{ + Mode: gpuv1.FabricModeFullPassthrough, + }, + }, + client: mockClientMap["secret-env-client"], + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "nvidia-driver-ctr", + Image: "nvcr.io/nvidia/driver:570.172.08-", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + { + Name: "FABRIC_MANAGER_FABRIC_MODE", + Value: "0", + }, + }, + }).WithInitContainer(corev1.Container{ + Name: "k8s-driver-manager", + Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0", + }), + errorExpected: false, + }, } for _, tc := range testCases { diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 379e98d87..b8a6ad74a 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -1057,6 +1057,17 @@ spec: type: string type: object type: object + fabricManager: + description: FabricManager component spec + properties: + mode: + default: full-passthrough + description: Mode indicates the Fabric Manager mode + enum: + - full-passthrough + - shared-nvswitch + type: string + type: object gdrcopy: description: GDRCopy component spec properties: diff --git a/internal/state/testdata/golden/driver-additional-configs.yaml b/internal/state/testdata/golden/driver-additional-configs.yaml index 88d0e7a09..f150861d2 100644 --- a/internal/state/testdata/golden/driver-additional-configs.yaml +++ b/internal/state/testdata/golden/driver-additional-configs.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-full-spec.yaml b/internal/state/testdata/golden/driver-full-spec.yaml index 2397f42fb..0e9a01b81 100644 --- a/internal/state/testdata/golden/driver-full-spec.yaml +++ b/internal/state/testdata/golden/driver-full-spec.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml index 4b1a6f85b..fda787c54 100644 --- a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml +++ b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-gdrcopy.yaml b/internal/state/testdata/golden/driver-gdrcopy.yaml index cd56e8a93..0e1232f16 100644 --- a/internal/state/testdata/golden/driver-gdrcopy.yaml +++ b/internal/state/testdata/golden/driver-gdrcopy.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-gds.yaml b/internal/state/testdata/golden/driver-gds.yaml index b14b03af3..6d7b523b6 100644 --- a/internal/state/testdata/golden/driver-gds.yaml +++ b/internal/state/testdata/golden/driver-gds.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-minimal.yaml b/internal/state/testdata/golden/driver-minimal.yaml index 890a40cee..265ce4d4b 100644 --- a/internal/state/testdata/golden/driver-minimal.yaml +++ b/internal/state/testdata/golden/driver-minimal.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml index f979ac36e..1009c10d9 100644 --- a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml +++ b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-precompiled.yaml b/internal/state/testdata/golden/driver-precompiled.yaml index e6a37c48d..85c126f7e 100644 --- a/internal/state/testdata/golden/driver-precompiled.yaml +++ b/internal/state/testdata/golden/driver-precompiled.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml index e29367438..292808d84 100644 --- a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml +++ b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-rdma.yaml b/internal/state/testdata/golden/driver-rdma.yaml index 2efe95107..36ebfb36c 100644 --- a/internal/state/testdata/golden/driver-rdma.yaml +++ b/internal/state/testdata/golden/driver-rdma.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-secret-env.yaml b/internal/state/testdata/golden/driver-secret-env.yaml index 8b2c277a5..ab767d741 100644 --- a/internal/state/testdata/golden/driver-secret-env.yaml +++ b/internal/state/testdata/golden/driver-secret-env.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml index 7e61f189f..1bc4ccd25 100644 --- a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml +++ b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml index 6df18ad31..072052627 100644 --- a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml +++ b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml b/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml index 66e04a502..ec0e41a5b 100644 --- a/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml +++ b/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/internal/state/testdata/golden/driver-vgpu-licensing.yaml b/internal/state/testdata/golden/driver-vgpu-licensing.yaml index 6d95d1c09..dbb26b457 100644 --- a/internal/state/testdata/golden/driver-vgpu-licensing.yaml +++ b/internal/state/testdata/golden/driver-vgpu-licensing.yaml @@ -106,8 +106,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" diff --git a/manifests/state-driver/0400_configmap.yaml b/manifests/state-driver/0400_configmap.yaml index 55ba3df55..34802a6d5 100644 --- a/manifests/state-driver/0400_configmap.yaml +++ b/manifests/state-driver/0400_configmap.yaml @@ -26,8 +26,14 @@ data: fi if ! nvidia-smi; then - echo "nvidia-smi failed" - exit 1 + # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices + # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1 + if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then + echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)" + else + echo "nvidia-smi failed" + exit 1 + fi fi GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"