From 77b559fac655757ca0bb80971402bc03f053d589 Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 21:43:12 -0400
Subject: [PATCH 01/12] minor

---
 .gitignore                                   |  3 ++
 conf/common/system/badger_slurm_cluster.toml | 30 ++++++++++++++++
 conf/common/test_scenario/sleep.toml         | 38 ++++++++++----------
 experiment/ray_test/ray_test_job.py          |  6 ++++
 experiment/ray_test/stderr.txt               |  2 ++
 experiment/ray_test/stdout.txt               | 36 +++++++++++++++++++
 experiment/ray_test/test.sh                  | 31 ++++++++++++++++
 7 files changed, 127 insertions(+), 19 deletions(-)
 create mode 100644 conf/common/system/badger_slurm_cluster.toml
 create mode 100644 experiment/ray_test/ray_test_job.py
 create mode 100644 experiment/ray_test/stderr.txt
 create mode 100644 experiment/ray_test/stdout.txt
 create mode 100644 experiment/ray_test/test.sh

diff --git a/.gitignore b/.gitignore
index 092d3741e..ee5d8e77b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -89,3 +89,6 @@ Thumbs.db
 install/
 results/
 .*
+
+# conda
+env/
\ No newline at end of file
diff --git a/conf/common/system/badger_slurm_cluster.toml b/conf/common/system/badger_slurm_cluster.toml
new file mode 100644
index 000000000..ef289cbc0
--- /dev/null
+++ b/conf/common/system/badger_slurm_cluster.toml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "badger"
+scheduler = "slurm"
+
+install_path = "./install_dir"
+output_path = "./results"
+default_partition = "cuda"
+
+mpi = "pmi2"
+gpus_per_node = 4
+ntasks_per_node = 4
+monitor_interval = 2
+
+[[partitions]]
+name = "cuda"
diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml
index daca397c4..3e25c9a02 100644
--- a/conf/common/test_scenario/sleep.toml
+++ b/conf/common/test_scenario/sleep.toml
@@ -18,25 +18,25 @@ name = "test_scenario_example"
 
 [[Tests]]
 id = "Tests.1"
-test_name = "sleep_10"
-
-[[Tests]]
-id = "Tests.2"
 test_name = "sleep_5"
-  [[Tests.dependencies]]
-  type = "start_post_init"
-  id = "Tests.1"
 
-[[Tests]]
-id = "Tests.3"
-test_name = "sleep_5"
-  [[Tests.dependencies]]
-  type = "start_post_comp"
-  id = "Tests.1"
+# [[Tests]]
+# id = "Tests.2"
+# test_name = "sleep_5"
+#   [[Tests.dependencies]]
+#   type = "start_post_init"
+#   id = "Tests.1"
 
-[[Tests]]
-id = "Tests.4"
-test_name = "sleep_20"
-  [[Tests.dependencies]]
-  type = "end_post_comp"
-  id = "Tests.1"
+# [[Tests]]
+# id = "Tests.3"
+# test_name = "sleep_5"
+#   [[Tests.dependencies]]
+#   type = "start_post_comp"
+#   id = "Tests.1"
+
+# [[Tests]]
+# id = "Tests.4"
+# test_name = "sleep_20"
+#   [[Tests.dependencies]]
+#   type = "end_post_comp"
+#   id = "Tests.1"
diff --git a/experiment/ray_test/ray_test_job.py b/experiment/ray_test/ray_test_job.py
new file mode 100644
index 000000000..f58034824
--- /dev/null
+++ b/experiment/ray_test/ray_test_job.py
@@ -0,0 +1,6 @@
+import sys
+import ray
+
+ray.init(address="auto")
+
+print(f"Available resources: {ray.available_resources()}", flush=True)
diff --git a/experiment/ray_test/stderr.txt b/experiment/ray_test/stderr.txt
new file mode 100644
index 000000000..f4c8e3362
--- /dev/null
+++ b/experiment/ray_test/stderr.txt
@@ -0,0 +1,2 @@
+2025-03-11 21:38:54,163	INFO worker.py:1654 -- Connecting to existing Ray cluster at address: 130.207.125.88:6379...
+2025-03-11 21:38:54,174	INFO worker.py:1841 -- Connected to Ray cluster.
diff --git a/experiment/ray_test/stdout.txt b/experiment/ray_test/stdout.txt
new file mode 100644
index 000000000..ce5d599b4
--- /dev/null
+++ b/experiment/ray_test/stdout.txt
@@ -0,0 +1,36 @@
+Starting Ray head node on badger
+2025-03-11 21:38:23,038	INFO usage_lib.py:467 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.
+2025-03-11 21:38:23,038	INFO scripts.py:865 -- Local node IP: 130.207.125.88
+2025-03-11 21:38:23,826	SUCC scripts.py:902 -- --------------------
+2025-03-11 21:38:23,826	SUCC scripts.py:903 -- Ray runtime started.
+2025-03-11 21:38:23,826	SUCC scripts.py:904 -- --------------------
+2025-03-11 21:38:23,826	INFO scripts.py:906 -- Next steps
+2025-03-11 21:38:23,826	INFO scripts.py:909 -- To add another node to this Ray cluster, run
+2025-03-11 21:38:23,826	INFO scripts.py:912 --   ray start --address='130.207.125.88:6379'
+2025-03-11 21:38:23,826	INFO scripts.py:921 -- To connect to this Ray cluster:
+2025-03-11 21:38:23,826	INFO scripts.py:923 -- import ray
+2025-03-11 21:38:23,826	INFO scripts.py:924 -- ray.init()
+2025-03-11 21:38:23,826	INFO scripts.py:955 -- To terminate the Ray runtime, run
+2025-03-11 21:38:23,826	INFO scripts.py:956 --   ray stop
+2025-03-11 21:38:23,826	INFO scripts.py:959 -- To view the status of the cluster, use
+2025-03-11 21:38:23,826	INFO scripts.py:960 --   ray status
+Available resources: {'memory': 591011351552.0, 'CPU': 8.0, 'object_store_memory': 200000000000.0, 'node:__internal_head__': 1.0, 'node:130.207.125.88': 1.0}
+======== Autoscaler status: 2025-03-11 21:38:53.521190 ========
+Node status
+---------------------------------------------------------------
+Active:
+ 1 node_940a89a547ed955042d438039d8e0d126d8baedc5444f007a4d1bfc8
+Pending:
+ (no pending nodes)
+Recent failures:
+ (no failures)
+
+Resources
+---------------------------------------------------------------
+Usage:
+ 0.0/8.0 CPU
+ 0B/550.42GiB memory
+ 0B/186.26GiB object_store_memory
+
+Demands:
+ (no resource demands)
diff --git a/experiment/ray_test/test.sh b/experiment/ray_test/test.sh
new file mode 100644
index 000000000..79bf37509
--- /dev/null
+++ b/experiment/ray_test/test.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#SBATCH --job-name=ray_docker
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --output=stdout.txt
+#SBATCH --error=stderr.txt
+
+# Get node information
+nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST)
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+port=6379
+
+
+source /nethome/aagrawal360/mambaforge/etc/profile.d/conda.sh
+source /nethome/aagrawal360/mambaforge/etc/profile.d/mamba.sh
+mamba activate /home/aagrawal360/repos/cloudai/env
+
+if [ "$SLURMD_NODENAME" == "$head_node" ]; then
+    echo "Starting Ray head node on $head_node"
+    ray start --head --port=$port --num-cpus=$SLURM_CPUS_PER_TASK
+    sleep 30
+    python /home/aagrawal360/repos/cloudai/experiment/ray_test/ray_test_job.py
+    ray status
+else
+    echo "Starting Ray worker on $SLURMD_NODENAME"
+    ray start --address=${head_node}:${port} --num-cpus=$SLURM_CPUS_PER_TASK
+fi
+
+sleep 30
\ No newline at end of file

From 2c352e83621b2f8fc14bf7fb6503a2562728146b Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 22:55:48 -0400
Subject: [PATCH 02/12] minor

---
 .../test/slurm_ray_container_hello_world.toml | 26 +++++++
 conf/common/test_scenario/sleep.toml          | 36 +++++-----
 .../test_scenario/slurm_ray_container.toml    | 22 ++++++
 src/cloudai/__init__.py                       |  5 ++
 src/cloudai/_core/test_scenario_parser.py     |  2 +
 .../strategy/slurm_command_gen_strategy.py    | 67 +++++++++++++------
 .../workloads/slurm_ray_container/__init__.py | 26 +++++++
 .../report_generation_strategy.py             | 29 ++++++++
 .../slurm_command_gen_strategy.py             | 62 +++++++++++++++++
 .../slurm_ray_container.py                    | 52 ++++++++++++++
 .../slurm_ray_container_template.sh.jinja     | 29 ++++++++
 11 files changed, 317 insertions(+), 39 deletions(-)
 create mode 100644 conf/common/test/slurm_ray_container_hello_world.toml
 create mode 100644 conf/common/test_scenario/slurm_ray_container.toml
 create mode 100644 src/cloudai/workloads/slurm_ray_container/__init__.py
 create mode 100644 src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py
 create mode 100644 src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
 create mode 100644 src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py
 create mode 100644 src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja

diff --git a/conf/common/test/slurm_ray_container_hello_world.toml b/conf/common/test/slurm_ray_container_hello_world.toml
new file mode 100644
index 000000000..2a995c04d
--- /dev/null
+++ b/conf/common/test/slurm_ray_container_hello_world.toml
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "slurm_ray_container_hello_world"
+description = "Hello World"
+test_template_name = "SlurmRayContainer"
+
+[cmd_args]
+docker_image_url = "nvcr.io/nvidia/nemo:24.12.rc3"
+conda_env = "nemo"
+cmd = "bash -c 'echo Hello World'"
+
+
diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml
index 3e25c9a02..94e850850 100644
--- a/conf/common/test_scenario/sleep.toml
+++ b/conf/common/test_scenario/sleep.toml
@@ -20,23 +20,23 @@ name = "test_scenario_example"
 id = "Tests.1"
 test_name = "sleep_5"
 
-# [[Tests]]
-# id = "Tests.2"
-# test_name = "sleep_5"
-#   [[Tests.dependencies]]
-#   type = "start_post_init"
-#   id = "Tests.1"
+[[Tests]]
+id = "Tests.2"
+test_name = "sleep_5"
+  [[Tests.dependencies]]
+  type = "start_post_init"
+  id = "Tests.1"
 
-# [[Tests]]
-# id = "Tests.3"
-# test_name = "sleep_5"
-#   [[Tests.dependencies]]
-#   type = "start_post_comp"
-#   id = "Tests.1"
+[[Tests]]
+id = "Tests.3"
+test_name = "sleep_5"
+  [[Tests.dependencies]]
+  type = "start_post_comp"
+  id = "Tests.1"
 
-# [[Tests]]
-# id = "Tests.4"
-# test_name = "sleep_20"
-#   [[Tests.dependencies]]
-#   type = "end_post_comp"
-#   id = "Tests.1"
+[[Tests]]
+id = "Tests.4"
+test_name = "sleep_20"
+  [[Tests.dependencies]]
+  type = "end_post_comp"
+  id = "Tests.1"
diff --git a/conf/common/test_scenario/slurm_ray_container.toml b/conf/common/test_scenario/slurm_ray_container.toml
new file mode 100644
index 000000000..b58510998
--- /dev/null
+++ b/conf/common/test_scenario/slurm_ray_container.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "slurm_ray_container_example"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "slurm_ray_container_hello_world"
+num_nodes = "2"
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index 4c6d1210c..1e6f6f13c 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -97,6 +97,7 @@
     SleepTestDefinition,
 )
 from .workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition
+from .workloads.slurm_ray_container import SlurmRayContainerCommandGenStrategy, SlurmRayContainerTestDefinition
 from .workloads.ucc_test import (
     UCCTestDefinition,
     UCCTestGradingStrategy,
@@ -207,6 +208,9 @@
 Registry().add_strategy(
     CommandGenStrategy, [SlurmSystem], [SlurmContainerTestDefinition], SlurmContainerCommandGenStrategy
 )
+Registry().add_strategy(
+    CommandGenStrategy, [SlurmSystem], [SlurmRayContainerTestDefinition], SlurmRayContainerCommandGenStrategy
+)
 
 Registry().add_installer("slurm", SlurmInstaller)
 Registry().add_installer("standalone", StandaloneInstaller)
@@ -226,6 +230,7 @@
 Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition)
 Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
 Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
+Registry().add_test_definition("SlurmRayContainer", SlurmRayContainerTestDefinition)
 Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition)
 
 Registry().add_agent("grid_search", GridSearchAgent)
diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 76c1d1dc1..cffac9b29 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -36,6 +36,7 @@
 from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition
 from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition
 from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
+from cloudai.workloads.slurm_ray_container import SlurmRayContainerReportGenerationStrategy, SlurmRayContainerTestDefinition
 from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy
 
 from .exceptions import TestScenarioParsingError, format_validation_error
@@ -54,6 +55,7 @@
     NemotronTestDefinition: {JaxToolboxReportGenerationStrategy},
     SleepTestDefinition: {SleepReportGenerationStrategy},
     SlurmContainerTestDefinition: {SlurmContainerReportGenerationStrategy},
+    SlurmRayContainerTestDefinition: {SlurmRayContainerReportGenerationStrategy},
     UCCTestDefinition: {UCCTestReportGenerationStrategy},
 }
 
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 2f57ab802..62135b42b 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -310,6 +310,46 @@ def _write_sbatch_script(
 
         return f"sbatch {batch_script_path}"
 
+    def _get_sbatch_directives(
+        self, args: Dict[str, Any], output_path: Path
+    ) -> Dict[str, str]:
+        sbatch_directives: Dict[str, str] = {}
+
+        if "output" not in args:
+            sbatch_directives["output"] = f"{output_path / 'stdout.txt'}"
+        if "error" not in args:
+            sbatch_directives["error"] = f"{output_path / 'stderr.txt'}"
+
+        sbatch_directives["partition"] = self.system.default_partition
+
+        if args["node_list_str"]:
+            sbatch_directives["nodelist"] = args["node_list_str"]
+        if self.system.account:
+            sbatch_directives["account"] = self.system.account
+        if self.system.distribution:
+            sbatch_directives["distribution"] = self.system.distribution
+        if self.system.gpus_per_node:
+            sbatch_directives["gpus_per_node"] = str(self.system.gpus_per_node)
+            sbatch_directives["gres"] = f"gpu:{self.system.gpus_per_node}"
+        if self.system.ntasks_per_node:
+            sbatch_directives["ntasks_per_node"] = str(self.system.ntasks_per_node)
+        if "time_limit" in args:
+            sbatch_directives["time_limit"] = args["time_limit"]
+
+        for arg in self.system.extra_sbatch_args:
+            arg = arg.strip()
+            # remove -- from the start of the string if present
+            if arg.startswith("--"):
+                arg = arg[2:]
+            # split the string into key and value
+            arg_split = arg.split("=")
+            if len(arg_split) == 2:
+                sbatch_directives[arg_split[0]] = arg_split[1].join("=")
+            else:
+                sbatch_directives[arg] = ""
+
+        return sbatch_directives
+
     def _append_sbatch_directives(
         self, batch_script_content: List[str], args: Dict[str, Any], output_path: Path
     ) -> None:
@@ -322,28 +362,13 @@ def _append_sbatch_directives(
             output_path (Path): Output directory for script and logs.
         """
         batch_script_content = self._add_reservation(batch_script_content)
+        sbatch_directives = self._get_sbatch_directives(args, output_path)
 
-        if "output" not in args:
-            batch_script_content.append(f"#SBATCH --output={output_path / 'stdout.txt'}")
-        if "error" not in args:
-            batch_script_content.append(f"#SBATCH --error={output_path / 'stderr.txt'}")
-        batch_script_content.append(f"#SBATCH --partition={self.system.default_partition}")
-        if args["node_list_str"]:
-            batch_script_content.append(f"#SBATCH --nodelist={args['node_list_str']}")
-        if self.system.account:
-            batch_script_content.append(f"#SBATCH --account={self.system.account}")
-        if self.system.distribution:
-            batch_script_content.append(f"#SBATCH --distribution={self.system.distribution}")
-        if self.system.gpus_per_node:
-            batch_script_content.append(f"#SBATCH --gpus-per-node={self.system.gpus_per_node}")
-            batch_script_content.append(f"#SBATCH --gres=gpu:{self.system.gpus_per_node}")
-        if self.system.ntasks_per_node:
-            batch_script_content.append(f"#SBATCH --ntasks-per-node={self.system.ntasks_per_node}")
-        if "time_limit" in args:
-            batch_script_content.append(f"#SBATCH --time={args['time_limit']}")
-
-        for arg in self.system.extra_sbatch_args:
-            batch_script_content.append(f"#SBATCH {arg}")
+        for key, value in sbatch_directives.items():
+            if value:
+                batch_script_content.append(f"#SBATCH --{key}={value}")
+            else:
+                batch_script_content.append(f"#SBATCH --{key}")
 
         batch_script_content.append(
             "\nexport SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)"
diff --git a/src/cloudai/workloads/slurm_ray_container/__init__.py b/src/cloudai/workloads/slurm_ray_container/__init__.py
new file mode 100644
index 000000000..875c47bb0
--- /dev/null
+++ b/src/cloudai/workloads/slurm_ray_container/__init__.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .report_generation_strategy import SlurmRayContainerReportGenerationStrategy
+from .slurm_command_gen_strategy import SlurmRayContainerCommandGenStrategy
+from .slurm_ray_container import SlurmRayContainerCmdArgs, SlurmRayContainerTestDefinition
+
+__all__ = [
+    "SlurmRayContainerCmdArgs",
+    "SlurmRayContainerCommandGenStrategy",
+    "SlurmRayContainerReportGenerationStrategy",
+    "SlurmRayContainerTestDefinition",
+]
diff --git a/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py
new file mode 100644
index 000000000..1646f1434
--- /dev/null
+++ b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+from cloudai import ReportGenerationStrategy
+
+
+class SlurmRayContainerReportGenerationStrategy(ReportGenerationStrategy):
+    """Report generation strategy for a generic Slurm ray container test."""
+
+    def can_handle_directory(self) -> bool:
+        return False
+
+    def generate_report(self) -> None:
+        pass
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
new file mode 100644
index 000000000..fee895389
--- /dev/null
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Union, cast
+
+from jinja2 import Template
+
+from cloudai import TestRun
+from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy
+
+from .slurm_ray_container import SlurmRayContainerTestDefinition
+
+
+class SlurmRayContainerCommandGenStrategy(SlurmContainerCommandGenStrategy):
+    """Command generation strategy for generic Slurm container tests."""
+    def _get_sbatch_directives(
+        self, args: Dict[str, Any], output_path: Path
+    ) -> Dict[str, str]:
+        sbatch_directives = super()._get_sbatch_directives(args, output_path)
+        # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu
+        # override tasks per node
+        sbatch_directives["tasks-per-node"] = "1"
+        sbatch_directives["exclusive"] = ""
+
+        return sbatch_directives
+
+    def generate_test_command(
+        self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun
+    ) -> list[str]:
+        tdef: SlurmRayContainerTestDefinition = cast(SlurmRayContainerTestDefinition, tr.test.test_definition)
+        srun_command_parts: list[str] = [*super().gen_nsys_command(tr), tdef.cmd_args.cmd]
+        if tr.test.extra_cmd_args:
+            srun_command_parts.append(tr.test.extra_cmd_args)
+
+        # load the jinja template file which is placed at the same directory as this file
+        script_dir = Path(__file__).parent
+        template_path = script_dir / "slurm_ray_container_template.sh.jinja"
+        template = Template(template_path.read_text())
+
+        # render the template
+        rendered_template = template.render({
+            "conda_env": tdef.cmd_args.conda_env,
+            "command": " ".join(srun_command_parts)
+        })
+
+        return [rendered_template]
+
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py
new file mode 100644
index 000000000..24f15176a
--- /dev/null
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from cloudai import CmdArgs, DockerImage, Installable, TestDefinition
+
+
+class SlurmRayContainerCmdArgs(CmdArgs):
+    """Command line arguments for a generic Slurm container test."""
+
+    docker_image_url: str
+    conda_env: str
+    cmd: str
+
+
+class SlurmRayContainerTestDefinition(TestDefinition):
+    """Test definition for a generic Slurm container test."""
+
+    cmd_args: SlurmRayContainerCmdArgs
+
+    _docker_image: Optional[DockerImage] = None
+
+    @property
+    def docker_image(self) -> DockerImage:
+        if not self._docker_image:
+            self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
+        return self._docker_image
+
+    @property
+    def installables(self) -> list[Installable]:
+        return [self.docker_image, *self.git_repos]
+
+    @property
+    def extra_args_str(self) -> str:
+        parts = []
+        for k, v in self.extra_cmd_args.items():
+            parts.append(f"{k} {v}" if v else k)
+        return " ".join(parts)
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
new file mode 100644
index 000000000..eeeb08fec
--- /dev/null
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
@@ -0,0 +1,29 @@
+conda activate {{ conda_env }}
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" \
+    ray start --head --node-ip-address="$head_node_ip" --port=$port \
+    --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
+
+# optional, though may be useful in certain versions of Ray < 1.0.
+sleep 10
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        ray start --address "$ip_head" \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
+    sleep 5
+done
+
+{{ command }}
+

From 0978f5edf14b7f880533ec812f76a032ba444e7a Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 22:57:33 -0400
Subject: [PATCH 03/12] minor

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index ddaf06e25..973fef73c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ tbparse==0.0.8
 toml==0.10.2
 kubernetes==30.1.0
 pydantic==2.8.2
+jinja2==3.1.3

From a41de4eee175ad5607523db4507c843a76f0bb87 Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 22:57:47 -0400
Subject: [PATCH 04/12] minor

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 973fef73c..c585ee021 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,4 @@ tbparse==0.0.8
 toml==0.10.2
 kubernetes==30.1.0
 pydantic==2.8.2
-jinja2==3.1.3
+jinja2==3.1.6

From a2321aa3281ceca6c84125b3a1aa903d386b1c28 Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 22:59:58 -0400
Subject: [PATCH 05/12] minor

---
 experiment/ray_test/ray_test_job.py |  6 -----
 experiment/ray_test/stderr.txt      |  2 --
 experiment/ray_test/stdout.txt      | 36 -----------------------------
 experiment/ray_test/test.sh         | 31 -------------------------
 4 files changed, 75 deletions(-)
 delete mode 100644 experiment/ray_test/ray_test_job.py
 delete mode 100644 experiment/ray_test/stderr.txt
 delete mode 100644 experiment/ray_test/stdout.txt
 delete mode 100644 experiment/ray_test/test.sh

diff --git a/experiment/ray_test/ray_test_job.py b/experiment/ray_test/ray_test_job.py
deleted file mode 100644
index f58034824..000000000
--- a/experiment/ray_test/ray_test_job.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import sys
-import ray
-
-ray.init(address="auto")
-
-print(f"Available resources: {ray.available_resources()}", flush=True)
diff --git a/experiment/ray_test/stderr.txt b/experiment/ray_test/stderr.txt
deleted file mode 100644
index f4c8e3362..000000000
--- a/experiment/ray_test/stderr.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-2025-03-11 21:38:54,163	INFO worker.py:1654 -- Connecting to existing Ray cluster at address: 130.207.125.88:6379...
-2025-03-11 21:38:54,174	INFO worker.py:1841 -- Connected to Ray cluster.
diff --git a/experiment/ray_test/stdout.txt b/experiment/ray_test/stdout.txt
deleted file mode 100644
index ce5d599b4..000000000
--- a/experiment/ray_test/stdout.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-Starting Ray head node on badger
-2025-03-11 21:38:23,038	INFO usage_lib.py:467 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.
-2025-03-11 21:38:23,038	INFO scripts.py:865 -- Local node IP: 130.207.125.88
-2025-03-11 21:38:23,826	SUCC scripts.py:902 -- --------------------
-2025-03-11 21:38:23,826	SUCC scripts.py:903 -- Ray runtime started.
-2025-03-11 21:38:23,826	SUCC scripts.py:904 -- --------------------
-2025-03-11 21:38:23,826	INFO scripts.py:906 -- Next steps
-2025-03-11 21:38:23,826	INFO scripts.py:909 -- To add another node to this Ray cluster, run
-2025-03-11 21:38:23,826	INFO scripts.py:912 --   ray start --address='130.207.125.88:6379'
-2025-03-11 21:38:23,826	INFO scripts.py:921 -- To connect to this Ray cluster:
-2025-03-11 21:38:23,826	INFO scripts.py:923 -- import ray
-2025-03-11 21:38:23,826	INFO scripts.py:924 -- ray.init()
-2025-03-11 21:38:23,826	INFO scripts.py:955 -- To terminate the Ray runtime, run
-2025-03-11 21:38:23,826	INFO scripts.py:956 --   ray stop
-2025-03-11 21:38:23,826	INFO scripts.py:959 -- To view the status of the cluster, use
-2025-03-11 21:38:23,826	INFO scripts.py:960 --   ray status
-Available resources: {'memory': 591011351552.0, 'CPU': 8.0, 'object_store_memory': 200000000000.0, 'node:__internal_head__': 1.0, 'node:130.207.125.88': 1.0}
-======== Autoscaler status: 2025-03-11 21:38:53.521190 ========
-Node status
----------------------------------------------------------------
-Active:
- 1 node_940a89a547ed955042d438039d8e0d126d8baedc5444f007a4d1bfc8
-Pending:
- (no pending nodes)
-Recent failures:
- (no failures)
-
-Resources
----------------------------------------------------------------
-Usage:
- 0.0/8.0 CPU
- 0B/550.42GiB memory
- 0B/186.26GiB object_store_memory
-
-Demands:
- (no resource demands)
diff --git a/experiment/ray_test/test.sh b/experiment/ray_test/test.sh
deleted file mode 100644
index 79bf37509..000000000
--- a/experiment/ray_test/test.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=ray_docker
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=8
-#SBATCH --output=stdout.txt
-#SBATCH --error=stderr.txt
-
-# Get node information
-nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST)
-nodes_array=($nodes)
-head_node=${nodes_array[0]}
-port=6379
-
-
-source /nethome/aagrawal360/mambaforge/etc/profile.d/conda.sh
-source /nethome/aagrawal360/mambaforge/etc/profile.d/mamba.sh
-mamba activate /home/aagrawal360/repos/cloudai/env
-
-if [ "$SLURMD_NODENAME" == "$head_node" ]; then
-    echo "Starting Ray head node on $head_node"
-    ray start --head --port=$port --num-cpus=$SLURM_CPUS_PER_TASK
-    sleep 30
-    python /home/aagrawal360/repos/cloudai/experiment/ray_test/ray_test_job.py
-    ray status
-else
-    echo "Starting Ray worker on $SLURMD_NODENAME"
-    ray start --address=${head_node}:${port} --num-cpus=$SLURM_CPUS_PER_TASK
-fi
-
-sleep 30
\ No newline at end of file

From 7d13c4cfdc9b5f1fcd75affce72224d1ca38b821 Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 23:01:58 -0400
Subject: [PATCH 06/12] minor

---
 conf/common/system/badger_slurm_cluster.toml | 30 --------------------
 conf/common/test_scenario/sleep.toml         |  2 +-
 2 files changed, 1 insertion(+), 31 deletions(-)
 delete mode 100644 conf/common/system/badger_slurm_cluster.toml

diff --git a/conf/common/system/badger_slurm_cluster.toml b/conf/common/system/badger_slurm_cluster.toml
deleted file mode 100644
index ef289cbc0..000000000
--- a/conf/common/system/badger_slurm_cluster.toml
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "badger"
-scheduler = "slurm"
-
-install_path = "./install_dir"
-output_path = "./results"
-default_partition = "cuda"
-
-mpi = "pmi2"
-gpus_per_node = 4
-ntasks_per_node = 4
-monitor_interval = 2
-
-[[partitions]]
-name = "cuda"
diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml
index 94e850850..daca397c4 100644
--- a/conf/common/test_scenario/sleep.toml
+++ b/conf/common/test_scenario/sleep.toml
@@ -18,7 +18,7 @@ name = "test_scenario_example"
 
 [[Tests]]
 id = "Tests.1"
-test_name = "sleep_5"
+test_name = "sleep_10"
 
 [[Tests]]
 id = "Tests.2"

From 9185db91e79549d61550734ce728e2530162b778 Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 23:13:17 -0400
Subject: [PATCH 07/12] minor

---
 README.md                   |  1 +
 tests/test_acceptance.py    | 17 +++++++++++++++++
 tests/test_init.py          |  8 ++++++++
 tests/test_test_scenario.py |  2 ++
 4 files changed, 28 insertions(+)

diff --git a/README.md b/README.md
index 324ac70b8..8c0b30a0c 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ These schemas enable CloudAI to be flexible and compatible with different system
 |Sleep|✅|✅|✅|
 |UCC|✅|❌|❌|
 |SlurmContainer|✅|❌|❌|
+|SlurmRayContainer|✅|❌|❌|
 |MegatronRun (experimental)|✅|❌|❌|
 
 
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index cd7d0648d..b570708ad 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -51,6 +51,11 @@
     SlurmContainerCommandGenStrategy,
     SlurmContainerTestDefinition,
 )
+from cloudai.workloads.slurm_ray_container import (
+    SlurmRayContainerCmdArgs,
+    SlurmRayContainerCommandGenStrategy,
+    SlurmRayContainerTestDefinition,
+)
 from cloudai.workloads.ucc_test import UCCCmdArgs, UCCTestDefinition, UCCTestSlurmCommandGenStrategy
 
 SLURM_TEST_SCENARIOS = [
@@ -261,6 +266,18 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             ),
             SlurmContainerCommandGenStrategy,
         ),
+        "slurm_ray_container": lambda: create_test_run(
+            partial_tr,
+            slurm_system,
+            "slurm_ray_container",
+            SlurmRayContainerTestDefinition(
+                name="slurm_ray_container",
+                description="slurm_ray_container",
+                test_template_name="slurm_ray_container",
+                cmd_args=SlurmRayContainerCmdArgs(docker_image_url="https://docker/url", cmd="pwd ; ls", conda_env="test"),
+            ),
+            SlurmRayContainerCommandGenStrategy,
+        ),
         "megatron-run": lambda: create_test_run(
             partial_tr,
             slurm_system,
diff --git a/tests/test_init.py b/tests/test_init.py
index bb3dd95d0..b0ae7ea01 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -66,6 +66,10 @@
     SleepTestDefinition,
 )
 from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition
+from cloudai.workloads.slurm_ray_container import (
+    SlurmRayContainerCommandGenStrategy,
+    SlurmRayContainerTestDefinition,
+)
 from cloudai.workloads.ucc_test import (
     UCCTestDefinition,
     UCCTestGradingStrategy,
@@ -99,6 +103,7 @@ def test_runners():
     (CommandGenStrategy, SlurmSystem, NemotronTestDefinition): JaxToolboxSlurmCommandGenStrategy,
     (CommandGenStrategy, SlurmSystem, SleepTestDefinition): SleepSlurmCommandGenStrategy,
     (CommandGenStrategy, SlurmSystem, SlurmContainerTestDefinition): SlurmContainerCommandGenStrategy,
+    (CommandGenStrategy, SlurmSystem, SlurmRayContainerTestDefinition): SlurmRayContainerCommandGenStrategy,
     (CommandGenStrategy, SlurmSystem, UCCTestDefinition): UCCTestSlurmCommandGenStrategy,
     (CommandGenStrategy, SlurmSystem, MegatronRunTestDefinition): MegatronRunSlurmCommandGenStrategy,
     (CommandGenStrategy, StandaloneSystem, SleepTestDefinition): SleepStandaloneCommandGenStrategy,
@@ -119,6 +124,7 @@ def test_runners():
     (JobIdRetrievalStrategy, SlurmSystem, NemotronTestDefinition): SlurmJobIdRetrievalStrategy,
     (JobIdRetrievalStrategy, SlurmSystem, SleepTestDefinition): SlurmJobIdRetrievalStrategy,
     (JobIdRetrievalStrategy, SlurmSystem, SlurmContainerTestDefinition): SlurmJobIdRetrievalStrategy,
+    (JobIdRetrievalStrategy, SlurmSystem, SlurmRayContainerTestDefinition): SlurmJobIdRetrievalStrategy,
     (JobIdRetrievalStrategy, SlurmSystem, UCCTestDefinition): SlurmJobIdRetrievalStrategy,
     (JobIdRetrievalStrategy, SlurmSystem, MegatronRunTestDefinition): SlurmJobIdRetrievalStrategy,
     (JobIdRetrievalStrategy, StandaloneSystem, SleepTestDefinition): StandaloneJobIdRetrievalStrategy,
@@ -133,6 +139,7 @@ def test_runners():
     (JobStatusRetrievalStrategy, SlurmSystem, NemotronTestDefinition): JaxToolboxJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, SlurmSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, SlurmSystem, SlurmContainerTestDefinition): DefaultJobStatusRetrievalStrategy,
+    (JobStatusRetrievalStrategy, SlurmSystem, SlurmRayContainerTestDefinition): DefaultJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, SlurmSystem, UCCTestDefinition): DefaultJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, SlurmSystem, MegatronRunTestDefinition): DefaultJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, StandaloneSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy,
@@ -177,6 +184,7 @@ def test_definitions():
         ("JaxToolboxGrok", GrokTestDefinition),
         ("JaxToolboxNemotron", NemotronTestDefinition),
         ("SlurmContainer", SlurmContainerTestDefinition),
+        ("SlurmRayContainer", SlurmRayContainerTestDefinition),
         ("MegatronRun", MegatronRunTestDefinition),
     ]:
         assert test_defs[tdef[0]] == tdef[1]
diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py
index aeab2e0c3..6bd0c51df 100644
--- a/tests/test_test_scenario.py
+++ b/tests/test_test_scenario.py
@@ -44,6 +44,7 @@
 from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition
 from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition
 from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
+from cloudai.workloads.slurm_ray_container import SlurmRayContainerReportGenerationStrategy, SlurmRayContainerTestDefinition
 from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy
 from tests.conftest import MyTestDefinition
 
@@ -293,6 +294,7 @@ def test_default_reporters_size(self):
             (NemotronTestDefinition, {JaxToolboxReportGenerationStrategy}),
             (SleepTestDefinition, {SleepReportGenerationStrategy}),
             (SlurmContainerTestDefinition, {SlurmContainerReportGenerationStrategy}),
+            (SlurmRayContainerTestDefinition, {SlurmRayContainerReportGenerationStrategy}),
             (UCCTestDefinition, {UCCTestReportGenerationStrategy}),
         ],
     )

From a01f053e2c928870ece5bccf80224d34695cfea4 Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 23:34:02 -0400
Subject: [PATCH 08/12] fix ci

---
 .../test/slurm_ray_container_hello_world.toml |  2 +-
 conf/common/test_scenario/sleep.toml          |  2 +-
 .../test_scenario/slurm_ray_container.toml    |  2 +-
 pyproject.toml                                |  1 +
 src/cloudai/__init__.py                       |  2 ++
 src/cloudai/_core/test_scenario_parser.py     |  5 +++-
 .../strategy/slurm_command_gen_strategy.py    | 29 +++++++++----------
 .../report_generation_strategy.py             |  1 -
 .../slurm_command_gen_strategy.py             | 14 ++++-----
 tests/test_acceptance.py                      |  4 ++-
 tests/test_init.py                            |  2 +-
 tests/test_test_scenario.py                   |  7 +++--
 12 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/conf/common/test/slurm_ray_container_hello_world.toml b/conf/common/test/slurm_ray_container_hello_world.toml
index 2a995c04d..3e159dd48 100644
--- a/conf/common/test/slurm_ray_container_hello_world.toml
+++ b/conf/common/test/slurm_ray_container_hello_world.toml
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml
index daca397c4..2d3b06597 100644
--- a/conf/common/test_scenario/sleep.toml
+++ b/conf/common/test_scenario/sleep.toml
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/conf/common/test_scenario/slurm_ray_container.toml b/conf/common/test_scenario/slurm_ray_container.toml
index b58510998..1a30864bd 100644
--- a/conf/common/test_scenario/slurm_ray_container.toml
+++ b/conf/common/test_scenario/slurm_ray_container.toml
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/pyproject.toml b/pyproject.toml
index 12b5d2f18..4bb5dfcd8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
   "toml==0.10.2",
   "kubernetes==30.1.0",
   "pydantic==2.8.2",
+  "jinja2==3.1.6",
 ]
   [project.scripts]
   cloudai = "cloudai.__main__:main"
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index 1e6f6f13c..f33d077fc 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -157,6 +157,7 @@
         SleepTestDefinition,
         NeMoRunTestDefinition,
         SlurmContainerTestDefinition,
+        SlurmRayContainerTestDefinition,
         MegatronRunTestDefinition,
     ],
     SlurmJobIdRetrievalStrategy,
@@ -192,6 +193,7 @@
         SleepTestDefinition,
         NeMoRunTestDefinition,
         SlurmContainerTestDefinition,
+        SlurmRayContainerTestDefinition,
         MegatronRunTestDefinition,
     ],
     DefaultJobStatusRetrievalStrategy,
diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index cffac9b29..ce41debb9 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -36,7 +36,10 @@
 from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition
 from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition
 from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
-from cloudai.workloads.slurm_ray_container import SlurmRayContainerReportGenerationStrategy, SlurmRayContainerTestDefinition
+from cloudai.workloads.slurm_ray_container import (
+    SlurmRayContainerReportGenerationStrategy,
+    SlurmRayContainerTestDefinition,
+)
 from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy
 
 from .exceptions import TestScenarioParsingError, format_validation_error
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 62135b42b..416401178 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -310,9 +310,17 @@ def _write_sbatch_script(
 
         return f"sbatch {batch_script_path}"
 
-    def _get_sbatch_directives(
-        self, args: Dict[str, Any], output_path: Path
-    ) -> Dict[str, str]:
+    def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]:
+        """
+        Get the Slurm batch script directives.
+
+        Args:
+            args (Dict[str, Any]): Slurm-specific arguments.
+            output_path (Path): Output directory for script and logs.
+
+        Returns:
+            Dict[str, str]: Dictionary of Slurm batch script directives.
+        """
         sbatch_directives: Dict[str, str] = {}
 
         if "output" not in args:
@@ -336,18 +344,6 @@ def _get_sbatch_directives(
         if "time_limit" in args:
             sbatch_directives["time_limit"] = args["time_limit"]
 
-        for arg in self.system.extra_sbatch_args:
-            arg = arg.strip()
-            # remove -- from the start of the string if present
-            if arg.startswith("--"):
-                arg = arg[2:]
-            # split the string into key and value
-            arg_split = arg.split("=")
-            if len(arg_split) == 2:
-                sbatch_directives[arg_split[0]] = arg_split[1].join("=")
-            else:
-                sbatch_directives[arg] = ""
-
         return sbatch_directives
 
     def _append_sbatch_directives(
@@ -370,6 +366,9 @@ def _append_sbatch_directives(
             else:
                 batch_script_content.append(f"#SBATCH --{key}")
 
+        for arg in self.system.extra_sbatch_args:
+            batch_script_content.append(f"#SBATCH {arg}")
+
         batch_script_content.append(
             "\nexport SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)"
         )
diff --git a/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py
index 1646f1434..d8978e8da 100644
--- a/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py
+++ b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
 
 from cloudai import ReportGenerationStrategy
 
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
index fee895389..6887b064b 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 from pathlib import Path
 from typing import Any, Dict, List, Union, cast
 
@@ -28,9 +27,8 @@
 
 class SlurmRayContainerCommandGenStrategy(SlurmContainerCommandGenStrategy):
     """Command generation strategy for generic Slurm container tests."""
-    def _get_sbatch_directives(
-        self, args: Dict[str, Any], output_path: Path
-    ) -> Dict[str, str]:
+
+    def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]:
         sbatch_directives = super()._get_sbatch_directives(args, output_path)
         # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu
         # override tasks per node
@@ -53,10 +51,8 @@ def generate_test_command(
         template = Template(template_path.read_text())
 
         # render the template
-        rendered_template = template.render({
-            "conda_env": tdef.cmd_args.conda_env,
-            "command": " ".join(srun_command_parts)
-        })
+        rendered_template = template.render(
+            {"conda_env": tdef.cmd_args.conda_env, "command": " ".join(srun_command_parts)}
+        )
 
         return [rendered_template]
-
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index b570708ad..31978cb0f 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -274,7 +274,9 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 name="slurm_ray_container",
                 description="slurm_ray_container",
                 test_template_name="slurm_ray_container",
-                cmd_args=SlurmRayContainerCmdArgs(docker_image_url="https://docker/url", cmd="pwd ; ls", conda_env="test"),
+                cmd_args=SlurmRayContainerCmdArgs(
+                    docker_image_url="https://docker/url", cmd="pwd ; ls", conda_env="test"
+                ),
             ),
             SlurmRayContainerCommandGenStrategy,
         ),
diff --git a/tests/test_init.py b/tests/test_init.py
index b0ae7ea01..0ffcd1afe 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -172,7 +172,7 @@ def test_installers():
 
 def test_definitions():
     test_defs = Registry().test_definitions_map
-    assert len(test_defs) == 11
+    assert len(test_defs) == 12
     for tdef in [
         ("UCCTest", UCCTestDefinition),
         ("NcclTest", NCCLTestDefinition),
diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py
index 6bd0c51df..c2208068f 100644
--- a/tests/test_test_scenario.py
+++ b/tests/test_test_scenario.py
@@ -44,7 +44,10 @@
 from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition
 from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition
 from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
-from cloudai.workloads.slurm_ray_container import SlurmRayContainerReportGenerationStrategy, SlurmRayContainerTestDefinition
+from cloudai.workloads.slurm_ray_container import (
+    SlurmRayContainerReportGenerationStrategy,
+    SlurmRayContainerTestDefinition,
+)
 from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy
 from tests.conftest import MyTestDefinition
 
@@ -279,7 +282,7 @@ def test_default(self):
         assert len(reporters) == 0
 
     def test_default_reporters_size(self):
-        assert len(DEFAULT_REPORTERS) == 11
+        assert len(DEFAULT_REPORTERS) == 12
 
     @pytest.mark.parametrize(
         "tdef,expected_reporters",

From e408026c7ea7bba0fec81cac985e1f79d8663563 Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 23:46:21 -0400
Subject: [PATCH 09/12] minor

---
 .../slurm_command_gen_strategy.py             | 24 ++++++++++++++++---
 .../slurm_ray_container_template.sh.jinja     |  8 +++----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
index 6887b064b..2abcc1b64 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
@@ -37,13 +37,27 @@ def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dic
 
         return sbatch_directives
 
+    def _gen_srun_command(
+        self,
+        slurm_args: Dict[str, Any],
+        env_vars: Dict[str, str],
+        cmd_args: Dict[str, Union[str, List[str]]],
+        tr: TestRun,
+    ) -> str:
+        srun_command_parts = self.gen_srun_prefix(slurm_args, tr)
+        nsys_command_parts = super().gen_nsys_command(tr)
+        cmd_args["srun_command_prefix"] = " ".join(srun_command_parts + nsys_command_parts)
+        test_command_parts = self.generate_test_command(env_vars, cmd_args, tr)
+        return " ".join(test_command_parts)
+
     def generate_test_command(
         self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun
     ) -> list[str]:
         tdef: SlurmRayContainerTestDefinition = cast(SlurmRayContainerTestDefinition, tr.test.test_definition)
-        srun_command_parts: list[str] = [*super().gen_nsys_command(tr), tdef.cmd_args.cmd]
+
+        command_parts: list[str] = [tdef.cmd_args.cmd]
         if tr.test.extra_cmd_args:
-            srun_command_parts.append(tr.test.extra_cmd_args)
+            command_parts.append(tr.test.extra_cmd_args)
 
         # load the jinja template file which is placed at the same directory as this file
         script_dir = Path(__file__).parent
@@ -52,7 +66,11 @@ def generate_test_command(
 
         # render the template
         rendered_template = template.render(
-            {"conda_env": tdef.cmd_args.conda_env, "command": " ".join(srun_command_parts)}
+            {
+                "conda_env": tdef.cmd_args.conda_env,
+                "command": " ".join(command_parts),
+                "srun_command_prefix": cmd_args["srun_command_prefix"],
+            }
         )
 
         return [rendered_template]
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
index eeeb08fec..530130816 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
@@ -1,12 +1,11 @@
-conda activate {{ conda_env }}
-
 port=6379
 ip_head=$head_node_ip:$port
 export ip_head
 echo "IP Head: $ip_head"
 
 echo "Starting HEAD at $head_node"
-srun --nodes=1 --ntasks=1 -w "$head_node" \
+{{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$head_node" \
+    conda activate {{ conda_env }} && \
     ray start --head --node-ip-address="$head_node_ip" --port=$port \
     --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
 
@@ -19,7 +18,8 @@ worker_num=$((SLURM_JOB_NUM_NODES - 1))
 for ((i = 1; i <= worker_num; i++)); do
     node_i=${nodes_array[$i]}
     echo "Starting WORKER $i at $node_i"
-    srun --nodes=1 --ntasks=1 -w "$node_i" \
+    {{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$node_i" \
+        conda activate {{ conda_env }} && \
         ray start --address "$ip_head" \
         --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
     sleep 5

From f13a597d6f7e5135be6c1a6f801a9e67ce38c06a Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Tue, 11 Mar 2025 23:51:50 -0400
Subject: [PATCH 10/12] minor

---
 .../slurm_ray_container/slurm_command_gen_strategy.py       | 2 +-
 .../slurm_ray_container_template.sh.jinja                   | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
index 2abcc1b64..31bcfb591 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
@@ -32,7 +32,7 @@ def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dic
         sbatch_directives = super()._get_sbatch_directives(args, output_path)
         # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu
         # override tasks per node
-        sbatch_directives["tasks-per-node"] = "1"
+        sbatch_directives["tasks-per-node"] = "2"
         sbatch_directives["exclusive"] = ""
 
         return sbatch_directives
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
index 530130816..57692f6fb 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
@@ -25,5 +25,7 @@ for ((i = 1; i <= worker_num; i++)); do
     sleep 5
 done
 
-{{ command }}
-
+{{ srun_command_prefix }} --nodes=1 --ntasks=1 \
+  -w "$head_node" --gpus-per-node=0 \
+  conda activate {{ conda_env }} && \
+  {{ command }}

From 7b982ad9d97f328fb418a9a9958ae12f0d2a212f Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Wed, 12 Mar 2025 01:31:04 -0400
Subject: [PATCH 11/12] minor

---
 ...hello_world.toml => slurm_ray_container_vllm.toml} | 11 ++++-------
 conf/common/test_scenario/slurm_ray_container.toml    |  2 +-
 .../slurm_ray_container/slurm_command_gen_strategy.py |  7 ++++++-
 .../slurm_ray_container/slurm_ray_container.py        |  2 +-
 .../slurm_ray_container_template.sh.jinja             |  6 +++---
 5 files changed, 15 insertions(+), 13 deletions(-)
 rename conf/common/test/{slurm_ray_container_hello_world.toml => slurm_ray_container_vllm.toml} (79%)

diff --git a/conf/common/test/slurm_ray_container_hello_world.toml b/conf/common/test/slurm_ray_container_vllm.toml
similarity index 79%
rename from conf/common/test/slurm_ray_container_hello_world.toml
rename to conf/common/test/slurm_ray_container_vllm.toml
index 3e159dd48..ff3835393 100644
--- a/conf/common/test/slurm_ray_container_hello_world.toml
+++ b/conf/common/test/slurm_ray_container_vllm.toml
@@ -14,13 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name = "slurm_ray_container_hello_world"
-description = "Hello World"
+name = "slurm_ray_container_vllm"
+description = "Run example script with vLLM"
 test_template_name = "SlurmRayContainer"
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/nemo:24.12.rc3"
-conda_env = "nemo"
-cmd = "bash -c 'echo Hello World'"
-
-
+docker_image_url = "vllm/vllm-openai:latest"
+cmd = "python3 examples/offline_inference/llm_engine_example.py -tp 8 -pp 2"
diff --git a/conf/common/test_scenario/slurm_ray_container.toml b/conf/common/test_scenario/slurm_ray_container.toml
index 1a30864bd..73e4506ff 100644
--- a/conf/common/test_scenario/slurm_ray_container.toml
+++ b/conf/common/test_scenario/slurm_ray_container.toml
@@ -18,5 +18,5 @@ name = "slurm_ray_container_example"
 
 [[Tests]]
 id = "Tests.1"
-test_name = "slurm_ray_container_hello_world"
+test_name = "slurm_ray_container_vllm"
 num_nodes = "2"
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
index 31bcfb591..0fa05ddfb 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
@@ -64,10 +64,15 @@ def generate_test_command(
         template_path = script_dir / "slurm_ray_container_template.sh.jinja"
         template = Template(template_path.read_text())
 
+        if tdef.cmd_args.conda_env:
+            conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && "
+        else:
+            conda_activate_command = ""
+
         # render the template
         rendered_template = template.render(
             {
-                "conda_env": tdef.cmd_args.conda_env,
+                "conda_activate_command": conda_activate_command,
                 "command": " ".join(command_parts),
                 "srun_command_prefix": cmd_args["srun_command_prefix"],
             }
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py
index 24f15176a..742aeb6b2 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py
@@ -23,8 +23,8 @@ class SlurmRayContainerCmdArgs(CmdArgs):
     """Command line arguments for a generic Slurm container test."""
 
     docker_image_url: str
-    conda_env: str
     cmd: str
+    conda_env: Optional[str] = None
 
 
 class SlurmRayContainerTestDefinition(TestDefinition):
diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
index 57692f6fb..f22094f46 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja
@@ -5,7 +5,7 @@ echo "IP Head: $ip_head"
 
 echo "Starting HEAD at $head_node"
 {{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$head_node" \
-    conda activate {{ conda_env }} && \
+    {{ conda_activate_command }} \
     ray start --head --node-ip-address="$head_node_ip" --port=$port \
     --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
 
@@ -19,7 +19,7 @@ for ((i = 1; i <= worker_num; i++)); do
     node_i=${nodes_array[$i]}
     echo "Starting WORKER $i at $node_i"
     {{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$node_i" \
-        conda activate {{ conda_env }} && \
+        {{ conda_activate_command }} \
         ray start --address "$ip_head" \
         --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block &
     sleep 5
@@ -27,5 +27,5 @@ done
 
 {{ srun_command_prefix }} --nodes=1 --ntasks=1 \
   -w "$head_node" --gpus-per-node=0 \
-  conda activate {{ conda_env }} && \
+  {{ conda_activate_command }} \
   {{ command }}

From 6c5e9dd9a5b1ea1d0d05291d1b6adaa7a963050a Mon Sep 17 00:00:00 2001
From: Rayyan Shahid <asmit18025@iiitd.ac.in>
Date: Wed, 12 Mar 2025 01:31:41 -0400
Subject: [PATCH 12/12] minor

---
 .../slurm_ray_container/slurm_command_gen_strategy.py        | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
index 0fa05ddfb..e6419d0f4 100644
--- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py
@@ -64,10 +64,7 @@ def generate_test_command(
         template_path = script_dir / "slurm_ray_container_template.sh.jinja"
         template = Template(template_path.read_text())
 
-        if tdef.cmd_args.conda_env:
-            conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && "
-        else:
-            conda_activate_command = ""
+        conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && " if tdef.cmd_args.conda_env else ""
 
         # render the template
         rendered_template = template.render(