From 77b559fac655757ca0bb80971402bc03f053d589 Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 21:43:12 -0400 Subject: [PATCH 01/12] minor --- .gitignore | 3 ++ conf/common/system/badger_slurm_cluster.toml | 30 ++++++++++++++++ conf/common/test_scenario/sleep.toml | 38 ++++++++++---------- experiment/ray_test/ray_test_job.py | 6 ++++ experiment/ray_test/stderr.txt | 2 ++ experiment/ray_test/stdout.txt | 36 +++++++++++++++++++ experiment/ray_test/test.sh | 31 ++++++++++++++++ 7 files changed, 127 insertions(+), 19 deletions(-) create mode 100644 conf/common/system/badger_slurm_cluster.toml create mode 100644 experiment/ray_test/ray_test_job.py create mode 100644 experiment/ray_test/stderr.txt create mode 100644 experiment/ray_test/stdout.txt create mode 100644 experiment/ray_test/test.sh diff --git a/.gitignore b/.gitignore index 092d3741e..ee5d8e77b 100644 --- a/.gitignore +++ b/.gitignore @@ -89,3 +89,6 @@ Thumbs.db install/ results/ .* + +# conda +env/ \ No newline at end of file diff --git a/conf/common/system/badger_slurm_cluster.toml b/conf/common/system/badger_slurm_cluster.toml new file mode 100644 index 000000000..ef289cbc0 --- /dev/null +++ b/conf/common/system/badger_slurm_cluster.toml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "badger" +scheduler = "slurm" + +install_path = "./install_dir" +output_path = "./results" +default_partition = "cuda" + +mpi = "pmi2" +gpus_per_node = 4 +ntasks_per_node = 4 +monitor_interval = 2 + +[[partitions]] +name = "cuda" diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml index daca397c4..3e25c9a02 100644 --- a/conf/common/test_scenario/sleep.toml +++ b/conf/common/test_scenario/sleep.toml @@ -18,25 +18,25 @@ name = "test_scenario_example" [[Tests]] id = "Tests.1" -test_name = "sleep_10" - -[[Tests]] -id = "Tests.2" test_name = "sleep_5" - [[Tests.dependencies]] - type = "start_post_init" - id = "Tests.1" -[[Tests]] -id = "Tests.3" -test_name = "sleep_5" - [[Tests.dependencies]] - type = "start_post_comp" - id = "Tests.1" +# [[Tests]] +# id = "Tests.2" +# test_name = "sleep_5" +# [[Tests.dependencies]] +# type = "start_post_init" +# id = "Tests.1" -[[Tests]] -id = "Tests.4" -test_name = "sleep_20" - [[Tests.dependencies]] - type = "end_post_comp" - id = "Tests.1" +# [[Tests]] +# id = "Tests.3" +# test_name = "sleep_5" +# [[Tests.dependencies]] +# type = "start_post_comp" +# id = "Tests.1" + +# [[Tests]] +# id = "Tests.4" +# test_name = "sleep_20" +# [[Tests.dependencies]] +# type = "end_post_comp" +# id = "Tests.1" diff --git a/experiment/ray_test/ray_test_job.py b/experiment/ray_test/ray_test_job.py new file mode 100644 index 000000000..f58034824 --- /dev/null +++ b/experiment/ray_test/ray_test_job.py @@ -0,0 +1,6 @@ +import sys +import ray + +ray.init(address="auto") + +print(f"Available resources: {ray.available_resources()}", flush=True) diff --git a/experiment/ray_test/stderr.txt b/experiment/ray_test/stderr.txt new file mode 100644 index 000000000..f4c8e3362 --- /dev/null +++ b/experiment/ray_test/stderr.txt @@ -0,0 +1,2 @@ +2025-03-11 21:38:54,163 INFO worker.py:1654 -- Connecting to existing Ray cluster at address: 130.207.125.88:6379... +2025-03-11 21:38:54,174 INFO worker.py:1841 -- Connected to Ray cluster. diff --git a/experiment/ray_test/stdout.txt b/experiment/ray_test/stdout.txt new file mode 100644 index 000000000..ce5d599b4 --- /dev/null +++ b/experiment/ray_test/stdout.txt @@ -0,0 +1,36 @@ +Starting Ray head node on badger +2025-03-11 21:38:23,038 INFO usage_lib.py:467 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details. +2025-03-11 21:38:23,038 INFO scripts.py:865 -- Local node IP: 130.207.125.88 +2025-03-11 21:38:23,826 SUCC scripts.py:902 -- -------------------- +2025-03-11 21:38:23,826 SUCC scripts.py:903 -- Ray runtime started. +2025-03-11 21:38:23,826 SUCC scripts.py:904 -- -------------------- +2025-03-11 21:38:23,826 INFO scripts.py:906 -- Next steps +2025-03-11 21:38:23,826 INFO scripts.py:909 -- To add another node to this Ray cluster, run +2025-03-11 21:38:23,826 INFO scripts.py:912 -- ray start --address='130.207.125.88:6379' +2025-03-11 21:38:23,826 INFO scripts.py:921 -- To connect to this Ray cluster: +2025-03-11 21:38:23,826 INFO scripts.py:923 -- import ray +2025-03-11 21:38:23,826 INFO scripts.py:924 -- ray.init() +2025-03-11 21:38:23,826 INFO scripts.py:955 -- To terminate the Ray runtime, run +2025-03-11 21:38:23,826 INFO scripts.py:956 -- ray stop +2025-03-11 21:38:23,826 INFO scripts.py:959 -- To view the status of the cluster, use +2025-03-11 21:38:23,826 INFO scripts.py:960 -- ray status +Available resources: {'memory': 591011351552.0, 'CPU': 8.0, 'object_store_memory': 200000000000.0, 'node:__internal_head__': 1.0, 'node:130.207.125.88': 1.0} +======== Autoscaler status: 2025-03-11 21:38:53.521190 ======== +Node status +--------------------------------------------------------------- +Active: + 1 node_940a89a547ed955042d438039d8e0d126d8baedc5444f007a4d1bfc8 +Pending: + (no pending nodes) +Recent failures: + (no failures) + +Resources +--------------------------------------------------------------- +Usage: + 0.0/8.0 CPU + 0B/550.42GiB memory + 0B/186.26GiB object_store_memory + +Demands: + (no resource demands) diff --git a/experiment/ray_test/test.sh b/experiment/ray_test/test.sh new file mode 100644 index 000000000..79bf37509 --- /dev/null +++ b/experiment/ray_test/test.sh @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --job-name=ray_docker +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=8 +#SBATCH --output=stdout.txt +#SBATCH --error=stderr.txt + +# Get node information +nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) +nodes_array=($nodes) +head_node=${nodes_array[0]} +port=6379 + + +source /nethome/aagrawal360/mambaforge/etc/profile.d/conda.sh +source /nethome/aagrawal360/mambaforge/etc/profile.d/mamba.sh +mamba activate /home/aagrawal360/repos/cloudai/env + +if [ "$SLURMD_NODENAME" == "$head_node" ]; then + echo "Starting Ray head node on $head_node" + ray start --head --port=$port --num-cpus=$SLURM_CPUS_PER_TASK + sleep 30 + python /home/aagrawal360/repos/cloudai/experiment/ray_test/ray_test_job.py + ray status +else + echo "Starting Ray worker on $SLURMD_NODENAME" + ray start --address=${head_node}:${port} --num-cpus=$SLURM_CPUS_PER_TASK +fi + +sleep 30 \ No newline at end of file From 2c352e83621b2f8fc14bf7fb6503a2562728146b Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 22:55:48 -0400 Subject: [PATCH 02/12] minor --- .../test/slurm_ray_container_hello_world.toml | 26 +++++++ conf/common/test_scenario/sleep.toml | 36 +++++----- .../test_scenario/slurm_ray_container.toml | 22 ++++++ src/cloudai/__init__.py | 5 ++ src/cloudai/_core/test_scenario_parser.py | 2 + .../strategy/slurm_command_gen_strategy.py | 67 +++++++++++++------ .../workloads/slurm_ray_container/__init__.py | 26 +++++++ .../report_generation_strategy.py | 29 ++++++++ .../slurm_command_gen_strategy.py | 62 +++++++++++++++++ .../slurm_ray_container.py | 52 ++++++++++++++ .../slurm_ray_container_template.sh.jinja | 29 ++++++++ 11 files changed, 317 insertions(+), 39 deletions(-) create mode 100644 conf/common/test/slurm_ray_container_hello_world.toml create mode 100644 conf/common/test_scenario/slurm_ray_container.toml create mode 100644 src/cloudai/workloads/slurm_ray_container/__init__.py create mode 100644 src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py create mode 100644 src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py create mode 100644 src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py create mode 100644 src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja diff --git a/conf/common/test/slurm_ray_container_hello_world.toml b/conf/common/test/slurm_ray_container_hello_world.toml new file mode 100644 index 000000000..2a995c04d --- /dev/null +++ b/conf/common/test/slurm_ray_container_hello_world.toml @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "slurm_ray_container_hello_world" +description = "Hello World" +test_template_name = "SlurmRayContainer" + +[cmd_args] +docker_image_url = "nvcr.io/nvidia/nemo:24.12.rc3" +conda_env = "nemo" +cmd = "bash -c 'echo Hello World'" + + diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml index 3e25c9a02..94e850850 100644 --- a/conf/common/test_scenario/sleep.toml +++ b/conf/common/test_scenario/sleep.toml @@ -20,23 +20,23 @@ name = "test_scenario_example" id = "Tests.1" test_name = "sleep_5" -# [[Tests]] -# id = "Tests.2" -# test_name = "sleep_5" -# [[Tests.dependencies]] -# type = "start_post_init" -# id = "Tests.1" +[[Tests]] +id = "Tests.2" +test_name = "sleep_5" + [[Tests.dependencies]] + type = "start_post_init" + id = "Tests.1" -# [[Tests]] -# id = "Tests.3" -# test_name = "sleep_5" -# [[Tests.dependencies]] -# type = "start_post_comp" -# id = "Tests.1" +[[Tests]] +id = "Tests.3" +test_name = "sleep_5" + [[Tests.dependencies]] + type = "start_post_comp" + id = "Tests.1" -# [[Tests]] -# id = "Tests.4" -# test_name = "sleep_20" -# [[Tests.dependencies]] -# type = "end_post_comp" -# id = "Tests.1" +[[Tests]] +id = "Tests.4" +test_name = "sleep_20" + [[Tests.dependencies]] + type = "end_post_comp" + id = "Tests.1" diff --git a/conf/common/test_scenario/slurm_ray_container.toml b/conf/common/test_scenario/slurm_ray_container.toml new file mode 100644 index 000000000..b58510998 --- /dev/null +++ b/conf/common/test_scenario/slurm_ray_container.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "slurm_ray_container_example" + +[[Tests]] +id = "Tests.1" +test_name = "slurm_ray_container_hello_world" +num_nodes = "2" diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index 4c6d1210c..1e6f6f13c 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -97,6 +97,7 @@ SleepTestDefinition, ) from .workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition +from .workloads.slurm_ray_container import SlurmRayContainerCommandGenStrategy, SlurmRayContainerTestDefinition from .workloads.ucc_test import ( UCCTestDefinition, UCCTestGradingStrategy, @@ -207,6 +208,9 @@ Registry().add_strategy( CommandGenStrategy, [SlurmSystem], [SlurmContainerTestDefinition], SlurmContainerCommandGenStrategy ) +Registry().add_strategy( + CommandGenStrategy, [SlurmSystem], [SlurmRayContainerTestDefinition], SlurmRayContainerCommandGenStrategy +) Registry().add_installer("slurm", SlurmInstaller) Registry().add_installer("standalone", StandaloneInstaller) @@ -226,6 +230,7 @@ Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition) Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition) Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition) +Registry().add_test_definition("SlurmRayContainer", SlurmRayContainerTestDefinition) Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition) Registry().add_agent("grid_search", GridSearchAgent) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 76c1d1dc1..cffac9b29 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -36,6 +36,7 @@ from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition +from cloudai.workloads.slurm_ray_container import SlurmRayContainerReportGenerationStrategy, SlurmRayContainerTestDefinition from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy from .exceptions import TestScenarioParsingError, format_validation_error @@ -54,6 +55,7 @@ NemotronTestDefinition: {JaxToolboxReportGenerationStrategy}, SleepTestDefinition: {SleepReportGenerationStrategy}, SlurmContainerTestDefinition: {SlurmContainerReportGenerationStrategy}, + SlurmRayContainerTestDefinition: {SlurmRayContainerReportGenerationStrategy}, UCCTestDefinition: {UCCTestReportGenerationStrategy}, } diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 2f57ab802..62135b42b 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -310,6 +310,46 @@ def _write_sbatch_script( return f"sbatch {batch_script_path}" + def _get_sbatch_directives( + self, args: Dict[str, Any], output_path: Path + ) -> Dict[str, str]: + sbatch_directives: Dict[str, str] = {} + + if "output" not in args: + sbatch_directives["output"] = f"{output_path / 'stdout.txt'}" + if "error" not in args: + sbatch_directives["error"] = f"{output_path / 'stderr.txt'}" + + sbatch_directives["partition"] = self.system.default_partition + + if args["node_list_str"]: + sbatch_directives["nodelist"] = args["node_list_str"] + if self.system.account: + sbatch_directives["account"] = self.system.account + if self.system.distribution: + sbatch_directives["distribution"] = self.system.distribution + if self.system.gpus_per_node: + sbatch_directives["gpus_per_node"] = str(self.system.gpus_per_node) + sbatch_directives["gres"] = f"gpu:{self.system.gpus_per_node}" + if self.system.ntasks_per_node: + sbatch_directives["ntasks_per_node"] = str(self.system.ntasks_per_node) + if "time_limit" in args: + sbatch_directives["time_limit"] = args["time_limit"] + + for arg in self.system.extra_sbatch_args: + arg = arg.strip() + # remove -- from the start of the string if present + if arg.startswith("--"): + arg = arg[2:] + # split the string into key and value + arg_split = arg.split("=") + if len(arg_split) == 2: + sbatch_directives[arg_split[0]] = arg_split[1].join("=") + else: + sbatch_directives[arg] = "" + + return sbatch_directives + def _append_sbatch_directives( self, batch_script_content: List[str], args: Dict[str, Any], output_path: Path ) -> None: @@ -322,28 +362,13 @@ def _append_sbatch_directives( output_path (Path): Output directory for script and logs. """ batch_script_content = self._add_reservation(batch_script_content) + sbatch_directives = self._get_sbatch_directives(args, output_path) - if "output" not in args: - batch_script_content.append(f"#SBATCH --output={output_path / 'stdout.txt'}") - if "error" not in args: - batch_script_content.append(f"#SBATCH --error={output_path / 'stderr.txt'}") - batch_script_content.append(f"#SBATCH --partition={self.system.default_partition}") - if args["node_list_str"]: - batch_script_content.append(f"#SBATCH --nodelist={args['node_list_str']}") - if self.system.account: - batch_script_content.append(f"#SBATCH --account={self.system.account}") - if self.system.distribution: - batch_script_content.append(f"#SBATCH --distribution={self.system.distribution}") - if self.system.gpus_per_node: - batch_script_content.append(f"#SBATCH --gpus-per-node={self.system.gpus_per_node}") - batch_script_content.append(f"#SBATCH --gres=gpu:{self.system.gpus_per_node}") - if self.system.ntasks_per_node: - batch_script_content.append(f"#SBATCH --ntasks-per-node={self.system.ntasks_per_node}") - if "time_limit" in args: - batch_script_content.append(f"#SBATCH --time={args['time_limit']}") - - for arg in self.system.extra_sbatch_args: - batch_script_content.append(f"#SBATCH {arg}") + for key, value in sbatch_directives.items(): + if value: + batch_script_content.append(f"#SBATCH --{key}={value}") + else: + batch_script_content.append(f"#SBATCH --{key}") batch_script_content.append( "\nexport SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)" diff --git a/src/cloudai/workloads/slurm_ray_container/__init__.py b/src/cloudai/workloads/slurm_ray_container/__init__.py new file mode 100644 index 000000000..875c47bb0 --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/__init__.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .report_generation_strategy import SlurmRayContainerReportGenerationStrategy +from .slurm_command_gen_strategy import SlurmRayContainerCommandGenStrategy +from .slurm_ray_container import SlurmRayContainerCmdArgs, SlurmRayContainerTestDefinition + +__all__ = [ + "SlurmRayContainerCmdArgs", + "SlurmRayContainerCommandGenStrategy", + "SlurmRayContainerReportGenerationStrategy", + "SlurmRayContainerTestDefinition", +] diff --git a/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py new file mode 100644 index 000000000..1646f1434 --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +from cloudai import ReportGenerationStrategy + + +class SlurmRayContainerReportGenerationStrategy(ReportGenerationStrategy): + """Report generation strategy for a generic Slurm ray container test.""" + + def can_handle_directory(self) -> bool: + return False + + def generate_report(self) -> None: + pass diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py new file mode 100644 index 000000000..fee895389 --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from pathlib import Path +from typing import Any, Dict, List, Union, cast + +from jinja2 import Template + +from cloudai import TestRun +from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy + +from .slurm_ray_container import SlurmRayContainerTestDefinition + + +class SlurmRayContainerCommandGenStrategy(SlurmContainerCommandGenStrategy): + """Command generation strategy for generic Slurm container tests.""" + def _get_sbatch_directives( + self, args: Dict[str, Any], output_path: Path + ) -> Dict[str, str]: + sbatch_directives = super()._get_sbatch_directives(args, output_path) + # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu + # override tasks per node + sbatch_directives["tasks-per-node"] = "1" + sbatch_directives["exclusive"] = "" + + return sbatch_directives + + def generate_test_command( + self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun + ) -> list[str]: + tdef: SlurmRayContainerTestDefinition = cast(SlurmRayContainerTestDefinition, tr.test.test_definition) + srun_command_parts: list[str] = [*super().gen_nsys_command(tr), tdef.cmd_args.cmd] + if tr.test.extra_cmd_args: + srun_command_parts.append(tr.test.extra_cmd_args) + + # load the jinja template file which is placed at the same directory as this file + script_dir = Path(__file__).parent + template_path = script_dir / "slurm_ray_container_template.sh.jinja" + template = Template(template_path.read_text()) + + # render the template + rendered_template = template.render({ + "conda_env": tdef.cmd_args.conda_env, + "command": " ".join(srun_command_parts) + }) + + return [rendered_template] + diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py new file mode 100644 index 000000000..24f15176a --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from cloudai import CmdArgs, DockerImage, Installable, TestDefinition + + +class SlurmRayContainerCmdArgs(CmdArgs): + """Command line arguments for a generic Slurm container test.""" + + docker_image_url: str + conda_env: str + cmd: str + + +class SlurmRayContainerTestDefinition(TestDefinition): + """Test definition for a generic Slurm container test.""" + + cmd_args: SlurmRayContainerCmdArgs + + _docker_image: Optional[DockerImage] = None + + @property + def docker_image(self) -> DockerImage: + if not self._docker_image: + self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) + return self._docker_image + + @property + def installables(self) -> list[Installable]: + return [self.docker_image, *self.git_repos] + + @property + def extra_args_str(self) -> str: + parts = [] + for k, v in self.extra_cmd_args.items(): + parts.append(f"{k} {v}" if v else k) + return " ".join(parts) diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja new file mode 100644 index 000000000..eeeb08fec --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja @@ -0,0 +1,29 @@ +conda activate {{ conda_env }} + +port=6379 +ip_head=$head_node_ip:$port +export ip_head +echo "IP Head: $ip_head" + +echo "Starting HEAD at $head_node" +srun --nodes=1 --ntasks=1 -w "$head_node" \ + ray start --head --node-ip-address="$head_node_ip" --port=$port \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block & + +# optional, though may be useful in certain versions of Ray < 1.0. +sleep 10 + +# number of nodes other than the head node +worker_num=$((SLURM_JOB_NUM_NODES - 1)) + +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \ + ray start --address "$ip_head" \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block & + sleep 5 +done + +{{ command }} + From 0978f5edf14b7f880533ec812f76a032ba444e7a Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 22:57:33 -0400 Subject: [PATCH 03/12] minor --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index ddaf06e25..973fef73c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ tbparse==0.0.8 toml==0.10.2 kubernetes==30.1.0 pydantic==2.8.2 +jinja2==3.1.3 From a41de4eee175ad5607523db4507c843a76f0bb87 Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 22:57:47 -0400 Subject: [PATCH 04/12] minor --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 973fef73c..c585ee021 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ tbparse==0.0.8 toml==0.10.2 kubernetes==30.1.0 pydantic==2.8.2 -jinja2==3.1.3 +jinja2==3.1.6 From a2321aa3281ceca6c84125b3a1aa903d386b1c28 Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 22:59:58 -0400 Subject: [PATCH 05/12] minor --- experiment/ray_test/ray_test_job.py | 6 ----- experiment/ray_test/stderr.txt | 2 -- experiment/ray_test/stdout.txt | 36 ----------------------------- experiment/ray_test/test.sh | 31 ------------------------- 4 files changed, 75 deletions(-) delete mode 100644 experiment/ray_test/ray_test_job.py delete mode 100644 experiment/ray_test/stderr.txt delete mode 100644 experiment/ray_test/stdout.txt delete mode 100644 experiment/ray_test/test.sh diff --git a/experiment/ray_test/ray_test_job.py b/experiment/ray_test/ray_test_job.py deleted file mode 100644 index f58034824..000000000 --- a/experiment/ray_test/ray_test_job.py +++ /dev/null @@ -1,6 +0,0 @@ -import sys -import ray - -ray.init(address="auto") - -print(f"Available resources: {ray.available_resources()}", flush=True) diff --git a/experiment/ray_test/stderr.txt b/experiment/ray_test/stderr.txt deleted file mode 100644 index f4c8e3362..000000000 --- a/experiment/ray_test/stderr.txt +++ /dev/null @@ -1,2 +0,0 @@ -2025-03-11 21:38:54,163 INFO worker.py:1654 -- Connecting to existing Ray cluster at address: 130.207.125.88:6379... -2025-03-11 21:38:54,174 INFO worker.py:1841 -- Connected to Ray cluster. diff --git a/experiment/ray_test/stdout.txt b/experiment/ray_test/stdout.txt deleted file mode 100644 index ce5d599b4..000000000 --- a/experiment/ray_test/stdout.txt +++ /dev/null @@ -1,36 +0,0 @@ -Starting Ray head node on badger -2025-03-11 21:38:23,038 INFO usage_lib.py:467 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details. -2025-03-11 21:38:23,038 INFO scripts.py:865 -- Local node IP: 130.207.125.88 -2025-03-11 21:38:23,826 SUCC scripts.py:902 -- -------------------- -2025-03-11 21:38:23,826 SUCC scripts.py:903 -- Ray runtime started. -2025-03-11 21:38:23,826 SUCC scripts.py:904 -- -------------------- -2025-03-11 21:38:23,826 INFO scripts.py:906 -- Next steps -2025-03-11 21:38:23,826 INFO scripts.py:909 -- To add another node to this Ray cluster, run -2025-03-11 21:38:23,826 INFO scripts.py:912 -- ray start --address='130.207.125.88:6379' -2025-03-11 21:38:23,826 INFO scripts.py:921 -- To connect to this Ray cluster: -2025-03-11 21:38:23,826 INFO scripts.py:923 -- import ray -2025-03-11 21:38:23,826 INFO scripts.py:924 -- ray.init() -2025-03-11 21:38:23,826 INFO scripts.py:955 -- To terminate the Ray runtime, run -2025-03-11 21:38:23,826 INFO scripts.py:956 -- ray stop -2025-03-11 21:38:23,826 INFO scripts.py:959 -- To view the status of the cluster, use -2025-03-11 21:38:23,826 INFO scripts.py:960 -- ray status -Available resources: {'memory': 591011351552.0, 'CPU': 8.0, 'object_store_memory': 200000000000.0, 'node:__internal_head__': 1.0, 'node:130.207.125.88': 1.0} -======== Autoscaler status: 2025-03-11 21:38:53.521190 ======== -Node status ---------------------------------------------------------------- -Active: - 1 node_940a89a547ed955042d438039d8e0d126d8baedc5444f007a4d1bfc8 -Pending: - (no pending nodes) -Recent failures: - (no failures) - -Resources ---------------------------------------------------------------- -Usage: - 0.0/8.0 CPU - 0B/550.42GiB memory - 0B/186.26GiB object_store_memory - -Demands: - (no resource demands) diff --git a/experiment/ray_test/test.sh b/experiment/ray_test/test.sh deleted file mode 100644 index 79bf37509..000000000 --- a/experiment/ray_test/test.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=ray_docker -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=8 -#SBATCH --output=stdout.txt -#SBATCH --error=stderr.txt - -# Get node information -nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) -nodes_array=($nodes) -head_node=${nodes_array[0]} -port=6379 - - -source /nethome/aagrawal360/mambaforge/etc/profile.d/conda.sh -source /nethome/aagrawal360/mambaforge/etc/profile.d/mamba.sh -mamba activate /home/aagrawal360/repos/cloudai/env - -if [ "$SLURMD_NODENAME" == "$head_node" ]; then - echo "Starting Ray head node on $head_node" - ray start --head --port=$port --num-cpus=$SLURM_CPUS_PER_TASK - sleep 30 - python /home/aagrawal360/repos/cloudai/experiment/ray_test/ray_test_job.py - ray status -else - echo "Starting Ray worker on $SLURMD_NODENAME" - ray start --address=${head_node}:${port} --num-cpus=$SLURM_CPUS_PER_TASK -fi - -sleep 30 \ No newline at end of file From 7d13c4cfdc9b5f1fcd75affce72224d1ca38b821 Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 23:01:58 -0400 Subject: [PATCH 06/12] minor --- conf/common/system/badger_slurm_cluster.toml | 30 -------------------- conf/common/test_scenario/sleep.toml | 2 +- 2 files changed, 1 insertion(+), 31 deletions(-) delete mode 100644 conf/common/system/badger_slurm_cluster.toml diff --git a/conf/common/system/badger_slurm_cluster.toml b/conf/common/system/badger_slurm_cluster.toml deleted file mode 100644 index ef289cbc0..000000000 --- a/conf/common/system/badger_slurm_cluster.toml +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "badger" -scheduler = "slurm" - -install_path = "./install_dir" -output_path = "./results" -default_partition = "cuda" - -mpi = "pmi2" -gpus_per_node = 4 -ntasks_per_node = 4 -monitor_interval = 2 - -[[partitions]] -name = "cuda" diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml index 94e850850..daca397c4 100644 --- a/conf/common/test_scenario/sleep.toml +++ b/conf/common/test_scenario/sleep.toml @@ -18,7 +18,7 @@ name = "test_scenario_example" [[Tests]] id = "Tests.1" -test_name = "sleep_5" +test_name = "sleep_10" [[Tests]] id = "Tests.2" From 9185db91e79549d61550734ce728e2530162b778 Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 23:13:17 -0400 Subject: [PATCH 07/12] minor --- README.md | 1 + tests/test_acceptance.py | 17 +++++++++++++++++ tests/test_init.py | 8 ++++++++ tests/test_test_scenario.py | 2 ++ 4 files changed, 28 insertions(+) diff --git a/README.md b/README.md index 324ac70b8..8c0b30a0c 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ These schemas enable CloudAI to be flexible and compatible with different system |Sleep|✅|✅|✅| |UCC|✅|❌|❌| |SlurmContainer|✅|❌|❌| +|SlurmRayContainer|✅|❌|❌| |MegatronRun (experimental)|✅|❌|❌| diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index cd7d0648d..b570708ad 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -51,6 +51,11 @@ SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition, ) +from cloudai.workloads.slurm_ray_container import ( + SlurmRayContainerCmdArgs, + SlurmRayContainerCommandGenStrategy, + SlurmRayContainerTestDefinition, +) from cloudai.workloads.ucc_test import UCCCmdArgs, UCCTestDefinition, UCCTestSlurmCommandGenStrategy SLURM_TEST_SCENARIOS = [ @@ -261,6 +266,18 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - ), SlurmContainerCommandGenStrategy, ), + "slurm_ray_container": lambda: create_test_run( + partial_tr, + slurm_system, + "slurm_ray_container", + SlurmRayContainerTestDefinition( + name="slurm_ray_container", + description="slurm_ray_container", + test_template_name="slurm_ray_container", + cmd_args=SlurmRayContainerCmdArgs(docker_image_url="https://docker/url", cmd="pwd ; ls", conda_env="test"), + ), + SlurmRayContainerCommandGenStrategy, + ), "megatron-run": lambda: create_test_run( partial_tr, slurm_system, diff --git a/tests/test_init.py b/tests/test_init.py index bb3dd95d0..b0ae7ea01 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -66,6 +66,10 @@ SleepTestDefinition, ) from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition +from cloudai.workloads.slurm_ray_container import ( + SlurmRayContainerCommandGenStrategy, + SlurmRayContainerTestDefinition, +) from cloudai.workloads.ucc_test import ( UCCTestDefinition, UCCTestGradingStrategy, @@ -99,6 +103,7 @@ def test_runners(): (CommandGenStrategy, SlurmSystem, NemotronTestDefinition): JaxToolboxSlurmCommandGenStrategy, (CommandGenStrategy, SlurmSystem, SleepTestDefinition): SleepSlurmCommandGenStrategy, (CommandGenStrategy, SlurmSystem, SlurmContainerTestDefinition): SlurmContainerCommandGenStrategy, + (CommandGenStrategy, SlurmSystem, SlurmRayContainerTestDefinition): SlurmRayContainerCommandGenStrategy, (CommandGenStrategy, SlurmSystem, UCCTestDefinition): UCCTestSlurmCommandGenStrategy, (CommandGenStrategy, SlurmSystem, MegatronRunTestDefinition): MegatronRunSlurmCommandGenStrategy, (CommandGenStrategy, StandaloneSystem, SleepTestDefinition): SleepStandaloneCommandGenStrategy, @@ -119,6 +124,7 @@ def test_runners(): (JobIdRetrievalStrategy, SlurmSystem, NemotronTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, SlurmSystem, SleepTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, SlurmSystem, SlurmContainerTestDefinition): SlurmJobIdRetrievalStrategy, + (JobIdRetrievalStrategy, SlurmSystem, SlurmRayContainerTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, SlurmSystem, UCCTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, SlurmSystem, MegatronRunTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, StandaloneSystem, SleepTestDefinition): StandaloneJobIdRetrievalStrategy, @@ -133,6 +139,7 @@ def test_runners(): (JobStatusRetrievalStrategy, SlurmSystem, NemotronTestDefinition): JaxToolboxJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, SlurmSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, SlurmSystem, SlurmContainerTestDefinition): DefaultJobStatusRetrievalStrategy, + (JobStatusRetrievalStrategy, SlurmSystem, SlurmRayContainerTestDefinition): DefaultJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, SlurmSystem, UCCTestDefinition): DefaultJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, SlurmSystem, MegatronRunTestDefinition): DefaultJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, StandaloneSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy, @@ -177,6 +184,7 @@ def test_definitions(): ("JaxToolboxGrok", GrokTestDefinition), ("JaxToolboxNemotron", NemotronTestDefinition), ("SlurmContainer", SlurmContainerTestDefinition), + ("SlurmRayContainer", SlurmRayContainerTestDefinition), ("MegatronRun", MegatronRunTestDefinition), ]: assert test_defs[tdef[0]] == tdef[1] diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index aeab2e0c3..6bd0c51df 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -44,6 +44,7 @@ from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition +from cloudai.workloads.slurm_ray_container import SlurmRayContainerReportGenerationStrategy, SlurmRayContainerTestDefinition from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy from tests.conftest import MyTestDefinition @@ -293,6 +294,7 @@ def test_default_reporters_size(self): (NemotronTestDefinition, {JaxToolboxReportGenerationStrategy}), (SleepTestDefinition, {SleepReportGenerationStrategy}), (SlurmContainerTestDefinition, {SlurmContainerReportGenerationStrategy}), + (SlurmRayContainerTestDefinition, {SlurmRayContainerReportGenerationStrategy}), (UCCTestDefinition, {UCCTestReportGenerationStrategy}), ], ) From a01f053e2c928870ece5bccf80224d34695cfea4 Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 23:34:02 -0400 Subject: [PATCH 08/12] fix ci --- .../test/slurm_ray_container_hello_world.toml | 2 +- conf/common/test_scenario/sleep.toml | 2 +- .../test_scenario/slurm_ray_container.toml | 2 +- pyproject.toml | 1 + src/cloudai/__init__.py | 2 ++ src/cloudai/_core/test_scenario_parser.py | 5 +++- .../strategy/slurm_command_gen_strategy.py | 29 +++++++++---------- .../report_generation_strategy.py | 1 - .../slurm_command_gen_strategy.py | 14 ++++----- tests/test_acceptance.py | 4 ++- tests/test_init.py | 2 +- tests/test_test_scenario.py | 7 +++-- 12 files changed, 38 insertions(+), 33 deletions(-) diff --git a/conf/common/test/slurm_ray_container_hello_world.toml b/conf/common/test/slurm_ray_container_hello_world.toml index 2a995c04d..3e159dd48 100644 --- a/conf/common/test/slurm_ray_container_hello_world.toml +++ b/conf/common/test/slurm_ray_container_hello_world.toml @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml index daca397c4..2d3b06597 100644 --- a/conf/common/test_scenario/sleep.toml +++ b/conf/common/test_scenario/sleep.toml @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/conf/common/test_scenario/slurm_ray_container.toml b/conf/common/test_scenario/slurm_ray_container.toml index b58510998..1a30864bd 100644 --- a/conf/common/test_scenario/slurm_ray_container.toml +++ b/conf/common/test_scenario/slurm_ray_container.toml @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/pyproject.toml b/pyproject.toml index 12b5d2f18..4bb5dfcd8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "toml==0.10.2", "kubernetes==30.1.0", "pydantic==2.8.2", + "jinja2==3.1.6", ] [project.scripts] cloudai = "cloudai.__main__:main" diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index 1e6f6f13c..f33d077fc 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -157,6 +157,7 @@ SleepTestDefinition, NeMoRunTestDefinition, SlurmContainerTestDefinition, + SlurmRayContainerTestDefinition, MegatronRunTestDefinition, ], SlurmJobIdRetrievalStrategy, @@ -192,6 +193,7 @@ SleepTestDefinition, NeMoRunTestDefinition, SlurmContainerTestDefinition, + SlurmRayContainerTestDefinition, MegatronRunTestDefinition, ], DefaultJobStatusRetrievalStrategy, diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index cffac9b29..ce41debb9 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -36,7 +36,10 @@ from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition -from cloudai.workloads.slurm_ray_container import SlurmRayContainerReportGenerationStrategy, SlurmRayContainerTestDefinition +from cloudai.workloads.slurm_ray_container import ( + SlurmRayContainerReportGenerationStrategy, + SlurmRayContainerTestDefinition, +) from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy from .exceptions import TestScenarioParsingError, format_validation_error diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 62135b42b..416401178 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -310,9 +310,17 @@ def _write_sbatch_script( return f"sbatch {batch_script_path}" - def _get_sbatch_directives( - self, args: Dict[str, Any], output_path: Path - ) -> Dict[str, str]: + def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]: + """ + Get the Slurm batch script directives. + + Args: + args (Dict[str, Any]): Slurm-specific arguments. + output_path (Path): Output directory for script and logs. + + Returns: + Dict[str, str]: Dictionary of Slurm batch script directives. + """ sbatch_directives: Dict[str, str] = {} if "output" not in args: @@ -336,18 +344,6 @@ def _get_sbatch_directives( if "time_limit" in args: sbatch_directives["time_limit"] = args["time_limit"] - for arg in self.system.extra_sbatch_args: - arg = arg.strip() - # remove -- from the start of the string if present - if arg.startswith("--"): - arg = arg[2:] - # split the string into key and value - arg_split = arg.split("=") - if len(arg_split) == 2: - sbatch_directives[arg_split[0]] = arg_split[1].join("=") - else: - sbatch_directives[arg] = "" - return sbatch_directives def _append_sbatch_directives( @@ -370,6 +366,9 @@ def _append_sbatch_directives( else: batch_script_content.append(f"#SBATCH --{key}") + for arg in self.system.extra_sbatch_args: + batch_script_content.append(f"#SBATCH {arg}") + batch_script_content.append( "\nexport SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)" ) diff --git a/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py index 1646f1434..d8978e8da 100644 --- a/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py +++ b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re from cloudai import ReportGenerationStrategy diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py index fee895389..6887b064b 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys from pathlib import Path from typing import Any, Dict, List, Union, cast @@ -28,9 +27,8 @@ class SlurmRayContainerCommandGenStrategy(SlurmContainerCommandGenStrategy): """Command generation strategy for generic Slurm container tests.""" - def _get_sbatch_directives( - self, args: Dict[str, Any], output_path: Path - ) -> Dict[str, str]: + + def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]: sbatch_directives = super()._get_sbatch_directives(args, output_path) # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu # override tasks per node @@ -53,10 +51,8 @@ def generate_test_command( template = Template(template_path.read_text()) # render the template - rendered_template = template.render({ - "conda_env": tdef.cmd_args.conda_env, - "command": " ".join(srun_command_parts) - }) + rendered_template = template.render( + {"conda_env": tdef.cmd_args.conda_env, "command": " ".join(srun_command_parts)} + ) return [rendered_template] - diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index b570708ad..31978cb0f 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -274,7 +274,9 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - name="slurm_ray_container", description="slurm_ray_container", test_template_name="slurm_ray_container", - cmd_args=SlurmRayContainerCmdArgs(docker_image_url="https://docker/url", cmd="pwd ; ls", conda_env="test"), + cmd_args=SlurmRayContainerCmdArgs( + docker_image_url="https://docker/url", cmd="pwd ; ls", conda_env="test" + ), ), SlurmRayContainerCommandGenStrategy, ), diff --git a/tests/test_init.py b/tests/test_init.py index b0ae7ea01..0ffcd1afe 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -172,7 +172,7 @@ def test_installers(): def test_definitions(): test_defs = Registry().test_definitions_map - assert len(test_defs) == 11 + assert len(test_defs) == 12 for tdef in [ ("UCCTest", UCCTestDefinition), ("NcclTest", NCCLTestDefinition), diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index 6bd0c51df..c2208068f 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -44,7 +44,10 @@ from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition -from cloudai.workloads.slurm_ray_container import SlurmRayContainerReportGenerationStrategy, SlurmRayContainerTestDefinition +from cloudai.workloads.slurm_ray_container import ( + SlurmRayContainerReportGenerationStrategy, + SlurmRayContainerTestDefinition, +) from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy from tests.conftest import MyTestDefinition @@ -279,7 +282,7 @@ def test_default(self): assert len(reporters) == 0 def test_default_reporters_size(self): - assert len(DEFAULT_REPORTERS) == 11 + assert len(DEFAULT_REPORTERS) == 12 @pytest.mark.parametrize( "tdef,expected_reporters", From e408026c7ea7bba0fec81cac985e1f79d8663563 Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 23:46:21 -0400 Subject: [PATCH 09/12] minor --- .../slurm_command_gen_strategy.py | 24 ++++++++++++++++--- .../slurm_ray_container_template.sh.jinja | 8 +++---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py index 6887b064b..2abcc1b64 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py @@ -37,13 +37,27 @@ def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dic return sbatch_directives + def _gen_srun_command( + self, + slurm_args: Dict[str, Any], + env_vars: Dict[str, str], + cmd_args: Dict[str, Union[str, List[str]]], + tr: TestRun, + ) -> str: + srun_command_parts = self.gen_srun_prefix(slurm_args, tr) + nsys_command_parts = super().gen_nsys_command(tr) + cmd_args["srun_command_prefix"] = " ".join(srun_command_parts + nsys_command_parts) + test_command_parts = self.generate_test_command(env_vars, cmd_args, tr) + return " ".join(test_command_parts) + def generate_test_command( self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun ) -> list[str]: tdef: SlurmRayContainerTestDefinition = cast(SlurmRayContainerTestDefinition, tr.test.test_definition) - srun_command_parts: list[str] = [*super().gen_nsys_command(tr), tdef.cmd_args.cmd] + + command_parts: list[str] = [tdef.cmd_args.cmd] if tr.test.extra_cmd_args: - srun_command_parts.append(tr.test.extra_cmd_args) + command_parts.append(tr.test.extra_cmd_args) # load the jinja template file which is placed at the same directory as this file script_dir = Path(__file__).parent @@ -52,7 +66,11 @@ def generate_test_command( # render the template rendered_template = template.render( - {"conda_env": tdef.cmd_args.conda_env, "command": " ".join(srun_command_parts)} + { + "conda_env": tdef.cmd_args.conda_env, + "command": " ".join(command_parts), + "srun_command_prefix": cmd_args["srun_command_prefix"], + } ) return [rendered_template] diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja index eeeb08fec..530130816 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja +++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja @@ -1,12 +1,11 @@ -conda activate {{ conda_env }} - port=6379 ip_head=$head_node_ip:$port export ip_head echo "IP Head: $ip_head" echo "Starting HEAD at $head_node" -srun --nodes=1 --ntasks=1 -w "$head_node" \ +{{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$head_node" \ + conda activate {{ conda_env }} && \ ray start --head --node-ip-address="$head_node_ip" --port=$port \ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block & @@ -19,7 +18,8 @@ worker_num=$((SLURM_JOB_NUM_NODES - 1)) for ((i = 1; i <= worker_num; i++)); do node_i=${nodes_array[$i]} echo "Starting WORKER $i at $node_i" - srun --nodes=1 --ntasks=1 -w "$node_i" \ + {{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$node_i" \ + conda activate {{ conda_env }} && \ ray start --address "$ip_head" \ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block & sleep 5 From f13a597d6f7e5135be6c1a6f801a9e67ce38c06a Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Tue, 11 Mar 2025 23:51:50 -0400 Subject: [PATCH 10/12] minor --- .../slurm_ray_container/slurm_command_gen_strategy.py | 2 +- .../slurm_ray_container_template.sh.jinja | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py index 2abcc1b64..31bcfb591 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py @@ -32,7 +32,7 @@ def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dic sbatch_directives = super()._get_sbatch_directives(args, output_path) # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu # override tasks per node - sbatch_directives["tasks-per-node"] = "1" + sbatch_directives["tasks-per-node"] = "2" sbatch_directives["exclusive"] = "" return sbatch_directives diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja index 530130816..57692f6fb 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja +++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja @@ -25,5 +25,7 @@ for ((i = 1; i <= worker_num; i++)); do sleep 5 done -{{ command }} - +{{ srun_command_prefix }} --nodes=1 --ntasks=1 \ + -w "$head_node" --gpus-per-node=0 \ + conda activate {{ conda_env }} && \ + {{ command }} From 7b982ad9d97f328fb418a9a9958ae12f0d2a212f Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Wed, 12 Mar 2025 01:31:04 -0400 Subject: [PATCH 11/12] minor --- ...hello_world.toml => slurm_ray_container_vllm.toml} | 11 ++++------- conf/common/test_scenario/slurm_ray_container.toml | 2 +- .../slurm_ray_container/slurm_command_gen_strategy.py | 7 ++++++- .../slurm_ray_container/slurm_ray_container.py | 2 +- .../slurm_ray_container_template.sh.jinja | 6 +++--- 5 files changed, 15 insertions(+), 13 deletions(-) rename conf/common/test/{slurm_ray_container_hello_world.toml => slurm_ray_container_vllm.toml} (79%) diff --git a/conf/common/test/slurm_ray_container_hello_world.toml b/conf/common/test/slurm_ray_container_vllm.toml similarity index 79% rename from conf/common/test/slurm_ray_container_hello_world.toml rename to conf/common/test/slurm_ray_container_vllm.toml index 3e159dd48..ff3835393 100644 --- a/conf/common/test/slurm_ray_container_hello_world.toml +++ b/conf/common/test/slurm_ray_container_vllm.toml @@ -14,13 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -name = "slurm_ray_container_hello_world" -description = "Hello World" +name = "slurm_ray_container_vllm" +description = "Run example script with vLLM" test_template_name = "SlurmRayContainer" [cmd_args] -docker_image_url = "nvcr.io/nvidia/nemo:24.12.rc3" -conda_env = "nemo" -cmd = "bash -c 'echo Hello World'" - - +docker_image_url = "vllm/vllm-openai:latest" +cmd = "python3 examples/offline_inference/llm_engine_example.py -tp 8 -pp 2" diff --git a/conf/common/test_scenario/slurm_ray_container.toml b/conf/common/test_scenario/slurm_ray_container.toml index 1a30864bd..73e4506ff 100644 --- a/conf/common/test_scenario/slurm_ray_container.toml +++ b/conf/common/test_scenario/slurm_ray_container.toml @@ -18,5 +18,5 @@ name = "slurm_ray_container_example" [[Tests]] id = "Tests.1" -test_name = "slurm_ray_container_hello_world" +test_name = "slurm_ray_container_vllm" num_nodes = "2" diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py index 31bcfb591..0fa05ddfb 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py @@ -64,10 +64,15 @@ def generate_test_command( template_path = script_dir / "slurm_ray_container_template.sh.jinja" template = Template(template_path.read_text()) + if tdef.cmd_args.conda_env: + conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && " + else: + conda_activate_command = "" + # render the template rendered_template = template.render( { - "conda_env": tdef.cmd_args.conda_env, + "conda_activate_command": conda_activate_command, "command": " ".join(command_parts), "srun_command_prefix": cmd_args["srun_command_prefix"], } diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py index 24f15176a..742aeb6b2 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py +++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py @@ -23,8 +23,8 @@ class SlurmRayContainerCmdArgs(CmdArgs): """Command line arguments for a generic Slurm container test.""" docker_image_url: str - conda_env: str cmd: str + conda_env: Optional[str] = None class SlurmRayContainerTestDefinition(TestDefinition): diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja index 57692f6fb..f22094f46 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja +++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja @@ -5,7 +5,7 @@ echo "IP Head: $ip_head" echo "Starting HEAD at $head_node" {{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$head_node" \ - conda activate {{ conda_env }} && \ + {{ conda_activate_command }} \ ray start --head --node-ip-address="$head_node_ip" --port=$port \ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block & @@ -19,7 +19,7 @@ for ((i = 1; i <= worker_num; i++)); do node_i=${nodes_array[$i]} echo "Starting WORKER $i at $node_i" {{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$node_i" \ - conda activate {{ conda_env }} && \ + {{ conda_activate_command }} \ ray start --address "$ip_head" \ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block & sleep 5 @@ -27,5 +27,5 @@ done {{ srun_command_prefix }} --nodes=1 --ntasks=1 \ -w "$head_node" --gpus-per-node=0 \ - conda activate {{ conda_env }} && \ + {{ conda_activate_command }} \ {{ command }} From 6c5e9dd9a5b1ea1d0d05291d1b6adaa7a963050a Mon Sep 17 00:00:00 2001 From: Rayyan Shahid Date: Wed, 12 Mar 2025 01:31:41 -0400 Subject: [PATCH 12/12] minor --- .../slurm_ray_container/slurm_command_gen_strategy.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py index 0fa05ddfb..e6419d0f4 100644 --- a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py @@ -64,10 +64,7 @@ def generate_test_command( template_path = script_dir / "slurm_ray_container_template.sh.jinja" template = Template(template_path.read_text()) - if tdef.cmd_args.conda_env: - conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && " - else: - conda_activate_command = "" + conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && " if tdef.cmd_args.conda_env else "" # render the template rendered_template = template.render(