diff --git a/README.md b/README.md index 455419c..e449ec1 100644 --- a/README.md +++ b/README.md @@ -105,9 +105,8 @@ pip3 install https://github.com/NVIDIA/Bobber/releases/download/v6.1.1/nvidia_bo ## Build Bobber container (includes OSU Tests, NCCL Tests, fio, mdtest, DALI RN50 Pipeline, and the base NGC TensorFlow container) The Bobber application includes a built-in mechanism to build the Docker -container where all tests will be run. This command should be run on a single -system in the cluster as it will be copied in a future step. For single-node -tests, run the command on the node to be tested. +container where all tests will be run. Run the following command on all nodes +that will be tested. ```bash $ bobber build @@ -123,13 +122,13 @@ $ docker images | grep nvidia/bobber nvidia/bobber 6.1.1 c697a75ee482 36 minutes ago 12.4GB ``` -## Save container +## Synchronize container keys Bobber relies on shared SSH keys to communicate between containers via MPI. This -is done by generating an SSH key in the image during build time and using that -same container on all hosts. This requires saving the image to a local tarball -and transferring the image to all other nodes. The `export` command saves the -image as a local tarball. Run the command on the node from the previous step -where the Docker image is located. +is done by generating an SSH key on a single node and copying that key to all +other containers in the cluster. Bobber includes a tool to automatically +synchronize keys amongst all containers on all nodes. To make the process more +seamless, it is highly recommended to create passwordless SSH keys to +communicate between the hosts in order to automatically copy the keys. If running on a single node, this step is not required. diff --git a/bobber/bin/sync-keys.sh b/bobber/bin/sync-keys.sh new file mode 100755 index 0000000..47fc568 --- /dev/null +++ b/bobber/bin/sync-keys.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e +# Pass the list of hosts in as a string +hosts=$1 +# Optionally pass a username to login to remote nodes +user=$2 + +# Generate a new RSA key locally for SSH to share across the cluster +mkdir -p /tmp/bobber +rm -f /tmp/bobber/* +ssh-keygen -t rsa -b 4096 -f /tmp/bobber/id_rsa -N "" + +echo "Copying keys to containers on all hosts" +echo "For remote hosts, if passwordless-ssh is not configured, you will be prompted for the password for all nodes" + +if [[ $hosts=="localhost" || -z "$hosts" ]]; then + docker cp /tmp/bobber/id_rsa bobber:/root/.ssh/id_rsa + docker cp /tmp/bobber/id_rsa.pub bobber:/root/.ssh/authorized_keys +fi + +# Copy the key to the container +for host in ${hosts//,/ }; do + if [ ! -z "$user" ]; then + scp -r /tmp/bobber $user@$host:/tmp/ + ssh $user@$host 'docker cp /tmp/bobber/id_rsa bobber:/root/.ssh/id_rsa && docker cp /tmp/bobber/id_rsa.pub bobber:/root/.ssh/authorized_keys && rm /tmp/bobber/id_rsa*' + else + scp -r /tmp/bobber $host:/tmp/ + ssh $host 'docker cp /tmp/bobber/id_rsa bobber:/root/.ssh/id_rsa && docker cp /tmp/bobber/id_rsa.pub bobber:/root/.ssh/authorized_keys && rm /tmp/bobber/id_rsa*' + fi +done + +# Cleanup the local key +rm -f /tmp/bobber/id_rsa* diff --git a/bobber/bobber.py b/bobber/bobber.py index fdf3c48..eb1ec0e 100644 --- a/bobber/bobber.py +++ b/bobber/bobber.py @@ -20,10 +20,12 @@ RUN_STG_BW, RUN_STG_IOPS, RUN_STG_META, + SYNC, SYSTEMS ) from bobber.lib.analysis import parse_results from bobber.lib.system.file_handler import create_directory +from bobber.lib.system.shell import copy_keys from bobber.lib.tests import run_tests from typing import NoReturn @@ -227,6 +229,21 @@ def parse_args(version: str) -> Namespace: 'binary') load.add_argument('filename', help='Filename of local *.tar file of ' 'the image to load') + + # Options specific to synchronizing SSH keys in containers + sync = commands.add_parser(SYNC, help='Create SSH keys and add them to all' + ' Bobber containers in a cluster. Requires the ' + 'container to be running on all nodes using ' + '"bobber cast".') + sync.add_argument('--hosts', help='A comma-separated list of hostnames or ' + 'IP address of the nodes to add SSH keys to. Required ' + 'for multi-node tests. If left empty, it is assumed ' + 'that keys should only be copied to the container on ' + 'the local node.', type=str, default='') + sync.add_argument('--user', help='Optionally specify a user to use to ' + 'login to remote hosts to copy keys to containers. If ' + 'left blank, will use the currently logged-in user.', + type=str, default='') return parser.parse_args() @@ -360,6 +377,8 @@ def execute_command(args: Namespace, version: str) -> NoReturn: bobber.lib.docker.cast(args.storage_path, args.ignore_gpu, version) elif args.command == LOAD: bobber.lib.docker.load(args.filename) + elif args.command == SYNC: + copy_keys(args.hosts, args.user) else: # Update the version to be used in filenames version_underscore = version.replace('.', '_') diff --git a/bobber/lib/constants.py b/bobber/lib/constants.py index 7793e8b..b7458b7 100644 --- a/bobber/lib/constants.py +++ b/bobber/lib/constants.py @@ -10,6 +10,7 @@ RUN_STG_BW = 'run-stg-bw' RUN_STG_IOPS = 'run-stg-iops' RUN_STG_META = 'run-stg-meta' +SYNC = 'sync' DGX_A100_SINGLE = { 'gpus': 8, diff --git a/bobber/lib/docker/management.py b/bobber/lib/docker/management.py index c8ad269..d333538 100644 --- a/bobber/lib/docker/management.py +++ b/bobber/lib/docker/management.py @@ -234,3 +234,24 @@ def execute(self, command: str, environment: Optional[dict] = None, print(result.output) except StopIteration: break + + def running(self): + """ + Determine if the Bobber container is running on the system. + + Check to see if the current version of the Bobber container is running + on the local machine and return the status. This method can be used to + determine whether or not to run a command that depends on the container + being launched. + + Returns + ------- + boolean + Returns `True` when the container is running and `False` when not. + """ + try: + bobber = self.client.containers.get('bobber') + except docker.errors.NotFound: + return False + else: + return True diff --git a/bobber/lib/system/shell.py b/bobber/lib/system/shell.py new file mode 100644 index 0000000..f00c307 --- /dev/null +++ b/bobber/lib/system/shell.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: MIT +import subprocess +import sys +from bobber.lib.docker import manager +from typing import NoReturn + + +def copy_keys(hosts: str, user: str) -> NoReturn: + """ + Generate and copy SSH keys to all hosts. + + Launch a shell script included with the package which generates a local SSH + key that is copied to all Bobber containers on all nodes to allow + passwordless communication for MPI. + + Parameters + ---------- + hosts : string + A comma-separated list as a ``string`` representing all hosts, such as + 'host1,host2,host3,...'. + user : string + A ``string`` of the user to use to login to remote hosts as, if + necessary. + """ + if not manager.running(): + print('Bobber container is not running. Please ensure Bobber is ' + 'running on all nodes using the "bobber cast" command before ' + 'running "bobber sync".') + sys.exit(-1) + try: + subprocess.run(['bobber/bin/sync-keys.sh', hosts, user], check=True) + except subprocess.CalledProcessError: + print('Error synchronizing keys. See output from the sync script ' + 'above.') + sys.exit(-1)