From 8eb2549894762693aaba2a15daeaee45733394a9 Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Wed, 10 Feb 2021 08:42:18 -0600 Subject: [PATCH] Synchronize SSH keys Bobber relies on SSH keys that are baked into the images to enable multi-node communication. This forces users to build the image on one machine, save the image locally, copy it to all remote nodes, and load the copied image on those hosts. This process is long and tedious, but by replacing it with a synchronization method, makes it possible to run the build on each host and not need to copy images remotely. Signed-Off-By: Robert Clark --- README.md | 17 ++++++++-------- bobber/bin/sync-keys.sh | 33 +++++++++++++++++++++++++++++++ bobber/bobber.py | 19 ++++++++++++++++++ bobber/lib/constants.py | 1 + bobber/lib/docker/management.py | 21 ++++++++++++++++++++ bobber/lib/system/shell.py | 35 +++++++++++++++++++++++++++++++++ 6 files changed, 117 insertions(+), 9 deletions(-) create mode 100755 bobber/bin/sync-keys.sh create mode 100644 bobber/lib/system/shell.py diff --git a/README.md b/README.md index 455419c..e449ec1 100644 --- a/README.md +++ b/README.md @@ -105,9 +105,8 @@ pip3 install https://github.com/NVIDIA/Bobber/releases/download/v6.1.1/nvidia_bo ## Build Bobber container (includes OSU Tests, NCCL Tests, fio, mdtest, DALI RN50 Pipeline, and the base NGC TensorFlow container) The Bobber application includes a built-in mechanism to build the Docker -container where all tests will be run. This command should be run on a single -system in the cluster as it will be copied in a future step. For single-node -tests, run the command on the node to be tested. +container where all tests will be run. Run the following command on all nodes +that will be tested. ```bash $ bobber build @@ -123,13 +122,13 @@ $ docker images | grep nvidia/bobber nvidia/bobber 6.1.1 c697a75ee482 36 minutes ago 12.4GB ``` -## Save container +## Synchronize container keys Bobber relies on shared SSH keys to communicate between containers via MPI. This -is done by generating an SSH key in the image during build time and using that -same container on all hosts. This requires saving the image to a local tarball -and transferring the image to all other nodes. The `export` command saves the -image as a local tarball. Run the command on the node from the previous step -where the Docker image is located. +is done by generating an SSH key on a single node and copying that key to all +other containers in the cluster. Bobber includes a tool to automatically +synchronize keys amongst all containers on all nodes. To make the process more +seamless, it is highly recommended to create passwordless SSH keys to +communicate between the hosts in order to automatically copy the keys. If running on a single node, this step is not required. diff --git a/bobber/bin/sync-keys.sh b/bobber/bin/sync-keys.sh new file mode 100755 index 0000000..47fc568 --- /dev/null +++ b/bobber/bin/sync-keys.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e +# Pass the list of hosts in as a string +hosts=$1 +# Optionally pass a username to login to remote nodes +user=$2 + +# Generate a new RSA key locally for SSH to share across the cluster +mkdir -p /tmp/bobber +rm -f /tmp/bobber/* +ssh-keygen -t rsa -b 4096 -f /tmp/bobber/id_rsa -N "" + +echo "Copying keys to containers on all hosts" +echo "For remote hosts, if passwordless-ssh is not configured, you will be prompted for the password for all nodes" + +if [[ $hosts=="localhost" || -z "$hosts" ]]; then + docker cp /tmp/bobber/id_rsa bobber:/root/.ssh/id_rsa + docker cp /tmp/bobber/id_rsa.pub bobber:/root/.ssh/authorized_keys +fi + +# Copy the key to the container +for host in ${hosts//,/ }; do + if [ ! -z "$user" ]; then + scp -r /tmp/bobber $user@$host:/tmp/ + ssh $user@$host 'docker cp /tmp/bobber/id_rsa bobber:/root/.ssh/id_rsa && docker cp /tmp/bobber/id_rsa.pub bobber:/root/.ssh/authorized_keys && rm /tmp/bobber/id_rsa*' + else + scp -r /tmp/bobber $host:/tmp/ + ssh $host 'docker cp /tmp/bobber/id_rsa bobber:/root/.ssh/id_rsa && docker cp /tmp/bobber/id_rsa.pub bobber:/root/.ssh/authorized_keys && rm /tmp/bobber/id_rsa*' + fi +done + +# Cleanup the local key +rm -f /tmp/bobber/id_rsa* diff --git a/bobber/bobber.py b/bobber/bobber.py index fdf3c48..eb1ec0e 100644 --- a/bobber/bobber.py +++ b/bobber/bobber.py @@ -20,10 +20,12 @@ RUN_STG_BW, RUN_STG_IOPS, RUN_STG_META, + SYNC, SYSTEMS ) from bobber.lib.analysis import parse_results from bobber.lib.system.file_handler import create_directory +from bobber.lib.system.shell import copy_keys from bobber.lib.tests import run_tests from typing import NoReturn @@ -227,6 +229,21 @@ def parse_args(version: str) -> Namespace: 'binary') load.add_argument('filename', help='Filename of local *.tar file of ' 'the image to load') + + # Options specific to synchronizing SSH keys in containers + sync = commands.add_parser(SYNC, help='Create SSH keys and add them to all' + ' Bobber containers in a cluster. Requires the ' + 'container to be running on all nodes using ' + '"bobber cast".') + sync.add_argument('--hosts', help='A comma-separated list of hostnames or ' + 'IP address of the nodes to add SSH keys to. Required ' + 'for multi-node tests. If left empty, it is assumed ' + 'that keys should only be copied to the container on ' + 'the local node.', type=str, default='') + sync.add_argument('--user', help='Optionally specify a user to use to ' + 'login to remote hosts to copy keys to containers. If ' + 'left blank, will use the currently logged-in user.', + type=str, default='') return parser.parse_args() @@ -360,6 +377,8 @@ def execute_command(args: Namespace, version: str) -> NoReturn: bobber.lib.docker.cast(args.storage_path, args.ignore_gpu, version) elif args.command == LOAD: bobber.lib.docker.load(args.filename) + elif args.command == SYNC: + copy_keys(args.hosts, args.user) else: # Update the version to be used in filenames version_underscore = version.replace('.', '_') diff --git a/bobber/lib/constants.py b/bobber/lib/constants.py index 7793e8b..b7458b7 100644 --- a/bobber/lib/constants.py +++ b/bobber/lib/constants.py @@ -10,6 +10,7 @@ RUN_STG_BW = 'run-stg-bw' RUN_STG_IOPS = 'run-stg-iops' RUN_STG_META = 'run-stg-meta' +SYNC = 'sync' DGX_A100_SINGLE = { 'gpus': 8, diff --git a/bobber/lib/docker/management.py b/bobber/lib/docker/management.py index c8ad269..d333538 100644 --- a/bobber/lib/docker/management.py +++ b/bobber/lib/docker/management.py @@ -234,3 +234,24 @@ def execute(self, command: str, environment: Optional[dict] = None, print(result.output) except StopIteration: break + + def running(self): + """ + Determine if the Bobber container is running on the system. + + Check to see if the current version of the Bobber container is running + on the local machine and return the status. This method can be used to + determine whether or not to run a command that depends on the container + being launched. + + Returns + ------- + boolean + Returns `True` when the container is running and `False` when not. + """ + try: + bobber = self.client.containers.get('bobber') + except docker.errors.NotFound: + return False + else: + return True diff --git a/bobber/lib/system/shell.py b/bobber/lib/system/shell.py new file mode 100644 index 0000000..f00c307 --- /dev/null +++ b/bobber/lib/system/shell.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: MIT +import subprocess +import sys +from bobber.lib.docker import manager +from typing import NoReturn + + +def copy_keys(hosts: str, user: str) -> NoReturn: + """ + Generate and copy SSH keys to all hosts. + + Launch a shell script included with the package which generates a local SSH + key that is copied to all Bobber containers on all nodes to allow + passwordless communication for MPI. + + Parameters + ---------- + hosts : string + A comma-separated list as a ``string`` representing all hosts, such as + 'host1,host2,host3,...'. + user : string + A ``string`` of the user to use to login to remote hosts as, if + necessary. + """ + if not manager.running(): + print('Bobber container is not running. Please ensure Bobber is ' + 'running on all nodes using the "bobber cast" command before ' + 'running "bobber sync".') + sys.exit(-1) + try: + subprocess.run(['bobber/bin/sync-keys.sh', hosts, user], check=True) + except subprocess.CalledProcessError: + print('Error synchronizing keys. See output from the sync script ' + 'above.') + sys.exit(-1)