From 6e33fc6d2e8856468b6b7ff397025f9d34c6e488 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 11:11:48 -0800 Subject: [PATCH 01/20] bump Ray version --- .ci/docker/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 086633cf043..d9e7b338cfd 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -32,7 +32,7 @@ bs4 awscliv2==2.1.1 flask spacy==3.4.1 -ray[tune]==2.7.2 +ray[tune]==2.52.1 tensorboard jinja2==3.1.3 pytorch-lightning From bededb95ec9700067cd9805f0174714c85ded5ac Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 15:01:34 -0800 Subject: [PATCH 02/20] Ignore more data stuff --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 3f1f927ee33..ea478ca180d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,8 @@ beginner_source/hymenoptera_data/ intermediate_source/data/ *.zip MNIST/ +data/cifar-10-batches-py/* +*.tar.gz #builds _build/ @@ -132,3 +134,4 @@ dictionary.dic # linters /.lintbin + From 352ad9b3ddae3dd142dbf5fb09158c601c4d3403 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 15:20:25 -0800 Subject: [PATCH 03/20] Update Ray Tune tutorial to use new API and improve formatting --- .../hyperparameter_tuning_tutorial.py | 448 ++++++++++-------- 1 file changed, 240 insertions(+), 208 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index dd3fe65699e..e3f1b15f1a9 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -1,44 +1,51 @@ -# -*- coding: utf-8 -*- """ Hyperparameter tuning with Ray Tune =================================== -Hyperparameter tuning can make the difference between an average model and a highly -accurate one. Often simple things like choosing a different learning rate or changing -a network layer size can have a dramatic impact on your model performance. +Hyperparameter tuning can make the difference between an average model +and a highly accurate one. Often simple things like choosing a different +learning rate or changing a network layer size can have a dramatic +impact on your model performance. -Fortunately, there are tools that help with finding the best combination of parameters. -`Ray Tune `_ is an industry standard tool for -distributed hyperparameter tuning. Ray Tune includes the latest hyperparameter search -algorithms, integrates with various analysis libraries, and natively -supports distributed training through `Ray's distributed machine learning engine -`_. +Fortunately, there are tools that help with finding the best combination +of parameters. `Ray Tune `__ is +an industry standard tool for distributed hyperparameter tuning. Ray +Tune includes the latest hyperparameter search algorithms, integrates +with various analysis libraries, and natively supports distributed +training through `Ray’s distributed machine learning +engine `__. -In this tutorial, we will show you how to integrate Ray Tune into your PyTorch -training workflow. We will extend `this tutorial from the PyTorch documentation -`_ for training -a CIFAR10 image classifier. +In this tutorial, we will show you how to integrate Ray Tune into your +PyTorch training workflow. We will extend `this tutorial from the +PyTorch +documentation `__ +for training a CIFAR10 image classifier. -As you will see, we only need to add some slight modifications. In particular, we -need to +As you will see, we only need to add some slight modifications. In +particular, we need to 1. wrap data loading and training in functions, 2. make some network parameters configurable, 3. add checkpointing (optional), 4. and define the search space for the model tuning -| +| To run this tutorial, please make sure the following packages are installed: -- ``ray[tune]``: Distributed hyperparameter tuning library -- ``torchvision``: For the data transformers +- ``ray[tune]``: Distributed hyperparameter tuning library +- ``torchvision``: For the data transformers Setup / Imports --------------- -Let's start with the imports: + +Let’s start with the imports: + """ + +# %matplotlib inline + from functools import partial import os import tempfile @@ -51,28 +58,29 @@ import torchvision import torchvision.transforms as transforms # sphinx_gallery_start_ignore -# Fixes ``AttributeError: '_LoggingTee' object has no attribute 'fileno'``. +# Fixes `AttributeError: '_LoggingTee' object has no attribute 'fileno'`. # This is only needed to run with sphinx-build. import sys if not hasattr(sys.stdout, "encoding"): sys.stdout.encoding = "latin1" sys.stdout.fileno = lambda: 0 # sphinx_gallery_end_ignore +import ray from ray import tune -from ray import train -from ray.train import Checkpoint, get_checkpoint +from ray.tune import Checkpoint from ray.tune.schedulers import ASHAScheduler import ray.cloudpickle as pickle ###################################################################### -# Most of the imports are needed for building the PyTorch model. Only the last -# imports are for Ray Tune. +# Most of the imports are needed for building the PyTorch model. Only the +# last imports are for Ray Tune. # # Data loaders # ------------ -# We wrap the data loaders in their own function and pass a global data directory. -# This way we can share a data directory between different trials. - +# +# We wrap the data loaders in their own function and pass a global data +# directory. This way we can share a data directory between different +# trials. def load_data(data_dir="./data"): transform = transforms.Compose( @@ -89,14 +97,12 @@ def load_data(data_dir="./data"): return trainset, testset - ###################################################################### # Configurable neural network # --------------------------- -# We can only tune those parameters that are configurable. -# In this example, we can specify -# the layer sizes of the fully connected layers: - +# +# We can only tune those parameters that are configurable. In this +# example, we can specify the layer sizes of the fully connected layers: class Net(nn.Module): def __init__(self, l1=120, l2=84): @@ -117,76 +123,82 @@ def forward(self, x): x = self.fc3(x) return x - ###################################################################### # The train function # ------------------ -# Now it gets interesting, because we introduce some changes to the example `from the PyTorch -# documentation `_. -# -# We wrap the training script in a function ``train_cifar(config, data_dir=None)``. -# The ``config`` parameter will receive the hyperparameters we would like to -# train with. The ``data_dir`` specifies the directory where we load and store the data, -# so that multiple runs can share the same data source. -# We also load the model and optimizer state at the start of the run, if a checkpoint -# is provided. Further down in this tutorial you will find information on how +# +# Now it gets interesting, because we introduce some changes to the +# example `from the PyTorch +# documentation `__. +# +# We wrap the training script in a function +# ``train_cifar(config, data_dir=None)``. The ``config`` parameter will +# receive the hyperparameters we would like to train with. The +# ``data_dir`` specifies the directory where we load and store the data, +# so that multiple runs can share the same data source. We also load the +# model and optimizer state at the start of the run, if a checkpoint is +# provided. Further down in this tutorial you will find information on how # to save the checkpoint and what it is used for. # # .. code-block:: python # -# net = Net(config["l1"], config["l2"]) +# net = Net(config["l1"], config["l2"]) # -# checkpoint = get_checkpoint() -# if checkpoint: -# with checkpoint.as_directory() as checkpoint_dir: -# data_path = Path(checkpoint_dir) / "data.pkl" -# with open(data_path, "rb") as fp: -# checkpoint_state = pickle.load(fp) -# start_epoch = checkpoint_state["epoch"] -# net.load_state_dict(checkpoint_state["net_state_dict"]) -# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) -# else: -# start_epoch = 0 +# checkpoint = tune.get_checkpoint() +# if checkpoint: +# with checkpoint.as_directory() as checkpoint_dir: +# data_path = Path(checkpoint_dir) / "data.pkl" +# with open(data_path, "rb") as fp: +# checkpoint_state = pickle.load(fp) +# start_epoch = checkpoint_state["epoch"] +# net.load_state_dict(checkpoint_state["net_state_dict"]) +# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) +# else: +# start_epoch = 0 # # The learning rate of the optimizer is made configurable, too: # # .. code-block:: python # -# optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) +# optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) # -# We also split the training data into a training and validation subset. We thus train on -# 80% of the data and calculate the validation loss on the remaining 20%. The batch sizes -# with which we iterate through the training and test sets are configurable as well. +# We also split the training data into a training and validation subset. +# We thus train on 80% of the data and calculate the validation loss on +# the remaining 20%. The batch sizes with which we iterate through the +# training and test sets are configurable as well. # # Adding (multi) GPU support with DataParallel # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Image classification benefits largely from GPUs. Luckily, we can continue to use -# PyTorch's abstractions in Ray Tune. Thus, we can wrap our model in ``nn.DataParallel`` -# to support data parallel training on multiple GPUs: +# +# Image classification benefits largely from GPUs. Luckily, we can +# continue to use PyTorch’s abstractions in Ray Tune. Thus, we can wrap +# our model in ``nn.DataParallel`` to support data parallel training on +# multiple GPUs: # # .. code-block:: python # -# device = "cpu" -# if torch.cuda.is_available(): -# device = "cuda:0" -# if torch.cuda.device_count() > 1: -# net = nn.DataParallel(net) -# net.to(device) +# device = "cpu" +# if torch.cuda.is_available(): +# device = "cuda:0" +# if torch.cuda.device_count() > 1: +# net = nn.DataParallel(net) +# net.to(device) # -# By using a ``device`` variable we make sure that training also works when we have -# no GPUs available. PyTorch requires us to send our data to the GPU memory explicitly, -# like this: +# By using a ``device`` variable we make sure that training also works +# when we have no GPUs available. PyTorch requires us to send our data to +# the GPU memory explicitly, like this: # # .. code-block:: python # -# for i, data in enumerate(trainloader, 0): -# inputs, labels = data -# inputs, labels = inputs.to(device), labels.to(device) +# for i, data in enumerate(trainloader, 0): +# inputs, labels = data +# inputs, labels = inputs.to(device), labels.to(device) # -# The code now supports training on CPUs, on a single GPU, and on multiple GPUs. Notably, Ray -# also supports `fractional GPUs `_ -# so we can share GPUs among trials, as long as the model still fits on the GPU memory. We'll come back -# to that later. +# The code now supports training on CPUs, on a single GPU, and on multiple +# GPUs. Notably, Ray also supports `fractional +# GPUs `__ +# so we can share GPUs among trials, as long as the model still fits on +# the GPU memory. We’ll come back to that later. # # Communicating with Ray Tune # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -195,41 +207,42 @@ def forward(self, x): # # .. code-block:: python # -# checkpoint_data = { -# "epoch": epoch, -# "net_state_dict": net.state_dict(), -# "optimizer_state_dict": optimizer.state_dict(), -# } -# with tempfile.TemporaryDirectory() as checkpoint_dir: -# data_path = Path(checkpoint_dir) / "data.pkl" -# with open(data_path, "wb") as fp: -# pickle.dump(checkpoint_data, fp) -# -# checkpoint = Checkpoint.from_directory(checkpoint_dir) -# train.report( -# {"loss": val_loss / val_steps, "accuracy": correct / total}, -# checkpoint=checkpoint, -# ) -# -# Here we first save a checkpoint and then report some metrics back to Ray Tune. Specifically, -# we send the validation loss and accuracy back to Ray Tune. Ray Tune can then use these metrics -# to decide which hyperparameter configuration lead to the best results. These metrics -# can also be used to stop bad performing trials early in order to avoid wasting -# resources on those trials. -# -# The checkpoint saving is optional, however, it is necessary if we wanted to use advanced -# schedulers like -# `Population Based Training `_. -# Also, by saving the checkpoint we can later load the trained models and validate them -# on a test set. Lastly, saving checkpoints is useful for fault tolerance, and it allows -# us to interrupt training and continue training later. +# checkpoint_data = { +# "epoch": epoch, +# "net_state_dict": net.state_dict(), +# "optimizer_state_dict": optimizer.state_dict(), +# } +# with tempfile.TemporaryDirectory() as checkpoint_dir: +# data_path = Path(checkpoint_dir) / "data.pkl" +# with open(data_path, "wb") as fp: +# pickle.dump(checkpoint_data, fp) +# +# checkpoint = Checkpoint.from_directory(checkpoint_dir) +# tune.report( +# {"loss": val_loss / val_steps, "accuracy": correct / total}, +# checkpoint=checkpoint, +# ) +# +# Here we first save a checkpoint and then report some metrics back to Ray +# Tune. Specifically, we send the validation loss and accuracy back to Ray +# Tune. Ray Tune can then use these metrics to decide which hyperparameter +# configuration lead to the best results. These metrics can also be used +# to stop bad performing trials early in order to avoid wasting resources +# on those trials. +# +# The checkpoint saving is optional, however, it is necessary if we wanted +# to use advanced schedulers like `Population Based +# Training `__. +# Also, by saving the checkpoint we can later load the trained models and +# validate them on a test set. Lastly, saving checkpoints is useful for +# fault tolerance, and it allows us to interrupt training and continue +# training later. # # Full training function # ~~~~~~~~~~~~~~~~~~~~~~ # # The full code example looks like this: - def train_cifar(config, data_dir=None): net = Net(config["l1"], config["l2"]) @@ -243,7 +256,7 @@ def train_cifar(config, data_dir=None): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) - checkpoint = get_checkpoint() + checkpoint = tune.get_checkpoint() if checkpoint: with checkpoint.as_directory() as checkpoint_dir: data_path = Path(checkpoint_dir) / "data.pkl" @@ -263,10 +276,10 @@ def train_cifar(config, data_dir=None): ) trainloader = torch.utils.data.DataLoader( - train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8 + train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=2 ) valloader = torch.utils.data.DataLoader( - val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8 + val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=2 ) for epoch in range(start_epoch, 10): # loop over the dataset multiple times @@ -326,23 +339,23 @@ def train_cifar(config, data_dir=None): pickle.dump(checkpoint_data, fp) checkpoint = Checkpoint.from_directory(checkpoint_dir) - train.report( + tune.report( {"loss": val_loss / val_steps, "accuracy": correct / total}, checkpoint=checkpoint, ) print("Finished Training") - ###################################################################### -# As you can see, most of the code is adapted directly from the original example. +# As you can see, most of the code is adapted directly from the original +# example. # # Test set accuracy # ----------------- -# Commonly the performance of a machine learning model is tested on a hold-out test -# set with data that has not been used for training the model. We also wrap this in a -# function: - +# +# Commonly the performance of a machine learning model is tested on a +# hold-out test set with data that has not been used for training the +# model. We also wrap this in a function: def test_accuracy(net, device="cpu"): trainset, testset = load_data() @@ -364,69 +377,83 @@ def test_accuracy(net, device="cpu"): return correct / total - ###################################################################### -# The function also expects a ``device`` parameter, so we can do the -# test set validation on a GPU. +# The function also expects a ``device`` parameter, so we can do the test +# set validation on a GPU. # # Configuring the search space # ---------------------------- -# Lastly, we need to define Ray Tune's search space. Here is an example: -# -# .. code-block:: python # -# config = { -# "l1": tune.choice([2 ** i for i in range(9)]), -# "l2": tune.choice([2 ** i for i in range(9)]), -# "lr": tune.loguniform(1e-4, 1e-1), -# "batch_size": tune.choice([2, 4, 8, 16]) -# } +# Lastly, we need to define Ray Tune’s search space. Here is an example: # -# The ``tune.choice()`` accepts a list of values that are uniformly sampled from. -# In this example, the ``l1`` and ``l2`` parameters -# should be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or 256. -# The ``lr`` (learning rate) should be uniformly sampled between 0.0001 and 0.1. Lastly, -# the batch size is a choice between 2, 4, 8, and 16. -# -# At each trial, Ray Tune will now randomly sample a combination of parameters from these -# search spaces. It will then train a number of models in parallel and find the best -# performing one among these. We also use the ``ASHAScheduler`` which will terminate bad -# performing trials early. +# .. code-block:: python # -# We wrap the ``train_cifar`` function with ``functools.partial`` to set the constant -# ``data_dir`` parameter. We can also tell Ray Tune what resources should be -# available for each trial: +# config = { +# "l1": tune.choice([2 ** i for i in range(9)]), +# "l2": tune.choice([2 ** i for i in range(9)]), +# "lr": tune.loguniform(1e-4, 1e-1), +# "batch_size": tune.choice([2, 4, 8, 16]) +# } +# +# The ``tune.choice()`` accepts a list of values that are uniformly +# sampled from. In this example, the ``l1`` and ``l2`` parameters should +# be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or +# 256. The ``lr`` (learning rate) should be uniformly sampled between +# 0.0001 and 0.1. Lastly, the batch size is a choice between 2, 4, 8, and +# 16. +# +# At each trial, Ray Tune will now randomly sample a combination of +# parameters from these search spaces. It will then train a number of +# models in parallel and find the best performing one among these. We also +# use the ``ASHAScheduler`` which will terminate bad performing trials +# early. +# +# We wrap the ``train_cifar`` function with ``functools.partial`` to set +# the constant ``data_dir`` parameter. We can also tell Ray Tune what +# resources should be available for each trial using +# ``tune.with_resources``: # # .. code-block:: python # -# gpus_per_trial = 2 -# # ... -# result = tune.run( -# partial(train_cifar, data_dir=data_dir), -# resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, -# config=config, -# num_samples=num_samples, -# scheduler=scheduler, -# checkpoint_at_end=True) -# -# You can specify the number of CPUs, which are then available e.g. -# to increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. The selected -# number of GPUs are made visible to PyTorch in each trial. Trials do not have access to -# GPUs that haven't been requested for them - so you don't have to care about two trials -# using the same set of resources. -# -# Here we can also specify fractional GPUs, so something like ``gpus_per_trial=0.5`` is -# completely valid. The trials will then share GPUs among each other. -# You just have to make sure that the models still fit in the GPU memory. -# -# After training the models, we will find the best performing one and load the trained -# network from the checkpoint file. We then obtain the test set accuracy and report -# everything by printing. +# gpus_per_trial = 2 +# # ... +# tuner = tune.Tuner( +# tune.with_resources( +# partial(train_cifar, data_dir=data_dir), +# resources={"cpu": 8, "gpu": gpus_per_trial} +# ), +# tune_config=tune.TuneConfig( +# metric="loss", +# mode="min", +# scheduler=scheduler, +# num_samples=num_samples, +# ), +# param_space=config, +# ) +# results = tuner.fit() +# +# You can specify the number of CPUs, which are then available e.g. to +# increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. +# The selected number of GPUs are made visible to PyTorch in each trial. +# Trials do not have access to GPUs that haven’t been requested for them - +# so you don’t have to care about two trials using the same set of +# resources. +# +# Here we can also specify fractional GPUs, so something like +# ``gpus_per_trial=0.5`` is completely valid. The trials will then share +# GPUs among each other. You just have to make sure that the models still +# fit in the GPU memory. +# +# After training the models, we will find the best performing one and load +# the trained network from the checkpoint file. We then obtain the test +# set accuracy and report everything by printing. # # The full main function looks like this: - -def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): +def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): + print("Starting hyperparameter tuning.") + ray.init(include_dashboard=False, runtime_env={"RAY_enable_open_telemetry": "0"}) + data_dir = os.path.abspath("./data") load_data(data_dir) config = { @@ -436,26 +463,32 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): "batch_size": tune.choice([2, 4, 8, 16]), } scheduler = ASHAScheduler( - metric="loss", - mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2, ) - result = tune.run( - partial(train_cifar, data_dir=data_dir), - resources_per_trial={"cpu": 2, "gpu": gpus_per_trial}, - config=config, - num_samples=num_samples, - scheduler=scheduler, + + tuner = tune.Tuner( + tune.with_resources( + partial(train_cifar, data_dir=data_dir), + resources={"cpu": 2, "gpu": gpus_per_trial} + ), + tune_config=tune.TuneConfig( + metric="loss", + mode="min", + scheduler=scheduler, + num_samples=num_trials, + ), + param_space=config, ) + results = tuner.fit() - best_trial = result.get_best_trial("loss", "min", "last") - print(f"Best trial config: {best_trial.config}") - print(f"Best trial final validation loss: {best_trial.last_result['loss']}") - print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}") + best_result = results.get_best_result("loss", "min") + print(f"Best trial config: {best_result.config}") + print(f"Best trial final validation loss: {best_result.metrics['loss']}") + print(f"Best trial final validation accuracy: {best_result.metrics['accuracy']}") - best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"]) + best_trained_model = Net(best_result.config["l1"], best_result.config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" @@ -463,7 +496,7 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) - best_checkpoint = result.get_best_checkpoint(trial=best_trial, metric="accuracy", mode="max") + best_checkpoint = best_result.checkpoint with best_checkpoint.as_directory() as checkpoint_dir: data_path = Path(checkpoint_dir) / "data.pkl" with open(data_path, "rb") as fp: @@ -476,37 +509,36 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): if __name__ == "__main__": # You can change the number of GPUs per trial here: - main(num_samples=10, max_num_epochs=10, gpus_per_trial=0) - + main(num_trials=1, max_num_epochs=1, gpus_per_trial=0) ###################################################################### # If you run the code, an example output could look like this: # -# .. code-block:: sh -# -# Number of trials: 10/10 (10 TERMINATED) -# +-----+--------------+------+------+-------------+--------+---------+------------+ -# | ... | batch_size | l1 | l2 | lr | iter | loss | accuracy | -# |-----+--------------+------+------+-------------+--------+---------+------------| -# | ... | 2 | 1 | 256 | 0.000668163 | 1 | 2.31479 | 0.0977 | -# | ... | 4 | 64 | 8 | 0.0331514 | 1 | 2.31605 | 0.0983 | -# | ... | 4 | 2 | 1 | 0.000150295 | 1 | 2.30755 | 0.1023 | -# | ... | 16 | 32 | 32 | 0.0128248 | 10 | 1.66912 | 0.4391 | -# | ... | 4 | 8 | 128 | 0.00464561 | 2 | 1.7316 | 0.3463 | -# | ... | 8 | 256 | 8 | 0.00031556 | 1 | 2.19409 | 0.1736 | -# | ... | 4 | 16 | 256 | 0.00574329 | 2 | 1.85679 | 0.3368 | -# | ... | 8 | 2 | 2 | 0.00325652 | 1 | 2.30272 | 0.0984 | -# | ... | 2 | 2 | 2 | 0.000342987 | 2 | 1.76044 | 0.292 | -# | ... | 4 | 64 | 32 | 0.003734 | 8 | 1.53101 | 0.4761 | -# +-----+--------------+------+------+-------------+--------+---------+------------+ -# -# Best trial config: {'l1': 64, 'l2': 32, 'lr': 0.0037339984519545164, 'batch_size': 4} -# Best trial final validation loss: 1.5310075663924216 -# Best trial final validation accuracy: 0.4761 -# Best trial test set accuracy: 0.4737 +# .. code-block:: text +# +# Number of trials: 10/10 (10 TERMINATED) +# +-----+--------------+------+------+-------------+--------+---------+------------+ +# | ... | batch_size | l1 | l2 | lr | iter | loss | accuracy | +# |-----+--------------+------+------+-------------+--------+---------+------------| +# | ... | 2 | 1 | 256 | 0.000668163 | 1 | 2.31479 | 0.0977 | +# | ... | 4 | 64 | 8 | 0.0331514 | 1 | 2.31605 | 0.0983 | +# | ... | 4 | 2 | 1 | 0.000150295 | 1 | 2.30755 | 0.1023 | +# | ... | 16 | 32 | 32 | 0.0128248 | 10 | 1.66912 | 0.4391 | +# | ... | 4 | 8 | 128 | 0.00464561 | 2 | 1.7316 | 0.3463 | +# | ... | 8 | 256 | 8 | 0.00031556 | 1 | 2.19409 | 0.1736 | +# | ... | 4 | 16 | 256 | 0.00574329 | 2 | 1.85679 | 0.3368 | +# | ... | 8 | 2 | 2 | 0.00325652 | 1 | 2.30272 | 0.0984 | +# | ... | 2 | 2 | 2 | 0.000342987 | 2 | 1.76044 | 0.292 | +# | ... | 4 | 64 | 32 | 0.003734 | 8 | 1.53101 | 0.4761 | +# +-----+--------------+------+------+-------------+--------+---------+------------+ +# +# Best trial config: {'l1': 64, 'l2': 32, 'lr': 0.0037339984519545164, 'batch_size': 4} +# Best trial final validation loss: 1.5310075663924216 +# Best trial final validation accuracy: 0.4761 +# Best trial test set accuracy: 0.4737 # # Most trials have been stopped early in order to avoid wasting resources. -# The best performing trial achieved a validation accuracy of about 47%, which could -# be confirmed on the test set. +# The best performing trial achieved a validation accuracy of about 47%, +# which could be confirmed on the test set. # -# So that's it! You can now tune the parameters of your PyTorch models. +# So that’s it! You can now tune the parameters of your PyTorch models. From 10566e349ef1fb17986e6f5c495a3e9cdd276fcd Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 15:29:14 -0800 Subject: [PATCH 04/20] Remove Ray initialization from hyperparameter tuning tutorial and update code block formatting from text to bash. --- beginner_source/hyperparameter_tuning_tutorial.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index e3f1b15f1a9..d39e12b9370 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -452,7 +452,6 @@ def test_accuracy(net, device="cpu"): def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): print("Starting hyperparameter tuning.") - ray.init(include_dashboard=False, runtime_env={"RAY_enable_open_telemetry": "0"}) data_dir = os.path.abspath("./data") load_data(data_dir) @@ -514,7 +513,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): ###################################################################### # If you run the code, an example output could look like this: # -# .. code-block:: text +# .. code-block:: bash # # Number of trials: 10/10 (10 TERMINATED) # +-----+--------------+------+------+-------------+--------+---------+------------+ From ad79672502257c58cb7c59bff811887f612198bb Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 15:52:32 -0800 Subject: [PATCH 05/20] Clean up hparam tuning tutorial, modernize checkpointing --- .../hyperparameter_tuning_tutorial.py | 48 +++++++------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index d39e12b9370..ffc5f361db1 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -44,8 +44,6 @@ """ -# %matplotlib inline - from functools import partial import os import tempfile @@ -57,19 +55,10 @@ from torch.utils.data import random_split import torchvision import torchvision.transforms as transforms -# sphinx_gallery_start_ignore -# Fixes `AttributeError: '_LoggingTee' object has no attribute 'fileno'`. -# This is only needed to run with sphinx-build. -import sys -if not hasattr(sys.stdout, "encoding"): - sys.stdout.encoding = "latin1" - sys.stdout.fileno = lambda: 0 -# sphinx_gallery_end_ignore import ray from ray import tune from ray.tune import Checkpoint from ray.tune.schedulers import ASHAScheduler -import ray.cloudpickle as pickle ###################################################################### # Most of the imports are needed for building the PyTorch model. Only the @@ -135,10 +124,13 @@ def forward(self, x): # ``train_cifar(config, data_dir=None)``. The ``config`` parameter will # receive the hyperparameters we would like to train with. The # ``data_dir`` specifies the directory where we load and store the data, -# so that multiple runs can share the same data source. We also load the -# model and optimizer state at the start of the run, if a checkpoint is -# provided. Further down in this tutorial you will find information on how -# to save the checkpoint and what it is used for. +# so that multiple runs can share the same data source. This is especially +# useful in cluster environments, where you can mount a shared storage +# (e.g. NFS) to this directory so that the data is not downloaded to each +# node separately. We also load the model and optimizer state at the start +# of the run, if a checkpoint is provided. Further down in this tutorial +# you will find information on how to save the checkpoint and what it is +# used for. # # .. code-block:: python # @@ -147,9 +139,8 @@ def forward(self, x): # checkpoint = tune.get_checkpoint() # if checkpoint: # with checkpoint.as_directory() as checkpoint_dir: -# data_path = Path(checkpoint_dir) / "data.pkl" -# with open(data_path, "rb") as fp: -# checkpoint_state = pickle.load(fp) +# checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" +# checkpoint_state = torch.load(checkpoint_path) # start_epoch = checkpoint_state["epoch"] # net.load_state_dict(checkpoint_state["net_state_dict"]) # optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) @@ -213,9 +204,8 @@ def forward(self, x): # "optimizer_state_dict": optimizer.state_dict(), # } # with tempfile.TemporaryDirectory() as checkpoint_dir: -# data_path = Path(checkpoint_dir) / "data.pkl" -# with open(data_path, "wb") as fp: -# pickle.dump(checkpoint_data, fp) +# checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" +# torch.save(checkpoint_data, checkpoint_path) # # checkpoint = Checkpoint.from_directory(checkpoint_dir) # tune.report( @@ -259,9 +249,8 @@ def train_cifar(config, data_dir=None): checkpoint = tune.get_checkpoint() if checkpoint: with checkpoint.as_directory() as checkpoint_dir: - data_path = Path(checkpoint_dir) / "data.pkl" - with open(data_path, "rb") as fp: - checkpoint_state = pickle.load(fp) + checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" + checkpoint_state = torch.load(checkpoint_path) start_epoch = checkpoint_state["epoch"] net.load_state_dict(checkpoint_state["net_state_dict"]) optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) @@ -334,9 +323,8 @@ def train_cifar(config, data_dir=None): "optimizer_state_dict": optimizer.state_dict(), } with tempfile.TemporaryDirectory() as checkpoint_dir: - data_path = Path(checkpoint_dir) / "data.pkl" - with open(data_path, "wb") as fp: - pickle.dump(checkpoint_data, fp) + checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" + torch.save(checkpoint_data, checkpoint_path) checkpoint = Checkpoint.from_directory(checkpoint_dir) tune.report( @@ -452,6 +440,7 @@ def test_accuracy(net, device="cpu"): def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): print("Starting hyperparameter tuning.") + ray.init() data_dir = os.path.abspath("./data") load_data(data_dir) @@ -497,9 +486,8 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): best_checkpoint = best_result.checkpoint with best_checkpoint.as_directory() as checkpoint_dir: - data_path = Path(checkpoint_dir) / "data.pkl" - with open(data_path, "rb") as fp: - best_checkpoint_data = pickle.load(fp) + checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" + best_checkpoint_data = torch.load(checkpoint_path) best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"]) test_acc = test_accuracy(best_trained_model, device) From a941761df4b750a952b00afcf485d52cb56a382b Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 16:49:24 -0800 Subject: [PATCH 06/20] Polish hparam tutorial a bit --- .../hyperparameter_tuning_tutorial.py | 120 ++++++++++-------- 1 file changed, 70 insertions(+), 50 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index ffc5f361db1..c8dac6ef9c3 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -3,9 +3,9 @@ =================================== Hyperparameter tuning can make the difference between an average model -and a highly accurate one. Often simple things like choosing a different -learning rate or changing a network layer size can have a dramatic -impact on your model performance. +and a highly accurate one. Often, simple decisions like choosing a +different learning rate or changing a network layer size can +dramatically impact model performance. Fortunately, there are tools that help with finding the best combination of parameters. `Ray Tune `__ is @@ -21,15 +21,12 @@ documentation `__ for training a CIFAR10 image classifier. -As you will see, we only need to add some slight modifications. In -particular, we need to +We only need to make minor modifications: 1. wrap data loading and training in functions, 2. make some network parameters configurable, 3. add checkpointing (optional), -4. and define the search space for the model tuning - -| +4. define the search space for the model tuning To run this tutorial, please make sure the following packages are installed: @@ -62,14 +59,13 @@ ###################################################################### # Most of the imports are needed for building the PyTorch model. Only the -# last imports are for Ray Tune. +# last few are specific to Ray Tune. # # Data loaders # ------------ # -# We wrap the data loaders in their own function and pass a global data -# directory. This way we can share a data directory between different -# trials. +# We wrap the data loaders in a function and pass a global data directory. +# This allows us to share a data directory across different trials. def load_data(data_dir="./data"): transform = transforms.Compose( @@ -90,8 +86,8 @@ def load_data(data_dir="./data"): # Configurable neural network # --------------------------- # -# We can only tune those parameters that are configurable. In this -# example, we can specify the layer sizes of the fully connected layers: +# We can only tune parameters that are configurable. In this example, we +# specify the layer sizes of the fully connected layers: class Net(nn.Module): def __init__(self, l1=120, l2=84): @@ -121,14 +117,14 @@ def forward(self, x): # documentation `__. # # We wrap the training script in a function -# ``train_cifar(config, data_dir=None)``. The ``config`` parameter will -# receive the hyperparameters we would like to train with. The -# ``data_dir`` specifies the directory where we load and store the data, -# so that multiple runs can share the same data source. This is especially -# useful in cluster environments, where you can mount a shared storage -# (e.g. NFS) to this directory so that the data is not downloaded to each +# ``train_cifar(config, data_dir=None)``. The ``config`` parameter +# receives the hyperparameters we want to train with. The ``data_dir`` +# specifies the directory where we load and store the data, allowing +# multiple runs to share the same data source. This is especially useful +# in cluster environments where you can mount a shared storage (e.g. NFS) +# to this directory, preventing the data from being downloaded to each # node separately. We also load the model and optimizer state at the start -# of the run, if a checkpoint is provided. Further down in this tutorial +# of the run if a checkpoint is provided. Further down in this tutorial, # you will find information on how to save the checkpoint and what it is # used for. # @@ -175,9 +171,9 @@ def forward(self, x): # net = nn.DataParallel(net) # net.to(device) # -# By using a ``device`` variable we make sure that training also works -# when we have no GPUs available. PyTorch requires us to send our data to -# the GPU memory explicitly, like this: +# By using a ``device`` variable, we ensure that training works even +# without a GPU. PyTorch requires us to send our data to the GPU memory +# explicitly: # # .. code-block:: python # @@ -194,7 +190,9 @@ def forward(self, x): # Communicating with Ray Tune # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# The most interesting part is the communication with Ray Tune: +# The most interesting part is the communication with Ray Tune. As you’ll +# see, integrating Ray Tune into your training code requires only a few +# additional lines: # # .. code-block:: python # @@ -215,18 +213,27 @@ def forward(self, x): # # Here we first save a checkpoint and then report some metrics back to Ray # Tune. Specifically, we send the validation loss and accuracy back to Ray -# Tune. Ray Tune can then use these metrics to decide which hyperparameter -# configuration lead to the best results. These metrics can also be used -# to stop bad performing trials early in order to avoid wasting resources -# on those trials. +# Tune. Ray Tune uses these metrics to determine the best hyperparameter +# configuration and to stop underperforming trials early, saving +# resources. # # The checkpoint saving is optional, however, it is necessary if we wanted # to use advanced schedulers like `Population Based # Training `__. -# Also, by saving the checkpoint we can later load the trained models and -# validate them on a test set. Lastly, saving checkpoints is useful for -# fault tolerance, and it allows us to interrupt training and continue -# training later. +# Saving the checkpoint also allows us to later load the trained models +# for validation on a test set. Lastly, it provides fault tolerance, +# enabling us to pause and resume training. +# +# To summarize, integrating Ray Tune into your PyTorch training requires +# just a few key additions: +# +# - ``tune.report()`` to report metrics (and optionally checkpoints) to +# Ray Tune +# - ``tune.get_checkpoint()`` to load a model from a checkpoint +# - ``Checkpoint.from_directory()`` to create a checkpoint object from +# saved state +# +# The rest of your training code remains standard PyTorch! # # Full training function # ~~~~~~~~~~~~~~~~~~~~~~ @@ -246,6 +253,7 @@ def train_cifar(config, data_dir=None): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) + # Load checkpoint if resuming training checkpoint = tune.get_checkpoint() if checkpoint: with checkpoint.as_directory() as checkpoint_dir: @@ -317,6 +325,7 @@ def train_cifar(config, data_dir=None): val_loss += loss.cpu().numpy() val_steps += 1 + # Save checkpoint and report metrics checkpoint_data = { "epoch": epoch, "net_state_dict": net.state_dict(), @@ -331,7 +340,7 @@ def train_cifar(config, data_dir=None): {"loss": val_loss / val_steps, "accuracy": correct / total}, checkpoint=checkpoint, ) - + print("Finished Training") ###################################################################### @@ -390,11 +399,21 @@ def test_accuracy(net, device="cpu"): # 0.0001 and 0.1. Lastly, the batch size is a choice between 2, 4, 8, and # 16. # -# At each trial, Ray Tune will now randomly sample a combination of -# parameters from these search spaces. It will then train a number of -# models in parallel and find the best performing one among these. We also -# use the ``ASHAScheduler`` which will terminate bad performing trials -# early. +# For each trial, Ray Tune samples a combination of parameters from these +# search spaces according to the search space configuration and search +# strategy. It then trains multiple models in parallel to identify the +# best performing one. +# +# By default, Ray Tune uses random search to pick the next hyperparameter +# configuration to try. However, Ray Tune also provides more sophisticated +# search algorithms that can more efficiently navigate the search space, +# such as +# `Optuna `__, +# `HyperOpt `__, +# and `Bayesian +# Optimization `__. +# +# We use the ``ASHAScheduler`` to terminate underperforming trials early. # # We wrap the ``train_cifar`` function with ``functools.partial`` to set # the constant ``data_dir`` parameter. We can also tell Ray Tune what @@ -423,20 +442,21 @@ def test_accuracy(net, device="cpu"): # You can specify the number of CPUs, which are then available e.g. to # increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. # The selected number of GPUs are made visible to PyTorch in each trial. -# Trials do not have access to GPUs that haven’t been requested for them - -# so you don’t have to care about two trials using the same set of -# resources. +# Trials do not have access to GPUs that haven’t been requested, so you +# don’t need to worry about resource contention. # -# Here we can also specify fractional GPUs, so something like -# ``gpus_per_trial=0.5`` is completely valid. The trials will then share -# GPUs among each other. You just have to make sure that the models still -# fit in the GPU memory. +# You can also specify fractional GPUs (e.g., ``gpus_per_trial=0.5``), +# which allows trials to share a GPU. Just ensure that the models fit +# within the GPU memory. # # After training the models, we will find the best performing one and load # the trained network from the checkpoint file. We then obtain the test # set accuracy and report everything by printing. # -# The full main function looks like this: +# The full main function looks like this. Note that the +# ``if __name__ == "__main__":`` block is configured for a quick run (1 +# trial, 1 epoch, CPU only) to verify that everything works. You should +# increase these values to perform an actual hyperparameter tuning search. def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): print("Starting hyperparameter tuning.") @@ -495,7 +515,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): if __name__ == "__main__": - # You can change the number of GPUs per trial here: + # Set the number of trials, epochs, and GPUs per trial here: main(num_trials=1, max_num_epochs=1, gpus_per_trial=0) ###################################################################### @@ -524,8 +544,8 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # Best trial final validation accuracy: 0.4761 # Best trial test set accuracy: 0.4737 # -# Most trials have been stopped early in order to avoid wasting resources. -# The best performing trial achieved a validation accuracy of about 47%, +# Most trials were stopped early to conserve resources. The best +# performing trial achieved a validation accuracy of approximately 47%, # which could be confirmed on the test set. # # So that’s it! You can now tune the parameters of your PyTorch models. From 64bc12eb09f3f50da9160eeafa460c1685276174 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 16:50:30 -0800 Subject: [PATCH 07/20] Use the actual CIFAR10 normalization values --- beginner_source/hyperparameter_tuning_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index c8dac6ef9c3..944a460c36e 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -69,7 +69,7 @@ def load_data(data_dir="./data"): transform = transforms.Compose( - [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + [transforms.ToTensor(), transforms.Normalize((0.4914, 0.48216, 0.44653), (0.2022, 0.19932, 0.20086))] ) trainset = torchvision.datasets.CIFAR10( From 1f11769e18186b20afb5d933dfb2e9ed31c6cc31 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 20:36:43 -0800 Subject: [PATCH 08/20] polish --- .../hyperparameter_tuning_tutorial.py | 110 ++++++++---------- 1 file changed, 47 insertions(+), 63 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index 944a460c36e..203cd6d7dd2 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -7,35 +7,25 @@ different learning rate or changing a network layer size can dramatically impact model performance. -Fortunately, there are tools that help with finding the best combination -of parameters. `Ray Tune `__ is -an industry standard tool for distributed hyperparameter tuning. Ray -Tune includes the latest hyperparameter search algorithms, integrates -with various analysis libraries, and natively supports distributed -training through `Ray’s distributed machine learning -engine `__. +This page shows how to integrate `Ray +Tune `__ into your PyTorch +training workflow for distributed hyperparameter tuning. It extends the +PyTorch tutorial for training a CIFAR10 image classifier in the `CIFAR10 +tutorial (PyTorch +documentation) `__. -In this tutorial, we will show you how to integrate Ray Tune into your -PyTorch training workflow. We will extend `this tutorial from the -PyTorch -documentation `__ -for training a CIFAR10 image classifier. +Only minor modifications are needed. Specifically, this example wraps +data loading and training in functions, makes some network parameters +configurable, adds optional checkpointing, and defines the search space +for model tuning. -We only need to make minor modifications: +To run this tutorial, install the following prerequisites: -1. wrap data loading and training in functions, -2. make some network parameters configurable, -3. add checkpointing (optional), -4. define the search space for the model tuning +- ``ray[tune]`` – Distributed hyperparameter tuning library +- ``torchvision`` – Data transforms for computer vision datasets -To run this tutorial, please make sure the following packages are -installed: - -- ``ray[tune]``: Distributed hyperparameter tuning library -- ``torchvision``: For the data transformers - -Setup / Imports ---------------- +Setup and imports +----------------- Let’s start with the imports: @@ -86,8 +76,8 @@ def load_data(data_dir="./data"): # Configurable neural network # --------------------------- # -# We can only tune parameters that are configurable. In this example, we -# specify the layer sizes of the fully connected layers: +# In this example, we specify the layer sizes of the fully connected +# layers. class Net(nn.Module): def __init__(self, l1=120, l2=84): @@ -109,24 +99,23 @@ def forward(self, x): return x ###################################################################### -# The train function -# ------------------ +# Train function +# -------------- # # Now it gets interesting, because we introduce some changes to the -# example `from the PyTorch -# documentation `__. +# example from the `CIFAR10 tutorial (PyTorch +# documentation) `__. # # We wrap the training script in a function # ``train_cifar(config, data_dir=None)``. The ``config`` parameter # receives the hyperparameters we want to train with. The ``data_dir`` # specifies the directory where we load and store the data, allowing # multiple runs to share the same data source. This is especially useful -# in cluster environments where you can mount a shared storage (e.g. NFS) -# to this directory, preventing the data from being downloaded to each -# node separately. We also load the model and optimizer state at the start -# of the run if a checkpoint is provided. Further down in this tutorial, -# you will find information on how to save the checkpoint and what it is -# used for. +# in cluster environments where you can mount shared storage (for example +# NFS), preventing the data from being downloaded to each node separately. +# We also load the model and optimizer state at the start of the run if a +# checkpoint is provided. Further down in this tutorial, you will find +# information on how to save the checkpoint and what it is used for. # # .. code-block:: python # @@ -158,9 +147,9 @@ def forward(self, x): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Image classification benefits largely from GPUs. Luckily, we can -# continue to use PyTorch’s abstractions in Ray Tune. Thus, we can wrap -# our model in ``nn.DataParallel`` to support data parallel training on -# multiple GPUs: +# continue to use PyTorch’s tools in Ray Tune. Thus, we can wrap our model +# in ``nn.DataParallel`` to support data parallel training on multiple +# GPUs: # # .. code-block:: python # @@ -185,7 +174,7 @@ def forward(self, x): # GPUs. Notably, Ray also supports `fractional # GPUs `__ # so we can share GPUs among trials, as long as the model still fits on -# the GPU memory. We’ll come back to that later. +# the GPU memory. We will return to that later. # # Communicating with Ray Tune # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -225,15 +214,11 @@ def forward(self, x): # enabling us to pause and resume training. # # To summarize, integrating Ray Tune into your PyTorch training requires -# just a few key additions: -# -# - ``tune.report()`` to report metrics (and optionally checkpoints) to -# Ray Tune -# - ``tune.get_checkpoint()`` to load a model from a checkpoint -# - ``Checkpoint.from_directory()`` to create a checkpoint object from -# saved state -# -# The rest of your training code remains standard PyTorch! +# just a few key additions: use ``tune.report()`` to report metrics (and +# optionally checkpoints) to Ray Tune, ``tune.get_checkpoint()`` to load a +# model from a checkpoint, and ``Checkpoint.from_directory()`` to create a +# checkpoint object from saved state. The rest of your training code +# remains standard PyTorch! # # Full training function # ~~~~~~~~~~~~~~~~~~~~~~ @@ -351,7 +336,7 @@ def train_cifar(config, data_dir=None): # ----------------- # # Commonly the performance of a machine learning model is tested on a -# hold-out test set with data that has not been used for training the +# held-out test set with data that has not been used for training the # model. We also wrap this in a function: def test_accuracy(net, device="cpu"): @@ -375,11 +360,11 @@ def test_accuracy(net, device="cpu"): return correct / total ###################################################################### -# The function also expects a ``device`` parameter, so we can do the test +# The function also expects a ``device`` parameter so we can do the test # set validation on a GPU. # -# Configuring the search space -# ---------------------------- +# Search space configuration +# -------------------------- # # Lastly, we need to define Ray Tune’s search space. Here is an example: # @@ -394,10 +379,9 @@ def test_accuracy(net, device="cpu"): # # The ``tune.choice()`` accepts a list of values that are uniformly # sampled from. In this example, the ``l1`` and ``l2`` parameters should -# be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or -# 256. The ``lr`` (learning rate) should be uniformly sampled between -# 0.0001 and 0.1. Lastly, the batch size is a choice between 2, 4, 8, and -# 16. +# be powers of 2 between 1 and 256: 1, 2, 4, 8, 16, 32, 64, 128, or 256. +# The ``lr`` (learning rate) should be uniformly sampled between 0.0001 +# and 0.1. Lastly, the batch size is a choice between 2, 4, 8, and 16. # # For each trial, Ray Tune samples a combination of parameters from these # search spaces according to the search space configuration and search @@ -439,13 +423,13 @@ def test_accuracy(net, device="cpu"): # ) # results = tuner.fit() # -# You can specify the number of CPUs, which are then available e.g. to +# Specify the number of CPUs, which are then available, for example to # increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. # The selected number of GPUs are made visible to PyTorch in each trial. -# Trials do not have access to GPUs that haven’t been requested, so you +# Trials do not have access to GPUs that have not been requested, so you # don’t need to worry about resource contention. # -# You can also specify fractional GPUs (e.g., ``gpus_per_trial=0.5``), +# You can specify fractional GPUs (for example, ``gpus_per_trial=0.5``), # which allows trials to share a GPU. Just ensure that the models fit # within the GPU memory. # @@ -519,7 +503,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): main(num_trials=1, max_num_epochs=1, gpus_per_trial=0) ###################################################################### -# If you run the code, an example output could look like this: +# Your output will look something like this: # # .. code-block:: bash # @@ -548,4 +532,4 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # performing trial achieved a validation accuracy of approximately 47%, # which could be confirmed on the test set. # -# So that’s it! You can now tune the parameters of your PyTorch models. +# You can now tune the parameters of your PyTorch models. From e3604484eebd9c32e64e50936bebb7d64b05b0bc Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Wed, 10 Dec 2025 18:58:44 -0800 Subject: [PATCH 09/20] finalize the hyperparameter tuning tutorial --- .../hyperparameter_tuning_tutorial.py | 255 +++++++++++------- ecosystem.rst | 2 +- index.rst | 2 +- 3 files changed, 160 insertions(+), 99 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index 203cd6d7dd2..19c939020b8 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -2,34 +2,38 @@ Hyperparameter tuning with Ray Tune =================================== -Hyperparameter tuning can make the difference between an average model -and a highly accurate one. Often, simple decisions like choosing a -different learning rate or changing a network layer size can -dramatically impact model performance. - -This page shows how to integrate `Ray -Tune `__ into your PyTorch -training workflow for distributed hyperparameter tuning. It extends the -PyTorch tutorial for training a CIFAR10 image classifier in the `CIFAR10 -tutorial (PyTorch +This tutorial shows how to integrate Ray Tune into your PyTorch training +workflow to perform scalable and efficient hyperparameter tuning. + +`Ray `__, a project of the +PyTorch Foundation, is an open-source unified framework for scaling AI +and Python applications. It helps run distributed workloads by handling +the complexity of distributed computing. `Ray +Tune `__ is a library +built on Ray for hyperparameter tuning that enables you to scale a +hyperparameter sweep from your machine to a large cluster with no code +changes. + +This tutorial extends the PyTorch tutorial for training a CIFAR10 image +classifier in the `CIFAR10 tutorial (PyTorch documentation) `__. +Only minor modifications are needed to adapt the PyTorch tutorial for +Ray Tune. Specifically, this tutorial wraps the data loading and +training in functions, makes some network parameters configurable, adds +optional checkpointing, and defines the search space for model tuning. -Only minor modifications are needed. Specifically, this example wraps -data loading and training in functions, makes some network parameters -configurable, adds optional checkpointing, and defines the search space -for model tuning. +Setup +----- -To run this tutorial, install the following prerequisites: +To run this tutorial, install the dependencies: -- ``ray[tune]`` – Distributed hyperparameter tuning library -- ``torchvision`` – Data transforms for computer vision datasets - -Setup and imports ------------------ +""" -Let’s start with the imports: +# %%bash +# pip install "ray[tune]" torchvision -""" +###################################################################### +# Then start with the imports: from functools import partial import os @@ -42,20 +46,18 @@ from torch.utils.data import random_split import torchvision import torchvision.transforms as transforms +# New: imports for Ray Tune import ray from ray import tune from ray.tune import Checkpoint from ray.tune.schedulers import ASHAScheduler ###################################################################### -# Most of the imports are needed for building the PyTorch model. Only the -# last few are specific to Ray Tune. -# -# Data loaders -# ------------ +# How to use PyTorch data loaders with Ray Tune +# --------------------------------------------- # -# We wrap the data loaders in a function and pass a global data directory. -# This allows us to share a data directory across different trials. +# Wrap the data loaders in a constructor function. Pass a global data +# directory here to reuse the dataset across different trials. def load_data(data_dir="./data"): transform = transforms.Compose( @@ -73,15 +75,15 @@ def load_data(data_dir="./data"): return trainset, testset ###################################################################### -# Configurable neural network -# --------------------------- +# Configure the hyperparameters +# ----------------------------- # # In this example, we specify the layer sizes of the fully connected # layers. class Net(nn.Module): def __init__(self, l1=120, l2=84): - super(Net, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) @@ -99,12 +101,12 @@ def forward(self, x): return x ###################################################################### -# Train function -# -------------- +# Use a train function with Ray Tune +# ---------------------------------- # # Now it gets interesting, because we introduce some changes to the -# example from the `CIFAR10 tutorial (PyTorch -# documentation) `__. +# example `from the PyTorch +# documentation `__. # # We wrap the training script in a function # ``train_cifar(config, data_dir=None)``. The ``config`` parameter @@ -112,10 +114,10 @@ def forward(self, x): # specifies the directory where we load and store the data, allowing # multiple runs to share the same data source. This is especially useful # in cluster environments where you can mount shared storage (for example -# NFS), preventing the data from being downloaded to each node separately. +# NFS) to prevent the data from being downloaded to each node separately. # We also load the model and optimizer state at the start of the run if a # checkpoint is provided. Further down in this tutorial, you will find -# information on how to save the checkpoint and what it is used for. +# information on how to save the checkpoint and how it is used. # # .. code-block:: python # @@ -143,12 +145,12 @@ def forward(self, x): # the remaining 20%. The batch sizes with which we iterate through the # training and test sets are configurable as well. # -# Adding (multi) GPU support with DataParallel -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Add multi-GPU support with DataParallel +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Image classification benefits largely from GPUs. Luckily, we can -# continue to use PyTorch’s tools in Ray Tune. Thus, we can wrap our model -# in ``nn.DataParallel`` to support data parallel training on multiple +# Image classification benefits largely from GPUs. Luckily, you can +# continue to use PyTorch tools in Ray Tune. Thus, you can wrap the model +# in ``nn.DataParallel`` to support data-parallel training on multiple # GPUs: # # .. code-block:: python @@ -206,7 +208,7 @@ def forward(self, x): # configuration and to stop underperforming trials early, saving # resources. # -# The checkpoint saving is optional, however, it is necessary if we wanted +# The checkpoint saving is optional. However, it is necessary if we wanted # to use advanced schedulers like `Population Based # Training `__. # Saving the checkpoint also allows us to later load the trained models @@ -218,7 +220,7 @@ def forward(self, x): # optionally checkpoints) to Ray Tune, ``tune.get_checkpoint()`` to load a # model from a checkpoint, and ``Checkpoint.from_directory()`` to create a # checkpoint object from saved state. The rest of your training code -# remains standard PyTorch! +# remains standard PyTorch. # # Full training function # ~~~~~~~~~~~~~~~~~~~~~~ @@ -332,8 +334,8 @@ def train_cifar(config, data_dir=None): # As you can see, most of the code is adapted directly from the original # example. # -# Test set accuracy -# ----------------- +# Compute test set accuracy +# ------------------------- # # Commonly the performance of a machine learning model is tested on a # held-out test set with data that has not been used for training the @@ -360,58 +362,95 @@ def test_accuracy(net, device="cpu"): return correct / total ###################################################################### -# The function also expects a ``device`` parameter so we can do the test +# The function also expects a ``device`` parameter so you can run the test # set validation on a GPU. # -# Search space configuration +# Configure the search space # -------------------------- # -# Lastly, we need to define Ray Tune’s search space. Here is an example: +# Lastly, we need to define Ray Tune’s search space. Ray Tune offers a +# variety of `search space +# distributions `__ +# to suit different parameter types: ``loguniform``, ``uniform``, +# ``choice``, ``randint``, ``grid``, and more. It also lets you express +# complex dependencies between parameters with `conditional search +# spaces `__. +# +# Here is an example: # # .. code-block:: python # # config = { -# "l1": tune.choice([2 ** i for i in range(9)]), -# "l2": tune.choice([2 ** i for i in range(9)]), +# "l1": tune.choice([2**i for i in range(9)]), +# "l2": tune.choice([2**i for i in range(9)]), # "lr": tune.loguniform(1e-4, 1e-1), -# "batch_size": tune.choice([2, 4, 8, 16]) +# "batch_size": tune.choice([2, 4, 8, 16]), # } # # The ``tune.choice()`` accepts a list of values that are uniformly -# sampled from. In this example, the ``l1`` and ``l2`` parameters should -# be powers of 2 between 1 and 256: 1, 2, 4, 8, 16, 32, 64, 128, or 256. -# The ``lr`` (learning rate) should be uniformly sampled between 0.0001 -# and 0.1. Lastly, the batch size is a choice between 2, 4, 8, and 16. -# -# For each trial, Ray Tune samples a combination of parameters from these -# search spaces according to the search space configuration and search -# strategy. It then trains multiple models in parallel to identify the -# best performing one. -# -# By default, Ray Tune uses random search to pick the next hyperparameter -# configuration to try. However, Ray Tune also provides more sophisticated -# search algorithms that can more efficiently navigate the search space, -# such as -# `Optuna `__, -# `HyperOpt `__, -# and `Bayesian -# Optimization `__. -# -# We use the ``ASHAScheduler`` to terminate underperforming trials early. -# -# We wrap the ``train_cifar`` function with ``functools.partial`` to set -# the constant ``data_dir`` parameter. We can also tell Ray Tune what -# resources should be available for each trial using +# sampled from. In this example, the ``l1`` and ``l2`` parameter values +# will be powers of 2 between 1 and 256. The learning rate is sampled on a +# log scale between 0.0001 and 0.1. Sampling on a log scale ensures that +# the search space is explored efficiently across different magnitudes. +# +# Smarter sampling and scheduling +# ------------------------------- +# +# To make the hyperparameter search process efficient, Ray Tune provides +# two main controls: +# +# 1. It can intelligently pick the next set of hyperparameters to test +# based on previous results using `advanced search +# algorithms `__ +# such as +# `Optuna `__ +# or +# ```bayesopt`` `__, +# instead of relying only on random or grid search. +# 2. It can detect underperforming trials and stop them early using +# `schedulers `__, +# enabling you to explore the parameter space more on the same compute +# budget. +# +# In this tutorial, we use the ``ASHAScheduler``, which aggressively +# terminates low-performing trials to save computational resources. +# +# Configure the resources +# ----------------------- +# +# Tell Ray Tune what resources should be available for each trial using # ``tune.with_resources``: # # .. code-block:: python # -# gpus_per_trial = 2 -# # ... +# tune.with_resources( +# partial(train_cifar, data_dir=data_dir), +# resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} +# ) +# +# This tells Ray Tune to allocate ``cpus_per_trial`` CPUs and +# ``gpus_per_trial`` GPUs for each trial. Ray Tune automatically manages +# the placement of these trials and ensures they are isolated, so you +# don’t need to manually assign GPUs to processes. +# +# For example, if you are running this experiment on a cluster of 20 +# machines, each with 8 GPUs, you can set ``gpus_per_trial = 0.5`` to +# schedule 2 concurrent trials per GPU. This configuration runs 320 trials +# in parallel across the cluster. +# +# Putting it together +# ------------------- +# +# The Ray Tune API is designed to be modular and composable: you pass your +# configurations to the ``tune.Tuner`` class to create a tuner object, +# then execute ``tuner.fit()`` to start training: +# +# .. code-block:: python +# # tuner = tune.Tuner( # tune.with_resources( # partial(train_cifar, data_dir=data_dir), -# resources={"cpu": 8, "gpu": gpus_per_trial} +# resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} # ), # tune_config=tune.TuneConfig( # metric="loss", @@ -423,24 +462,9 @@ def test_accuracy(net, device="cpu"): # ) # results = tuner.fit() # -# Specify the number of CPUs, which are then available, for example to -# increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. -# The selected number of GPUs are made visible to PyTorch in each trial. -# Trials do not have access to GPUs that have not been requested, so you -# don’t need to worry about resource contention. -# -# You can specify fractional GPUs (for example, ``gpus_per_trial=0.5``), -# which allows trials to share a GPU. Just ensure that the models fit -# within the GPU memory. -# # After training the models, we will find the best performing one and load # the trained network from the checkpoint file. We then obtain the test -# set accuracy and report everything by printing. -# -# The full main function looks like this. Note that the -# ``if __name__ == "__main__":`` block is configured for a quick run (1 -# trial, 1 epoch, CPU only) to verify that everything works. You should -# increase these values to perform an actual hyperparameter tuning search. +# set accuracy and report the results. def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): print("Starting hyperparameter tuning.") @@ -500,10 +524,11 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): if __name__ == "__main__": # Set the number of trials, epochs, and GPUs per trial here: + # The following configuration is for a quick run (1 trial, 1 epoch, CPU only) for demonstration purposes. main(num_trials=1, max_num_epochs=1, gpus_per_trial=0) ###################################################################### -# Your output will look something like this: +# Your Ray Tune trial summary output will look something like this: # # .. code-block:: bash # @@ -533,3 +558,39 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # which could be confirmed on the test set. # # You can now tune the parameters of your PyTorch models. +# +# Observability +# ------------- +# +# When running large-scale experiments, monitoring is crucial. Ray +# provides a +# `Dashboard `__ +# that lets you view the status of your trials, check cluster resource +# utilization, and inspect logs in real-time. +# +# For debugging, Ray also offers `Distributed +# Debugging `__ +# tools that let you attach a debugger to running trials across the +# cluster. +# +# Conclusion +# ---------- +# +# In this tutorial, you learned how to tune the hyperparameters of a +# PyTorch model using Ray Tune. You saw how to integrate Ray Tune into +# your PyTorch training loop, define a search space for your +# hyperparameters, use an efficient scheduler like ASHA to terminate bad +# trials early, save checkpoints and report metrics to Ray Tune, and run +# the hyperparameter search and analyze the results. +# +# Ray Tune makes it easy to scale your experiments from a single machine +# to a large cluster, helping you find the best model configuration +# efficiently. +# +# Further reading +# --------------- +# +# - `Ray Tune +# documentation `__ +# - `Ray Tune +# examples `__ diff --git a/ecosystem.rst b/ecosystem.rst index da2a926851a..ddd6c505561 100644 --- a/ecosystem.rst +++ b/ecosystem.rst @@ -33,7 +33,7 @@ to production deployment. :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model. :image: _static/img/ray-tune.png :link: beginner/hyperparameter_tuning_tutorial.html - :tags: Model-Optimization,Best-Practice,Ecosystem + :tags: Model-Optimization,Best-Practice,Ecosystem,Ray-Distributed,Parallel-and-Distributed-Training .. customcarditem:: :header: Multi-Objective Neural Architecture Search with Ax diff --git a/index.rst b/index.rst index 5a5e80abfbb..f9a76296750 100644 --- a/index.rst +++ b/index.rst @@ -493,7 +493,7 @@ Welcome to PyTorch Tutorials :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model. :image: _static/img/ray-tune.png :link: beginner/hyperparameter_tuning_tutorial.html - :tags: Model-Optimization,Best-Practice + :tags: Model-Optimization,Best-Practice,Ray-Distributed,Parallel-and-Distributed-Training .. customcarditem:: :header: Parametrizations Tutorial From 90de3a93e41ad105343e7998d3c5bc2b09f52696 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 11 Dec 2025 19:21:56 -0800 Subject: [PATCH 10/20] add author --- beginner_source/hyperparameter_tuning_tutorial.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index 19c939020b8..c1798cca8ac 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -2,6 +2,8 @@ Hyperparameter tuning with Ray Tune =================================== +**Author:** `Ricardo Decal `_ + This tutorial shows how to integrate Ray Tune into your PyTorch training workflow to perform scalable and efficient hyperparameter tuning. From 0ab1f75ecc975e03a180c6f798f3c1fc50ee5fe5 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Mon, 15 Dec 2025 14:42:01 -0800 Subject: [PATCH 11/20] Ignore more IDE stuff --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ea478ca180d..9722d93505a 100644 --- a/.gitignore +++ b/.gitignore @@ -126,8 +126,10 @@ cleanup.sh # PyTorch things *.pt -# VSCode +# IDEs *.vscode +.devtools/ +.cursor # pyspelling dictionary.dic From ce0bc81e5fd8acb5c10f49bae1aaf7a0de094010 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Mon, 15 Dec 2025 14:46:55 -0800 Subject: [PATCH 12/20] make linter happy --- .gitignore | 1 - beginner_source/hyperparameter_tuning_tutorial.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 9722d93505a..67d6fefc303 100644 --- a/.gitignore +++ b/.gitignore @@ -136,4 +136,3 @@ dictionary.dic # linters /.lintbin - diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index c1798cca8ac..dd76a3258bb 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -571,7 +571,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # utilization, and inspect logs in real-time. # # For debugging, Ray also offers `Distributed -# Debugging `__ +# Debugging `__ # tools that let you attach a debugger to running trials across the # cluster. # From 8c18e688835f495d0d31de81add0c575b487cb77 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Mon, 15 Dec 2025 14:58:03 -0800 Subject: [PATCH 13/20] PR feedback --- .../hyperparameter_tuning_tutorial.py | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index dd76a3258bb..ada184f42cd 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -62,6 +62,7 @@ # directory here to reuse the dataset across different trials. def load_data(data_dir="./data"): + # Mean and standard deviation of the CIFAR10 training subset. transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.4914, 0.48216, 0.44653), (0.2022, 0.19932, 0.20086))] ) @@ -157,12 +158,11 @@ def forward(self, x): # # .. code-block:: python # -# device = "cpu" # if torch.cuda.is_available(): -# device = "cuda:0" +# # Must move the model to CUDA before wrapping it with ``DataParallel`` +# net = net.to("cuda") # if torch.cuda.device_count() > 1: # net = nn.DataParallel(net) -# net.to(device) # # By using a ``device`` variable, we ensure that training works even # without a GPU. PyTorch requires us to send our data to the GPU memory @@ -232,12 +232,9 @@ def forward(self, x): def train_cifar(config, data_dir=None): net = Net(config["l1"], config["l2"]) - device = "cpu" - if torch.cuda.is_available(): - device = "cuda:0" - if torch.cuda.device_count() > 1: - net = nn.DataParallel(net) - net.to(device) + net = net.to(config["device"]) + if torch.cuda.device_count() > 1: + net = nn.DataParallel(net) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) @@ -474,11 +471,13 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): data_dir = os.path.abspath("./data") load_data(data_dir) + device = "cuda" if torch.cuda.is_available() else "cpu" config = { "l1": tune.choice([2**i for i in range(9)]), "l2": tune.choice([2**i for i in range(9)]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([2, 4, 8, 16]), + "device": device, } scheduler = ASHAScheduler( max_t=max_num_epochs, @@ -507,12 +506,9 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): print(f"Best trial final validation accuracy: {best_result.metrics['accuracy']}") best_trained_model = Net(best_result.config["l1"], best_result.config["l2"]) - device = "cpu" - if torch.cuda.is_available(): - device = "cuda:0" - if gpus_per_trial > 1: - best_trained_model = nn.DataParallel(best_trained_model) - best_trained_model.to(device) + best_trained_model = best_trained_model.to(device) + if gpus_per_trial > 1: + best_trained_model = nn.DataParallel(best_trained_model) best_checkpoint = best_result.checkpoint with best_checkpoint.as_directory() as checkpoint_dir: From e1e0ea16ad0d989af4fd41bfcf91d6fcd7419d75 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Mon, 15 Dec 2025 15:00:44 -0800 Subject: [PATCH 14/20] PR feedback --- beginner_source/hyperparameter_tuning_tutorial.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index ada184f42cd..062a8adda8d 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -143,10 +143,10 @@ def forward(self, x): # # optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) # -# We also split the training data into a training and validation subset. +# We also split the dataset into training and validation subsets. # We thus train on 80% of the data and calculate the validation loss on # the remaining 20%. The batch sizes with which we iterate through the -# training and test sets are configurable as well. +# training and test sets are configurable by Ray Tune. # # Add multi-GPU support with DataParallel # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From ffbb8cb00824d0b1c58c5331a370672c000b5609 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Mon, 15 Dec 2025 17:47:59 -0800 Subject: [PATCH 15/20] fix device loading --- beginner_source/hyperparameter_tuning_tutorial.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index 062a8adda8d..c97bda8272f 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -231,8 +231,9 @@ def forward(self, x): def train_cifar(config, data_dir=None): net = Net(config["l1"], config["l2"]) + device = config["device"] - net = net.to(config["device"]) + net = net.to(device) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) @@ -251,7 +252,7 @@ def train_cifar(config, data_dir=None): else: start_epoch = 0 - trainset, testset = load_data(data_dir) + trainset, _testset = load_data(data_dir) test_abs = int(len(trainset) * 0.8) train_subset, val_subset = random_split( @@ -341,7 +342,7 @@ def train_cifar(config, data_dir=None): # model. We also wrap this in a function: def test_accuracy(net, device="cpu"): - trainset, testset = load_data() + _trainset, testset = load_data() testloader = torch.utils.data.DataLoader( testset, batch_size=4, shuffle=False, num_workers=2 @@ -470,7 +471,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): ray.init() data_dir = os.path.abspath("./data") - load_data(data_dir) + load_data(data_dir) # Pre-download the dataset device = "cuda" if torch.cuda.is_available() else "cpu" config = { "l1": tune.choice([2**i for i in range(9)]), From 326952ec8201d29452d81077b713b092d18e0af3 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Mon, 15 Dec 2025 18:32:46 -0800 Subject: [PATCH 16/20] turn the pip code block to markdown section --- beginner_source/hyperparameter_tuning_tutorial.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index c97bda8272f..cc29461520f 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -29,10 +29,11 @@ To run this tutorial, install the dependencies: -""" +.. code-block:: bash + + pip install "ray[tune]" torchvision -# %%bash -# pip install "ray[tune]" torchvision +""" ###################################################################### # Then start with the imports: From a6b27bea7a6299395d0532777730b1a21ca8067a Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Mon, 15 Dec 2025 19:42:02 -0800 Subject: [PATCH 17/20] pr feedback and linting --- .../hyperparameter_tuning_tutorial.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index cc29461520f..89c8c6c6640 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -2,7 +2,7 @@ Hyperparameter tuning with Ray Tune =================================== -**Author:** `Ricardo Decal `_ +**Author:** `Ricardo Decal `__ This tutorial shows how to integrate Ray Tune into your PyTorch training workflow to perform scalable and efficient hyperparameter tuning. @@ -57,7 +57,7 @@ ###################################################################### # How to use PyTorch data loaders with Ray Tune -# --------------------------------------------- +# ============================================= # # Wrap the data loaders in a constructor function. Pass a global data # directory here to reuse the dataset across different trials. @@ -80,10 +80,11 @@ def load_data(data_dir="./data"): ###################################################################### # Configure the hyperparameters -# ----------------------------- +# ============================= # -# In this example, we specify the layer sizes of the fully connected -# layers. +# In this tutorial, we will tune the sizes of the fully connected layers +# and the learning rate. In order to do so, we need to expose the layer +# sizes and the learning rate as configurable parameters. class Net(nn.Module): def __init__(self, l1=120, l2=84): @@ -106,7 +107,7 @@ def forward(self, x): ###################################################################### # Use a train function with Ray Tune -# ---------------------------------- +# ================================== # # Now it gets interesting, because we introduce some changes to the # example `from the PyTorch @@ -144,13 +145,13 @@ def forward(self, x): # # optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) # -# We also split the dataset into training and validation subsets. -# We thus train on 80% of the data and calculate the validation loss on -# the remaining 20%. The batch sizes with which we iterate through the +# We also split the dataset into training and validation subsets. We thus +# train on 80% of the data and calculate the validation loss on the +# remaining 20%. The batch sizes with which we iterate through the # training and test sets are configurable by Ray Tune. # # Add multi-GPU support with DataParallel -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# --------------------------------------- # # Image classification benefits largely from GPUs. Luckily, you can # continue to use PyTorch tools in Ray Tune. Thus, you can wrap the model @@ -182,7 +183,7 @@ def forward(self, x): # the GPU memory. We will return to that later. # # Communicating with Ray Tune -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# --------------------------- # # The most interesting part is the communication with Ray Tune. As you’ll # see, integrating Ray Tune into your training code requires only a few @@ -226,7 +227,7 @@ def forward(self, x): # remains standard PyTorch. # # Full training function -# ~~~~~~~~~~~~~~~~~~~~~~ +# ---------------------- # # The full code example looks like this: @@ -336,7 +337,7 @@ def train_cifar(config, data_dir=None): # example. # # Compute test set accuracy -# ------------------------- +# ========================= # # Commonly the performance of a machine learning model is tested on a # held-out test set with data that has not been used for training the @@ -367,7 +368,7 @@ def test_accuracy(net, device="cpu"): # set validation on a GPU. # # Configure the search space -# -------------------------- +# ========================== # # Lastly, we need to define Ray Tune’s search space. Ray Tune offers a # variety of `search space @@ -395,7 +396,7 @@ def test_accuracy(net, device="cpu"): # the search space is explored efficiently across different magnitudes. # # Smarter sampling and scheduling -# ------------------------------- +# =============================== # # To make the hyperparameter search process efficient, Ray Tune provides # two main controls: @@ -406,7 +407,7 @@ def test_accuracy(net, device="cpu"): # such as # `Optuna `__ # or -# ```bayesopt`` `__, +# `BayesOpt `__, # instead of relying only on random or grid search. # 2. It can detect underperforming trials and stop them early using # `schedulers `__, @@ -417,7 +418,7 @@ def test_accuracy(net, device="cpu"): # terminates low-performing trials to save computational resources. # # Configure the resources -# ----------------------- +# ======================= # # Tell Ray Tune what resources should be available for each trial using # ``tune.with_resources``: @@ -436,11 +437,11 @@ def test_accuracy(net, device="cpu"): # # For example, if you are running this experiment on a cluster of 20 # machines, each with 8 GPUs, you can set ``gpus_per_trial = 0.5`` to -# schedule 2 concurrent trials per GPU. This configuration runs 320 trials -# in parallel across the cluster. +# schedule two concurrent trials per GPU. This configuration runs 320 +# trials in parallel across the cluster. # # Putting it together -# ------------------- +# =================== # # The Ray Tune API is designed to be modular and composable: you pass your # configurations to the ``tune.Tuner`` class to create a tuner object, @@ -560,7 +561,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # You can now tune the parameters of your PyTorch models. # # Observability -# ------------- +# ============= # # When running large-scale experiments, monitoring is crucial. Ray # provides a @@ -568,13 +569,12 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # that lets you view the status of your trials, check cluster resource # utilization, and inspect logs in real-time. # -# For debugging, Ray also offers `Distributed -# Debugging `__ -# tools that let you attach a debugger to running trials across the -# cluster. +# For debugging, Ray also offers `distributed debugging +# tools `__ +# that let you attach a debugger to running trials across the cluster. # # Conclusion -# ---------- +# ========== # # In this tutorial, you learned how to tune the hyperparameters of a # PyTorch model using Ray Tune. You saw how to integrate Ray Tune into @@ -588,7 +588,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # efficiently. # # Further reading -# --------------- +# =============== # # - `Ray Tune # documentation `__ From 60d7e3239c37f0793a06d9d14e9b8227aef66b42 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 16 Dec 2025 13:29:37 -0800 Subject: [PATCH 18/20] Restructure the hyperparameter tuning tutorial --- .../hyperparameter_tuning_tutorial.py | 463 +++++++++--------- 1 file changed, 232 insertions(+), 231 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index 89c8c6c6640..3bae0fbc4bb 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -1,6 +1,6 @@ """ -Hyperparameter tuning with Ray Tune -=================================== +Hyperparameter tuning using Ray Tune +==================================== **Author:** `Ricardo Decal `__ @@ -8,21 +8,21 @@ workflow to perform scalable and efficient hyperparameter tuning. `Ray `__, a project of the -PyTorch Foundation, is an open-source unified framework for scaling AI -and Python applications. It helps run distributed workloads by handling -the complexity of distributed computing. `Ray +PyTorch Foundation, is an open source unified framework for scaling AI +and Python applications. It helps run distributed jobs by handling the +complexity of distributed computing. `Ray Tune `__ is a library built on Ray for hyperparameter tuning that enables you to scale a hyperparameter sweep from your machine to a large cluster with no code changes. -This tutorial extends the PyTorch tutorial for training a CIFAR10 image -classifier in the `CIFAR10 tutorial (PyTorch -documentation) `__. -Only minor modifications are needed to adapt the PyTorch tutorial for -Ray Tune. Specifically, this tutorial wraps the data loading and -training in functions, makes some network parameters configurable, adds -optional checkpointing, and defines the search space for model tuning. +This tutorial makes minor modifications to the `PyTorch tutorial for +training a CIFAR10 +classifier `__ +to adapt it for Ray Tune. Specifically, this tutorial wraps the data +loading and training in functions, defines a search space for model +tuning, exposes some parameters to make them configurable, adds optional +checkpointing, and supports multi-GPU training. Setup ----- @@ -56,11 +56,14 @@ from ray.tune.schedulers import ASHAScheduler ###################################################################### -# How to use PyTorch data loaders with Ray Tune -# ============================================= +# Data loading +# ============ # -# Wrap the data loaders in a constructor function. Pass a global data -# directory here to reuse the dataset across different trials. +# Wrap the data loaders in a constructor function. In this tutorial, a +# global data directory is passed to the function to enable reusing the +# dataset across different trials. In a cluster environment, you can use +# shared storage, such as network file systems, to prevent each node from +# downloading the data separately. def load_data(data_dir="./data"): # Mean and standard deviation of the CIFAR10 training subset. @@ -79,12 +82,13 @@ def load_data(data_dir="./data"): return trainset, testset ###################################################################### -# Configure the hyperparameters -# ============================= +# Model architecture +# ================== # -# In this tutorial, we will tune the sizes of the fully connected layers -# and the learning rate. In order to do so, we need to expose the layer -# sizes and the learning rate as configurable parameters. +# This tutorial searches for the best sizes for the fully connected layers +# and the learning rate. To enable this, the ``Net`` class exposes the +# layer sizes ``l1`` and ``l2`` as configurable parameters that Ray Tune +# can search over: class Net(nn.Module): def __init__(self, l1=120, l2=84): @@ -106,130 +110,45 @@ def forward(self, x): return x ###################################################################### -# Use a train function with Ray Tune -# ================================== -# -# Now it gets interesting, because we introduce some changes to the -# example `from the PyTorch -# documentation `__. -# -# We wrap the training script in a function -# ``train_cifar(config, data_dir=None)``. The ``config`` parameter -# receives the hyperparameters we want to train with. The ``data_dir`` -# specifies the directory where we load and store the data, allowing -# multiple runs to share the same data source. This is especially useful -# in cluster environments where you can mount shared storage (for example -# NFS) to prevent the data from being downloaded to each node separately. -# We also load the model and optimizer state at the start of the run if a -# checkpoint is provided. Further down in this tutorial, you will find -# information on how to save the checkpoint and how it is used. -# -# .. code-block:: python -# -# net = Net(config["l1"], config["l2"]) -# -# checkpoint = tune.get_checkpoint() -# if checkpoint: -# with checkpoint.as_directory() as checkpoint_dir: -# checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" -# checkpoint_state = torch.load(checkpoint_path) -# start_epoch = checkpoint_state["epoch"] -# net.load_state_dict(checkpoint_state["net_state_dict"]) -# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) -# else: -# start_epoch = 0 -# -# The learning rate of the optimizer is made configurable, too: -# -# .. code-block:: python -# -# optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) -# -# We also split the dataset into training and validation subsets. We thus -# train on 80% of the data and calculate the validation loss on the -# remaining 20%. The batch sizes with which we iterate through the -# training and test sets are configurable by Ray Tune. -# -# Add multi-GPU support with DataParallel -# --------------------------------------- -# -# Image classification benefits largely from GPUs. Luckily, you can -# continue to use PyTorch tools in Ray Tune. Thus, you can wrap the model -# in ``nn.DataParallel`` to support data-parallel training on multiple -# GPUs: -# -# .. code-block:: python -# -# if torch.cuda.is_available(): -# # Must move the model to CUDA before wrapping it with ``DataParallel`` -# net = net.to("cuda") -# if torch.cuda.device_count() > 1: -# net = nn.DataParallel(net) -# -# By using a ``device`` variable, we ensure that training works even -# without a GPU. PyTorch requires us to send our data to the GPU memory -# explicitly: -# -# .. code-block:: python -# -# for i, data in enumerate(trainloader, 0): -# inputs, labels = data -# inputs, labels = inputs.to(device), labels.to(device) -# -# The code now supports training on CPUs, on a single GPU, and on multiple -# GPUs. Notably, Ray also supports `fractional -# GPUs `__ -# so we can share GPUs among trials, as long as the model still fits on -# the GPU memory. We will return to that later. +# Define the search space +# ======================= # -# Communicating with Ray Tune -# --------------------------- +# Next, define the hyperparameters to tune and how Ray Tune samples them. +# Ray Tune offers a variety of `search space +# distributions `__ +# to suit different parameter types: ``loguniform``, ``uniform``, +# ``choice``, ``randint``, ``grid``, and more. You can also express +# complex dependencies between parameters with `conditional search +# spaces `__ +# or sample from arbitrary functions. # -# The most interesting part is the communication with Ray Tune. As you’ll -# see, integrating Ray Tune into your training code requires only a few -# additional lines: +# Here is the search space for this tutorial: # # .. code-block:: python # -# checkpoint_data = { -# "epoch": epoch, -# "net_state_dict": net.state_dict(), -# "optimizer_state_dict": optimizer.state_dict(), +# config = { +# "l1": tune.choice([2**i for i in range(9)]), +# "l2": tune.choice([2**i for i in range(9)]), +# "lr": tune.loguniform(1e-4, 1e-1), +# "batch_size": tune.choice([2, 4, 8, 16]), # } -# with tempfile.TemporaryDirectory() as checkpoint_dir: -# checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" -# torch.save(checkpoint_data, checkpoint_path) # -# checkpoint = Checkpoint.from_directory(checkpoint_dir) -# tune.report( -# {"loss": val_loss / val_steps, "accuracy": correct / total}, -# checkpoint=checkpoint, -# ) +# The ``tune.choice()`` accepts a list of values that are uniformly +# sampled from. In this example, the ``l1`` and ``l2`` parameter values +# are powers of 2 between 1 and 256, and the learning rate samples on a +# log scale between 0.0001 and 0.1. Sampling on a log scale enables +# exploration across a range of magnitudes on a relative scale, rather +# than an absolute scale. # -# Here we first save a checkpoint and then report some metrics back to Ray -# Tune. Specifically, we send the validation loss and accuracy back to Ray -# Tune. Ray Tune uses these metrics to determine the best hyperparameter -# configuration and to stop underperforming trials early, saving -# resources. +# Training function +# ================= # -# The checkpoint saving is optional. However, it is necessary if we wanted -# to use advanced schedulers like `Population Based -# Training `__. -# Saving the checkpoint also allows us to later load the trained models -# for validation on a test set. Lastly, it provides fault tolerance, -# enabling us to pause and resume training. -# -# To summarize, integrating Ray Tune into your PyTorch training requires -# just a few key additions: use ``tune.report()`` to report metrics (and -# optionally checkpoints) to Ray Tune, ``tune.get_checkpoint()`` to load a -# model from a checkpoint, and ``Checkpoint.from_directory()`` to create a -# checkpoint object from saved state. The rest of your training code -# remains standard PyTorch. -# -# Full training function -# ---------------------- +# Ray Tune requires a training function that accepts a configuration +# dictionary and runs the main training loop. As Ray Tune runs different +# trials, it updates the configuration dictionary for each trial. # -# The full code example looks like this: +# Here is the full training function, followed by explanations of the key +# Ray Tune integration points: def train_cifar(config, data_dir=None): net = Net(config["l1"], config["l2"]) @@ -333,18 +252,110 @@ def train_cifar(config, data_dir=None): print("Finished Training") ###################################################################### -# As you can see, most of the code is adapted directly from the original -# example. +# Key integration points +# ---------------------- +# +# Using hyperparameters from the configuration dictionary +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Ray Tune updates the ``config`` dictionary with the hyperparameters for +# each trial. In this example, the model architecture and optimizer +# receive the hyperparameters from the ``config`` dictionary: # -# Compute test set accuracy -# ========================= +# .. code-block:: python +# +# net = Net(config["l1"], config["l2"]) +# optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) +# +# Reporting metrics and saving checkpoints +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The most important integration is communicating with Ray Tune. Ray Tune +# uses the validation metrics to determine the best hyperparameter +# configuration and to stop underperforming trials early, saving +# resources. # -# Commonly the performance of a machine learning model is tested on a -# held-out test set with data that has not been used for training the -# model. We also wrap this in a function: +# Checkpointing enables you to later load the trained models, resume +# hyperparameter searches, and provides fault tolerance. It’s also +# required for some Ray Tune schedulers like `Population Based +# Training `__ +# that pause and resume trials during the search. +# +# This code from the training function loads model and optimizer state at +# the start if a checkpoint exists: +# +# .. code-block:: python +# +# checkpoint = tune.get_checkpoint() +# if checkpoint: +# with checkpoint.as_directory() as checkpoint_dir: +# checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" +# checkpoint_state = torch.load(checkpoint_path) +# start_epoch = checkpoint_state["epoch"] +# net.load_state_dict(checkpoint_state["net_state_dict"]) +# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) +# +# At the end of each epoch, save a checkpoint and report the validation +# metrics: +# +# .. code-block:: python +# +# checkpoint_data = { +# "epoch": epoch, +# "net_state_dict": net.state_dict(), +# "optimizer_state_dict": optimizer.state_dict(), +# } +# with tempfile.TemporaryDirectory() as checkpoint_dir: +# checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" +# torch.save(checkpoint_data, checkpoint_path) +# +# checkpoint = Checkpoint.from_directory(checkpoint_dir) +# tune.report( +# {"loss": val_loss / val_steps, "accuracy": correct / total}, +# checkpoint=checkpoint, +# ) +# +# Ray Tune checkpointing supports local file systems, cloud storage, and +# distributed file systems. For more information, see the `Ray Tune +# storage +# documentation `__. +# +# Multi-GPU support +# ~~~~~~~~~~~~~~~~~ +# +# Image classification models can be greatly accelerated by using GPUs. +# The training function supports multi-GPU training by wrapping the model +# in ``nn.DataParallel``: +# +# .. code-block:: python +# +# if torch.cuda.device_count() > 1: +# net = nn.DataParallel(net) +# +# This training function supports training on CPUs, a single GPU, or +# multiple GPUs without code changes. Ray Tune also supports `fractional +# GPUs `__ +# so that one GPU can be shared among multiple trials, provided that the +# models, optimizers, and data batches fit into the GPU memory. +# +# Validation split +# ~~~~~~~~~~~~~~~~ +# +# The original CIFAR10 dataset only has train and test subsets. This is +# sufficient for training a single model, however for hyperparameter +# tuning a validation subset is required. The training function creates a +# validation subset by reserving 20% of the training subset. The test +# subset is used to evaluate the best model’s generalization error after +# the search completes. +# +# Evaluation function +# =================== +# +# After finding the optimal hyperparameters, test the model on a held-out +# test set to estimate the generalization error: -def test_accuracy(net, device="cpu"): - _trainset, testset = load_data() +def test_accuracy(net, device="cpu", data_dir=None): + _trainset, testset = load_data(data_dir) testloader = torch.utils.data.DataLoader( testset, batch_size=4, shuffle=False, num_workers=2 @@ -354,9 +365,9 @@ def test_accuracy(net, device="cpu"): total = 0 with torch.no_grad(): for data in testloader: - images, labels = data - images, labels = images.to(device), labels.to(device) - outputs = net(images) + image_batch, labels = data + image_batch, labels = image_batch.to(device), labels.to(device) + outputs = net(image_batch) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() @@ -364,109 +375,95 @@ def test_accuracy(net, device="cpu"): return correct / total ###################################################################### -# The function also expects a ``device`` parameter so you can run the test -# set validation on a GPU. -# -# Configure the search space +# Configure and run Ray Tune # ========================== # -# Lastly, we need to define Ray Tune’s search space. Ray Tune offers a -# variety of `search space -# distributions `__ -# to suit different parameter types: ``loguniform``, ``uniform``, -# ``choice``, ``randint``, ``grid``, and more. It also lets you express -# complex dependencies between parameters with `conditional search -# spaces `__. +# With the training and evaluation functions defined, configure Ray Tune +# to run the hyperparameter search. +# +# Scheduler for early stopping +# ---------------------------- # -# Here is an example: +# Ray Tune provides schedulers to improve the efficiency of the +# hyperparameter search by detecting underperforming trials and stopping +# them early. The ``ASHAScheduler`` uses the Asynchronous Successive +# Halving Algorithm (ASHA) to aggressively terminate low-performing +# trials: # # .. code-block:: python # -# config = { -# "l1": tune.choice([2**i for i in range(9)]), -# "l2": tune.choice([2**i for i in range(9)]), -# "lr": tune.loguniform(1e-4, 1e-1), -# "batch_size": tune.choice([2, 4, 8, 16]), -# } +# scheduler = ASHAScheduler( +# max_t=max_num_epochs, +# grace_period=1, +# reduction_factor=2, +# ) # -# The ``tune.choice()`` accepts a list of values that are uniformly -# sampled from. In this example, the ``l1`` and ``l2`` parameter values -# will be powers of 2 between 1 and 256. The learning rate is sampled on a -# log scale between 0.0001 and 0.1. Sampling on a log scale ensures that -# the search space is explored efficiently across different magnitudes. -# -# Smarter sampling and scheduling -# =============================== -# -# To make the hyperparameter search process efficient, Ray Tune provides -# two main controls: -# -# 1. It can intelligently pick the next set of hyperparameters to test -# based on previous results using `advanced search -# algorithms `__ -# such as -# `Optuna `__ -# or -# `BayesOpt `__, -# instead of relying only on random or grid search. -# 2. It can detect underperforming trials and stop them early using -# `schedulers `__, -# enabling you to explore the parameter space more on the same compute -# budget. -# -# In this tutorial, we use the ``ASHAScheduler``, which aggressively -# terminates low-performing trials to save computational resources. -# -# Configure the resources -# ======================= +# Ray Tune also provides `advanced search +# algorithms `__ +# to smartly pick the next set of hyperparameters based on previous +# results, instead of relying only on random or grid search. Examples +# include +# `Optuna `__ +# and +# `BayesOpt `__. # -# Tell Ray Tune what resources should be available for each trial using -# ``tune.with_resources``: +# Resource allocation +# ------------------- +# +# Tell Ray Tune what resources to allocate for each trial by passing a +# ``resources`` dictionary to ``tune.with_resources``: # # .. code-block:: python # # tune.with_resources( # partial(train_cifar, data_dir=data_dir), -# resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} +# resources={"cpu": 2, "gpu": gpus_per_trial} # ) # -# This tells Ray Tune to allocate ``cpus_per_trial`` CPUs and -# ``gpus_per_trial`` GPUs for each trial. Ray Tune automatically manages -# the placement of these trials and ensures they are isolated, so you -# don’t need to manually assign GPUs to processes. +# Ray Tune automatically manages the placement of these trials and ensures +# that the trials run in isolation, so you don’t need to manually assign +# GPUs to processes. # # For example, if you are running this experiment on a cluster of 20 # machines, each with 8 GPUs, you can set ``gpus_per_trial = 0.5`` to # schedule two concurrent trials per GPU. This configuration runs 320 # trials in parallel across the cluster. # -# Putting it together -# =================== +# **Note**: To run this tutorial without GPUs, set ``gpus_per_trial=0`` +# and expect significantly longer runtimes. +# +# To avoid long runtimes during development, start with a small number +# of trials and epochs. +# +# Creating the Tuner +# ------------------ # -# The Ray Tune API is designed to be modular and composable: you pass your -# configurations to the ``tune.Tuner`` class to create a tuner object, -# then execute ``tuner.fit()`` to start training: +# The Ray Tune API is modular and composable. Pass your configuration to +# the ``tune.Tuner`` class to create a tuner object, then run +# ``tuner.fit()`` to start training: # # .. code-block:: python # # tuner = tune.Tuner( # tune.with_resources( # partial(train_cifar, data_dir=data_dir), -# resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} +# resources={"cpu": 2, "gpu": gpus_per_trial} # ), # tune_config=tune.TuneConfig( # metric="loss", # mode="min", # scheduler=scheduler, -# num_samples=num_samples, +# num_samples=num_trials, # ), # param_space=config, # ) # results = tuner.fit() # -# After training the models, we will find the best performing one and load -# the trained network from the checkpoint file. We then obtain the test -# set accuracy and report the results. +# After training completes, retrieve the best performing trial, load its +# checkpoint, and evaluate on the test set. +# +# Putting it all together +# ----------------------- def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): print("Starting hyperparameter tuning.") @@ -519,17 +516,22 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): best_checkpoint_data = torch.load(checkpoint_path) best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"]) - test_acc = test_accuracy(best_trained_model, device) - print("Best trial test set accuracy: {}".format(test_acc)) + test_acc = test_accuracy(best_trained_model, device, data_dir) + print(f"Best trial test set accuracy: {test_acc}") if __name__ == "__main__": # Set the number of trials, epochs, and GPUs per trial here: - # The following configuration is for a quick run (1 trial, 1 epoch, CPU only) for demonstration purposes. + # The following configuration uses 1 trial, 1 epoch, and CPU only for demonstration purposes. main(num_trials=1, max_num_epochs=1, gpus_per_trial=0) ###################################################################### -# Your Ray Tune trial summary output will look something like this: +# Results +# ======= +# +# Your Ray Tune trial summary output looks something like this. The text +# table summarizes the validation performance of the trials and highlights +# the best hyperparameter configuration: # # .. code-block:: bash # @@ -554,20 +556,18 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # Best trial final validation accuracy: 0.4761 # Best trial test set accuracy: 0.4737 # -# Most trials were stopped early to conserve resources. The best -# performing trial achieved a validation accuracy of approximately 47%, -# which could be confirmed on the test set. -# -# You can now tune the parameters of your PyTorch models. +# Most trials stopped early to conserve resources. The best performing +# trial achieved a validation accuracy of approximately 47%, which the +# test set confirms. # # Observability # ============= # -# When running large-scale experiments, monitoring is crucial. Ray +# Monitoring is critical when running large-scale experiments. Ray # provides a -# `Dashboard `__ +# `dashboard `__ # that lets you view the status of your trials, check cluster resource -# utilization, and inspect logs in real-time. +# use, and inspect logs in real time. # # For debugging, Ray also offers `distributed debugging # tools `__ @@ -579,13 +579,14 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): # In this tutorial, you learned how to tune the hyperparameters of a # PyTorch model using Ray Tune. You saw how to integrate Ray Tune into # your PyTorch training loop, define a search space for your -# hyperparameters, use an efficient scheduler like ASHA to terminate bad -# trials early, save checkpoints and report metrics to Ray Tune, and run -# the hyperparameter search and analyze the results. -# -# Ray Tune makes it easy to scale your experiments from a single machine -# to a large cluster, helping you find the best model configuration -# efficiently. +# hyperparameters, use an efficient scheduler like ASHAScheduler to +# terminate low-performing trials early, save checkpoints and report +# metrics to Ray Tune, and run the hyperparameter search and analyze the +# results. +# +# Ray Tune makes it straightforward to scale your experiments from a +# single machine to a large cluster, helping you find the best model +# configuration efficiently. # # Further reading # =============== From 025c44b44cc63dd46454082aef12f128f4b84428 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 16 Dec 2025 14:47:34 -0800 Subject: [PATCH 19/20] add GPUs back to default config --- beginner_source/hyperparameter_tuning_tutorial.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index 3bae0fbc4bb..bfb44ee521b 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -465,7 +465,7 @@ def test_accuracy(net, device="cpu", data_dir=None): # Putting it all together # ----------------------- -def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): +def main(num_trials=10, max_num_epochs=10, gpus_per_trial=0): print("Starting hyperparameter tuning.") ray.init() @@ -522,8 +522,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): if __name__ == "__main__": # Set the number of trials, epochs, and GPUs per trial here: - # The following configuration uses 1 trial, 1 epoch, and CPU only for demonstration purposes. - main(num_trials=1, max_num_epochs=1, gpus_per_trial=0) + main(num_trials=10, max_num_epochs=10, gpus_per_trial=1) ###################################################################### # Results From 94faa5397d6b0537cc4a24078a1fc02a51dd8a85 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 16 Dec 2025 15:23:30 -0800 Subject: [PATCH 20/20] Expose cpus_per_trial for configuration --- beginner_source/hyperparameter_tuning_tutorial.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index bfb44ee521b..fbc214db92c 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -417,7 +417,7 @@ def test_accuracy(net, device="cpu", data_dir=None): # # tune.with_resources( # partial(train_cifar, data_dir=data_dir), -# resources={"cpu": 2, "gpu": gpus_per_trial} +# resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} # ) # # Ray Tune automatically manages the placement of these trials and ensures @@ -447,7 +447,7 @@ def test_accuracy(net, device="cpu", data_dir=None): # tuner = tune.Tuner( # tune.with_resources( # partial(train_cifar, data_dir=data_dir), -# resources={"cpu": 2, "gpu": gpus_per_trial} +# resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} # ), # tune_config=tune.TuneConfig( # metric="loss", @@ -465,7 +465,7 @@ def test_accuracy(net, device="cpu", data_dir=None): # Putting it all together # ----------------------- -def main(num_trials=10, max_num_epochs=10, gpus_per_trial=0): +def main(num_trials=10, max_num_epochs=10, gpus_per_trial=0, cpus_per_trial=2): print("Starting hyperparameter tuning.") ray.init() @@ -488,7 +488,7 @@ def main(num_trials=10, max_num_epochs=10, gpus_per_trial=0): tuner = tune.Tuner( tune.with_resources( partial(train_cifar, data_dir=data_dir), - resources={"cpu": 2, "gpu": gpus_per_trial} + resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} ), tune_config=tune.TuneConfig( metric="loss",