From cdded6e7ccf1ff35608a93a879eee925511411dd Mon Sep 17 00:00:00 2001 From: jundi69 Date: Mon, 5 May 2025 08:45:17 +0000 Subject: [PATCH 1/3] Clip after allreduce + Zero outer optimizer grads + remove gradscaler + reduce effective batch size --- distributed_training/averaging/avg_handler.py | 14 ++++++++++++++ distributed_training/utils/config.py | 2 +- distributed_training/utils/state_loader.py | 9 +-------- neurons/miner.py | 7 +++---- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/distributed_training/averaging/avg_handler.py b/distributed_training/averaging/avg_handler.py index 104c7229..ec3d0e35 100644 --- a/distributed_training/averaging/avg_handler.py +++ b/distributed_training/averaging/avg_handler.py @@ -198,10 +198,16 @@ async def run_validator_allreduce( # Update state_avgs main params with inner optimizer params self.update_main_param_after_outer_step() + + # Zero grads of outer optimizer + self.state_averager.optimizer.zero_grad() bt.logging.info( ":white_heavy_check_mark: Finished Outer Optimizer Step." ) + + # Clip gradients again + torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # Validate weight updates await self._validate_weight_update(initial_weights, block) @@ -392,11 +398,19 @@ async def run_miner_allreduce( self.state_averager.step( increment_epoch=True, optimizer_step=True, zero_grad=False ) + + # Update state_avgs main params with inner optimizer params self.update_main_param_after_outer_step() + + # Zero grads of outer optimizer + self.state_averager.optimizer.zero_grad() bt.logging.info( ":white_heavy_check_mark: Finished Outer Optimizer Step." ) + + # Clip gradients again + torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # Validate weight updates await self._validate_weight_update(initial_weights, block) diff --git a/distributed_training/utils/config.py b/distributed_training/utils/config.py index a2e03613..c0f418db 100644 --- a/distributed_training/utils/config.py +++ b/distributed_training/utils/config.py @@ -196,7 +196,7 @@ def add_args(cls, parser, prefix=None): "--neuron.local_batch_size_train_effective", type=int, help="Amount of micro batches for gradient accumulation", - default=2048, + default=512, ) parser.add_argument( diff --git a/distributed_training/utils/state_loader.py b/distributed_training/utils/state_loader.py index 5e842677..5ebb6def 100644 --- a/distributed_training/utils/state_loader.py +++ b/distributed_training/utils/state_loader.py @@ -16,7 +16,6 @@ import hivemind import psutil import torch -from memory_profiler import profile from datetime import datetime from hivemind.compression import deserialize_torch_tensor @@ -32,11 +31,7 @@ scan_cache_dir, upload_folder, ) -from huggingface_hub.utils import ( - HfHubHTTPError, - RepositoryNotFoundError, - EntryNotFoundError, -) +from huggingface_hub.utils import HfHubHTTPError from huggingface_hub.constants import HF_HUB_CACHE from transformers import ( AutoModelForCausalLM, @@ -547,8 +542,6 @@ def load_model_optimizer_gradient_averager( self.device, ) - self.scaler = torch.amp.GradScaler(enabled=True) - if (self.local_progress.inner_step != 0) and ("." in revision): self.state_averager.reset_main_parameters( model_name, diff --git a/neurons/miner.py b/neurons/miner.py index ab370c49..128b6e90 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -545,7 +545,7 @@ def _process_training_batch(self, dataset): outputs = self.model(input_ids=inputs, labels=labels) loss = outputs[1] / self.number_of_local_steps - self.scaler.scale(loss).backward() + loss.backward() self.running_loss += loss.item() * self.number_of_local_steps self.batch_count += 1 @@ -581,9 +581,8 @@ def _process_training_batch(self, dataset): def inner_optimizer_step(self): torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) - self.scaler.unscale_(optimizer=self.inner_optimizer) - self.scaler.step(self.inner_optimizer) - self.scaler.update() + + self.inner_optimizer.step() self.scheduler.step() From ac24a0155ec652cbf4acfe6aab1a97e6124e7285 Mon Sep 17 00:00:00 2001 From: jundi69 Date: Mon, 5 May 2025 14:17:46 +0200 Subject: [PATCH 2/3] Fix: switch param copying to correct params --- distributed_training/averaging/avg_handler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/distributed_training/averaging/avg_handler.py b/distributed_training/averaging/avg_handler.py index ec3d0e35..17990f79 100644 --- a/distributed_training/averaging/avg_handler.py +++ b/distributed_training/averaging/avg_handler.py @@ -197,7 +197,7 @@ async def run_validator_allreduce( ) # Update state_avgs main params with inner optimizer params - self.update_main_param_after_outer_step() + self.update_local_model_after_outer_step() # Zero grads of outer optimizer self.state_averager.optimizer.zero_grad() @@ -400,7 +400,7 @@ async def run_miner_allreduce( ) # Update state_avgs main params with inner optimizer params - self.update_main_param_after_outer_step() + self.update_local_model_after_outer_step() # Zero grads of outer optimizer self.state_averager.optimizer.zero_grad() @@ -433,14 +433,14 @@ async def run_miner_allreduce( bt.logging.success("Averaging Round Finished Succesfully") return synapse - def update_main_param_after_outer_step(self): + def update_local_model_after_outer_step(self): """Update the main parameters with the inner optimizer step""" opt_parameters = [ param for group in self.inner_optimizer.param_groups for param in group["params"] ] - for main_param, opt_param in zip( - self.state_averager.main_parameters, opt_parameters + for local_model_param, avg_param in zip( + opt_parameters, self.state_averager.main_parameters ): - main_param.data.copy_(opt_param.data, non_blocking=True) + local_model_param.data.copy_(avg_param.data.to(model_param.device), non_blocking=True) From 678e7f0f5db0abbb0a5b857ed71bc4970823afde Mon Sep 17 00:00:00 2001 From: jundi69 Date: Mon, 5 May 2025 14:25:41 +0200 Subject: [PATCH 3/3] Fix: change wrong var --- distributed_training/averaging/avg_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed_training/averaging/avg_handler.py b/distributed_training/averaging/avg_handler.py index 17990f79..50857845 100644 --- a/distributed_training/averaging/avg_handler.py +++ b/distributed_training/averaging/avg_handler.py @@ -443,4 +443,4 @@ def update_local_model_after_outer_step(self): for local_model_param, avg_param in zip( opt_parameters, self.state_averager.main_parameters ): - local_model_param.data.copy_(avg_param.data.to(model_param.device), non_blocking=True) + local_model_param.data.copy_(avg_param.data.to(local_model_param.device), non_blocking=True)