diff --git a/xtuner/v1/engine/train_engine.py b/xtuner/v1/engine/train_engine.py index 4b3e99873..2e29c668e 100644 --- a/xtuner/v1/engine/train_engine.py +++ b/xtuner/v1/engine/train_engine.py @@ -77,7 +77,7 @@ def __init__(self, model_path, cache_dir=None, from_hub="huggingface"): self.use_safetensors = False elif "model.safetensors" in os.listdir(self.model_path): with safe_open(os.path.join(self.model_path, "model.safetensors"), framework="pt") as f: - self.weight_map = {k: "model.safetensors" for k in f.keys()} + self.weight_map = dict.fromkeys(f.keys(), "model.safetensors") self.use_safetensors = True else: raise FileNotFoundError diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py index d558ce151..402058172 100644 --- a/xtuner/v1/model/base.py +++ b/xtuner/v1/model/base.py @@ -763,7 +763,7 @@ def _save_hf(self, hf_dir: Path | str, save_dtype: torch.dtype = torch.bfloat16) safetensor_index += 1 safetensor_name = f"model-{safetensor_index:04d}-fused-save_rank{save_rank}.safetensors" - weight_map.update({name: safetensor_name for name in name_list}) + weight_map.update(dict.fromkeys(name_list, safetensor_name)) assert save_executor is not None, "Internal Error, save_executor should not be None" future = save_executor.submit( _save_file, diff --git a/xtuner/v1/train/trainer.py b/xtuner/v1/train/trainer.py index 5d3ba020b..41dd5fd31 100644 --- a/xtuner/v1/train/trainer.py +++ b/xtuner/v1/train/trainer.py @@ -21,7 +21,7 @@ from pydantic import BaseModel, ConfigDict, field_serializer, field_validator, model_validator from torch.distributed import init_process_group from torch.distributed.device_mesh import init_device_mesh -from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR, LinearLR, SequentialLR +from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR, LinearLR, LRScheduler, SequentialLR from typing_extensions import NotRequired, Self, TypedDict from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast @@ -764,7 +764,7 @@ def build_engine( engine.model.set_hf(model_path) return engine - def build_lr_scheduler(self, lr_cfg: LRConfig, scheduler_step: int) -> torch.optim.lr_scheduler.LRScheduler: + def build_lr_scheduler(self, lr_cfg: LRConfig, scheduler_step: int) -> LRScheduler: """Build the learning rate scheduler. Args: @@ -774,36 +774,49 @@ def build_lr_scheduler(self, lr_cfg: LRConfig, scheduler_step: int) -> torch.opt torch.optim.lr_scheduler.LRScheduler: Configured learning rate scheduler. """ if lr_cfg.warmup_ratio < 1: - warmup_steps = int(lr_cfg.warmup_ratio * scheduler_step) + warmup_step = int(lr_cfg.warmup_ratio * scheduler_step) else: - warmup_steps = int(lr_cfg.warmup_ratio) + warmup_step = int(lr_cfg.warmup_ratio) def warmup_fn(x): - return x / warmup_steps if x < warmup_steps else 1 + return x / warmup_step if x < warmup_step else 1 warmup_scheduler = LambdaLR(self._engine.optimizer, warmup_fn) - scheduler: torch.optim.lr_scheduler.LRScheduler - if lr_cfg.lr_type == "linear": - scheduler = LinearLR( - self._engine.optimizer, - start_factor=1.0, - end_factor=lr_cfg.lr_min / self._engine.optimizer.defaults["lr"], - total_iters=scheduler_step - warmup_steps, + scheduler_after_warmup: LRScheduler + lr_scheduler: LRScheduler + + if warmup_step < scheduler_step: + if lr_cfg.lr_type == "linear": + scheduler_after_warmup = LinearLR( + self._engine.optimizer, + start_factor=1.0, + end_factor=lr_cfg.lr_min / self._engine.optimizer.defaults["lr"], + total_iters=scheduler_step - warmup_step, + ) + elif lr_cfg.lr_type == "cosine": + scheduler_after_warmup = CosineAnnealingLR( + self._engine.optimizer, T_max=scheduler_step - warmup_step, eta_min=lr_cfg.lr_min + ) + elif lr_cfg.lr_type == "constant": + scheduler_after_warmup = LambdaLR(self._engine.optimizer, lambda x: 1.0) + else: + raise ValueError(f"Unsupported lr type: {lr_cfg.lr_type}") + lr_scheduler = SequentialLR( + optimizer=self._engine.optimizer, + schedulers=[warmup_scheduler, scheduler_after_warmup], + milestones=[warmup_step], ) - elif lr_cfg.lr_type == "cosine": - scheduler = CosineAnnealingLR( - self._engine.optimizer, T_max=scheduler_step - warmup_steps, eta_min=lr_cfg.lr_min + elif warmup_step == scheduler_step: + self.logger.warning( + f"You're setting warmup_step ({warmup_step} to be equal to scheduler_step ({scheduler_step}), " + "which is generally not recommended." ) - elif lr_cfg.lr_type == "constant": - scheduler = LambdaLR(self._engine.optimizer, lambda x: 1.0) + lr_scheduler = warmup_scheduler else: - raise ValueError(f"Unsupported lr type: {lr_cfg.lr_type}") - lr_scheduler = SequentialLR( - optimizer=self._engine.optimizer, - schedulers=[warmup_scheduler, scheduler], - milestones=[warmup_steps], - ) + raise ValueError( + f"Expected warmup_step ({warmup_step}) to be no more than scheduler_step ({scheduler_step})" + ) return lr_scheduler def _maybe_save(self, is_snapshot: bool = False) -> bool: diff --git a/xtuner/v1/utils/loader.py b/xtuner/v1/utils/loader.py index 37050c677..bd91d2c36 100644 --- a/xtuner/v1/utils/loader.py +++ b/xtuner/v1/utils/loader.py @@ -68,7 +68,7 @@ def __init__( self.use_safetensors = False elif "model.safetensors" in os.listdir(self.model_path): with safe_open(os.path.join(self.model_path, "model.safetensors"), framework="pt") as f: - self.weight_map = {k: "model.safetensors" for k in f.keys()} + self.weight_map = dict.fromkeys(f.keys(), "model.safetensors") self.use_safetensors = True else: raise FileNotFoundError