From 33638826a1b9c1d4a3d738af86c41400550dddbd Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Mon, 19 Jan 2026 19:23:17 +0800 Subject: [PATCH 1/5] add convert mode to launcher --- tests/trainer/trainer_test.py | 22 ++-- trinity/cli/launcher.py | 184 +++++++++++++++++++++++++++++++++ trinity/common/models/utils.py | 3 +- 3 files changed, 200 insertions(+), 9 deletions(-) diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py index 99eb711975..94182937a2 100644 --- a/tests/trainer/trainer_test.py +++ b/tests/trainer/trainer_test.py @@ -28,7 +28,7 @@ get_vision_language_model_path, ) from trinity.buffer import get_buffer_reader -from trinity.cli.launcher import bench, both, explore, run, serve, train +from trinity.cli.launcher import bench, both, convert, explore, run, serve, train from trinity.common.config import ( AlgorithmConfig, BufferConfig, @@ -98,7 +98,7 @@ def test_trainer(self): eval_tasksets[0].repeat_times = 4 eval_tasksets[1].repeat_times = 4 self.config.trainer.save_interval = 4 - self.config.trainer.save_hf_checkpoint = "always" + self.config.trainer.save_hf_checkpoint = "never" if self.strategy == "megatron": self.config.trainer.trainer_strategy = "megatron" self.config.check_and_update() @@ -144,12 +144,18 @@ def test_trainer(self): ) self.assertGreater(len(os.listdir(os.path.join(checkpoint_step_4, "actor"))), 0) self.assertGreater(len(os.listdir(os.path.join(checkpoint_step_8, "actor"))), 0) - self.assertGreater( - len(os.listdir(os.path.join(checkpoint_step_4, "actor", "huggingface"))), 0 - ) - self.assertGreater( - len(os.listdir(os.path.join(checkpoint_step_8, "actor", "huggingface"))), 0 - ) + hf_dir_step_4 = os.listdir(os.path.join(checkpoint_step_4, "actor", "huggingface")) + hf_dir_step_8 = os.listdir(os.path.join(checkpoint_step_8, "actor", "huggingface")) + self.assertGreater(len(hf_dir_step_4), 0) + self.assertGreater(len(hf_dir_step_8), 0) + self.assertNotIn("model.safetensors", hf_dir_step_4) + self.assertNotIn("model.safetensors", hf_dir_step_8) + # test checkpoint convert + convert(self.config.checkpoint_job_dir) + hf_dir_step_4 = os.listdir(os.path.join(checkpoint_step_4, "actor", "huggingface")) + hf_dir_step_8 = os.listdir(os.path.join(checkpoint_step_8, "actor", "huggingface")) + self.assertIn("model.safetensors", hf_dir_step_4) + self.assertIn("model.safetensors", hf_dir_step_8) self.assertEqual(step_num, 8) ray.init(ignore_reinit_error=True, namespace=self.config.ray_namespace) # test bench mode diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py index 46e6cb2a0e..d9209bf589 100644 --- a/trinity/cli/launcher.py +++ b/trinity/cli/launcher.py @@ -6,6 +6,7 @@ import traceback from pathlib import Path from pprint import pprint +from typing import Optional import ray @@ -301,6 +302,171 @@ def debug( ) +class Converter: + def __init__(self, base_model_dir: Optional[str] = None): + self.logger = get_logger(__name__) + self.base_model_dir = base_model_dir + self.base_model = None + self._init_process_group = False + self.checkpoint_converter = None + + def init_base_model(self) -> bool: + if not self.base_model_dir: + return False + if self.base_model is not None: + return True + try: + self.base_model, _ = self._get_config_and_empty_model(self.base_model_dir) + except Exception: + return False + return True + + def init_process_group(self): + if self._init_process_group: + return + + import torch + from verl.utils.device import get_nccl_backend + from verl.utils.distributed import set_numa_affinity + + if "WORLD_SIZE" not in os.environ: + os.environ["RANK"] = "0" + os.environ["LOCAL_RANK"] = "0" + os.environ["WORLD_SIZE"] = "1" + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + + set_numa_affinity() + torch.distributed.init_process_group(get_nccl_backend()) + self._init_process_group = True + + def init_checkpoint_converter(self, checkpoint_dir) -> bool: + if self.checkpoint_converter is not None: + return True + if not os.path.basename(checkpoint_dir).startswith("global_step_"): + self.logger.error(f"Invalid checkpoint directory {checkpoint_dir}.") + return False + + actor_ckpt_dir = os.path.join(checkpoint_dir, "actor") + huggingface_dir = os.path.join(actor_ckpt_dir, "huggingface") + if not os.path.exists(os.path.join(huggingface_dir, "config.json")): + if not self.init_base_model(): + self.logger.error( + f"Failed to load base model from {self.base_model_dir}, " + "please check if the model exists." + ) + return False + self.base_model.config.save_pretrained(huggingface_dir) + + from trinity.common.models.utils import get_megatron_converter + + self.init_process_group() + self.checkpoint_converter = get_megatron_converter(actor_ckpt_dir) + return True + + def _get_config_and_empty_model(self, model_dir: str): + import torch + import transformers + from accelerate import init_empty_weights + + model_config = transformers.AutoConfig.from_pretrained(model_dir) + + if "ForTokenClassification" in model_config.architectures[0]: + from transformers import AutoModelForTokenClassification + + auto_model_cls = AutoModelForTokenClassification + elif "ForCausalLM" in model_config.architectures[0]: + from transformers import AutoModelForCausalLM + + auto_model_cls = AutoModelForCausalLM + elif "ForConditionalGeneration" in model_config.architectures[0]: + # Handle different transformers versions for Vision2Seq models + import transformers + from packaging import version + + if version.parse(transformers.__version__) >= version.parse("4.54.0"): + # transformers >= 4.54.0 uses AutoModelForImageTextToText + from transformers import AutoModelForImageTextToText + + auto_model_cls = AutoModelForImageTextToText + else: + # transformers < 4.54.0 uses AutoModelForVision2Seq + from transformers import AutoModelForVision2Seq + + auto_model_cls = AutoModelForVision2Seq + else: + raise NotImplementedError(f"Unknown architecture {model_config['architectures']}") + + with init_empty_weights(): + model = auto_model_cls.from_config(model_config, dtype=torch.bfloat16) + model.to_empty(device="cpu") + + return model, auto_model_cls + + def convert(self, checkpoint_dir: str) -> None: + if os.path.basename(checkpoint_dir).startswith("global_step_"): + import torch + + actor_ckpt_dir = os.path.join(checkpoint_dir, "actor") + huggingface_dir = os.path.join(actor_ckpt_dir, "huggingface") + model = None + if os.path.exists(huggingface_dir): + has_hf_checkpoint = True + try: + model, auto_model_cls = self._get_config_and_empty_model(huggingface_dir) + auto_model_cls.from_pretrained(huggingface_dir) + except Exception: + has_hf_checkpoint = False + + if has_hf_checkpoint: + return + if model is None: + if not self.init_base_model(): + self.logger.error( + f"Failed to load base model from {self.base_model_dir}, please check if the model exists." + ) + return + model = self.base_model + + self.logger.info(f"Converting {checkpoint_dir} to huggingface format...") + dist_cpkt_dir = os.path.join(actor_ckpt_dir, "dist_ckpt") + try: + if os.path.exists(dist_cpkt_dir): # megatron + if not self.init_checkpoint_converter(checkpoint_dir): + return + state_dict = self.checkpoint_converter.get_state_dict(actor_ckpt_dir) + else: # fsdp + from trinity.common.models.utils import ( + load_fsdp_state_dict_from_verl_checkpoint, + ) + + state_dict = load_fsdp_state_dict_from_verl_checkpoint(actor_ckpt_dir) + except Exception: + self.logger.error( + f"Failed to convert {checkpoint_dir} to huggingface format.", + exc_info=True, + ) + return + + state_dict = {k: v.to(torch.bfloat16) for k, v in state_dict.items()} + model.save_pretrained(huggingface_dir, state_dict=state_dict) + self.logger.info(f"Saved huggingface checkpoint to {huggingface_dir}") + + else: # recursive search + for sub_dir in os.listdir(checkpoint_dir): + sub_dir_path = os.path.join(checkpoint_dir, sub_dir) + if os.path.isdir(sub_dir_path): + self.convert(sub_dir_path) + + +def convert(checkpoint_dir: str, base_model_dir: Optional[str] = None) -> None: + if "global_step_" in checkpoint_dir: + while not os.path.basename(checkpoint_dir).startswith("global_step_"): + checkpoint_dir = os.path.dirname(checkpoint_dir) + converter = Converter(base_model_dir) + converter.convert(checkpoint_dir) + + def main() -> None: """The main entrypoint.""" parser = argparse.ArgumentParser() @@ -367,6 +533,22 @@ def main() -> None: help="The port for Experience Viewer.", ) + convert_parser = subparsers.add_parser( + "convert", help="Convert checkpoint to huggingface format." + ) + convert_parser.add_argument( + "--checkpoint-dir", + type=str, + required=True, + help="The path to the checkpoint directory.", + ) + convert_parser.add_argument( + "--base-model-dir", + type=str, + default=None, + help="The path to the base model.", + ) + args = parser.parse_args() if args.command == "run": # TODO: support parse all args from command line @@ -383,6 +565,8 @@ def main() -> None: args.port, args.plugin_dir, ) + elif args.command == "convert": + convert(args.checkpoint_dir, args.base_model_dir) if __name__ == "__main__": diff --git a/trinity/common/models/utils.py b/trinity/common/models/utils.py index 5e3b9f1020..c2c76519c9 100644 --- a/trinity/common/models/utils.py +++ b/trinity/common/models/utils.py @@ -413,7 +413,8 @@ def __init__(self, config: ModelMergerConfig): self.hf_config = AutoConfig.from_pretrained( self.config.hf_model_config_path, trust_remote_code=self.config.trust_remote_code ) - print(self.hf_config, flush=True) + self.logger = get_logger(__name__) + self.logger.debug(self.hf_config) self.params_mapping = { # megatron core gpt model name, huggingface model name From 05d01962231fdd132b3f83700604a5a806e67098 Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Tue, 20 Jan 2026 11:43:14 +0800 Subject: [PATCH 2/5] apply review and update docs --- .../tutorial/example_reasoning_basic.md | 87 +++++++++ .../tutorial/example_reasoning_basic.md | 88 +++++++++ trinity/cli/launcher.py | 158 +--------------- trinity/manager/checkpoint_converter.py | 173 ++++++++++++++++++ 4 files changed, 349 insertions(+), 157 deletions(-) create mode 100644 trinity/manager/checkpoint_converter.py diff --git a/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md index 78db3831f2..e070da5112 100644 --- a/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md +++ b/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md @@ -117,6 +117,93 @@ Run the RFT process with the following command: trinity run --config examples/grpo_gsm8k/gsm8k.yaml ``` +## Optional: Convert Checkpoints to Hugging Face Format + +After running Trinity-RFT experiments, the system automatically saves training checkpoints to the following path: + +``` +${checkpoint_root_dir}/${project_name}/${name} +``` + +The directory structure is as follows: + +``` +${checkpoint_root_dir}/${project_name}/${name} +├── buffer +│ ├── experience_buffer.jsonl # Stores experience data generated during training +│ └── explorer_output.db # Database file output by the Explorer module +├── log # Contains logs from multiple Ray Actors +│ ├── checkpoint_monitor.log +│ ├── explorer.log +│ ├── explorer_experience_pipeline.log +│ ├── explorer_runner_0.log ... explorer_runner_31.log +│ ├── queue_experience_buffer.log +│ └── synchronizer.log +├── monitor # Monitoring-related files (may be empty) +├── global_step_58 # Example: Full checkpoint at step 58 +│ └── actor +│ ├── huggingface # (Optional) Hugging Face formatted model files +│ │ ├── added_tokens.json +│ │ ├── chat_template.jinja +│ │ ├── config.json +│ │ ├── generation_config.json +│ │ ├── merges.txt +│ │ ├── model.safetensors # ← Key model weights file +│ │ ├── special_tokens_map.json +│ │ ├── tokenizer.json +│ │ ├── tokenizer_config.json +│ │ └── vocab.json +│ ├── extra_state_world_size_4_rank_0.pt # Additional state (e.g., random seeds) +│ ├── ... +│ ├── fsdp_config.json # FSDP configuration file +│ ├── model_world_size_4_rank_0.pt ... model_world_size_4_rank_3.pt # Sharded model parameters +│ ├── optim_world_size_4_rank_0.pt ... optim_world_size_4_rank_3.pt # Sharded optimizer states +│ └── ... +├── explorer_meta.json # Metadata for the Explorer module +├── trainer_meta.json # Metadata for the Trainer module +├── latest_checkpointed_iteration.txt # Training step of the most recent full checkpoint +└── latest_state_dict_iteration.txt # Training step of the most recent model parameter save (used for checkpoint synchronization) +``` + +### When Is Conversion Needed? + +If you wish to use the model in **Hugging Face format** (e.g., for inference or deployment), but find that the `model.safetensors` file is **missing** from the `global_step_*/actor/huggingface/` directory, you need to manually perform the conversion. + +### Automatic Batch Conversion Feature + +The `trinity convert` command supports **recursively scanning** all `global_step_*` subdirectories under the specified directory and **automatically converting each checkpoint**. This means you don't need to manually specify each training step individually—just point to the project root directory to process all checkpoints in bulk. + +#### Basic Usage (Recommended) + +To convert **all saved checkpoints** to Hugging Face format, simply run: + +```bash +trinity convert --checkpoint-dir ${checkpoint_root_dir}/${project_name}/${name} +``` + +This command will: +- Automatically locate all subdirectories matching the pattern `global_step_`; +- Convert the `actor` model within each subdirectory; +- Save the resulting Hugging Face format files (including `model.safetensors`, etc.) into the corresponding `actor/huggingface/` directory. + +#### Special Case: Missing Base Model Configuration + +If a `global_step_*/actor/huggingface/` directory is **missing `config.json`** (typically because the full configuration wasn't saved during training), the conversion process requires the configuration file from the original base model. In this case, specify the base model path using `--base-model-dir`: + +```bash +trinity convert \ + --checkpoint-dir ${checkpoint_root_dir}/${project_name}/${name} \ + --base-model-dir /path/to/your/base/model +``` + +> 💡 This parameter applies to **all scanned checkpoints**. If any checkpoint lacks `config.json`, you must provide this argument. + +### Notes + +- **Actor Model Only**: The current `trinity convert` command only processes model parameters in the `actor` folder and **does not handle `critic` models** (even if they exist). Converting Critic models requires separate operations. +- **Automatic Training Format Detection**: `trinity convert` natively supports checkpoints from both **FSDP** and **Megatron** distributed training formats. **No additional parameters are required**—the tool automatically detects the format and correctly merges the sharded weights. +- **Idempotency**: If a `global_step_*` checkpoint already contains a complete set of Hugging Face files (especially `model.safetensors`) in its `huggingface/` directory, the conversion will be skipped to avoid redundant processing. +- **Performance Tip**: The conversion process can be time-consuming, especially when dealing with many checkpoints or large models. It's recommended to run this during off-peak hours. ## Optional: RFT with SFT Warmup diff --git a/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md index a0f51d1015..2fd416f3af 100644 --- a/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md +++ b/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md @@ -118,6 +118,94 @@ trinity run --config examples/grpo_gsm8k/gsm8k.yaml ``` +## 可选:将检查点转换为 Hugging Face 格式 + +在运行 Trinity-RFT 进行实验后,系统会自动将训练过程中的检查点(checkpoint)保存到以下路径: + +``` +${checkpoint_root_dir}/${project_name}/${name} +``` + +该目录的结构如下: + +``` +${checkpoint_root_dir}/${project_name}/${name} +├── buffer +│ ├── experience_buffer.jsonl # 存储训练过程中生成的经验数据 +│ └── explorer_output.db # Explorer 模块输出的数据库文件 +├── log # 包含多个 Ray Actor 的日志 +│ ├── checkpoint_monitor.log +│ ├── explorer.log +│ ├── explorer_experience_pipeline.log +│ ├── explorer_runner_0.log ... explorer_runner_31.log +│ ├── queue_experience_buffer.log +│ └── synchronizer.log +├── monitor # 监控相关文件(可能为空) +├── global_step_58 # 示例:第 58 步的完整检查点 +│ └── actor +│ ├── huggingface # (可选)Hugging Face 格式的模型文件 +│ │ ├── added_tokens.json +│ │ ├── chat_template.jinja +│ │ ├── config.json +│ │ ├── generation_config.json +│ │ ├── merges.txt +│ │ ├── model.safetensors # ← 关键模型权重文件 +│ │ ├── special_tokens_map.json +│ │ ├── tokenizer.json +│ │ ├── tokenizer_config.json +│ │ └── vocab.json +│ ├── extra_state_world_size_4_rank_0.pt # 额外状态(如随机数种子等) +│ ├── ... +│ ├── fsdp_config.json # FSDP 配置文件 +│ ├── model_world_size_4_rank_0.pt ... model_world_size_4_rank_3.pt # 分片模型参数 +│ ├── optim_world_size_4_rank_0.pt ... optim_world_size_4_rank_3.pt # 分片优化器状态 +│ └── ... +├── explorer_meta.json # Explorer 模块的元数据 +├── trainer_meta.json # Trainer 模块的元数据 +├── latest_checkpointed_iteration.txt # 最近一次完整检查点的训练步数 +└── latest_state_dict_iteration.txt # 最近一次保存模型参数的训练步数(用于 checkpoint 同步) +``` + +### 何时需要转换? + +如果你希望使用 **Hugging Face 格式** 的模型(例如用于推理或部署),但发现 `global_step_*/actor/huggingface/` 目录中 **缺少 `model.safetensors` 文件**,就需要手动执行转换。 + +### 自动批量转换功能 + +`trinity convert` 命令支持**递归扫描**指定目录下的所有 `global_step_*` 子文件夹,并**自动为每个检查点执行转换**。这意味着你无需逐一手动指定每个训练步,只需指向项目根目录即可完成批量处理。 + +#### 基本用法(推荐) + +如果你希望将 **所有已保存的检查点** 转换为 Hugging Face 格式,直接运行: + +```bash +trinity convert --checkpoint-dir ${checkpoint_root_dir}/${project_name}/${name} +``` + +该命令会: +- 自动查找目录下所有形如 `global_step_数字` 的子文件夹; +- 对每个子文件夹中的 `actor` 模型执行转换; +- 将生成的 Hugging Face 格式文件(包括 `model.safetensors` 等)保存到对应的 `actor/huggingface/` 目录中。 + +#### 特殊情况:缺少基础模型配置 + +如果某个 `global_step_*/actor/huggingface/` 目录中 **缺少 `config.json`**(通常是因为训练时未保存完整配置),转换过程需要原始基础模型的配置文件。此时,请通过 `--base-model-dir` 指定基础模型路径: + +```bash +trinity convert \ + --checkpoint-dir ${checkpoint_root_dir}/${project_name}/${name} \ + --base-model-dir /path/to/your/base/model +``` + +> 💡 此参数适用于**所有被扫描到的检查点**。只要任意一个检查点缺少 `config.json`,就需要提供该参数。 + +### 注意事项 + +- **仅转换 Actor 模型**:当前 `trinity convert` 仅处理 `actor` 文件夹中的模型参数,**不会处理 `critic`**(即使存在)。若需转换 Critic 模型,需另行操作。 +- **自动识别训练格式**:`trinity convert` 原生支持 **FSDP** 和 **Megatron** 两种分布式训练格式的检查点,**无需额外指定参数**,工具会自动检测并正确合并分片权重。 +- **幂等性**:如果某个 `global_step_*` 的 `huggingface/` 目录已包含完整的 Hugging Face 文件(特别是 `model.safetensors`),该检查点将被跳过,避免重复转换。 +- **性能提示**:转换过程可能较耗时,尤其是当检查点数量多或模型较大时。建议在空闲时段运行。 + ## 进阶选项:带 SFT warmup 的 RFT diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py index d9209bf589..c077d0bcaa 100644 --- a/trinity/cli/launcher.py +++ b/trinity/cli/launcher.py @@ -14,6 +14,7 @@ from trinity.common.config import Config, load_config from trinity.common.constants import DEBUG_NAMESPACE, PLUGIN_DIRS_ENV_VAR from trinity.explorer.explorer import Explorer +from trinity.manager.checkpoint_converter import Converter from trinity.manager.state_manager import StateManager from trinity.trainer.trainer import Trainer from trinity.utils.dlc_utils import is_running, setup_ray_cluster, stop_ray_cluster @@ -302,163 +303,6 @@ def debug( ) -class Converter: - def __init__(self, base_model_dir: Optional[str] = None): - self.logger = get_logger(__name__) - self.base_model_dir = base_model_dir - self.base_model = None - self._init_process_group = False - self.checkpoint_converter = None - - def init_base_model(self) -> bool: - if not self.base_model_dir: - return False - if self.base_model is not None: - return True - try: - self.base_model, _ = self._get_config_and_empty_model(self.base_model_dir) - except Exception: - return False - return True - - def init_process_group(self): - if self._init_process_group: - return - - import torch - from verl.utils.device import get_nccl_backend - from verl.utils.distributed import set_numa_affinity - - if "WORLD_SIZE" not in os.environ: - os.environ["RANK"] = "0" - os.environ["LOCAL_RANK"] = "0" - os.environ["WORLD_SIZE"] = "1" - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "12355" - - set_numa_affinity() - torch.distributed.init_process_group(get_nccl_backend()) - self._init_process_group = True - - def init_checkpoint_converter(self, checkpoint_dir) -> bool: - if self.checkpoint_converter is not None: - return True - if not os.path.basename(checkpoint_dir).startswith("global_step_"): - self.logger.error(f"Invalid checkpoint directory {checkpoint_dir}.") - return False - - actor_ckpt_dir = os.path.join(checkpoint_dir, "actor") - huggingface_dir = os.path.join(actor_ckpt_dir, "huggingface") - if not os.path.exists(os.path.join(huggingface_dir, "config.json")): - if not self.init_base_model(): - self.logger.error( - f"Failed to load base model from {self.base_model_dir}, " - "please check if the model exists." - ) - return False - self.base_model.config.save_pretrained(huggingface_dir) - - from trinity.common.models.utils import get_megatron_converter - - self.init_process_group() - self.checkpoint_converter = get_megatron_converter(actor_ckpt_dir) - return True - - def _get_config_and_empty_model(self, model_dir: str): - import torch - import transformers - from accelerate import init_empty_weights - - model_config = transformers.AutoConfig.from_pretrained(model_dir) - - if "ForTokenClassification" in model_config.architectures[0]: - from transformers import AutoModelForTokenClassification - - auto_model_cls = AutoModelForTokenClassification - elif "ForCausalLM" in model_config.architectures[0]: - from transformers import AutoModelForCausalLM - - auto_model_cls = AutoModelForCausalLM - elif "ForConditionalGeneration" in model_config.architectures[0]: - # Handle different transformers versions for Vision2Seq models - import transformers - from packaging import version - - if version.parse(transformers.__version__) >= version.parse("4.54.0"): - # transformers >= 4.54.0 uses AutoModelForImageTextToText - from transformers import AutoModelForImageTextToText - - auto_model_cls = AutoModelForImageTextToText - else: - # transformers < 4.54.0 uses AutoModelForVision2Seq - from transformers import AutoModelForVision2Seq - - auto_model_cls = AutoModelForVision2Seq - else: - raise NotImplementedError(f"Unknown architecture {model_config['architectures']}") - - with init_empty_weights(): - model = auto_model_cls.from_config(model_config, dtype=torch.bfloat16) - model.to_empty(device="cpu") - - return model, auto_model_cls - - def convert(self, checkpoint_dir: str) -> None: - if os.path.basename(checkpoint_dir).startswith("global_step_"): - import torch - - actor_ckpt_dir = os.path.join(checkpoint_dir, "actor") - huggingface_dir = os.path.join(actor_ckpt_dir, "huggingface") - model = None - if os.path.exists(huggingface_dir): - has_hf_checkpoint = True - try: - model, auto_model_cls = self._get_config_and_empty_model(huggingface_dir) - auto_model_cls.from_pretrained(huggingface_dir) - except Exception: - has_hf_checkpoint = False - - if has_hf_checkpoint: - return - if model is None: - if not self.init_base_model(): - self.logger.error( - f"Failed to load base model from {self.base_model_dir}, please check if the model exists." - ) - return - model = self.base_model - - self.logger.info(f"Converting {checkpoint_dir} to huggingface format...") - dist_cpkt_dir = os.path.join(actor_ckpt_dir, "dist_ckpt") - try: - if os.path.exists(dist_cpkt_dir): # megatron - if not self.init_checkpoint_converter(checkpoint_dir): - return - state_dict = self.checkpoint_converter.get_state_dict(actor_ckpt_dir) - else: # fsdp - from trinity.common.models.utils import ( - load_fsdp_state_dict_from_verl_checkpoint, - ) - - state_dict = load_fsdp_state_dict_from_verl_checkpoint(actor_ckpt_dir) - except Exception: - self.logger.error( - f"Failed to convert {checkpoint_dir} to huggingface format.", - exc_info=True, - ) - return - - state_dict = {k: v.to(torch.bfloat16) for k, v in state_dict.items()} - model.save_pretrained(huggingface_dir, state_dict=state_dict) - self.logger.info(f"Saved huggingface checkpoint to {huggingface_dir}") - - else: # recursive search - for sub_dir in os.listdir(checkpoint_dir): - sub_dir_path = os.path.join(checkpoint_dir, sub_dir) - if os.path.isdir(sub_dir_path): - self.convert(sub_dir_path) - - def convert(checkpoint_dir: str, base_model_dir: Optional[str] = None) -> None: if "global_step_" in checkpoint_dir: while not os.path.basename(checkpoint_dir).startswith("global_step_"): diff --git a/trinity/manager/checkpoint_converter.py b/trinity/manager/checkpoint_converter.py new file mode 100644 index 0000000000..a21bd3db04 --- /dev/null +++ b/trinity/manager/checkpoint_converter.py @@ -0,0 +1,173 @@ + +import os +from typing import Optional +from trinity.utils.log import get_logger + + +class Converter: + def __init__(self, base_model_dir: Optional[str] = None): + self.logger = get_logger(__name__) + self.base_model_dir = base_model_dir + self.base_model = None + self._init_process_group = False + self.checkpoint_converter = None + + def init_base_model(self) -> bool: + if not self.base_model_dir: + self.logger.error( + "Base model directory is not specified. " + "Please specify it with `--base-model-dir /path/to/model`." + ) + return False + if self.base_model is not None: + return True + try: + self.base_model, _ = self._get_config_and_empty_model(self.base_model_dir) + except Exception: + self.logger.error( + f"Failed to initialize base model from {self.base_model_dir}", + exc_info=True + ) + return False + return True + + def init_process_group(self): + if self._init_process_group: + return + + import torch + from verl.utils.device import get_nccl_backend + from verl.utils.distributed import set_numa_affinity + + if "WORLD_SIZE" not in os.environ: + os.environ["RANK"] = "0" + os.environ["LOCAL_RANK"] = "0" + os.environ["WORLD_SIZE"] = "1" + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + + set_numa_affinity() + torch.distributed.init_process_group(get_nccl_backend()) + self._init_process_group = True + + def init_checkpoint_converter(self, checkpoint_dir) -> bool: + if self.checkpoint_converter is not None: + return True + if not os.path.basename(checkpoint_dir).startswith("global_step_"): + self.logger.error(f"Invalid checkpoint directory {checkpoint_dir}.") + return False + + actor_ckpt_dir = os.path.join(checkpoint_dir, "actor") + huggingface_dir = os.path.join(actor_ckpt_dir, "huggingface") + if not os.path.exists(os.path.join(huggingface_dir, "config.json")): + if not self.init_base_model(): + self.logger.error( + f"Failed to load base model from {self.base_model_dir}, " + "please check if the model exists." + ) + return False + self.base_model.config.save_pretrained(huggingface_dir) + + from trinity.common.models.utils import get_megatron_converter + + self.init_process_group() + self.checkpoint_converter = get_megatron_converter(actor_ckpt_dir) + return True + + def _get_config_and_empty_model(self, model_dir: str): + import torch + import transformers + from accelerate import init_empty_weights + + model_config = transformers.AutoConfig.from_pretrained(model_dir) + + if "ForTokenClassification" in model_config.architectures[0]: + from transformers import AutoModelForTokenClassification + + auto_model_cls = AutoModelForTokenClassification + elif "ForCausalLM" in model_config.architectures[0]: + from transformers import AutoModelForCausalLM + + auto_model_cls = AutoModelForCausalLM + elif "ForConditionalGeneration" in model_config.architectures[0]: + # Handle different transformers versions for Vision2Seq models + import transformers + from packaging import version + + if version.parse(transformers.__version__) >= version.parse("4.54.0"): + # transformers >= 4.54.0 uses AutoModelForImageTextToText + from transformers import AutoModelForImageTextToText + + auto_model_cls = AutoModelForImageTextToText + else: + # transformers < 4.54.0 uses AutoModelForVision2Seq + from transformers import AutoModelForVision2Seq + + auto_model_cls = AutoModelForVision2Seq + else: + raise NotImplementedError(f"Unknown architecture {model_config['architectures']}") + + with init_empty_weights(): + model = auto_model_cls.from_config(model_config, dtype=torch.bfloat16) + model.to_empty(device="cpu") + + return model, auto_model_cls + + def convert(self, checkpoint_dir: str) -> None: + if os.path.basename(checkpoint_dir).startswith("global_step_"): + import torch + + actor_ckpt_dir = os.path.join(checkpoint_dir, "actor") + huggingface_dir = os.path.join(actor_ckpt_dir, "huggingface") + model = None + if os.path.exists(huggingface_dir): + has_hf_checkpoint = True + try: + model, auto_model_cls = self._get_config_and_empty_model(huggingface_dir) + auto_model_cls.from_pretrained(huggingface_dir) + except Exception: + self.logger.debug( + f"Incomplete or invalid Hugging Face checkpoint in {huggingface_dir}, will re-convert.", + exc_info=True, + ) + has_hf_checkpoint = False + + if has_hf_checkpoint: + return + if model is None: + if not self.init_base_model(): + self.logger.error( + f"Failed to load base model from {self.base_model_dir}, please check if the model exists." + ) + return + model = self.base_model + + self.logger.info(f"Converting {checkpoint_dir} to huggingface format...") + dist_cpkt_dir = os.path.join(actor_ckpt_dir, "dist_ckpt") + try: + if os.path.exists(dist_cpkt_dir): # megatron + if not self.init_checkpoint_converter(checkpoint_dir): + return + state_dict = self.checkpoint_converter.get_state_dict(actor_ckpt_dir) + else: # fsdp + from trinity.common.models.utils import ( + load_fsdp_state_dict_from_verl_checkpoint, + ) + + state_dict = load_fsdp_state_dict_from_verl_checkpoint(actor_ckpt_dir) + except Exception: + self.logger.error( + f"Failed to convert {checkpoint_dir} to huggingface format.", + exc_info=True, + ) + return + + state_dict = {k: v.to(torch.bfloat16) for k, v in state_dict.items()} + model.save_pretrained(huggingface_dir, state_dict=state_dict) + self.logger.info(f"Saved huggingface checkpoint to {huggingface_dir}") + + else: # recursive search + for sub_dir in os.listdir(checkpoint_dir): + sub_dir_path = os.path.join(checkpoint_dir, sub_dir) + if os.path.isdir(sub_dir_path): + self.convert(sub_dir_path) From cec84a0d3092c2ace2fdc0c8e82b13918e8e7644 Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Tue, 20 Jan 2026 12:08:44 +0800 Subject: [PATCH 3/5] apply reviews --- .../tutorial/example_reasoning_basic.md | 35 ++++++++++++------- .../tutorial/example_reasoning_basic.md | 35 ++++++++++++------- trinity/manager/checkpoint_converter.py | 7 ++-- 3 files changed, 47 insertions(+), 30 deletions(-) diff --git a/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md index e070da5112..c4e9e89765 100644 --- a/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md +++ b/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md @@ -122,13 +122,13 @@ trinity run --config examples/grpo_gsm8k/gsm8k.yaml After running Trinity-RFT experiments, the system automatically saves training checkpoints to the following path: ``` -${checkpoint_root_dir}/${project_name}/${name} +${checkpoint_root_dir}/${project}/${name} ``` The directory structure is as follows: ``` -${checkpoint_root_dir}/${project_name}/${name} +${checkpoint_root_dir}/${project}/${name} ├── buffer │ ├── experience_buffer.jsonl # Stores experience data generated during training │ └── explorer_output.db # Database file output by the Explorer module @@ -169,30 +169,39 @@ ${checkpoint_root_dir}/${project_name}/${name} If you wish to use the model in **Hugging Face format** (e.g., for inference or deployment), but find that the `model.safetensors` file is **missing** from the `global_step_*/actor/huggingface/` directory, you need to manually perform the conversion. -### Automatic Batch Conversion Feature +### Conversion Tool: `trinity convert` -The `trinity convert` command supports **recursively scanning** all `global_step_*` subdirectories under the specified directory and **automatically converting each checkpoint**. This means you don't need to manually specify each training step individually—just point to the project root directory to process all checkpoints in bulk. +The `trinity convert` command provides flexible model conversion capabilities and supports the following usage patterns: -#### Basic Usage (Recommended) - -To convert **all saved checkpoints** to Hugging Face format, simply run: +#### ✅ Batch Conversion (Recommended) +Point `--checkpoint-dir` to your project root directory (i.e., the path containing multiple `global_step_*` subdirectories). The tool will **automatically recursively scan for all `global_step_*` directories** and convert each checkpoint accordingly. ```bash -trinity convert --checkpoint-dir ${checkpoint_root_dir}/${project_name}/${name} +trinity convert --checkpoint-dir ${checkpoint_root_dir}/${project}/${name} ``` This command will: -- Automatically locate all subdirectories matching the pattern `global_step_`; +- Automatically detect all subdirectories matching the pattern `global_step_`; - Convert the `actor` model within each subdirectory; -- Save the resulting Hugging Face format files (including `model.safetensors`, etc.) into the corresponding `actor/huggingface/` directory. +- Save the resulting Hugging Face–formatted files (including `model.safetensors`, etc.) into the corresponding `actor/huggingface/` subdirectory. + +#### ✅ Single-step Conversion +If you only want to convert a model from a specific training step, directly point `--checkpoint-dir` to the corresponding `global_step_XXX` folder: + +```bash +trinity convert --checkpoint-dir ${checkpoint_root_dir}/${project}/${name}/global_step_120 +``` + +#### ✅ Path Tolerance +Even if you specify a subpath inside a `global_step_XXX` directory (e.g., `.../global_step_120/actor`), the tool can intelligently recognize the correct context and complete the conversion successfully—no need to strictly align the path to the `global_step_XXX` level. -#### Special Case: Missing Base Model Configuration +### Special Case: Missing Base Model Configuration -If a `global_step_*/actor/huggingface/` directory is **missing `config.json`** (typically because the full configuration wasn't saved during training), the conversion process requires the configuration file from the original base model. In this case, specify the base model path using `--base-model-dir`: +If a `config.json` file is **missing** from any `global_step_*/actor/huggingface/` directory (typically because the configuration wasn't fully saved during training), the conversion process requires the original base model's configuration. In this case, use `--base-model-dir` to specify the path to your base model: ```bash trinity convert \ - --checkpoint-dir ${checkpoint_root_dir}/${project_name}/${name} \ + --checkpoint-dir ${checkpoint_root_dir}/${project}/${name} \ --base-model-dir /path/to/your/base/model ``` diff --git a/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md index 2fd416f3af..96287f6f5c 100644 --- a/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md +++ b/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md @@ -123,13 +123,13 @@ trinity run --config examples/grpo_gsm8k/gsm8k.yaml 在运行 Trinity-RFT 进行实验后,系统会自动将训练过程中的检查点(checkpoint)保存到以下路径: ``` -${checkpoint_root_dir}/${project_name}/${name} +${checkpoint_root_dir}/${project}/${name} ``` 该目录的结构如下: ``` -${checkpoint_root_dir}/${project_name}/${name} +${checkpoint_root_dir}/${project}/${name} ├── buffer │ ├── experience_buffer.jsonl # 存储训练过程中生成的经验数据 │ └── explorer_output.db # Explorer 模块输出的数据库文件 @@ -170,30 +170,39 @@ ${checkpoint_root_dir}/${project_name}/${name} 如果你希望使用 **Hugging Face 格式** 的模型(例如用于推理或部署),但发现 `global_step_*/actor/huggingface/` 目录中 **缺少 `model.safetensors` 文件**,就需要手动执行转换。 -### 自动批量转换功能 +### 转换工具:`trinity convert` -`trinity convert` 命令支持**递归扫描**指定目录下的所有 `global_step_*` 子文件夹,并**自动为每个检查点执行转换**。这意味着你无需逐一手动指定每个训练步,只需指向项目根目录即可完成批量处理。 +`trinity convert` 命令提供了灵活的模型转换功能,支持以下几种使用方式: -#### 基本用法(推荐) - -如果你希望将 **所有已保存的检查点** 转换为 Hugging Face 格式,直接运行: +#### ✅ 批量转换(推荐) +将 `--checkpoint-dir` 指向项目根目录(即包含多个 `global_step_*` 子目录的路径),工具会**自动递归查找所有 `global_step_*` 目录**,并对每个检查点执行转换。 ```bash -trinity convert --checkpoint-dir ${checkpoint_root_dir}/${project_name}/${name} +trinity convert --checkpoint-dir ${checkpoint_root_dir}/${project}/${name} ``` 该命令会: -- 自动查找目录下所有形如 `global_step_数字` 的子文件夹; -- 对每个子文件夹中的 `actor` 模型执行转换; +- 自动识别所有形如 `global_step_数字` 的子目录; +- 对每个子目录中的 `actor` 模型进行转换; - 将生成的 Hugging Face 格式文件(包括 `model.safetensors` 等)保存到对应的 `actor/huggingface/` 目录中。 -#### 特殊情况:缺少基础模型配置 +#### ✅ 单步转换 +如果只想转换某一个特定训练步的模型,可直接将 `--checkpoint-dir` 指向对应的 `global_step_XXX` 文件夹: + +```bash +trinity convert --checkpoint-dir ${checkpoint_root_dir}/${project}/${name}/global_step_120 +``` + +#### ✅ 路径容错 +即使你指定了 `global_step_XXX` 内部的子路径(例如 `.../global_step_120/actor`),工具也能智能识别并正确完成转换,无需严格对齐到 `global_step_XXX` 层级。 + +### 特殊情况:缺少基础模型配置 -如果某个 `global_step_*/actor/huggingface/` 目录中 **缺少 `config.json`**(通常是因为训练时未保存完整配置),转换过程需要原始基础模型的配置文件。此时,请通过 `--base-model-dir` 指定基础模型路径: +如果某个 `global_step_*/actor/huggingface/` 目录中 **缺少 `config.json`**(通常是因为训练时未完整保存配置),转换过程需要原始基础模型的配置文件。此时,请通过 `--base-model-dir` 指定基础模型路径: ```bash trinity convert \ - --checkpoint-dir ${checkpoint_root_dir}/${project_name}/${name} \ + --checkpoint-dir ${checkpoint_root_dir}/${project}/${name} \ --base-model-dir /path/to/your/base/model ``` diff --git a/trinity/manager/checkpoint_converter.py b/trinity/manager/checkpoint_converter.py index a21bd3db04..1c37d69443 100644 --- a/trinity/manager/checkpoint_converter.py +++ b/trinity/manager/checkpoint_converter.py @@ -1,6 +1,6 @@ - import os from typing import Optional + from trinity.utils.log import get_logger @@ -25,8 +25,7 @@ def init_base_model(self) -> bool: self.base_model, _ = self._get_config_and_empty_model(self.base_model_dir) except Exception: self.logger.error( - f"Failed to initialize base model from {self.base_model_dir}", - exc_info=True + f"Failed to initialize base model from {self.base_model_dir}", exc_info=True ) return False return True @@ -127,7 +126,7 @@ def convert(self, checkpoint_dir: str) -> None: auto_model_cls.from_pretrained(huggingface_dir) except Exception: self.logger.debug( - f"Incomplete or invalid Hugging Face checkpoint in {huggingface_dir}, will re-convert.", + f"Incomplete or invalid Hugging Face checkpoint in {huggingface_dir}, will re-convert.", exc_info=True, ) has_hf_checkpoint = False From dd0c213ed5835be7238de8b5b380725a2d15d3db Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Tue, 20 Jan 2026 13:23:55 +0800 Subject: [PATCH 4/5] apply reviews --- docs/sphinx_doc/source/tutorial/faq.md | 9 +++- .../tutorial/example_reasoning_basic.md | 2 +- docs/sphinx_doc/source_zh/tutorial/faq.md | 9 +++- examples/mix_chord/README.md | 45 +------------------ 4 files changed, 17 insertions(+), 48 deletions(-) diff --git a/docs/sphinx_doc/source/tutorial/faq.md b/docs/sphinx_doc/source/tutorial/faq.md index 8f7c8f51bb..137b3ad961 100644 --- a/docs/sphinx_doc/source/tutorial/faq.md +++ b/docs/sphinx_doc/source/tutorial/faq.md @@ -190,9 +190,14 @@ for exp in exp_list: **Q:** How to load the checkpoints outside of the Trinity-RFT framework? -**A:** You need to specify model path and checkpoint path. The following code snippet gives an example with transformers. +**A:** Currently, two loading methods are supported: -Here is an example of loading from fsdp trainer checkpoints: +1. **Recommended approach**: Use the `trinity convert` command to convert the original checkpoint into the standard Hugging Face format. + After conversion, you can load and use it directly just like any ordinary Hugging Face model. + For detailed instructions, please refer to the tutorial: [Optional: Converting Checkpoints to Hugging Face Format](https://agentscope-ai.github.io/Trinity-RFT/zh/main/tutorial/example_reasoning_basic.html#optional-convert-checkpoints-to-hugging-face-format) + +2. **Direct loading (for actor checkpoints trained with FSDP)**: + If you prefer to load the checkpoint directly without converting its format, you can use the following code example: ```python import os diff --git a/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md index 96287f6f5c..9768c13552 100644 --- a/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md +++ b/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md @@ -118,7 +118,7 @@ trinity run --config examples/grpo_gsm8k/gsm8k.yaml ``` -## 可选:将检查点转换为 Hugging Face 格式 +## 进阶选项:将检查点转换为 Hugging Face 格式 在运行 Trinity-RFT 进行实验后,系统会自动将训练过程中的检查点(checkpoint)保存到以下路径: diff --git a/docs/sphinx_doc/source_zh/tutorial/faq.md b/docs/sphinx_doc/source_zh/tutorial/faq.md index a0e5c91c7c..a79b4dc2bc 100644 --- a/docs/sphinx_doc/source_zh/tutorial/faq.md +++ b/docs/sphinx_doc/source_zh/tutorial/faq.md @@ -183,9 +183,14 @@ for exp in exp_list: **Q:** 如何在 Trinity-RFT 框架外加载 checkpoints? -**A:** 你需要指定模型路径和检查点路径。以下代码片段展示了如何使用 transformers 库进行加载。 +**A:** 目前支持两种加载方式: -以下是加载 FSDP trainer 检查点的示例: +1. **推荐方式**:使用 `trinity convert` 命令将原始检查点转换为标准的 Hugging Face 格式。 + 转换后,你就可以像加载普通 Hugging Face 模型一样直接使用它。 + 详细操作请参考教程:[可选:将检查点转换为 Hugging Face 格式](https://agentscope-ai.github.io/Trinity-RFT/zh/main/tutorial/example_reasoning_basic.html#hugging-face) + +2. **直接加载(适用于 FSDP 训练的 actor 检查点)**: + 如果你希望不转换格式而直接加载,可以使用以下代码示例: ```python import os diff --git a/examples/mix_chord/README.md b/examples/mix_chord/README.md index 665ac8e7ea..6dbd8f1679 100644 --- a/examples/mix_chord/README.md +++ b/examples/mix_chord/README.md @@ -56,49 +56,8 @@ trinity run --config examples/mix_chord/mix_chord_toolace.yaml It takes around 3 hours to run on 8 H20 GPUs. -After the run, you may also want to convert the checkpoint to a Hugging Face checkpoint. - -```python -import os -from transformers import AutoTokenizer, AutoModelForCausalLM -from trinity.common.models.utils import load_fsdp_state_dict_from_verl_checkpoint - -# The following variables are assumed to be predefined: -# model_path, checkpoint_root_dir, project, name -model = AutoModelForCausalLM.from_pretrained(model_path) -ckp_path = os.path.join(checkpoint_root_dir, project, name, "global_step_100", "actor") -state_dict = load_fsdp_state_dict_from_verl_checkpoint(ckp_path) -model.load_state_dict(state_dict) -output_dir = os.path.join(ckp_path, "huggingface") - -def save_to_huggingface_checkpoint(state_dict: dict, output_dir: str): - """Convert state dict to Hugging Face format and save it. - - Args: - state_dict: The state dict loaded from the Verl checkpoint. - output_dir: The directory to save the Hugging Face checkpoint. - """ - import os - import torch - from transformers import PreTrainedModel - - os.makedirs(output_dir, exist_ok=True) - - # Convert state dict keys to Hugging Face format if needed - hf_state_dict = {} - for key, value in state_dict.items(): - # Add any key mapping logic here if needed - # Example: - # if key.startswith("model."): - # new_key = key.replace("model.", "") - # hf_state_dict[new_key] = value - # else: - # hf_state_dict[key] = value - hf_state_dict[key] = value - torch.save(hf_state_dict, os.path.join(output_dir, "pytorch_model.bin")) - -save_to_huggingface_checkpoint(state_dict, output_dir) -``` +After the run, you can use the `trinity convert` command to convert the original checkpoint into the standard Hugging Face format. For detailed instructions, please refer to the tutorial: [Optional: Converting Checkpoints to Hugging Face Format](https://agentscope-ai.github.io/Trinity-RFT/zh/main/tutorial/example_reasoning_basic.html#optional-convert-checkpoints-to-hugging-face-format) + ## Evaluate the Trained Model on BFCL From 578c74907deb8b39ad1f78df3becce36cfcfe3bd Mon Sep 17 00:00:00 2001 From: chenyushuo <297086016@qq.com> Date: Tue, 20 Jan 2026 15:09:47 +0800 Subject: [PATCH 5/5] fix unittest.yaml --- .github/workflows/unittest.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 89b6429aa6..53b486ae1c 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -12,7 +12,7 @@ permissions: jobs: unittest: # only run on pull request - if: ${{ github.event.issue.pull_request && (startsWith(github.event.comment.body, '/unittest')) && github.event.comment.author_association == 'COLLABORATOR' }} + if: ${{ github.event.issue.pull_request && (startsWith(github.event.comment.body, '/unittest')) && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') }} runs-on: self-hosted steps: