Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 158 additions & 40 deletions xtuner/v1/model/base.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion xtuner/v1/model/compose/intern_s1/modeling_intern_s1.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def fully_shard(
# Note: 非常关键,不能删除这个 assert
assert self.fsdp_mesh is not None

fully_shard(
self._fully_shard(
self,
mesh=self.fsdp_mesh,
mp_policy=mp_policy,
Expand Down
2 changes: 1 addition & 1 deletion xtuner/v1/model/compose/qwen3_vl/modeling_projector.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def fully_shard(
for param in self.parameters():
param.requires_grad = False

fully_shard(
self._fully_shard(
self,
mesh=self.fsdp_mesh,
mp_policy=mp_policy,
Expand Down
3 changes: 1 addition & 2 deletions xtuner/v1/model/compose/qwen3_vl/modeling_qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from torch.distributed.fsdp import (
CPUOffloadPolicy,
MixedPrecisionPolicy,
fully_shard,
FSDPModule,
)
import torch.distributed as dist
Expand Down Expand Up @@ -90,7 +89,7 @@ def fully_shard(
# Note: 非常关键,不能删除这个 assert
assert self.fsdp_mesh is not None

fully_shard(
self._fully_shard(
self,
mesh=self.fsdp_mesh,
mp_policy=mp_policy,
Expand Down
5 changes: 2 additions & 3 deletions xtuner/v1/model/compose/qwen3_vl/modeling_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from torch.distributed.fsdp import (
CPUOffloadPolicy,
MixedPrecisionPolicy,
fully_shard,
)
from transformers.models.llama.modeling_llama import repeat_kv
from xtuner.v1.float8.float8_handler import Float8Handler
Expand Down Expand Up @@ -349,7 +348,7 @@ def fully_shard(

self.blocks[layer_idx] = layer

fully_shard(
self._fully_shard(
layer,
mesh=self.fsdp_mesh,
mp_policy=mp_policy,
Expand All @@ -362,7 +361,7 @@ def fully_shard(
for layer_cur, layer_next in zip(self.blocks[:-1], self.blocks[1:]):
layer_cur.set_modules_to_forward_prefetch([layer_next])

fully_shard(
self._fully_shard(
self,
mesh=self.fsdp_mesh,
mp_policy=mp_policy,
Expand Down
11 changes: 5 additions & 6 deletions xtuner/v1/model/dense/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from torch.distributed.fsdp import (
CPUOffloadPolicy,
MixedPrecisionPolicy,
fully_shard,
)
from torch.distributed.tensor import DTensor
from tqdm import tqdm
Expand Down Expand Up @@ -223,7 +222,7 @@ def fully_shard(
layer.forward = torch.compile(layer.forward, fullgraph=True)

self.layers[str(layer_idx)] = layer
fully_shard(
self._fully_shard(
layer,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
Expand All @@ -237,31 +236,31 @@ def fully_shard(
):
layer_cur.set_modules_to_forward_prefetch([layer_next]) # type: ignore

fully_shard(
self._fully_shard(
self.embed_tokens,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
reshard_after_forward=self.fsdp_config.reshard_after_forward,
offload_policy=CPUOffloadPolicy() if self.fsdp_config.cpu_offload else None,
)

fully_shard(
self._fully_shard(
self.norm,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
reshard_after_forward=self.fsdp_config.reshard_after_forward,
offload_policy=CPUOffloadPolicy() if self.fsdp_config.cpu_offload else None,
)

fully_shard(
self._fully_shard(
self.lm_head,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
reshard_after_forward=self.fsdp_config.reshard_after_forward,
offload_policy=CPUOffloadPolicy() if self.fsdp_config.cpu_offload else None,
)

fully_shard(
self._fully_shard(
self,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
Expand Down
11 changes: 5 additions & 6 deletions xtuner/v1/model/moe/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from torch.distributed.fsdp import (
CPUOffloadPolicy,
MixedPrecisionPolicy,
fully_shard,
)
from torch.distributed.tensor import DTensor, Replicate, distribute_tensor
from tqdm import tqdm
Expand Down Expand Up @@ -733,7 +732,7 @@ def fully_shard(
reshard_after_forward = False
else:
reshard_after_forward = self.fsdp_config.reshard_after_forward
fully_shard(
self._fully_shard(
layer,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
Expand All @@ -747,31 +746,31 @@ def fully_shard(
):
layer_cur.set_modules_to_forward_prefetch([layer_next]) # type: ignore

fully_shard(
self._fully_shard(
self.embed_tokens,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
reshard_after_forward=self.fsdp_config.reshard_after_forward,
offload_policy=CPUOffloadPolicy() if self.fsdp_config.cpu_offload else None,
)

fully_shard(
self._fully_shard(
self.norm,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
reshard_after_forward=self.fsdp_config.reshard_after_forward,
offload_policy=CPUOffloadPolicy() if self.fsdp_config.cpu_offload else None,
)

fully_shard(
self._fully_shard(
self.lm_head,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
reshard_after_forward=self.fsdp_config.reshard_after_forward,
offload_policy=CPUOffloadPolicy() if self.fsdp_config.cpu_offload else None,
)

fully_shard(
self._fully_shard(
self,
mesh=self.fsdp_mesh if self.hsdp_mesh is None else self.hsdp_mesh,
mp_policy=mp_policy,
Expand Down
2 changes: 2 additions & 0 deletions xtuner/v1/utils/load_spec.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import torch.distributed as dist
from torch.distributed.device_mesh import DeviceMesh
from pydantic import BaseModel, ConfigDict

from .enum_helper import StrEnum
Expand All @@ -21,6 +22,7 @@ class LoadSpec(BaseModel):
shard_start: int | None = None
shard_end: int | None = None
group: dist.ProcessGroup | None = None
fsdp_mesh: DeviceMesh | None = None # TODO: (yehaochen) Only a workaround

def model_post_init(self, _) -> None:
if self.load_enum == LoadEnum.SAME:
Expand Down
Loading