Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/source/en/api/pipelines/skyreels_v2.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ The following SkyReels-V2 models are supported in Diffusers:
- [SkyReels-V2 I2V 1.3B - 540P](https://huggingface.co/Skywork/SkyReels-V2-I2V-1.3B-540P-Diffusers)
- [SkyReels-V2 I2V 14B - 540P](https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-540P-Diffusers)
- [SkyReels-V2 I2V 14B - 720P](https://huggingface.co/Skywork/SkyReels-V2-I2V-14B-720P-Diffusers)
- [SkyReels-V2 FLF2V 1.3B - 540P](https://huggingface.co/Skywork/SkyReels-V2-FLF2V-1.3B-540P-Diffusers)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FLF2V was producing bad results at the 1.3B model; I didn't even create its HF repo, but put it here as a placeholder at that time.


Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A model listing line for "SkyReels-V2 FLF2V 1.3B - 540P" appears to have been removed from the supported models list. If this model is no longer supported or was listed incorrectly, this change should be documented in the PR description to explain why it was removed.

Suggested change
> **Note:** The model "SkyReels-V2 FLF2V 1.3B - 540P" is no longer listed as a supported model. It was removed from the list as it is not currently supported in Diffusers, or was previously listed in error.

Copilot uses AI. Check for mistakes.
This model was contributed by [M. Tolga Cangöz](https://github.com/tolgacangoz).
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


> [!TIP]
> Click on the SkyReels-V2 models in the right sidebar for more examples of video generation.
Expand Down
24 changes: 13 additions & 11 deletions src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,22 +545,24 @@ def __call__(
latent_model_input = latents.to(transformer_dtype)
timestep = t.expand(latents.shape[0])

noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=prompt_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]

if self.do_classifier_free_guidance:
noise_uncond = self.transformer(
with self.transformer.cache_context("cond"):
noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
encoder_hidden_states=prompt_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]

if self.do_classifier_free_guidance:
with self.transformer.cache_context("uncond"):
noise_uncond = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)

# compute the previous noisy sample x_t -> x_t-1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -887,25 +887,28 @@ def __call__(
)
timestep[:, valid_interval_start:prefix_video_latents_frames] = addnoise_condition

noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
if self.do_classifier_free_guidance:
noise_uncond = self.transformer(
with self.transformer.cache_context("cond"):
noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
encoder_hidden_states=prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]

if self.do_classifier_free_guidance:
with self.transformer.cache_context("uncond"):
noise_uncond = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)

update_mask_i = step_update_mask[i]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -966,25 +966,28 @@ def __call__(
)
timestep[:, valid_interval_start:prefix_video_latents_frames] = addnoise_condition

noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
if self.do_classifier_free_guidance:
noise_uncond = self.transformer(
with self.transformer.cache_context("cond"):
noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
encoder_hidden_states=prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]

if self.do_classifier_free_guidance:
with self.transformer.cache_context("uncond"):
noise_uncond = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)

update_mask_i = step_update_mask[i]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -974,25 +974,28 @@ def __call__(
)
timestep[:, valid_interval_start:prefix_video_latents_frames] = addnoise_condition

noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
if self.do_classifier_free_guidance:
noise_uncond = self.transformer(
with self.transformer.cache_context("cond"):
noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
encoder_hidden_states=prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]

if self.do_classifier_free_guidance:
with self.transformer.cache_context("uncond"):
noise_uncond = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
enable_diffusion_forcing=True,
fps=fps_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)

update_mask_i = step_update_mask[i]
Expand Down
26 changes: 14 additions & 12 deletions src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,24 +678,26 @@ def __call__(
latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
timestep = t.expand(latents.shape[0])

noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=prompt_embeds,
encoder_hidden_states_image=image_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]

if self.do_classifier_free_guidance:
noise_uncond = self.transformer(
with self.transformer.cache_context("cond"):
noise_pred = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
encoder_hidden_states=prompt_embeds,
encoder_hidden_states_image=image_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]

if self.do_classifier_free_guidance:
with self.transformer.cache_context("uncond"):
noise_uncond = self.transformer(
hidden_states=latent_model_input,
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
encoder_hidden_states_image=image_embeds,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)

# compute the previous noisy sample x_t -> x_t-1
Expand Down