From f357b98c1aa522d08d1f2ce21a96901f100a11fe Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Wed, 1 Apr 2026 02:55:32 -0700 Subject: [PATCH 01/11] Add modular pipeline support for LTX Video --- src/diffusers/__init__.py | 4 + src/diffusers/modular_pipelines/__init__.py | 5 + .../modular_pipelines/ltx/__init__.py | 47 +++ .../modular_pipelines/ltx/before_denoise.py | 281 ++++++++++++++++++ .../modular_pipelines/ltx/decoders.py | 137 +++++++++ .../modular_pipelines/ltx/denoise.py | 256 ++++++++++++++++ .../modular_pipelines/ltx/encoders.py | 205 +++++++++++++ .../ltx/modular_blocks_ltx.py | 65 ++++ .../modular_pipelines/ltx/modular_pipeline.py | 64 ++++ .../modular_pipelines/modular_pipeline.py | 1 + 10 files changed, 1065 insertions(+) create mode 100644 src/diffusers/modular_pipelines/ltx/__init__.py create mode 100644 src/diffusers/modular_pipelines/ltx/before_denoise.py create mode 100644 src/diffusers/modular_pipelines/ltx/decoders.py create mode 100644 src/diffusers/modular_pipelines/ltx/denoise.py create mode 100644 src/diffusers/modular_pipelines/ltx/encoders.py create mode 100644 src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py create mode 100644 src/diffusers/modular_pipelines/ltx/modular_pipeline.py diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 7d966452d1a2..637bf2685824 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -443,6 +443,8 @@ "HeliosPyramidAutoBlocks", "HeliosPyramidDistilledAutoBlocks", "HeliosPyramidDistilledModularPipeline", + "LTXBlocks", + "LTXModularPipeline", "HeliosPyramidModularPipeline", "QwenImageAutoBlocks", "QwenImageEditAutoBlocks", @@ -1210,6 +1212,8 @@ HeliosPyramidAutoBlocks, HeliosPyramidDistilledAutoBlocks, HeliosPyramidDistilledModularPipeline, + LTXBlocks, + LTXModularPipeline, HeliosPyramidModularPipeline, QwenImageAutoBlocks, QwenImageEditAutoBlocks, diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index fd9bd691ca87..389a5416f3ea 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -88,6 +88,10 @@ "QwenImageLayeredModularPipeline", "QwenImageLayeredAutoBlocks", ] + _import_structure["ltx"] = [ + "LTXBlocks", + "LTXModularPipeline", + ] _import_structure["z_image"] = [ "ZImageAutoBlocks", "ZImageModularPipeline", @@ -141,6 +145,7 @@ QwenImageModularPipeline, ) from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline + from .ltx import LTXBlocks, LTXModularPipeline from .wan import ( Wan22Blocks, Wan22Image2VideoBlocks, diff --git a/src/diffusers/modular_pipelines/ltx/__init__.py b/src/diffusers/modular_pipelines/ltx/__init__.py new file mode 100644 index 000000000000..019fa96fef14 --- /dev/null +++ b/src/diffusers/modular_pipelines/ltx/__init__.py @@ -0,0 +1,47 @@ +from typing import TYPE_CHECKING + +from ...utils import ( + DIFFUSERS_SLOW_IMPORT, + OptionalDependencyNotAvailable, + _LazyModule, + get_objects_from_module, + is_torch_available, + is_transformers_available, +) + + +_dummy_objects = {} +_import_structure = {} + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils import dummy_torch_and_transformers_objects # noqa F403 + + _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) +else: + _import_structure["modular_blocks_ltx"] = ["LTXBlocks"] + _import_structure["modular_pipeline"] = ["LTXModularPipeline"] + +if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: + try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 + else: + from .modular_blocks_ltx import LTXBlocks + from .modular_pipeline import LTXModularPipeline +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) + + for name, value in _dummy_objects.items(): + setattr(sys.modules[__name__], name, value) diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py new file mode 100644 index 000000000000..d3673c3ac1f8 --- /dev/null +++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py @@ -0,0 +1,281 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect + +import numpy as np +import torch + +from ...models import LTXVideoTransformer3DModel +from ...pipelines.ltx.pipeline_ltx import LTXPipeline +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import logging +from ...utils.torch_utils import randn_tensor +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import LTXModularPipeline + + +logger = logging.get_logger(__name__) + + +def calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.15, +): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: int | None = None, + device: str | torch.device | None = None, + timesteps: list[int] | None = None, + sigmas: list[float] | None = None, + **kwargs, +): + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed.") + if timesteps is not None: + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom sigmas." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class LTXTextInputStep(ModularPipelineBlocks): + model_name = "ltx" + + @property + def description(self) -> str: + return ( + "Input processing step that:\n" + " 1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n" + " 2. Adjusts input tensor shapes based on `batch_size` and `num_videos_per_prompt`" + ) + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("transformer", LTXVideoTransformer3DModel), + ] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("num_videos_per_prompt", default=1), + InputParam("prompt_embeds", required=True, type_hint=torch.Tensor), + InputParam("prompt_attention_mask", type_hint=torch.Tensor), + InputParam("negative_prompt_embeds", type_hint=torch.Tensor), + InputParam("negative_prompt_attention_mask", type_hint=torch.Tensor), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("batch_size", type_hint=int), + OutputParam("dtype", type_hint=torch.dtype), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + block_state.batch_size = block_state.prompt_embeds.shape[0] + block_state.dtype = block_state.prompt_embeds.dtype + num_videos = block_state.num_videos_per_prompt + + # Repeat prompt_embeds for num_videos_per_prompt + _, seq_len, _ = block_state.prompt_embeds.shape + block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, num_videos, 1) + block_state.prompt_embeds = block_state.prompt_embeds.view( + block_state.batch_size * num_videos, seq_len, -1 + ) + + if block_state.prompt_attention_mask is not None: + block_state.prompt_attention_mask = block_state.prompt_attention_mask.repeat(num_videos, 1) + + if block_state.negative_prompt_embeds is not None: + _, seq_len, _ = block_state.negative_prompt_embeds.shape + block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.repeat(1, num_videos, 1) + block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.view( + block_state.batch_size * num_videos, seq_len, -1 + ) + + if block_state.negative_prompt_attention_mask is not None: + block_state.negative_prompt_attention_mask = block_state.negative_prompt_attention_mask.repeat(num_videos, 1) + + self.set_block_state(state, block_state) + return components, state + + +class LTXSetTimestepsStep(ModularPipelineBlocks): + model_name = "ltx" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def description(self) -> str: + return "Step that sets the scheduler's timesteps for inference" + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("num_inference_steps", default=50), + InputParam("timesteps"), + InputParam("sigmas"), + InputParam("height", type_hint=int, default=512), + InputParam("width", type_hint=int, default=704), + InputParam("num_frames", type_hint=int, default=161), + InputParam("frame_rate", type_hint=int, default=25), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("timesteps", type_hint=torch.Tensor), + OutputParam("num_inference_steps", type_hint=int), + OutputParam("rope_interpolation_scale", type_hint=tuple), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + height = block_state.height + width = block_state.width + num_frames = block_state.num_frames + frame_rate = block_state.frame_rate + + latent_num_frames = (num_frames - 1) // components.vae_temporal_compression_ratio + 1 + latent_height = height // components.vae_spatial_compression_ratio + latent_width = width // components.vae_spatial_compression_ratio + video_sequence_length = latent_num_frames * latent_height * latent_width + + custom_timesteps = block_state.timesteps + sigmas = block_state.sigmas + + if custom_timesteps is not None: + # User provided custom timesteps, don't compute sigmas + block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps( + components.scheduler, + block_state.num_inference_steps, + device, + custom_timesteps, + ) + else: + if sigmas is None: + sigmas = np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps) + + mu = calculate_shift( + video_sequence_length, + components.scheduler.config.get("base_image_seq_len", 256), + components.scheduler.config.get("max_image_seq_len", 4096), + components.scheduler.config.get("base_shift", 0.5), + components.scheduler.config.get("max_shift", 1.15), + ) + + block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps( + components.scheduler, + block_state.num_inference_steps, + device, + sigmas=sigmas, + mu=mu, + ) + + block_state.rope_interpolation_scale = ( + components.vae_temporal_compression_ratio / frame_rate, + components.vae_spatial_compression_ratio, + components.vae_spatial_compression_ratio, + ) + + self.set_block_state(state, block_state) + return components, state + + +class LTXPrepareLatentsStep(ModularPipelineBlocks): + model_name = "ltx" + + @property + def description(self) -> str: + return "Prepare latents step that prepares the latents for the text-to-video generation process" + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("height", type_hint=int, default=512), + InputParam("width", type_hint=int, default=704), + InputParam("num_frames", type_hint=int, default=161), + InputParam("latents", type_hint=torch.Tensor | None), + InputParam("num_videos_per_prompt", type_hint=int, default=1), + InputParam("generator"), + InputParam("batch_size", required=True, type_hint=int), + InputParam("dtype", type_hint=torch.dtype), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("latents", type_hint=torch.Tensor), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + batch_size = block_state.batch_size * block_state.num_videos_per_prompt + num_channels_latents = components.transformer.config.in_channels + + if block_state.latents is not None: + block_state.latents = block_state.latents.to(device=device, dtype=torch.float32) + else: + height = block_state.height // components.vae_spatial_compression_ratio + width = block_state.width // components.vae_spatial_compression_ratio + num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1 + + shape = (batch_size, num_channels_latents, num_frames, height, width) + block_state.latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=torch.float32) + block_state.latents = LTXPipeline._pack_latents( + block_state.latents, + components.transformer_spatial_patch_size, + components.transformer_temporal_patch_size, + ) + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/ltx/decoders.py b/src/diffusers/modular_pipelines/ltx/decoders.py new file mode 100644 index 000000000000..eca22a5797a7 --- /dev/null +++ b/src/diffusers/modular_pipelines/ltx/decoders.py @@ -0,0 +1,137 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import numpy as np +import PIL +import torch + +from ...configuration_utils import FrozenDict +from ...models import AutoencoderKLLTXVideo +from ...pipelines.ltx.pipeline_ltx import LTXPipeline +from ...utils import logging +from ...utils.torch_utils import randn_tensor +from ...video_processor import VideoProcessor +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam + + +logger = logging.get_logger(__name__) + + +class LTXVaeDecoderStep(ModularPipelineBlocks): + model_name = "ltx" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("vae", AutoencoderKLLTXVideo), + ComponentSpec( + "video_processor", + VideoProcessor, + config=FrozenDict({"vae_scale_factor": 32}), + default_creation_method="from_config", + ), + ] + + @property + def description(self) -> str: + return "Step that decodes the denoised latents into videos" + + @property + def inputs(self) -> list[tuple[str, Any]]: + return [ + InputParam("latents", required=True, type_hint=torch.Tensor), + InputParam("output_type", default="np", type_hint=str), + InputParam("height", type_hint=int, default=512), + InputParam("width", type_hint=int, default=704), + InputParam("num_frames", type_hint=int, default=161), + InputParam("decode_timestep", default=0.0), + InputParam("decode_noise_scale", default=None), + InputParam("generator"), + InputParam("batch_size", type_hint=int, default=1), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + "videos", + type_hint=list[list[PIL.Image.Image]] | list[torch.Tensor] | list[np.ndarray], + description="The generated videos", + ) + ] + + @torch.no_grad() + def __call__(self, components, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + vae = components.vae + + latents = block_state.latents + + if block_state.output_type == "latent": + block_state.videos = latents + self.set_block_state(state, block_state) + return components, state + + height = block_state.height + width = block_state.width + num_frames = block_state.num_frames + + latent_num_frames = (num_frames - 1) // components.vae_temporal_compression_ratio + 1 + latent_height = height // components.vae_spatial_compression_ratio + latent_width = width // components.vae_spatial_compression_ratio + + latents = LTXPipeline._unpack_latents( + latents, + latent_num_frames, + latent_height, + latent_width, + components.transformer_spatial_patch_size, + components.transformer_temporal_patch_size, + ) + latents = LTXPipeline._denormalize_latents( + latents, vae.latents_mean, vae.latents_std, vae.config.scaling_factor + ) + latents = latents.to(block_state.dtype if hasattr(block_state, 'dtype') else torch.float32) + + if not vae.config.timestep_conditioning: + timestep = None + else: + device = latents.device + batch_size = block_state.batch_size + decode_timestep = block_state.decode_timestep + decode_noise_scale = block_state.decode_noise_scale + + noise = randn_tensor(latents.shape, generator=block_state.generator, device=device, dtype=latents.dtype) + if not isinstance(decode_timestep, list): + decode_timestep = [decode_timestep] * batch_size + if decode_noise_scale is None: + decode_noise_scale = decode_timestep + elif not isinstance(decode_noise_scale, list): + decode_noise_scale = [decode_noise_scale] * batch_size + + timestep = torch.tensor(decode_timestep, device=device, dtype=latents.dtype) + decode_noise_scale = torch.tensor(decode_noise_scale, device=device, dtype=latents.dtype)[ + :, None, None, None, None + ] + latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise + + latents = latents.to(vae.dtype) + video = vae.decode(latents, timestep, return_dict=False)[0] + block_state.videos = components.video_processor.postprocess_video(video, output_type=block_state.output_type) + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py new file mode 100644 index 000000000000..184aeb2d1b72 --- /dev/null +++ b/src/diffusers/modular_pipelines/ltx/denoise.py @@ -0,0 +1,256 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import torch + +from ...configuration_utils import FrozenDict +from ...guiders import ClassifierFreeGuidance +from ...models import LTXVideoTransformer3DModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import logging +from ..modular_pipeline import ( + BlockState, + LoopSequentialPipelineBlocks, + ModularPipelineBlocks, + PipelineState, +) +from ..modular_pipeline_utils import ComponentSpec, InputParam +from .modular_pipeline import LTXModularPipeline + + +logger = logging.get_logger(__name__) + + +class LTXLoopBeforeDenoiser(ModularPipelineBlocks): + model_name = "ltx" + + @property + def description(self) -> str: + return ( + "Step within the denoising loop that prepares the latent input for the denoiser. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `LTXDenoiseLoopWrapper`)" + ) + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("latents", required=True, type_hint=torch.Tensor), + InputParam("dtype", required=True, type_hint=torch.dtype), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + block_state.latent_model_input = block_state.latents.to(block_state.dtype) + return components, block_state + + +class LTXLoopDenoiser(ModularPipelineBlocks): + model_name = "ltx" + + def __init__( + self, + guider_input_fields: dict[str, Any] = { + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"), + }, + ): + if not isinstance(guider_input_fields, dict): + raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}") + self._guider_input_fields = guider_input_fields + super().__init__() + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 3.0}), + default_creation_method="from_config", + ), + ComponentSpec("transformer", LTXVideoTransformer3DModel), + ] + + @property + def description(self) -> str: + return ( + "Step within the denoising loop that denoises the latents with guidance. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `LTXDenoiseLoopWrapper`)" + ) + + @property + def inputs(self) -> list[tuple[str, Any]]: + inputs = [ + InputParam("attention_kwargs"), + InputParam("num_inference_steps", required=True, type_hint=int), + InputParam("rope_interpolation_scale", type_hint=tuple), + InputParam("height", type_hint=int), + InputParam("width", type_hint=int), + InputParam("num_frames", type_hint=int), + ] + guider_input_names = [] + for value in self._guider_input_fields.values(): + if isinstance(value, tuple): + guider_input_names.extend(value) + else: + guider_input_names.append(value) + + for name in guider_input_names: + inputs.append(InputParam(name=name, required=True, type_hint=torch.Tensor)) + return inputs + + @torch.no_grad() + def __call__( + self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor + ) -> PipelineState: + components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) + + latent_num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1 + latent_height = block_state.height // components.vae_spatial_compression_ratio + latent_width = block_state.width // components.vae_spatial_compression_ratio + + guider_state = components.guider.prepare_inputs_from_block_state(block_state, self._guider_input_fields) + + for guider_state_batch in guider_state: + components.guider.prepare_models(components.transformer) + cond_kwargs = guider_state_batch.as_dict() + cond_kwargs = { + k: v.to(block_state.dtype) if isinstance(v, torch.Tensor) else v + for k, v in cond_kwargs.items() + if k in self._guider_input_fields.keys() + } + + guider_state_batch.noise_pred = components.transformer( + hidden_states=block_state.latent_model_input, + timestep=t.expand(block_state.latent_model_input.shape[0]).to(block_state.dtype), + num_frames=latent_num_frames, + height=latent_height, + width=latent_width, + rope_interpolation_scale=block_state.rope_interpolation_scale, + attention_kwargs=block_state.attention_kwargs, + return_dict=False, + **cond_kwargs, + )[0] + components.guider.cleanup_models(components.transformer) + + block_state.noise_pred = components.guider(guider_state)[0] + + return components, block_state + + +class LTXLoopAfterDenoiser(ModularPipelineBlocks): + model_name = "ltx" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def description(self) -> str: + return ( + "Step within the denoising loop that updates the latents. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `LTXDenoiseLoopWrapper`)" + ) + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + latents_dtype = block_state.latents.dtype + block_state.latents = components.scheduler.step( + block_state.noise_pred, + t, + block_state.latents, + return_dict=False, + )[0] + + if block_state.latents.dtype != latents_dtype: + block_state.latents = block_state.latents.to(latents_dtype) + + return components, block_state + + +class LTXDenoiseLoopWrapper(LoopSequentialPipelineBlocks): + model_name = "ltx" + + @property + def description(self) -> str: + return ( + "Pipeline block that iteratively denoises the latents over `timesteps`. " + "The specific steps within each iteration can be customized with `sub_blocks` attributes" + ) + + @property + def loop_expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ComponentSpec("transformer", LTXVideoTransformer3DModel), + ] + + @property + def loop_inputs(self) -> list[InputParam]: + return [ + InputParam("timesteps", required=True, type_hint=torch.Tensor), + InputParam("num_inference_steps", required=True, type_hint=int), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + block_state.num_warmup_steps = max( + len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0 + ) + + with self.progress_bar(total=block_state.num_inference_steps) as progress_bar: + for i, t in enumerate(block_state.timesteps): + components, block_state = self.loop_step(components, block_state, i=i, t=t) + if i == len(block_state.timesteps) - 1 or ( + (i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0 + ): + progress_bar.update() + + self.set_block_state(state, block_state) + return components, state + + +class LTXDenoiseStep(LTXDenoiseLoopWrapper): + block_classes = [ + LTXLoopBeforeDenoiser, + LTXLoopDenoiser( + guider_input_fields={ + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"), + } + ), + LTXLoopAfterDenoiser, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoises the latents.\n" + "Its loop logic is defined in `LTXDenoiseLoopWrapper.__call__` method.\n" + "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n" + " - `LTXLoopBeforeDenoiser`\n" + " - `LTXLoopDenoiser`\n" + " - `LTXLoopAfterDenoiser`\n" + "This block supports text-to-video tasks." + ) diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py new file mode 100644 index 000000000000..c1310fdbd6da --- /dev/null +++ b/src/diffusers/modular_pipelines/ltx/encoders.py @@ -0,0 +1,205 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from transformers import T5EncoderModel, T5TokenizerFast + +from ...configuration_utils import FrozenDict +from ...guiders import ClassifierFreeGuidance +from ...utils import logging +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import LTXModularPipeline + + +logger = logging.get_logger(__name__) + + +class LTXTextEncoderStep(ModularPipelineBlocks): + model_name = "ltx" + + @property + def description(self) -> str: + return "Text Encoder step that generates text embeddings to guide the video generation" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("text_encoder", T5EncoderModel), + ComponentSpec("tokenizer", T5TokenizerFast), + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 3.0}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("prompt"), + InputParam("negative_prompt"), + InputParam("prompt_embeds", type_hint=torch.Tensor), + InputParam("prompt_attention_mask", type_hint=torch.Tensor), + InputParam("negative_prompt_embeds", type_hint=torch.Tensor), + InputParam("negative_prompt_attention_mask", type_hint=torch.Tensor), + InputParam("max_sequence_length", default=128), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + "prompt_embeds", + type_hint=torch.Tensor, + kwargs_type="denoiser_input_fields", + description="text embeddings used to guide the video generation", + ), + OutputParam( + "prompt_attention_mask", + type_hint=torch.Tensor, + kwargs_type="denoiser_input_fields", + description="attention mask for text embeddings", + ), + OutputParam( + "negative_prompt_embeds", + type_hint=torch.Tensor, + kwargs_type="denoiser_input_fields", + description="negative text embeddings", + ), + OutputParam( + "negative_prompt_attention_mask", + type_hint=torch.Tensor, + kwargs_type="denoiser_input_fields", + description="attention mask for negative text embeddings", + ), + ] + + @staticmethod + def check_inputs(block_state): + if block_state.prompt is not None and ( + not isinstance(block_state.prompt, str) and not isinstance(block_state.prompt, list) + ): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}") + + @staticmethod + def _get_t5_prompt_embeds( + components, + prompt: str | list[str], + max_sequence_length: int, + device: torch.device, + dtype: torch.dtype, + ): + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) + + text_inputs = components.tokenizer( + prompt, + padding="max_length", + max_length=max_sequence_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + prompt_attention_mask = text_inputs.attention_mask + prompt_attention_mask = prompt_attention_mask.bool().to(device) + + prompt_embeds = components.text_encoder(text_input_ids.to(device))[0] + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + return prompt_embeds, prompt_attention_mask + + @staticmethod + def encode_prompt( + components, + prompt: str, + device: torch.device | None = None, + prepare_unconditional_embeds: bool = True, + negative_prompt: str | None = None, + max_sequence_length: int = 128, + ): + device = device or components._execution_device + dtype = components.text_encoder.dtype + + if not isinstance(prompt, list): + prompt = [prompt] + batch_size = len(prompt) + + prompt_embeds, prompt_attention_mask = LTXTextEncoderStep._get_t5_prompt_embeds( + components=components, + prompt=prompt, + max_sequence_length=max_sequence_length, + device=device, + dtype=dtype, + ) + + negative_prompt_embeds = None + negative_prompt_attention_mask = None + + if prepare_unconditional_embeds: + negative_prompt = negative_prompt or "" + negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt + + if prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + + negative_prompt_embeds, negative_prompt_attention_mask = LTXTextEncoderStep._get_t5_prompt_embeds( + components=components, + prompt=negative_prompt, + max_sequence_length=max_sequence_length, + device=device, + dtype=dtype, + ) + + return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + self.check_inputs(block_state) + + block_state.device = components._execution_device + + # Skip encoding if pre-computed embeddings are provided + if getattr(block_state, "prompt_embeds", None) is not None: + self.set_block_state(state, block_state) + return components, state + + ( + block_state.prompt_embeds, + block_state.prompt_attention_mask, + block_state.negative_prompt_embeds, + block_state.negative_prompt_attention_mask, + ) = self.encode_prompt( + components=components, + prompt=block_state.prompt, + device=block_state.device, + prepare_unconditional_embeds=components.requires_unconditional_embeds, + negative_prompt=block_state.negative_prompt, + max_sequence_length=block_state.max_sequence_length, + ) + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py new file mode 100644 index 000000000000..f6871864aa2d --- /dev/null +++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py @@ -0,0 +1,65 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import logging +from ..modular_pipeline import SequentialPipelineBlocks +from ..modular_pipeline_utils import OutputParam +from .before_denoise import ( + LTXPrepareLatentsStep, + LTXSetTimestepsStep, + LTXTextInputStep, +) +from .decoders import LTXVaeDecoderStep +from .denoise import LTXDenoiseStep +from .encoders import LTXTextEncoderStep + + +logger = logging.get_logger(__name__) + + +class LTXCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "ltx" + block_classes = [ + LTXTextInputStep, + LTXSetTimestepsStep, + LTXPrepareLatentsStep, + LTXDenoiseStep, + ] + block_names = ["input", "set_timesteps", "prepare_latents", "denoise"] + + @property + def description(self): + return "Denoise block that takes encoded conditions and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] + + +class LTXBlocks(SequentialPipelineBlocks): + model_name = "ltx" + block_classes = [ + LTXTextEncoderStep, + LTXCoreDenoiseStep, + LTXVaeDecoderStep, + ] + block_names = ["text_encoder", "denoise", "decode"] + + @property + def description(self): + return "Modular pipeline blocks for LTX Video." + + @property + def outputs(self): + return [OutputParam.template("videos")] diff --git a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py new file mode 100644 index 000000000000..3cce6845396b --- /dev/null +++ b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py @@ -0,0 +1,64 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...loaders import LTXVideoLoraLoaderMixin +from ...utils import logging +from ..modular_pipeline import ModularPipeline + + +logger = logging.get_logger(__name__) + + +class LTXModularPipeline( + ModularPipeline, + LTXVideoLoraLoaderMixin, +): + """ + A ModularPipeline for LTX Video. + + > [!WARNING] > This is an experimental feature and is likely to change in the future. + """ + + default_blocks_name = "LTXBlocks" + + @property + def vae_spatial_compression_ratio(self): + if getattr(self, "vae", None) is not None: + return self.vae.spatial_compression_ratio + return 32 + + @property + def vae_temporal_compression_ratio(self): + if getattr(self, "vae", None) is not None: + return self.vae.temporal_compression_ratio + return 8 + + @property + def transformer_spatial_patch_size(self): + if getattr(self, "transformer", None) is not None: + return self.transformer.config.patch_size + return 1 + + @property + def transformer_temporal_patch_size(self): + if getattr(self, "transformer", None) is not None: + return self.transformer.config.patch_size_t + return 1 + + @property + def requires_unconditional_embeds(self): + if hasattr(self, "guider") and self.guider is not None: + return self.guider._enabled and self.guider.num_conditions > 1 + return False diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index 9cd2f9f5c6ae..ace89f0d6f91 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -132,6 +132,7 @@ def _helios_pyramid_map_fn(config_dict=None): ("z-image", _create_default_map_fn("ZImageModularPipeline")), ("helios", _create_default_map_fn("HeliosModularPipeline")), ("helios-pyramid", _helios_pyramid_map_fn), + ("ltx", _create_default_map_fn("LTXModularPipeline")), ] ) From 11b891c124e54a36676cfc258cdb742123e324a4 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Wed, 1 Apr 2026 03:22:47 -0700 Subject: [PATCH 02/11] Fix guidance_scale passthrough to guider --- src/diffusers/modular_pipelines/ltx/before_denoise.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py index d3673c3ac1f8..176edbe2bb69 100644 --- a/src/diffusers/modular_pipelines/ltx/before_denoise.py +++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py @@ -94,6 +94,7 @@ def expected_components(self) -> list[ComponentSpec]: def inputs(self) -> list[InputParam]: return [ InputParam("num_videos_per_prompt", default=1), + InputParam("guidance_scale", type_hint=float, default=3.0), InputParam("prompt_embeds", required=True, type_hint=torch.Tensor), InputParam("prompt_attention_mask", type_hint=torch.Tensor), InputParam("negative_prompt_embeds", type_hint=torch.Tensor), @@ -111,6 +112,11 @@ def intermediate_outputs(self) -> list[OutputParam]: def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) + # Set guidance_scale on guider so CFG is configured correctly + guidance_scale = getattr(block_state, "guidance_scale", 3.0) + if hasattr(components, "guider") and components.guider is not None: + components.guider.guidance_scale = guidance_scale + block_state.batch_size = block_state.prompt_embeds.shape[0] block_state.dtype = block_state.prompt_embeds.dtype num_videos = block_state.num_videos_per_prompt From 4d2d73eda5a3db7c34f61769cd4e627977938a54 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Wed, 1 Apr 2026 07:04:20 -0700 Subject: [PATCH 03/11] Add LTX modular pipeline tests --- tests/modular_pipelines/ltx/__init__.py | 0 .../ltx/test_modular_pipeline_ltx.py | 49 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 tests/modular_pipelines/ltx/__init__.py create mode 100644 tests/modular_pipelines/ltx/test_modular_pipeline_ltx.py diff --git a/tests/modular_pipelines/ltx/__init__.py b/tests/modular_pipelines/ltx/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/modular_pipelines/ltx/test_modular_pipeline_ltx.py b/tests/modular_pipelines/ltx/test_modular_pipeline_ltx.py new file mode 100644 index 000000000000..00e68d26fdee --- /dev/null +++ b/tests/modular_pipelines/ltx/test_modular_pipeline_ltx.py @@ -0,0 +1,49 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from diffusers.modular_pipelines import LTXBlocks, LTXModularPipeline + +from ..test_modular_pipelines_common import ModularPipelineTesterMixin + + +class TestLTXModularPipelineFast(ModularPipelineTesterMixin): + pipeline_class = LTXModularPipeline + pipeline_blocks_class = LTXBlocks + pretrained_model_name_or_path = "akshan-main/tiny-ltx-modular-pipe" + + params = frozenset(["prompt", "height", "width", "num_frames"]) + batch_params = frozenset(["prompt"]) + optional_params = frozenset(["num_inference_steps", "num_videos_per_prompt", "latents"]) + output_name = "videos" + + def get_dummy_inputs(self, seed=0): + generator = self.get_generator(seed) + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "generator": generator, + "num_inference_steps": 2, + "height": 32, + "width": 32, + "num_frames": 9, + "max_sequence_length": 16, + "output_type": "pt", + } + return inputs + + @pytest.mark.skip(reason="num_videos_per_prompt") + def test_num_images_per_prompt(self): + pass From 7b645e6953c4a71f323caf851c7813c017eab2e2 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Wed, 1 Apr 2026 07:41:47 -0700 Subject: [PATCH 04/11] Add LTX image-to-video modular pipeline --- src/diffusers/__init__.py | 2 + src/diffusers/modular_pipelines/__init__.py | 4 +- .../modular_pipelines/ltx/__init__.py | 8 +- .../modular_pipelines/ltx/before_denoise.py | 126 +++++++++++ .../modular_pipelines/ltx/denoise.py | 206 ++++++++++++++++++ .../ltx/modular_blocks_ltx.py | 42 +++- .../modular_pipelines/ltx/modular_pipeline.py | 10 + .../modular_pipelines/modular_pipeline.py | 1 + 8 files changed, 392 insertions(+), 7 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 637bf2685824..a1a82974eb50 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -444,6 +444,8 @@ "HeliosPyramidDistilledAutoBlocks", "HeliosPyramidDistilledModularPipeline", "LTXBlocks", + "LTXImage2VideoBlocks", + "LTXImage2VideoModularPipeline", "LTXModularPipeline", "HeliosPyramidModularPipeline", "QwenImageAutoBlocks", diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index 389a5416f3ea..967401ba6e57 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -90,6 +90,8 @@ ] _import_structure["ltx"] = [ "LTXBlocks", + "LTXImage2VideoBlocks", + "LTXImage2VideoModularPipeline", "LTXModularPipeline", ] _import_structure["z_image"] = [ @@ -145,7 +147,7 @@ QwenImageModularPipeline, ) from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline - from .ltx import LTXBlocks, LTXModularPipeline + from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXImage2VideoModularPipeline, LTXModularPipeline from .wan import ( Wan22Blocks, Wan22Image2VideoBlocks, diff --git a/src/diffusers/modular_pipelines/ltx/__init__.py b/src/diffusers/modular_pipelines/ltx/__init__.py index 019fa96fef14..3939db1ac9d8 100644 --- a/src/diffusers/modular_pipelines/ltx/__init__.py +++ b/src/diffusers/modular_pipelines/ltx/__init__.py @@ -21,8 +21,8 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["modular_blocks_ltx"] = ["LTXBlocks"] - _import_structure["modular_pipeline"] = ["LTXModularPipeline"] + _import_structure["modular_blocks_ltx"] = ["LTXBlocks", "LTXImage2VideoBlocks"] + _import_structure["modular_pipeline"] = ["LTXModularPipeline", "LTXImage2VideoModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: try: @@ -31,8 +31,8 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .modular_blocks_ltx import LTXBlocks - from .modular_pipeline import LTXModularPipeline + from .modular_blocks_ltx import LTXBlocks, LTXImage2VideoBlocks + from .modular_pipeline import LTXImage2VideoModularPipeline, LTXModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py index 176edbe2bb69..056cf8d5ad80 100644 --- a/src/diffusers/modular_pipelines/ltx/before_denoise.py +++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py @@ -285,3 +285,129 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe self.set_block_state(state, block_state) return components, state + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents +def retrieve_latents( + encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +class LTXImage2VideoPrepareLatentsStep(ModularPipelineBlocks): + model_name = "ltx" + + @property + def description(self) -> str: + return "Prepare latents step for image-to-video: encodes the first frame and creates a conditioning mask" + + @property + def expected_components(self) -> list[ComponentSpec]: + from ...models import AutoencoderKLLTXVideo + return [ + ComponentSpec("vae", AutoencoderKLLTXVideo), + ] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("image", required=True), + InputParam("height", type_hint=int, default=512), + InputParam("width", type_hint=int, default=704), + InputParam("num_frames", type_hint=int, default=161), + InputParam("latents", type_hint=torch.Tensor | None), + InputParam("num_videos_per_prompt", type_hint=int, default=1), + InputParam("generator"), + InputParam("batch_size", required=True, type_hint=int), + InputParam("dtype", type_hint=torch.dtype), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("latents", type_hint=torch.Tensor), + OutputParam("conditioning_mask", type_hint=torch.Tensor), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + batch_size = block_state.batch_size * block_state.num_videos_per_prompt + num_channels_latents = components.transformer.config.in_channels + + height = block_state.height // components.vae_spatial_compression_ratio + width = block_state.width // components.vae_spatial_compression_ratio + num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1 + + shape = (batch_size, num_channels_latents, num_frames, height, width) + mask_shape = (batch_size, 1, num_frames, height, width) + + if block_state.latents is not None: + conditioning_mask = block_state.latents.new_zeros(mask_shape) + conditioning_mask[:, :, 0] = 1.0 + conditioning_mask = LTXPipeline._pack_latents( + conditioning_mask, + components.transformer_spatial_patch_size, + components.transformer_temporal_patch_size, + ).squeeze(-1) + block_state.latents = block_state.latents.to(device=device, dtype=torch.float32) + block_state.conditioning_mask = conditioning_mask + self.set_block_state(state, block_state) + return components, state + + image = block_state.image + if not isinstance(image, torch.Tensor): + from ...video_processor import VideoProcessor + processor = VideoProcessor(vae_scale_factor=components.vae_spatial_compression_ratio) + image = processor.preprocess(image, height=block_state.height, width=block_state.width) + image = image.to(device=device, dtype=torch.float32) + + if isinstance(block_state.generator, list): + init_latents = [ + retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2)), block_state.generator[i]) + for i in range(batch_size) + ] + else: + init_latents = [ + retrieve_latents(components.vae.encode(img.unsqueeze(0).unsqueeze(2)), block_state.generator) + for img in image + ] + + init_latents = torch.cat(init_latents, dim=0).to(torch.float32) + init_latents = LTXPipeline._normalize_latents( + init_latents, components.vae.latents_mean, components.vae.latents_std + ) + init_latents = init_latents.repeat(1, 1, num_frames, 1, 1) + + actual_mask_shape = (init_latents.shape[0], 1, init_latents.shape[2], init_latents.shape[3], init_latents.shape[4]) + conditioning_mask = torch.zeros(actual_mask_shape, device=device, dtype=torch.float32) + conditioning_mask[:, :, 0] = 1.0 + + noise = randn_tensor(init_latents.shape, generator=block_state.generator, device=device, dtype=torch.float32) + latents = init_latents * conditioning_mask + noise * (1 - conditioning_mask) + + conditioning_mask = LTXPipeline._pack_latents( + conditioning_mask, + components.transformer_spatial_patch_size, + components.transformer_temporal_patch_size, + ).squeeze(-1) + latents = LTXPipeline._pack_latents( + latents, + components.transformer_spatial_patch_size, + components.transformer_temporal_patch_size, + ) + + block_state.latents = latents + block_state.conditioning_mask = conditioning_mask + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py index 184aeb2d1b72..04ff496f8477 100644 --- a/src/diffusers/modular_pipelines/ltx/denoise.py +++ b/src/diffusers/modular_pipelines/ltx/denoise.py @@ -254,3 +254,209 @@ def description(self) -> str: " - `LTXLoopAfterDenoiser`\n" "This block supports text-to-video tasks." ) + + +class LTXImage2VideoLoopBeforeDenoiser(ModularPipelineBlocks): + model_name = "ltx" + + @property + def description(self) -> str: + return ( + "Step within the i2v denoising loop that prepares the latent input and modulates " + "the timestep with the conditioning mask." + ) + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("latents", required=True, type_hint=torch.Tensor), + InputParam("conditioning_mask", required=True, type_hint=torch.Tensor), + InputParam("dtype", required=True, type_hint=torch.dtype), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + block_state.latent_model_input = block_state.latents.to(block_state.dtype) + block_state.timestep_adjusted = t.expand(block_state.latent_model_input.shape[0]).unsqueeze(-1) * ( + 1 - block_state.conditioning_mask + ) + return components, block_state + + +class LTXImage2VideoLoopDenoiser(ModularPipelineBlocks): + model_name = "ltx" + + def __init__( + self, + guider_input_fields: dict[str, Any] = { + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"), + }, + ): + if not isinstance(guider_input_fields, dict): + raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}") + self._guider_input_fields = guider_input_fields + super().__init__() + + @property + def expected_components(self) -> list[ComponentSpec]: + from ...configuration_utils import FrozenDict + from ...guiders import ClassifierFreeGuidance + return [ + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 3.0}), + default_creation_method="from_config", + ), + ComponentSpec("transformer", LTXVideoTransformer3DModel), + ] + + @property + def description(self) -> str: + return ( + "Step within the i2v denoising loop that denoises the latents with guidance " + "using timestep modulated by the conditioning mask." + ) + + @property + def inputs(self) -> list[tuple[str, Any]]: + inputs = [ + InputParam("attention_kwargs"), + InputParam("num_inference_steps", required=True, type_hint=int), + InputParam("rope_interpolation_scale", type_hint=tuple), + InputParam("height", type_hint=int), + InputParam("width", type_hint=int), + InputParam("num_frames", type_hint=int), + ] + guider_input_names = [] + for value in self._guider_input_fields.values(): + if isinstance(value, tuple): + guider_input_names.extend(value) + else: + guider_input_names.append(value) + for name in guider_input_names: + inputs.append(InputParam(name=name, required=True, type_hint=torch.Tensor)) + return inputs + + @torch.no_grad() + def __call__( + self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor + ) -> PipelineState: + components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) + + latent_num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1 + latent_height = block_state.height // components.vae_spatial_compression_ratio + latent_width = block_state.width // components.vae_spatial_compression_ratio + + guider_state = components.guider.prepare_inputs_from_block_state(block_state, self._guider_input_fields) + + for guider_state_batch in guider_state: + components.guider.prepare_models(components.transformer) + cond_kwargs = guider_state_batch.as_dict() + cond_kwargs = { + k: v.to(block_state.dtype) if isinstance(v, torch.Tensor) else v + for k, v in cond_kwargs.items() + if k in self._guider_input_fields.keys() + } + + guider_state_batch.noise_pred = components.transformer( + hidden_states=block_state.latent_model_input, + timestep=block_state.timestep_adjusted, + num_frames=latent_num_frames, + height=latent_height, + width=latent_width, + rope_interpolation_scale=block_state.rope_interpolation_scale, + attention_kwargs=block_state.attention_kwargs, + return_dict=False, + **cond_kwargs, + )[0] + components.guider.cleanup_models(components.transformer) + + block_state.noise_pred = components.guider(guider_state)[0] + + return components, block_state + + +class LTXImage2VideoLoopAfterDenoiser(ModularPipelineBlocks): + model_name = "ltx" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def description(self) -> str: + return ( + "Step within the i2v denoising loop that updates the latents, " + "applying the scheduler step only to frames after the first (conditioned) frame." + ) + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("height", type_hint=int), + InputParam("width", type_hint=int), + InputParam("num_frames", type_hint=int), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + from ...pipelines.ltx.pipeline_ltx import LTXPipeline + + latent_num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1 + latent_height = block_state.height // components.vae_spatial_compression_ratio + latent_width = block_state.width // components.vae_spatial_compression_ratio + + noise_pred = LTXPipeline._unpack_latents( + block_state.noise_pred, + latent_num_frames, latent_height, latent_width, + components.transformer_spatial_patch_size, + components.transformer_temporal_patch_size, + ) + latents = LTXPipeline._unpack_latents( + block_state.latents, + latent_num_frames, latent_height, latent_width, + components.transformer_spatial_patch_size, + components.transformer_temporal_patch_size, + ) + + noise_pred = noise_pred[:, :, 1:] + noise_latents = latents[:, :, 1:] + pred_latents = components.scheduler.step(noise_pred, t, noise_latents, return_dict=False)[0] + + latents = torch.cat([latents[:, :, :1], pred_latents], dim=2) + block_state.latents = LTXPipeline._pack_latents( + latents, + components.transformer_spatial_patch_size, + components.transformer_temporal_patch_size, + ) + + return components, block_state + + +class LTXImage2VideoDenoiseStep(LTXDenoiseLoopWrapper): + block_classes = [ + LTXImage2VideoLoopBeforeDenoiser, + LTXImage2VideoLoopDenoiser( + guider_input_fields={ + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"), + } + ), + LTXImage2VideoLoopAfterDenoiser, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step for image-to-video that iteratively denoises the latents.\n" + "The first frame is kept fixed via a conditioning mask.\n" + "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n" + " - `LTXImage2VideoLoopBeforeDenoiser`\n" + " - `LTXImage2VideoLoopDenoiser`\n" + " - `LTXImage2VideoLoopAfterDenoiser`" + ) diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py index f6871864aa2d..290e75051ea0 100644 --- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py +++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py @@ -16,12 +16,13 @@ from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import OutputParam from .before_denoise import ( + LTXImage2VideoPrepareLatentsStep, LTXPrepareLatentsStep, LTXSetTimestepsStep, LTXTextInputStep, ) from .decoders import LTXVaeDecoderStep -from .denoise import LTXDenoiseStep +from .denoise import LTXDenoiseStep, LTXImage2VideoDenoiseStep from .encoders import LTXTextEncoderStep @@ -47,6 +48,25 @@ def outputs(self): return [OutputParam.template("latents")] +class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "ltx" + block_classes = [ + LTXTextInputStep, + LTXSetTimestepsStep, + LTXImage2VideoPrepareLatentsStep, + LTXImage2VideoDenoiseStep, + ] + block_names = ["input", "set_timesteps", "prepare_latents", "denoise"] + + @property + def description(self): + return "Denoise block for image-to-video that takes encoded conditions and an image, and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] + + class LTXBlocks(SequentialPipelineBlocks): model_name = "ltx" block_classes = [ @@ -58,7 +78,25 @@ class LTXBlocks(SequentialPipelineBlocks): @property def description(self): - return "Modular pipeline blocks for LTX Video." + return "Modular pipeline blocks for LTX Video text-to-video." + + @property + def outputs(self): + return [OutputParam.template("videos")] + + +class LTXImage2VideoBlocks(SequentialPipelineBlocks): + model_name = "ltx" + block_classes = [ + LTXTextEncoderStep, + LTXImage2VideoCoreDenoiseStep, + LTXVaeDecoderStep, + ] + block_names = ["text_encoder", "denoise", "decode"] + + @property + def description(self): + return "Modular pipeline blocks for LTX Video image-to-video." @property def outputs(self): diff --git a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py index 3cce6845396b..9f4d1b45e93a 100644 --- a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py @@ -62,3 +62,13 @@ def requires_unconditional_embeds(self): if hasattr(self, "guider") and self.guider is not None: return self.guider._enabled and self.guider.num_conditions > 1 return False + + +class LTXImage2VideoModularPipeline(LTXModularPipeline): + """ + A ModularPipeline for LTX Video image-to-video. + + > [!WARNING] > This is an experimental feature and is likely to change in the future. + """ + + default_blocks_name = "LTXImage2VideoBlocks" diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index ace89f0d6f91..07636e95191b 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -133,6 +133,7 @@ def _helios_pyramid_map_fn(config_dict=None): ("helios", _create_default_map_fn("HeliosModularPipeline")), ("helios-pyramid", _helios_pyramid_map_fn), ("ltx", _create_default_map_fn("LTXModularPipeline")), + ("ltx-i2v", _create_default_map_fn("LTXImage2VideoModularPipeline")), ] ) From 7491d56e94d0d6869a91e2c2e2f12f58f205ed63 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Wed, 1 Apr 2026 07:56:02 -0700 Subject: [PATCH 05/11] Fix i2v VAE dtype mismatch --- src/diffusers/modular_pipelines/ltx/before_denoise.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py index 056cf8d5ad80..6333dd493d8c 100644 --- a/src/diffusers/modular_pipelines/ltx/before_denoise.py +++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py @@ -371,14 +371,16 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe image = processor.preprocess(image, height=block_state.height, width=block_state.width) image = image.to(device=device, dtype=torch.float32) + vae_dtype = components.vae.dtype + if isinstance(block_state.generator, list): init_latents = [ - retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2)), block_state.generator[i]) + retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i]) for i in range(batch_size) ] else: init_latents = [ - retrieve_latents(components.vae.encode(img.unsqueeze(0).unsqueeze(2)), block_state.generator) + retrieve_latents(components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator) for img in image ] From 1e53507166e3cd82d4db7696aa9511cf33d8b8e6 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Wed, 1 Apr 2026 11:35:37 -0700 Subject: [PATCH 06/11] Add cache_context to denoiser for CFG parity --- .../modular_pipelines/ltx/denoise.py | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py index 04ff496f8477..40c05fafcd74 100644 --- a/src/diffusers/modular_pipelines/ltx/denoise.py +++ b/src/diffusers/modular_pipelines/ltx/denoise.py @@ -135,17 +135,19 @@ def __call__( if k in self._guider_input_fields.keys() } - guider_state_batch.noise_pred = components.transformer( - hidden_states=block_state.latent_model_input, - timestep=t.expand(block_state.latent_model_input.shape[0]).to(block_state.dtype), - num_frames=latent_num_frames, - height=latent_height, - width=latent_width, - rope_interpolation_scale=block_state.rope_interpolation_scale, - attention_kwargs=block_state.attention_kwargs, - return_dict=False, - **cond_kwargs, - )[0] + context_name = getattr(guider_state_batch, components.guider._identifier_key, None) + with components.transformer.cache_context(context_name): + guider_state_batch.noise_pred = components.transformer( + hidden_states=block_state.latent_model_input, + timestep=t.expand(block_state.latent_model_input.shape[0]).to(block_state.dtype), + num_frames=latent_num_frames, + height=latent_height, + width=latent_width, + rope_interpolation_scale=block_state.rope_interpolation_scale, + attention_kwargs=block_state.attention_kwargs, + return_dict=False, + **cond_kwargs, + )[0] components.guider.cleanup_models(components.transformer) block_state.noise_pred = components.guider(guider_state)[0] @@ -360,17 +362,19 @@ def __call__( if k in self._guider_input_fields.keys() } - guider_state_batch.noise_pred = components.transformer( - hidden_states=block_state.latent_model_input, - timestep=block_state.timestep_adjusted, - num_frames=latent_num_frames, - height=latent_height, - width=latent_width, - rope_interpolation_scale=block_state.rope_interpolation_scale, - attention_kwargs=block_state.attention_kwargs, - return_dict=False, - **cond_kwargs, - )[0] + context_name = getattr(guider_state_batch, components.guider._identifier_key, None) + with components.transformer.cache_context(context_name): + guider_state_batch.noise_pred = components.transformer( + hidden_states=block_state.latent_model_input, + timestep=block_state.timestep_adjusted, + num_frames=latent_num_frames, + height=latent_height, + width=latent_width, + rope_interpolation_scale=block_state.rope_interpolation_scale, + attention_kwargs=block_state.attention_kwargs, + return_dict=False, + **cond_kwargs, + )[0] components.guider.cleanup_models(components.transformer) block_state.noise_pred = components.guider(guider_state)[0] From 322727d27ad84a4b77e5f4374fdb464f54df1e8b Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Wed, 1 Apr 2026 12:29:54 -0700 Subject: [PATCH 07/11] Address review feedback --- src/diffusers/__init__.py | 6 +++--- src/diffusers/modular_pipelines/__init__.py | 3 +-- .../modular_pipelines/ltx/__init__.py | 4 ++-- .../modular_pipelines/ltx/before_denoise.py | 5 ++++- .../modular_pipelines/ltx/decoders.py | 3 ++- .../modular_pipelines/ltx/denoise.py | 20 +++++++++++-------- .../modular_pipelines/ltx/encoders.py | 11 ++++------ .../ltx/modular_blocks_ltx.py | 4 ++++ .../modular_pipelines/ltx/modular_pipeline.py | 10 ---------- .../modular_pipelines/modular_pipeline.py | 1 - 10 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index a1a82974eb50..6c09a59bb8f9 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -443,11 +443,10 @@ "HeliosPyramidAutoBlocks", "HeliosPyramidDistilledAutoBlocks", "HeliosPyramidDistilledModularPipeline", + "HeliosPyramidModularPipeline", "LTXBlocks", "LTXImage2VideoBlocks", - "LTXImage2VideoModularPipeline", "LTXModularPipeline", - "HeliosPyramidModularPipeline", "QwenImageAutoBlocks", "QwenImageEditAutoBlocks", "QwenImageEditModularPipeline", @@ -1214,9 +1213,10 @@ HeliosPyramidAutoBlocks, HeliosPyramidDistilledAutoBlocks, HeliosPyramidDistilledModularPipeline, + HeliosPyramidModularPipeline, LTXBlocks, + LTXImage2VideoBlocks, LTXModularPipeline, - HeliosPyramidModularPipeline, QwenImageAutoBlocks, QwenImageEditAutoBlocks, QwenImageEditModularPipeline, diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index 967401ba6e57..f34ff1cedcbb 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -91,7 +91,6 @@ _import_structure["ltx"] = [ "LTXBlocks", "LTXImage2VideoBlocks", - "LTXImage2VideoModularPipeline", "LTXModularPipeline", ] _import_structure["z_image"] = [ @@ -147,7 +146,7 @@ QwenImageModularPipeline, ) from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline - from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXImage2VideoModularPipeline, LTXModularPipeline + from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXModularPipeline from .wan import ( Wan22Blocks, Wan22Image2VideoBlocks, diff --git a/src/diffusers/modular_pipelines/ltx/__init__.py b/src/diffusers/modular_pipelines/ltx/__init__.py index 3939db1ac9d8..6be74e6b4112 100644 --- a/src/diffusers/modular_pipelines/ltx/__init__.py +++ b/src/diffusers/modular_pipelines/ltx/__init__.py @@ -22,7 +22,7 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: _import_structure["modular_blocks_ltx"] = ["LTXBlocks", "LTXImage2VideoBlocks"] - _import_structure["modular_pipeline"] = ["LTXModularPipeline", "LTXImage2VideoModularPipeline"] + _import_structure["modular_pipeline"] = ["LTXModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: try: @@ -32,7 +32,7 @@ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: from .modular_blocks_ltx import LTXBlocks, LTXImage2VideoBlocks - from .modular_pipeline import LTXImage2VideoModularPipeline, LTXModularPipeline + from .modular_pipeline import LTXModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py index 6333dd493d8c..808a1fdd524f 100644 --- a/src/diffusers/modular_pipelines/ltx/before_denoise.py +++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py @@ -373,10 +373,11 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe vae_dtype = components.vae.dtype + num_images = image.shape[0] if isinstance(block_state.generator, list): init_latents = [ retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i]) - for i in range(batch_size) + for i in range(num_images) ] else: init_latents = [ @@ -385,6 +386,8 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe ] init_latents = torch.cat(init_latents, dim=0).to(torch.float32) + if init_latents.shape[0] < batch_size: + init_latents = init_latents.repeat_interleave(batch_size // init_latents.shape[0], dim=0) init_latents = LTXPipeline._normalize_latents( init_latents, components.vae.latents_mean, components.vae.latents_std ) diff --git a/src/diffusers/modular_pipelines/ltx/decoders.py b/src/diffusers/modular_pipelines/ltx/decoders.py index eca22a5797a7..d7c85171c091 100644 --- a/src/diffusers/modular_pipelines/ltx/decoders.py +++ b/src/diffusers/modular_pipelines/ltx/decoders.py @@ -62,6 +62,7 @@ def inputs(self) -> list[tuple[str, Any]]: InputParam("decode_noise_scale", default=None), InputParam("generator"), InputParam("batch_size", type_hint=int, default=1), + InputParam("dtype", required=True, type_hint=torch.dtype), ] @property @@ -105,7 +106,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: latents = LTXPipeline._denormalize_latents( latents, vae.latents_mean, vae.latents_std, vae.config.scaling_factor ) - latents = latents.to(block_state.dtype if hasattr(block_state, 'dtype') else torch.float32) + latents = latents.to(block_state.dtype) if not vae.config.timestep_conditioning: timestep = None diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py index 40c05fafcd74..bd25b6400aa6 100644 --- a/src/diffusers/modular_pipelines/ltx/denoise.py +++ b/src/diffusers/modular_pipelines/ltx/denoise.py @@ -63,11 +63,13 @@ class LTXLoopDenoiser(ModularPipelineBlocks): def __init__( self, - guider_input_fields: dict[str, Any] = { - "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), - "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"), - }, + guider_input_fields: dict[str, Any] | None = None, ): + if guider_input_fields is None: + guider_input_fields = { + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"), + } if not isinstance(guider_input_fields, dict): raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}") self._guider_input_fields = guider_input_fields @@ -290,11 +292,13 @@ class LTXImage2VideoLoopDenoiser(ModularPipelineBlocks): def __init__( self, - guider_input_fields: dict[str, Any] = { - "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), - "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"), - }, + guider_input_fields: dict[str, Any] | None = None, ): + if guider_input_fields is None: + guider_input_fields = { + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"), + } if not isinstance(guider_input_fields, dict): raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}") self._guider_input_fields = guider_input_fields diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py index c1310fdbd6da..91e85e57009c 100644 --- a/src/diffusers/modular_pipelines/ltx/encoders.py +++ b/src/diffusers/modular_pipelines/ltx/encoders.py @@ -153,12 +153,7 @@ def encode_prompt( negative_prompt = negative_prompt or "" negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif batch_size != len(negative_prompt): + if batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" @@ -183,7 +178,9 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe block_state.device = components._execution_device # Skip encoding if pre-computed embeddings are provided - if getattr(block_state, "prompt_embeds", None) is not None: + has_prompt_embeds = getattr(block_state, "prompt_embeds", None) is not None + has_negative = getattr(block_state, "negative_prompt_embeds", None) is not None + if has_prompt_embeds and (has_negative or not components.requires_unconditional_embeds): self.set_block_state(state, block_state) return components, state diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py index 290e75051ea0..3c7f85424926 100644 --- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py +++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py @@ -29,6 +29,7 @@ logger = logging.get_logger(__name__) +# auto_docstring class LTXCoreDenoiseStep(SequentialPipelineBlocks): model_name = "ltx" block_classes = [ @@ -48,6 +49,7 @@ def outputs(self): return [OutputParam.template("latents")] +# auto_docstring class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): model_name = "ltx" block_classes = [ @@ -67,6 +69,7 @@ def outputs(self): return [OutputParam.template("latents")] +# auto_docstring class LTXBlocks(SequentialPipelineBlocks): model_name = "ltx" block_classes = [ @@ -85,6 +88,7 @@ def outputs(self): return [OutputParam.template("videos")] +# auto_docstring class LTXImage2VideoBlocks(SequentialPipelineBlocks): model_name = "ltx" block_classes = [ diff --git a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py index 9f4d1b45e93a..3cce6845396b 100644 --- a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py @@ -62,13 +62,3 @@ def requires_unconditional_embeds(self): if hasattr(self, "guider") and self.guider is not None: return self.guider._enabled and self.guider.num_conditions > 1 return False - - -class LTXImage2VideoModularPipeline(LTXModularPipeline): - """ - A ModularPipeline for LTX Video image-to-video. - - > [!WARNING] > This is an experimental feature and is likely to change in the future. - """ - - default_blocks_name = "LTXImage2VideoBlocks" diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index 07636e95191b..ace89f0d6f91 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -133,7 +133,6 @@ def _helios_pyramid_map_fn(config_dict=None): ("helios", _create_default_map_fn("HeliosModularPipeline")), ("helios-pyramid", _helios_pyramid_map_fn), ("ltx", _create_default_map_fn("LTXModularPipeline")), - ("ltx-i2v", _create_default_map_fn("LTXImage2VideoModularPipeline")), ] ) From 4b644f7cdef5b8120437eaf2d2db37b7cb005270 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Thu, 2 Apr 2026 17:15:33 -0700 Subject: [PATCH 08/11] Generate auto docstrings for LTX assembled blocks --- .../ltx/modular_blocks_ltx.py | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py index 3c7f85424926..b4b9b1b4255b 100644 --- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py +++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py @@ -31,6 +31,53 @@ # auto_docstring class LTXCoreDenoiseStep(SequentialPipelineBlocks): + """ + Denoise block that takes encoded conditions and runs the denoising process. + + Components: + transformer (`LTXVideoTransformer3DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) + + Inputs: + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + guidance_scale (`float`, *optional*, defaults to 3.0): + TODO: Add description. + prompt_embeds (`Tensor`): + TODO: Add description. + prompt_attention_mask (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_attention_mask (`Tensor`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + height (`int`, *optional*, defaults to 512): + TODO: Add description. + width (`int`, *optional*, defaults to 704): + TODO: Add description. + num_frames (`int`, *optional*, defaults to 161): + TODO: Add description. + frame_rate (`int`, *optional*, defaults to 25): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "ltx" block_classes = [ LTXTextInputStep, @@ -51,6 +98,56 @@ def outputs(self): # auto_docstring class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): + """ + Denoise block for image-to-video that takes encoded conditions and an image, and runs the denoising process. + + Components: + transformer (`LTXVideoTransformer3DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + vae (`AutoencoderKLLTXVideo`) + guider (`ClassifierFreeGuidance`) + + Inputs: + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + guidance_scale (`float`, *optional*, defaults to 3.0): + TODO: Add description. + prompt_embeds (`Tensor`): + TODO: Add description. + prompt_attention_mask (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_attention_mask (`Tensor`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + height (`int`, *optional*, defaults to 512): + TODO: Add description. + width (`int`, *optional*, defaults to 704): + TODO: Add description. + num_frames (`int`, *optional*, defaults to 161): + TODO: Add description. + frame_rate (`int`, *optional*, defaults to 25): + TODO: Add description. + image (`None`): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "ltx" block_classes = [ LTXTextInputStep, @@ -71,6 +168,69 @@ def outputs(self): # auto_docstring class LTXBlocks(SequentialPipelineBlocks): + """ + Modular pipeline blocks for LTX Video text-to-video. + + Components: + text_encoder (`T5EncoderModel`) + tokenizer (`T5TokenizerFast`) + guider (`ClassifierFreeGuidance`) + transformer (`LTXVideoTransformer3DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + vae (`AutoencoderKLLTXVideo`) + video_processor (`VideoProcessor`) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + negative_prompt (`None`, *optional*): + TODO: Add description. + prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + prompt_attention_mask (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_attention_mask (`Tensor`, *optional*): + TODO: Add description. + max_sequence_length (`None`, *optional*, defaults to 128): + TODO: Add description. + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + guidance_scale (`float`, *optional*, defaults to 3.0): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + height (`int`, *optional*, defaults to 512): + TODO: Add description. + width (`int`, *optional*, defaults to 704): + TODO: Add description. + num_frames (`int`, *optional*, defaults to 161): + TODO: Add description. + frame_rate (`int`, *optional*, defaults to 25): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + output_type (`str`, *optional*, defaults to np): + TODO: Add description. + decode_timestep (`None`, *optional*, defaults to 0.0): + TODO: Add description. + decode_noise_scale (`None`, *optional*): + TODO: Add description. + + Outputs: + videos (`list`): + The generated videos. + """ + model_name = "ltx" block_classes = [ LTXTextEncoderStep, @@ -90,6 +250,71 @@ def outputs(self): # auto_docstring class LTXImage2VideoBlocks(SequentialPipelineBlocks): + """ + Modular pipeline blocks for LTX Video image-to-video. + + Components: + text_encoder (`T5EncoderModel`) + tokenizer (`T5TokenizerFast`) + guider (`ClassifierFreeGuidance`) + transformer (`LTXVideoTransformer3DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + vae (`AutoencoderKLLTXVideo`) + video_processor (`VideoProcessor`) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + negative_prompt (`None`, *optional*): + TODO: Add description. + prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + prompt_attention_mask (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_attention_mask (`Tensor`, *optional*): + TODO: Add description. + max_sequence_length (`None`, *optional*, defaults to 128): + TODO: Add description. + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + guidance_scale (`float`, *optional*, defaults to 3.0): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + height (`int`, *optional*, defaults to 512): + TODO: Add description. + width (`int`, *optional*, defaults to 704): + TODO: Add description. + num_frames (`int`, *optional*, defaults to 161): + TODO: Add description. + frame_rate (`int`, *optional*, defaults to 25): + TODO: Add description. + image (`None`): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + output_type (`str`, *optional*, defaults to np): + TODO: Add description. + decode_timestep (`None`, *optional*, defaults to 0.0): + TODO: Add description. + decode_noise_scale (`None`, *optional*): + TODO: Add description. + + Outputs: + videos (`list`): + The generated videos. + """ + model_name = "ltx" block_classes = [ LTXTextEncoderStep, From 3da70da038caec7948492e539ed08aa9b3f7713f Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Thu, 2 Apr 2026 17:20:58 -0700 Subject: [PATCH 09/11] Fix ruff lint and format issues --- .../modular_pipelines/ltx/before_denoise.py | 32 +++++++++++++------ .../modular_pipelines/ltx/denoise.py | 9 ++++-- .../modular_pipelines/ltx/encoders.py | 1 - 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py index 808a1fdd524f..25f80ccd473f 100644 --- a/src/diffusers/modular_pipelines/ltx/before_denoise.py +++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py @@ -124,9 +124,7 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe # Repeat prompt_embeds for num_videos_per_prompt _, seq_len, _ = block_state.prompt_embeds.shape block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, num_videos, 1) - block_state.prompt_embeds = block_state.prompt_embeds.view( - block_state.batch_size * num_videos, seq_len, -1 - ) + block_state.prompt_embeds = block_state.prompt_embeds.view(block_state.batch_size * num_videos, seq_len, -1) if block_state.prompt_attention_mask is not None: block_state.prompt_attention_mask = block_state.prompt_attention_mask.repeat(num_videos, 1) @@ -139,7 +137,9 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe ) if block_state.negative_prompt_attention_mask is not None: - block_state.negative_prompt_attention_mask = block_state.negative_prompt_attention_mask.repeat(num_videos, 1) + block_state.negative_prompt_attention_mask = block_state.negative_prompt_attention_mask.repeat( + num_videos, 1 + ) self.set_block_state(state, block_state) return components, state @@ -276,7 +276,9 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1 shape = (batch_size, num_channels_latents, num_frames, height, width) - block_state.latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=torch.float32) + block_state.latents = randn_tensor( + shape, generator=block_state.generator, device=device, dtype=torch.float32 + ) block_state.latents = LTXPipeline._pack_latents( block_state.latents, components.transformer_spatial_patch_size, @@ -311,6 +313,7 @@ def description(self) -> str: @property def expected_components(self) -> list[ComponentSpec]: from ...models import AutoencoderKLLTXVideo + return [ ComponentSpec("vae", AutoencoderKLLTXVideo), ] @@ -342,13 +345,11 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe device = components._execution_device batch_size = block_state.batch_size * block_state.num_videos_per_prompt - num_channels_latents = components.transformer.config.in_channels height = block_state.height // components.vae_spatial_compression_ratio width = block_state.width // components.vae_spatial_compression_ratio num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1 - shape = (batch_size, num_channels_latents, num_frames, height, width) mask_shape = (batch_size, 1, num_frames, height, width) if block_state.latents is not None: @@ -367,6 +368,7 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe image = block_state.image if not isinstance(image, torch.Tensor): from ...video_processor import VideoProcessor + processor = VideoProcessor(vae_scale_factor=components.vae_spatial_compression_ratio) image = processor.preprocess(image, height=block_state.height, width=block_state.width) image = image.to(device=device, dtype=torch.float32) @@ -376,12 +378,16 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe num_images = image.shape[0] if isinstance(block_state.generator, list): init_latents = [ - retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i]) + retrieve_latents( + components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i] + ) for i in range(num_images) ] else: init_latents = [ - retrieve_latents(components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator) + retrieve_latents( + components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator + ) for img in image ] @@ -393,7 +399,13 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe ) init_latents = init_latents.repeat(1, 1, num_frames, 1, 1) - actual_mask_shape = (init_latents.shape[0], 1, init_latents.shape[2], init_latents.shape[3], init_latents.shape[4]) + actual_mask_shape = ( + init_latents.shape[0], + 1, + init_latents.shape[2], + init_latents.shape[3], + init_latents.shape[4], + ) conditioning_mask = torch.zeros(actual_mask_shape, device=device, dtype=torch.float32) conditioning_mask[:, :, 0] = 1.0 diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py index bd25b6400aa6..eb6dbeeeee82 100644 --- a/src/diffusers/modular_pipelines/ltx/denoise.py +++ b/src/diffusers/modular_pipelines/ltx/denoise.py @@ -308,6 +308,7 @@ def __init__( def expected_components(self) -> list[ComponentSpec]: from ...configuration_utils import FrozenDict from ...guiders import ClassifierFreeGuidance + return [ ComponentSpec( "guider", @@ -420,13 +421,17 @@ def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: i noise_pred = LTXPipeline._unpack_latents( block_state.noise_pred, - latent_num_frames, latent_height, latent_width, + latent_num_frames, + latent_height, + latent_width, components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, ) latents = LTXPipeline._unpack_latents( block_state.latents, - latent_num_frames, latent_height, latent_width, + latent_num_frames, + latent_height, + latent_width, components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, ) diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py index 91e85e57009c..9f15d33b0e18 100644 --- a/src/diffusers/modular_pipelines/ltx/encoders.py +++ b/src/diffusers/modular_pipelines/ltx/encoders.py @@ -103,7 +103,6 @@ def _get_t5_prompt_embeds( dtype: torch.dtype, ): prompt = [prompt] if isinstance(prompt, str) else prompt - batch_size = len(prompt) text_inputs = components.tokenizer( prompt, From 38cfc86d4708ac7a6f2f7cef497c749affa45dac Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Thu, 2 Apr 2026 19:53:36 -0700 Subject: [PATCH 10/11] use InputParam/OutputParam templates and ruff check --- src/diffusers/modular_pipelines/__init__.py | 2 +- .../modular_pipelines/ltx/before_denoise.py | 54 ++--- .../modular_pipelines/ltx/decoders.py | 24 +- .../modular_pipelines/ltx/denoise.py | 32 +-- .../modular_pipelines/ltx/encoders.py | 42 +--- .../ltx/modular_blocks_ltx.py | 208 +++++++++--------- 6 files changed, 167 insertions(+), 195 deletions(-) diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index f34ff1cedcbb..c76861df96d4 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -124,6 +124,7 @@ HeliosPyramidDistilledModularPipeline, HeliosPyramidModularPipeline, ) + from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXModularPipeline from .modular_pipeline import ( AutoPipelineBlocks, BlockState, @@ -146,7 +147,6 @@ QwenImageModularPipeline, ) from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline - from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXModularPipeline from .wan import ( Wan22Blocks, Wan22Image2VideoBlocks, diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py index 25f80ccd473f..cca52af9248e 100644 --- a/src/diffusers/modular_pipelines/ltx/before_denoise.py +++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py @@ -93,12 +93,12 @@ def expected_components(self) -> list[ComponentSpec]: @property def inputs(self) -> list[InputParam]: return [ - InputParam("num_videos_per_prompt", default=1), + InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"), InputParam("guidance_scale", type_hint=float, default=3.0), - InputParam("prompt_embeds", required=True, type_hint=torch.Tensor), - InputParam("prompt_attention_mask", type_hint=torch.Tensor), - InputParam("negative_prompt_embeds", type_hint=torch.Tensor), - InputParam("negative_prompt_attention_mask", type_hint=torch.Tensor), + InputParam.template("prompt_embeds", required=True), + InputParam.template("prompt_embeds_mask", name="prompt_attention_mask"), + InputParam.template("negative_prompt_embeds"), + InputParam.template("negative_prompt_embeds_mask", name="negative_prompt_attention_mask"), ] @property @@ -161,11 +161,11 @@ def description(self) -> str: @property def inputs(self) -> list[InputParam]: return [ - InputParam("num_inference_steps", default=50), - InputParam("timesteps"), - InputParam("sigmas"), - InputParam("height", type_hint=int, default=512), - InputParam("width", type_hint=int, default=704), + InputParam.template("num_inference_steps"), + InputParam.template("timesteps"), + InputParam.template("sigmas"), + InputParam.template("height", default=512), + InputParam.template("width", default=704), InputParam("num_frames", type_hint=int, default=161), InputParam("frame_rate", type_hint=int, default=25), ] @@ -244,20 +244,20 @@ def description(self) -> str: @property def inputs(self) -> list[InputParam]: return [ - InputParam("height", type_hint=int, default=512), - InputParam("width", type_hint=int, default=704), + InputParam.template("height", default=512), + InputParam.template("width", default=704), InputParam("num_frames", type_hint=int, default=161), - InputParam("latents", type_hint=torch.Tensor | None), - InputParam("num_videos_per_prompt", type_hint=int, default=1), - InputParam("generator"), - InputParam("batch_size", required=True, type_hint=int), - InputParam("dtype", type_hint=torch.dtype), + InputParam.template("latents"), + InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"), + InputParam.template("generator"), + InputParam.template("batch_size", required=True), + InputParam.template("dtype"), ] @property def intermediate_outputs(self) -> list[OutputParam]: return [ - OutputParam("latents", type_hint=torch.Tensor), + OutputParam.template("latents"), ] @torch.no_grad() @@ -321,21 +321,21 @@ def expected_components(self) -> list[ComponentSpec]: @property def inputs(self) -> list[InputParam]: return [ - InputParam("image", required=True), - InputParam("height", type_hint=int, default=512), - InputParam("width", type_hint=int, default=704), + InputParam.template("image"), + InputParam.template("height", default=512), + InputParam.template("width", default=704), InputParam("num_frames", type_hint=int, default=161), - InputParam("latents", type_hint=torch.Tensor | None), - InputParam("num_videos_per_prompt", type_hint=int, default=1), - InputParam("generator"), - InputParam("batch_size", required=True, type_hint=int), - InputParam("dtype", type_hint=torch.dtype), + InputParam.template("latents"), + InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"), + InputParam.template("generator"), + InputParam.template("batch_size", required=True), + InputParam.template("dtype"), ] @property def intermediate_outputs(self) -> list[OutputParam]: return [ - OutputParam("latents", type_hint=torch.Tensor), + OutputParam.template("latents"), OutputParam("conditioning_mask", type_hint=torch.Tensor), ] diff --git a/src/diffusers/modular_pipelines/ltx/decoders.py b/src/diffusers/modular_pipelines/ltx/decoders.py index d7c85171c091..6259338e0147 100644 --- a/src/diffusers/modular_pipelines/ltx/decoders.py +++ b/src/diffusers/modular_pipelines/ltx/decoders.py @@ -14,8 +14,6 @@ from typing import Any -import numpy as np -import PIL import torch from ...configuration_utils import FrozenDict @@ -53,27 +51,21 @@ def description(self) -> str: @property def inputs(self) -> list[tuple[str, Any]]: return [ - InputParam("latents", required=True, type_hint=torch.Tensor), - InputParam("output_type", default="np", type_hint=str), - InputParam("height", type_hint=int, default=512), - InputParam("width", type_hint=int, default=704), + InputParam.template("latents", required=True), + InputParam.template("output_type", default="np"), + InputParam.template("height", default=512), + InputParam.template("width", default=704), InputParam("num_frames", type_hint=int, default=161), InputParam("decode_timestep", default=0.0), InputParam("decode_noise_scale", default=None), - InputParam("generator"), - InputParam("batch_size", type_hint=int, default=1), - InputParam("dtype", required=True, type_hint=torch.dtype), + InputParam.template("generator"), + InputParam.template("batch_size"), + InputParam.template("dtype", required=True), ] @property def intermediate_outputs(self) -> list[OutputParam]: - return [ - OutputParam( - "videos", - type_hint=list[list[PIL.Image.Image]] | list[torch.Tensor] | list[np.ndarray], - description="The generated videos", - ) - ] + return [OutputParam.template("videos")] @torch.no_grad() def __call__(self, components, state: PipelineState) -> PipelineState: diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py index eb6dbeeeee82..3e7b7dca7a46 100644 --- a/src/diffusers/modular_pipelines/ltx/denoise.py +++ b/src/diffusers/modular_pipelines/ltx/denoise.py @@ -48,8 +48,8 @@ def description(self) -> str: @property def inputs(self) -> list[InputParam]: return [ - InputParam("latents", required=True, type_hint=torch.Tensor), - InputParam("dtype", required=True, type_hint=torch.dtype), + InputParam.template("latents", required=True), + InputParam.template("dtype", required=True), ] @torch.no_grad() @@ -98,11 +98,11 @@ def description(self) -> str: @property def inputs(self) -> list[tuple[str, Any]]: inputs = [ - InputParam("attention_kwargs"), - InputParam("num_inference_steps", required=True, type_hint=int), + InputParam.template("attention_kwargs"), + InputParam.template("num_inference_steps", required=True), InputParam("rope_interpolation_scale", type_hint=tuple), - InputParam("height", type_hint=int), - InputParam("width", type_hint=int), + InputParam.template("height"), + InputParam.template("width"), InputParam("num_frames", type_hint=int), ] guider_input_names = [] @@ -210,8 +210,8 @@ def loop_expected_components(self) -> list[ComponentSpec]: @property def loop_inputs(self) -> list[InputParam]: return [ - InputParam("timesteps", required=True, type_hint=torch.Tensor), - InputParam("num_inference_steps", required=True, type_hint=int), + InputParam.template("timesteps", required=True), + InputParam.template("num_inference_steps", required=True), ] @torch.no_grad() @@ -273,9 +273,9 @@ def description(self) -> str: @property def inputs(self) -> list[InputParam]: return [ - InputParam("latents", required=True, type_hint=torch.Tensor), + InputParam.template("latents", required=True), InputParam("conditioning_mask", required=True, type_hint=torch.Tensor), - InputParam("dtype", required=True, type_hint=torch.dtype), + InputParam.template("dtype", required=True), ] @torch.no_grad() @@ -329,11 +329,11 @@ def description(self) -> str: @property def inputs(self) -> list[tuple[str, Any]]: inputs = [ - InputParam("attention_kwargs"), - InputParam("num_inference_steps", required=True, type_hint=int), + InputParam.template("attention_kwargs"), + InputParam.template("num_inference_steps", required=True), InputParam("rope_interpolation_scale", type_hint=tuple), - InputParam("height", type_hint=int), - InputParam("width", type_hint=int), + InputParam.template("height"), + InputParam.template("width"), InputParam("num_frames", type_hint=int), ] guider_input_names = [] @@ -406,8 +406,8 @@ def description(self) -> str: @property def inputs(self) -> list[InputParam]: return [ - InputParam("height", type_hint=int), - InputParam("width", type_hint=int), + InputParam.template("height"), + InputParam.template("width"), InputParam("num_frames", type_hint=int), ] diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py index 9f15d33b0e18..1f8d44bb24f3 100644 --- a/src/diffusers/modular_pipelines/ltx/encoders.py +++ b/src/diffusers/modular_pipelines/ltx/encoders.py @@ -49,42 +49,22 @@ def expected_components(self) -> list[ComponentSpec]: @property def inputs(self) -> list[InputParam]: return [ - InputParam("prompt"), - InputParam("negative_prompt"), - InputParam("prompt_embeds", type_hint=torch.Tensor), - InputParam("prompt_attention_mask", type_hint=torch.Tensor), - InputParam("negative_prompt_embeds", type_hint=torch.Tensor), - InputParam("negative_prompt_attention_mask", type_hint=torch.Tensor), - InputParam("max_sequence_length", default=128), + InputParam.template("prompt"), + InputParam.template("negative_prompt"), + InputParam.template("prompt_embeds"), + InputParam.template("prompt_embeds_mask", name="prompt_attention_mask"), + InputParam.template("negative_prompt_embeds"), + InputParam.template("negative_prompt_embeds_mask", name="negative_prompt_attention_mask"), + InputParam.template("max_sequence_length", default=128), ] @property def intermediate_outputs(self) -> list[OutputParam]: return [ - OutputParam( - "prompt_embeds", - type_hint=torch.Tensor, - kwargs_type="denoiser_input_fields", - description="text embeddings used to guide the video generation", - ), - OutputParam( - "prompt_attention_mask", - type_hint=torch.Tensor, - kwargs_type="denoiser_input_fields", - description="attention mask for text embeddings", - ), - OutputParam( - "negative_prompt_embeds", - type_hint=torch.Tensor, - kwargs_type="denoiser_input_fields", - description="negative text embeddings", - ), - OutputParam( - "negative_prompt_attention_mask", - type_hint=torch.Tensor, - kwargs_type="denoiser_input_fields", - description="attention mask for negative text embeddings", - ), + OutputParam.template("prompt_embeds"), + OutputParam.template("prompt_embeds_mask", name="prompt_attention_mask"), + OutputParam.template("negative_prompt_embeds"), + OutputParam.template("negative_prompt_embeds_mask", name="negative_prompt_attention_mask"), ] @staticmethod diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py index b4b9b1b4255b..8f8dc58e1145 100644 --- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py +++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py @@ -40,38 +40,38 @@ class LTXCoreDenoiseStep(SequentialPipelineBlocks): guider (`ClassifierFreeGuidance`) Inputs: - num_videos_per_prompt (`None`, *optional*, defaults to 1): - TODO: Add description. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. guidance_scale (`float`, *optional*, defaults to 3.0): TODO: Add description. prompt_embeds (`Tensor`): - TODO: Add description. - prompt_attention_mask (`Tensor`, *optional*): - TODO: Add description. + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_attention_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. negative_prompt_embeds (`Tensor`, *optional*): - TODO: Add description. + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. negative_prompt_attention_mask (`Tensor`, *optional*): - TODO: Add description. - num_inference_steps (`None`, *optional*, defaults to 50): - TODO: Add description. - timesteps (`None`, *optional*): - TODO: Add description. - sigmas (`None`, *optional*): - TODO: Add description. + mask for the negative text embeddings. Can be generated from text_encoder step. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + timesteps (`Tensor`, *optional*): + Timesteps for the denoising process. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. height (`int`, *optional*, defaults to 512): - TODO: Add description. + The height in pixels of the generated image. width (`int`, *optional*, defaults to 704): - TODO: Add description. + The width in pixels of the generated image. num_frames (`int`, *optional*, defaults to 161): TODO: Add description. frame_rate (`int`, *optional*, defaults to 25): TODO: Add description. - latents (`Tensor | NoneType`, *optional*): - TODO: Add description. - generator (`None`, *optional*): - TODO: Add description. - attention_kwargs (`None`, *optional*): - TODO: Add description. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. Outputs: latents (`Tensor`): @@ -108,40 +108,40 @@ class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): guider (`ClassifierFreeGuidance`) Inputs: - num_videos_per_prompt (`None`, *optional*, defaults to 1): - TODO: Add description. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. guidance_scale (`float`, *optional*, defaults to 3.0): TODO: Add description. prompt_embeds (`Tensor`): - TODO: Add description. - prompt_attention_mask (`Tensor`, *optional*): - TODO: Add description. + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_attention_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. negative_prompt_embeds (`Tensor`, *optional*): - TODO: Add description. + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. negative_prompt_attention_mask (`Tensor`, *optional*): - TODO: Add description. - num_inference_steps (`None`, *optional*, defaults to 50): - TODO: Add description. - timesteps (`None`, *optional*): - TODO: Add description. - sigmas (`None`, *optional*): - TODO: Add description. + mask for the negative text embeddings. Can be generated from text_encoder step. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + timesteps (`Tensor`, *optional*): + Timesteps for the denoising process. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. height (`int`, *optional*, defaults to 512): - TODO: Add description. + The height in pixels of the generated image. width (`int`, *optional*, defaults to 704): - TODO: Add description. + The width in pixels of the generated image. num_frames (`int`, *optional*, defaults to 161): TODO: Add description. frame_rate (`int`, *optional*, defaults to 25): TODO: Add description. - image (`None`): - TODO: Add description. - latents (`Tensor | NoneType`, *optional*): - TODO: Add description. - generator (`None`, *optional*): - TODO: Add description. - attention_kwargs (`None`, *optional*): - TODO: Add description. + image (`Image | list`): + Reference image(s) for denoising. Can be a single image or list of images. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. Outputs: latents (`Tensor`): @@ -181,46 +181,46 @@ class LTXBlocks(SequentialPipelineBlocks): video_processor (`VideoProcessor`) Inputs: - prompt (`None`, *optional*): - TODO: Add description. - negative_prompt (`None`, *optional*): - TODO: Add description. - prompt_embeds (`Tensor`, *optional*): - TODO: Add description. - prompt_attention_mask (`Tensor`, *optional*): - TODO: Add description. + prompt (`str`): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_attention_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. negative_prompt_embeds (`Tensor`, *optional*): - TODO: Add description. + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. negative_prompt_attention_mask (`Tensor`, *optional*): - TODO: Add description. - max_sequence_length (`None`, *optional*, defaults to 128): - TODO: Add description. - num_videos_per_prompt (`None`, *optional*, defaults to 1): - TODO: Add description. + mask for the negative text embeddings. Can be generated from text_encoder step. + max_sequence_length (`int`, *optional*, defaults to 128): + Maximum sequence length for prompt encoding. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. guidance_scale (`float`, *optional*, defaults to 3.0): TODO: Add description. - num_inference_steps (`None`, *optional*, defaults to 50): - TODO: Add description. - timesteps (`None`, *optional*): - TODO: Add description. - sigmas (`None`, *optional*): - TODO: Add description. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + timesteps (`Tensor`, *optional*): + Timesteps for the denoising process. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. height (`int`, *optional*, defaults to 512): - TODO: Add description. + The height in pixels of the generated image. width (`int`, *optional*, defaults to 704): - TODO: Add description. + The width in pixels of the generated image. num_frames (`int`, *optional*, defaults to 161): TODO: Add description. frame_rate (`int`, *optional*, defaults to 25): TODO: Add description. - latents (`Tensor | NoneType`, *optional*): - TODO: Add description. - generator (`None`, *optional*): - TODO: Add description. - attention_kwargs (`None`, *optional*): - TODO: Add description. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. output_type (`str`, *optional*, defaults to np): - TODO: Add description. + Output format: 'pil', 'np', 'pt'. decode_timestep (`None`, *optional*, defaults to 0.0): TODO: Add description. decode_noise_scale (`None`, *optional*): @@ -263,48 +263,48 @@ class LTXImage2VideoBlocks(SequentialPipelineBlocks): video_processor (`VideoProcessor`) Inputs: - prompt (`None`, *optional*): - TODO: Add description. - negative_prompt (`None`, *optional*): - TODO: Add description. - prompt_embeds (`Tensor`, *optional*): - TODO: Add description. - prompt_attention_mask (`Tensor`, *optional*): - TODO: Add description. + prompt (`str`): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_attention_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. negative_prompt_embeds (`Tensor`, *optional*): - TODO: Add description. + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. negative_prompt_attention_mask (`Tensor`, *optional*): - TODO: Add description. - max_sequence_length (`None`, *optional*, defaults to 128): - TODO: Add description. - num_videos_per_prompt (`None`, *optional*, defaults to 1): - TODO: Add description. + mask for the negative text embeddings. Can be generated from text_encoder step. + max_sequence_length (`int`, *optional*, defaults to 128): + Maximum sequence length for prompt encoding. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. guidance_scale (`float`, *optional*, defaults to 3.0): TODO: Add description. - num_inference_steps (`None`, *optional*, defaults to 50): - TODO: Add description. - timesteps (`None`, *optional*): - TODO: Add description. - sigmas (`None`, *optional*): - TODO: Add description. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + timesteps (`Tensor`, *optional*): + Timesteps for the denoising process. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. height (`int`, *optional*, defaults to 512): - TODO: Add description. + The height in pixels of the generated image. width (`int`, *optional*, defaults to 704): - TODO: Add description. + The width in pixels of the generated image. num_frames (`int`, *optional*, defaults to 161): TODO: Add description. frame_rate (`int`, *optional*, defaults to 25): TODO: Add description. - image (`None`): - TODO: Add description. - latents (`Tensor | NoneType`, *optional*): - TODO: Add description. - generator (`None`, *optional*): - TODO: Add description. - attention_kwargs (`None`, *optional*): - TODO: Add description. + image (`Image | list`): + Reference image(s) for denoising. Can be a single image or list of images. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. output_type (`str`, *optional*, defaults to np): - TODO: Add description. + Output format: 'pil', 'np', 'pt'. decode_timestep (`None`, *optional*, defaults to 0.0): TODO: Add description. decode_noise_scale (`None`, *optional*): From 69c10cf7ddb3cdf95b424d0b22d3a9a7a9b9c313 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Thu, 2 Apr 2026 22:44:12 -0700 Subject: [PATCH 11/11] address all review --- .../modular_pipelines/ltx/before_denoise.py | 115 +++++------- .../modular_pipelines/ltx/decoders.py | 37 +++- .../modular_pipelines/ltx/denoise.py | 48 ++++- .../modular_pipelines/ltx/encoders.py | 173 ++++++++++++++---- .../ltx/modular_blocks_ltx.py | 64 ++----- 5 files changed, 266 insertions(+), 171 deletions(-) diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py index cca52af9248e..47344b55ea0d 100644 --- a/src/diffusers/modular_pipelines/ltx/before_denoise.py +++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py @@ -18,7 +18,6 @@ import torch from ...models import LTXVideoTransformer3DModel -from ...pipelines.ltx.pipeline_ltx import LTXPipeline from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import logging from ...utils.torch_utils import randn_tensor @@ -73,6 +72,43 @@ def retrieve_timesteps( return timesteps, num_inference_steps +# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._pack_latents +def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor: + # Unpacked latents of shape are [B, C, F, H, W] are patched into tokens of shape + # [B, C, F // p_t, p_t, H // p, p, W // p, p]. + # The patch dimensions are then permuted and collapsed into the channel dimension of shape: + # [B, F // p_t * H // p * W // p, C * p_t * p * p] (an ndim=3 tensor). + # dim=0 is the batch size, dim=1 is the effective video sequence length, + # dim=2 is the effective number of input features + batch_size, num_channels, num_frames, height, width = latents.shape + post_patch_num_frames = num_frames // patch_size_t + post_patch_height = height // patch_size + post_patch_width = width // patch_size + latents = latents.reshape( + batch_size, + -1, + post_patch_num_frames, + patch_size_t, + post_patch_height, + patch_size, + post_patch_width, + patch_size, + ) + latents = latents.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3) + return latents + + +# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents +def _normalize_latents( + latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0 +) -> torch.Tensor: + # Normalize latents across the channel dimension [B, C, F, H, W] + latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents = (latents - latents_mean) * scaling_factor / latents_std + return latents + + class LTXTextInputStep(ModularPipelineBlocks): model_name = "ltx" @@ -94,7 +130,6 @@ def expected_components(self) -> list[ComponentSpec]: def inputs(self) -> list[InputParam]: return [ InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"), - InputParam("guidance_scale", type_hint=float, default=3.0), InputParam.template("prompt_embeds", required=True), InputParam.template("prompt_embeds_mask", name="prompt_attention_mask"), InputParam.template("negative_prompt_embeds"), @@ -112,11 +147,6 @@ def intermediate_outputs(self) -> list[OutputParam]: def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) - # Set guidance_scale on guider so CFG is configured correctly - guidance_scale = getattr(block_state, "guidance_scale", 3.0) - if hasattr(components, "guider") and components.guider is not None: - components.guider.guidance_scale = guidance_scale - block_state.batch_size = block_state.prompt_embeds.shape[0] block_state.dtype = block_state.prompt_embeds.dtype num_videos = block_state.num_videos_per_prompt @@ -257,7 +287,7 @@ def inputs(self) -> list[InputParam]: @property def intermediate_outputs(self) -> list[OutputParam]: return [ - OutputParam.template("latents"), + OutputParam("latents", type_hint=torch.Tensor), ] @torch.no_grad() @@ -279,7 +309,7 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe block_state.latents = randn_tensor( shape, generator=block_state.generator, device=device, dtype=torch.float32 ) - block_state.latents = LTXPipeline._pack_latents( + block_state.latents = _pack_latents( block_state.latents, components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, @@ -289,39 +319,19 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe return components, state -# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents -def retrieve_latents( - encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample" -): - if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": - return encoder_output.latent_dist.sample(generator) - elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": - return encoder_output.latent_dist.mode() - elif hasattr(encoder_output, "latents"): - return encoder_output.latents - else: - raise AttributeError("Could not access latents of provided encoder_output") - - class LTXImage2VideoPrepareLatentsStep(ModularPipelineBlocks): model_name = "ltx" @property def description(self) -> str: - return "Prepare latents step for image-to-video: encodes the first frame and creates a conditioning mask" - - @property - def expected_components(self) -> list[ComponentSpec]: - from ...models import AutoencoderKLLTXVideo - - return [ - ComponentSpec("vae", AutoencoderKLLTXVideo), - ] + return ( + "Prepare latents step for image-to-video: takes pre-encoded image latents and creates a conditioning mask" + ) @property def inputs(self) -> list[InputParam]: return [ - InputParam.template("image"), + InputParam("image_latents", type_hint=torch.Tensor, required=True), InputParam.template("height", default=512), InputParam.template("width", default=704), InputParam("num_frames", type_hint=int, default=161), @@ -335,7 +345,7 @@ def inputs(self) -> list[InputParam]: @property def intermediate_outputs(self) -> list[OutputParam]: return [ - OutputParam.template("latents"), + OutputParam("latents", type_hint=torch.Tensor), OutputParam("conditioning_mask", type_hint=torch.Tensor), ] @@ -355,7 +365,7 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe if block_state.latents is not None: conditioning_mask = block_state.latents.new_zeros(mask_shape) conditioning_mask[:, :, 0] = 1.0 - conditioning_mask = LTXPipeline._pack_latents( + conditioning_mask = _pack_latents( conditioning_mask, components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, @@ -365,38 +375,9 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe self.set_block_state(state, block_state) return components, state - image = block_state.image - if not isinstance(image, torch.Tensor): - from ...video_processor import VideoProcessor - - processor = VideoProcessor(vae_scale_factor=components.vae_spatial_compression_ratio) - image = processor.preprocess(image, height=block_state.height, width=block_state.width) - image = image.to(device=device, dtype=torch.float32) - - vae_dtype = components.vae.dtype - - num_images = image.shape[0] - if isinstance(block_state.generator, list): - init_latents = [ - retrieve_latents( - components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i] - ) - for i in range(num_images) - ] - else: - init_latents = [ - retrieve_latents( - components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator - ) - for img in image - ] - - init_latents = torch.cat(init_latents, dim=0).to(torch.float32) + init_latents = block_state.image_latents.to(device=device, dtype=torch.float32) if init_latents.shape[0] < batch_size: init_latents = init_latents.repeat_interleave(batch_size // init_latents.shape[0], dim=0) - init_latents = LTXPipeline._normalize_latents( - init_latents, components.vae.latents_mean, components.vae.latents_std - ) init_latents = init_latents.repeat(1, 1, num_frames, 1, 1) actual_mask_shape = ( @@ -412,12 +393,12 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe noise = randn_tensor(init_latents.shape, generator=block_state.generator, device=device, dtype=torch.float32) latents = init_latents * conditioning_mask + noise * (1 - conditioning_mask) - conditioning_mask = LTXPipeline._pack_latents( + conditioning_mask = _pack_latents( conditioning_mask, components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, ).squeeze(-1) - latents = LTXPipeline._pack_latents( + latents = _pack_latents( latents, components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, diff --git a/src/diffusers/modular_pipelines/ltx/decoders.py b/src/diffusers/modular_pipelines/ltx/decoders.py index 6259338e0147..7524d6f7f67d 100644 --- a/src/diffusers/modular_pipelines/ltx/decoders.py +++ b/src/diffusers/modular_pipelines/ltx/decoders.py @@ -18,7 +18,6 @@ from ...configuration_utils import FrozenDict from ...models import AutoencoderKLLTXVideo -from ...pipelines.ltx.pipeline_ltx import LTXPipeline from ...utils import logging from ...utils.torch_utils import randn_tensor from ...video_processor import VideoProcessor @@ -29,6 +28,31 @@ logger = logging.get_logger(__name__) +# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._unpack_latents +def _unpack_latents( + latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1 +) -> torch.Tensor: + # Packed latents of shape [B, S, D] (S is the effective video sequence length, + # D is the effective feature dimensions) are unpacked and reshaped into a video tensor + # of shape [B, C, F, H, W]. This is the inverse operation of what happens in the + # `_pack_latents` method. + batch_size = latents.size(0) + latents = latents.reshape(batch_size, num_frames, height, width, -1, patch_size_t, patch_size, patch_size) + latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3) + return latents + + +# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._denormalize_latents +def _denormalize_latents( + latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0 +) -> torch.Tensor: + # Denormalize latents across the channel dimension [B, C, F, H, W] + latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents = latents * latents_std / scaling_factor + latents_mean + return latents + + class LTXVaeDecoderStep(ModularPipelineBlocks): model_name = "ltx" @@ -74,11 +98,6 @@ def __call__(self, components, state: PipelineState) -> PipelineState: latents = block_state.latents - if block_state.output_type == "latent": - block_state.videos = latents - self.set_block_state(state, block_state) - return components, state - height = block_state.height width = block_state.width num_frames = block_state.num_frames @@ -87,7 +106,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: latent_height = height // components.vae_spatial_compression_ratio latent_width = width // components.vae_spatial_compression_ratio - latents = LTXPipeline._unpack_latents( + latents = _unpack_latents( latents, latent_num_frames, latent_height, @@ -95,9 +114,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, ) - latents = LTXPipeline._denormalize_latents( - latents, vae.latents_mean, vae.latents_std, vae.config.scaling_factor - ) + latents = _denormalize_latents(latents, vae.latents_mean, vae.latents_std, vae.config.scaling_factor) latents = latents.to(block_state.dtype) if not vae.config.timestep_conditioning: diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py index 3e7b7dca7a46..e8f72ec4a477 100644 --- a/src/diffusers/modular_pipelines/ltx/denoise.py +++ b/src/diffusers/modular_pipelines/ltx/denoise.py @@ -34,6 +34,46 @@ logger = logging.get_logger(__name__) +# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._pack_latents +def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor: + # Unpacked latents of shape are [B, C, F, H, W] are patched into tokens of shape + # [B, C, F // p_t, p_t, H // p, p, W // p, p]. + # The patch dimensions are then permuted and collapsed into the channel dimension of shape: + # [B, F // p_t * H // p * W // p, C * p_t * p * p] (an ndim=3 tensor). + # dim=0 is the batch size, dim=1 is the effective video sequence length, + # dim=2 is the effective number of input features + batch_size, num_channels, num_frames, height, width = latents.shape + post_patch_num_frames = num_frames // patch_size_t + post_patch_height = height // patch_size + post_patch_width = width // patch_size + latents = latents.reshape( + batch_size, + -1, + post_patch_num_frames, + patch_size_t, + post_patch_height, + patch_size, + post_patch_width, + patch_size, + ) + latents = latents.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3) + return latents + + +# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._unpack_latents +def _unpack_latents( + latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1 +) -> torch.Tensor: + # Packed latents of shape [B, S, D] (S is the effective video sequence length, + # D is the effective feature dimensions) are unpacked and reshaped into a video tensor + # of shape [B, C, F, H, W]. This is the inverse operation of what happens in the + # `_pack_latents` method. + batch_size = latents.size(0) + latents = latents.reshape(batch_size, num_frames, height, width, -1, patch_size_t, patch_size, patch_size) + latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3) + return latents + + class LTXLoopBeforeDenoiser(ModularPipelineBlocks): model_name = "ltx" @@ -413,13 +453,11 @@ def inputs(self) -> list[InputParam]: @torch.no_grad() def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): - from ...pipelines.ltx.pipeline_ltx import LTXPipeline - latent_num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1 latent_height = block_state.height // components.vae_spatial_compression_ratio latent_width = block_state.width // components.vae_spatial_compression_ratio - noise_pred = LTXPipeline._unpack_latents( + noise_pred = _unpack_latents( block_state.noise_pred, latent_num_frames, latent_height, @@ -427,7 +465,7 @@ def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: i components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, ) - latents = LTXPipeline._unpack_latents( + latents = _unpack_latents( block_state.latents, latent_num_frames, latent_height, @@ -441,7 +479,7 @@ def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: i pred_latents = components.scheduler.step(noise_pred, t, noise_latents, return_dict=False)[0] latents = torch.cat([latents[:, :, :1], pred_latents], dim=2) - block_state.latents = LTXPipeline._pack_latents( + block_state.latents = _pack_latents( latents, components.transformer_spatial_patch_size, components.transformer_temporal_patch_size, diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py index 1f8d44bb24f3..ec76a86cf2f1 100644 --- a/src/diffusers/modular_pipelines/ltx/encoders.py +++ b/src/diffusers/modular_pipelines/ltx/encoders.py @@ -17,7 +17,9 @@ from ...configuration_utils import FrozenDict from ...guiders import ClassifierFreeGuidance +from ...models import AutoencoderKLLTXVideo from ...utils import logging +from ...video_processor import VideoProcessor from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam from .modular_pipeline import LTXModularPipeline @@ -26,6 +28,33 @@ logger = logging.get_logger(__name__) +def _get_t5_prompt_embeds( + components, + prompt: str | list[str], + max_sequence_length: int, + device: torch.device, + dtype: torch.dtype, +): + prompt = [prompt] if isinstance(prompt, str) else prompt + + text_inputs = components.tokenizer( + prompt, + padding="max_length", + max_length=max_sequence_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + prompt_attention_mask = text_inputs.attention_mask + prompt_attention_mask = prompt_attention_mask.bool().to(device) + + prompt_embeds = components.text_encoder(text_input_ids.to(device))[0] + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + return prompt_embeds, prompt_attention_mask + + class LTXTextEncoderStep(ModularPipelineBlocks): model_name = "ltx" @@ -51,10 +80,6 @@ def inputs(self) -> list[InputParam]: return [ InputParam.template("prompt"), InputParam.template("negative_prompt"), - InputParam.template("prompt_embeds"), - InputParam.template("prompt_embeds_mask", name="prompt_attention_mask"), - InputParam.template("negative_prompt_embeds"), - InputParam.template("negative_prompt_embeds_mask", name="negative_prompt_attention_mask"), InputParam.template("max_sequence_length", default=128), ] @@ -74,33 +99,6 @@ def check_inputs(block_state): ): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}") - @staticmethod - def _get_t5_prompt_embeds( - components, - prompt: str | list[str], - max_sequence_length: int, - device: torch.device, - dtype: torch.dtype, - ): - prompt = [prompt] if isinstance(prompt, str) else prompt - - text_inputs = components.tokenizer( - prompt, - padding="max_length", - max_length=max_sequence_length, - truncation=True, - add_special_tokens=True, - return_tensors="pt", - ) - text_input_ids = text_inputs.input_ids - prompt_attention_mask = text_inputs.attention_mask - prompt_attention_mask = prompt_attention_mask.bool().to(device) - - prompt_embeds = components.text_encoder(text_input_ids.to(device))[0] - prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) - - return prompt_embeds, prompt_attention_mask - @staticmethod def encode_prompt( components, @@ -117,7 +115,7 @@ def encode_prompt( prompt = [prompt] batch_size = len(prompt) - prompt_embeds, prompt_attention_mask = LTXTextEncoderStep._get_t5_prompt_embeds( + prompt_embeds, prompt_attention_mask = _get_t5_prompt_embeds( components=components, prompt=prompt, max_sequence_length=max_sequence_length, @@ -139,7 +137,7 @@ def encode_prompt( " the batch size of `prompt`." ) - negative_prompt_embeds, negative_prompt_attention_mask = LTXTextEncoderStep._get_t5_prompt_embeds( + negative_prompt_embeds, negative_prompt_attention_mask = _get_t5_prompt_embeds( components=components, prompt=negative_prompt, max_sequence_length=max_sequence_length, @@ -156,13 +154,6 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe block_state.device = components._execution_device - # Skip encoding if pre-computed embeddings are provided - has_prompt_embeds = getattr(block_state, "prompt_embeds", None) is not None - has_negative = getattr(block_state, "negative_prompt_embeds", None) is not None - if has_prompt_embeds and (has_negative or not components.requires_unconditional_embeds): - self.set_block_state(state, block_state) - return components, state - ( block_state.prompt_embeds, block_state.prompt_attention_mask, @@ -179,3 +170,105 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe self.set_block_state(state, block_state) return components, state + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents +def retrieve_latents( + encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents +def _normalize_latents( + latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0 +) -> torch.Tensor: + # Normalize latents across the channel dimension [B, C, F, H, W] + latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents = (latents - latents_mean) * scaling_factor / latents_std + return latents + + +class LTXVaeEncoderStep(ModularPipelineBlocks): + model_name = "ltx" + + @property + def description(self) -> str: + return "VAE Encoder step that encodes an input image into latent space for image-to-video generation" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("vae", AutoencoderKLLTXVideo), + ComponentSpec( + "video_processor", + VideoProcessor, + config=FrozenDict({"vae_scale_factor": 32}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("image", required=True), + InputParam.template("height", default=512), + InputParam.template("width", default=704), + InputParam.template("generator"), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + "image_latents", + type_hint=torch.Tensor, + description="Encoded image latents from the VAE encoder", + ), + ] + + @torch.no_grad() + def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + image = block_state.image + if not isinstance(image, torch.Tensor): + image = components.video_processor.preprocess(image, height=block_state.height, width=block_state.width) + image = image.to(device=device, dtype=torch.float32) + + vae_dtype = components.vae.dtype + + num_images = image.shape[0] + if isinstance(block_state.generator, list): + init_latents = [ + retrieve_latents( + components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), + block_state.generator[i], + ) + for i in range(num_images) + ] + else: + init_latents = [ + retrieve_latents( + components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), + block_state.generator, + ) + for img in image + ] + + init_latents = torch.cat(init_latents, dim=0).to(torch.float32) + block_state.image_latents = _normalize_latents( + init_latents, components.vae.latents_mean, components.vae.latents_std + ) + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py index 8f8dc58e1145..76c69e3f0fdb 100644 --- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py +++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py @@ -23,7 +23,7 @@ ) from .decoders import LTXVaeDecoderStep from .denoise import LTXDenoiseStep, LTXImage2VideoDenoiseStep -from .encoders import LTXTextEncoderStep +from .encoders import LTXTextEncoderStep, LTXVaeEncoderStep logger = logging.get_logger(__name__) @@ -35,15 +35,12 @@ class LTXCoreDenoiseStep(SequentialPipelineBlocks): Denoise block that takes encoded conditions and runs the denoising process. Components: - transformer (`LTXVideoTransformer3DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) + transformer (`LTXVideoTransformer3DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) Inputs: num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 3.0): - TODO: Add description. prompt_embeds (`Tensor`): text embeddings used to guide the image generation. Can be generated from text_encoder step. prompt_attention_mask (`Tensor`): @@ -102,16 +99,12 @@ class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): Denoise block for image-to-video that takes encoded conditions and an image, and runs the denoising process. Components: - transformer (`LTXVideoTransformer3DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - vae (`AutoencoderKLLTXVideo`) - guider (`ClassifierFreeGuidance`) + transformer (`LTXVideoTransformer3DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) vae + (`AutoencoderKLLTXVideo`) video_processor (`VideoProcessor`) guider (`ClassifierFreeGuidance`) Inputs: num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 3.0): - TODO: Add description. prompt_embeds (`Tensor`): text embeddings used to guide the image generation. Can be generated from text_encoder step. prompt_attention_mask (`Tensor`): @@ -136,10 +129,10 @@ class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): TODO: Add description. image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. - latents (`Tensor`, *optional*): - Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): Torch generator for deterministic generation. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. @@ -152,10 +145,11 @@ class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): block_classes = [ LTXTextInputStep, LTXSetTimestepsStep, + LTXVaeEncoderStep, LTXImage2VideoPrepareLatentsStep, LTXImage2VideoDenoiseStep, ] - block_names = ["input", "set_timesteps", "prepare_latents", "denoise"] + block_names = ["input", "set_timesteps", "vae_encoder", "prepare_latents", "denoise"] @property def description(self): @@ -172,12 +166,8 @@ class LTXBlocks(SequentialPipelineBlocks): Modular pipeline blocks for LTX Video text-to-video. Components: - text_encoder (`T5EncoderModel`) - tokenizer (`T5TokenizerFast`) - guider (`ClassifierFreeGuidance`) - transformer (`LTXVideoTransformer3DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - vae (`AutoencoderKLLTXVideo`) + text_encoder (`T5EncoderModel`) tokenizer (`T5TokenizerFast`) guider (`ClassifierFreeGuidance`) transformer + (`LTXVideoTransformer3DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLLTXVideo`) video_processor (`VideoProcessor`) Inputs: @@ -185,20 +175,10 @@ class LTXBlocks(SequentialPipelineBlocks): The prompt or prompts to guide image generation. negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - prompt_embeds (`Tensor`): - text embeddings used to guide the image generation. Can be generated from text_encoder step. - prompt_attention_mask (`Tensor`): - mask for the text embeddings. Can be generated from text_encoder step. - negative_prompt_embeds (`Tensor`, *optional*): - negative text embeddings used to guide the image generation. Can be generated from text_encoder step. - negative_prompt_attention_mask (`Tensor`, *optional*): - mask for the negative text embeddings. Can be generated from text_encoder step. max_sequence_length (`int`, *optional*, defaults to 128): Maximum sequence length for prompt encoding. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 3.0): - TODO: Add description. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. timesteps (`Tensor`, *optional*): @@ -254,12 +234,8 @@ class LTXImage2VideoBlocks(SequentialPipelineBlocks): Modular pipeline blocks for LTX Video image-to-video. Components: - text_encoder (`T5EncoderModel`) - tokenizer (`T5TokenizerFast`) - guider (`ClassifierFreeGuidance`) - transformer (`LTXVideoTransformer3DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - vae (`AutoencoderKLLTXVideo`) + text_encoder (`T5EncoderModel`) tokenizer (`T5TokenizerFast`) guider (`ClassifierFreeGuidance`) transformer + (`LTXVideoTransformer3DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLLTXVideo`) video_processor (`VideoProcessor`) Inputs: @@ -267,20 +243,10 @@ class LTXImage2VideoBlocks(SequentialPipelineBlocks): The prompt or prompts to guide image generation. negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - prompt_embeds (`Tensor`): - text embeddings used to guide the image generation. Can be generated from text_encoder step. - prompt_attention_mask (`Tensor`): - mask for the text embeddings. Can be generated from text_encoder step. - negative_prompt_embeds (`Tensor`, *optional*): - negative text embeddings used to guide the image generation. Can be generated from text_encoder step. - negative_prompt_attention_mask (`Tensor`, *optional*): - mask for the negative text embeddings. Can be generated from text_encoder step. max_sequence_length (`int`, *optional*, defaults to 128): Maximum sequence length for prompt encoding. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - guidance_scale (`float`, *optional*, defaults to 3.0): - TODO: Add description. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. timesteps (`Tensor`, *optional*): @@ -297,10 +263,10 @@ class LTXImage2VideoBlocks(SequentialPipelineBlocks): TODO: Add description. image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. - latents (`Tensor`, *optional*): - Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): Torch generator for deterministic generation. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. output_type (`str`, *optional*, defaults to np):