From f357b98c1aa522d08d1f2ce21a96901f100a11fe Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Wed, 1 Apr 2026 02:55:32 -0700
Subject: [PATCH 01/11] Add modular pipeline support for LTX Video

---
 src/diffusers/__init__.py                     |   4 +
 src/diffusers/modular_pipelines/__init__.py   |   5 +
 .../modular_pipelines/ltx/__init__.py         |  47 +++
 .../modular_pipelines/ltx/before_denoise.py   | 281 ++++++++++++++++++
 .../modular_pipelines/ltx/decoders.py         | 137 +++++++++
 .../modular_pipelines/ltx/denoise.py          | 256 ++++++++++++++++
 .../modular_pipelines/ltx/encoders.py         | 205 +++++++++++++
 .../ltx/modular_blocks_ltx.py                 |  65 ++++
 .../modular_pipelines/ltx/modular_pipeline.py |  64 ++++
 .../modular_pipelines/modular_pipeline.py     |   1 +
 10 files changed, 1065 insertions(+)
 create mode 100644 src/diffusers/modular_pipelines/ltx/__init__.py
 create mode 100644 src/diffusers/modular_pipelines/ltx/before_denoise.py
 create mode 100644 src/diffusers/modular_pipelines/ltx/decoders.py
 create mode 100644 src/diffusers/modular_pipelines/ltx/denoise.py
 create mode 100644 src/diffusers/modular_pipelines/ltx/encoders.py
 create mode 100644 src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
 create mode 100644 src/diffusers/modular_pipelines/ltx/modular_pipeline.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 7d966452d1a2..637bf2685824 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -443,6 +443,8 @@
             "HeliosPyramidAutoBlocks",
             "HeliosPyramidDistilledAutoBlocks",
             "HeliosPyramidDistilledModularPipeline",
+            "LTXBlocks",
+            "LTXModularPipeline",
             "HeliosPyramidModularPipeline",
             "QwenImageAutoBlocks",
             "QwenImageEditAutoBlocks",
@@ -1210,6 +1212,8 @@
             HeliosPyramidAutoBlocks,
             HeliosPyramidDistilledAutoBlocks,
             HeliosPyramidDistilledModularPipeline,
+            LTXBlocks,
+            LTXModularPipeline,
             HeliosPyramidModularPipeline,
             QwenImageAutoBlocks,
             QwenImageEditAutoBlocks,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index fd9bd691ca87..389a5416f3ea 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -88,6 +88,10 @@
         "QwenImageLayeredModularPipeline",
         "QwenImageLayeredAutoBlocks",
     ]
+    _import_structure["ltx"] = [
+        "LTXBlocks",
+        "LTXModularPipeline",
+    ]
     _import_structure["z_image"] = [
         "ZImageAutoBlocks",
         "ZImageModularPipeline",
@@ -141,6 +145,7 @@
             QwenImageModularPipeline,
         )
         from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
+        from .ltx import LTXBlocks, LTXModularPipeline
         from .wan import (
             Wan22Blocks,
             Wan22Image2VideoBlocks,
diff --git a/src/diffusers/modular_pipelines/ltx/__init__.py b/src/diffusers/modular_pipelines/ltx/__init__.py
new file mode 100644
index 000000000000..019fa96fef14
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ltx/__init__.py
@@ -0,0 +1,47 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modular_blocks_ltx"] = ["LTXBlocks"]
+    _import_structure["modular_pipeline"] = ["LTXModularPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modular_blocks_ltx import LTXBlocks
+        from .modular_pipeline import LTXModularPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py
new file mode 100644
index 000000000000..d3673c3ac1f8
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py
@@ -0,0 +1,281 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+import numpy as np
+import torch
+
+from ...models import LTXVideoTransformer3DModel
+from ...pipelines.ltx.pipeline_ltx import LTXPipeline
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import LTXModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: int | None = None,
+    device: str | torch.device | None = None,
+    timesteps: list[int] | None = None,
+    sigmas: list[float] | None = None,
+    **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed.")
+    if timesteps is not None:
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom sigmas."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class LTXTextInputStep(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Input processing step that:\n"
+            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
+            "  2. Adjusts input tensor shapes based on `batch_size` and `num_videos_per_prompt`"
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", LTXVideoTransformer3DModel),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("num_videos_per_prompt", default=1),
+            InputParam("prompt_embeds", required=True, type_hint=torch.Tensor),
+            InputParam("prompt_attention_mask", type_hint=torch.Tensor),
+            InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
+            InputParam("negative_prompt_attention_mask", type_hint=torch.Tensor),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("batch_size", type_hint=int),
+            OutputParam("dtype", type_hint=torch.dtype),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        block_state.batch_size = block_state.prompt_embeds.shape[0]
+        block_state.dtype = block_state.prompt_embeds.dtype
+        num_videos = block_state.num_videos_per_prompt
+
+        # Repeat prompt_embeds for num_videos_per_prompt
+        _, seq_len, _ = block_state.prompt_embeds.shape
+        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, num_videos, 1)
+        block_state.prompt_embeds = block_state.prompt_embeds.view(
+            block_state.batch_size * num_videos, seq_len, -1
+        )
+
+        if block_state.prompt_attention_mask is not None:
+            block_state.prompt_attention_mask = block_state.prompt_attention_mask.repeat(num_videos, 1)
+
+        if block_state.negative_prompt_embeds is not None:
+            _, seq_len, _ = block_state.negative_prompt_embeds.shape
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.repeat(1, num_videos, 1)
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.view(
+                block_state.batch_size * num_videos, seq_len, -1
+            )
+
+        if block_state.negative_prompt_attention_mask is not None:
+            block_state.negative_prompt_attention_mask = block_state.negative_prompt_attention_mask.repeat(num_videos, 1)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class LTXSetTimestepsStep(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that sets the scheduler's timesteps for inference"
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("num_inference_steps", default=50),
+            InputParam("timesteps"),
+            InputParam("sigmas"),
+            InputParam("height", type_hint=int, default=512),
+            InputParam("width", type_hint=int, default=704),
+            InputParam("num_frames", type_hint=int, default=161),
+            InputParam("frame_rate", type_hint=int, default=25),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("timesteps", type_hint=torch.Tensor),
+            OutputParam("num_inference_steps", type_hint=int),
+            OutputParam("rope_interpolation_scale", type_hint=tuple),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        height = block_state.height
+        width = block_state.width
+        num_frames = block_state.num_frames
+        frame_rate = block_state.frame_rate
+
+        latent_num_frames = (num_frames - 1) // components.vae_temporal_compression_ratio + 1
+        latent_height = height // components.vae_spatial_compression_ratio
+        latent_width = width // components.vae_spatial_compression_ratio
+        video_sequence_length = latent_num_frames * latent_height * latent_width
+
+        custom_timesteps = block_state.timesteps
+        sigmas = block_state.sigmas
+
+        if custom_timesteps is not None:
+            # User provided custom timesteps, don't compute sigmas
+            block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
+                components.scheduler,
+                block_state.num_inference_steps,
+                device,
+                custom_timesteps,
+            )
+        else:
+            if sigmas is None:
+                sigmas = np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps)
+
+            mu = calculate_shift(
+                video_sequence_length,
+                components.scheduler.config.get("base_image_seq_len", 256),
+                components.scheduler.config.get("max_image_seq_len", 4096),
+                components.scheduler.config.get("base_shift", 0.5),
+                components.scheduler.config.get("max_shift", 1.15),
+            )
+
+            block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
+                components.scheduler,
+                block_state.num_inference_steps,
+                device,
+                sigmas=sigmas,
+                mu=mu,
+            )
+
+        block_state.rope_interpolation_scale = (
+            components.vae_temporal_compression_ratio / frame_rate,
+            components.vae_spatial_compression_ratio,
+            components.vae_spatial_compression_ratio,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class LTXPrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def description(self) -> str:
+        return "Prepare latents step that prepares the latents for the text-to-video generation process"
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("height", type_hint=int, default=512),
+            InputParam("width", type_hint=int, default=704),
+            InputParam("num_frames", type_hint=int, default=161),
+            InputParam("latents", type_hint=torch.Tensor | None),
+            InputParam("num_videos_per_prompt", type_hint=int, default=1),
+            InputParam("generator"),
+            InputParam("batch_size", required=True, type_hint=int),
+            InputParam("dtype", type_hint=torch.dtype),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("latents", type_hint=torch.Tensor),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        batch_size = block_state.batch_size * block_state.num_videos_per_prompt
+        num_channels_latents = components.transformer.config.in_channels
+
+        if block_state.latents is not None:
+            block_state.latents = block_state.latents.to(device=device, dtype=torch.float32)
+        else:
+            height = block_state.height // components.vae_spatial_compression_ratio
+            width = block_state.width // components.vae_spatial_compression_ratio
+            num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1
+
+            shape = (batch_size, num_channels_latents, num_frames, height, width)
+            block_state.latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=torch.float32)
+            block_state.latents = LTXPipeline._pack_latents(
+                block_state.latents,
+                components.transformer_spatial_patch_size,
+                components.transformer_temporal_patch_size,
+            )
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/ltx/decoders.py b/src/diffusers/modular_pipelines/ltx/decoders.py
new file mode 100644
index 000000000000..eca22a5797a7
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ltx/decoders.py
@@ -0,0 +1,137 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import numpy as np
+import PIL
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKLLTXVideo
+from ...pipelines.ltx.pipeline_ltx import LTXPipeline
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+
+
+logger = logging.get_logger(__name__)
+
+
+class LTXVaeDecoderStep(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLLTXVideo),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 32}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into videos"
+
+    @property
+    def inputs(self) -> list[tuple[str, Any]]:
+        return [
+            InputParam("latents", required=True, type_hint=torch.Tensor),
+            InputParam("output_type", default="np", type_hint=str),
+            InputParam("height", type_hint=int, default=512),
+            InputParam("width", type_hint=int, default=704),
+            InputParam("num_frames", type_hint=int, default=161),
+            InputParam("decode_timestep", default=0.0),
+            InputParam("decode_noise_scale", default=None),
+            InputParam("generator"),
+            InputParam("batch_size", type_hint=int, default=1),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "videos",
+                type_hint=list[list[PIL.Image.Image]] | list[torch.Tensor] | list[np.ndarray],
+                description="The generated videos",
+            )
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        vae = components.vae
+
+        latents = block_state.latents
+
+        if block_state.output_type == "latent":
+            block_state.videos = latents
+            self.set_block_state(state, block_state)
+            return components, state
+
+        height = block_state.height
+        width = block_state.width
+        num_frames = block_state.num_frames
+
+        latent_num_frames = (num_frames - 1) // components.vae_temporal_compression_ratio + 1
+        latent_height = height // components.vae_spatial_compression_ratio
+        latent_width = width // components.vae_spatial_compression_ratio
+
+        latents = LTXPipeline._unpack_latents(
+            latents,
+            latent_num_frames,
+            latent_height,
+            latent_width,
+            components.transformer_spatial_patch_size,
+            components.transformer_temporal_patch_size,
+        )
+        latents = LTXPipeline._denormalize_latents(
+            latents, vae.latents_mean, vae.latents_std, vae.config.scaling_factor
+        )
+        latents = latents.to(block_state.dtype if hasattr(block_state, 'dtype') else torch.float32)
+
+        if not vae.config.timestep_conditioning:
+            timestep = None
+        else:
+            device = latents.device
+            batch_size = block_state.batch_size
+            decode_timestep = block_state.decode_timestep
+            decode_noise_scale = block_state.decode_noise_scale
+
+            noise = randn_tensor(latents.shape, generator=block_state.generator, device=device, dtype=latents.dtype)
+            if not isinstance(decode_timestep, list):
+                decode_timestep = [decode_timestep] * batch_size
+            if decode_noise_scale is None:
+                decode_noise_scale = decode_timestep
+            elif not isinstance(decode_noise_scale, list):
+                decode_noise_scale = [decode_noise_scale] * batch_size
+
+            timestep = torch.tensor(decode_timestep, device=device, dtype=latents.dtype)
+            decode_noise_scale = torch.tensor(decode_noise_scale, device=device, dtype=latents.dtype)[
+                :, None, None, None, None
+            ]
+            latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise
+
+        latents = latents.to(vae.dtype)
+        video = vae.decode(latents, timestep, return_dict=False)[0]
+        block_state.videos = components.video_processor.postprocess_video(video, output_type=block_state.output_type)
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py
new file mode 100644
index 000000000000..184aeb2d1b72
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ltx/denoise.py
@@ -0,0 +1,256 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...models import LTXVideoTransformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ..modular_pipeline import (
+    BlockState,
+    LoopSequentialPipelineBlocks,
+    ModularPipelineBlocks,
+    PipelineState,
+)
+from ..modular_pipeline_utils import ComponentSpec, InputParam
+from .modular_pipeline import LTXModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+class LTXLoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that prepares the latent input for the denoiser. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `LTXDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("latents", required=True, type_hint=torch.Tensor),
+            InputParam("dtype", required=True, type_hint=torch.dtype),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        block_state.latent_model_input = block_state.latents.to(block_state.dtype)
+        return components, block_state
+
+
+class LTXLoopDenoiser(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    def __init__(
+        self,
+        guider_input_fields: dict[str, Any] = {
+            "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+            "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"),
+        },
+    ):
+        if not isinstance(guider_input_fields, dict):
+            raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
+        self._guider_input_fields = guider_input_fields
+        super().__init__()
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 3.0}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", LTXVideoTransformer3DModel),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoises the latents with guidance. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `LTXDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> list[tuple[str, Any]]:
+        inputs = [
+            InputParam("attention_kwargs"),
+            InputParam("num_inference_steps", required=True, type_hint=int),
+            InputParam("rope_interpolation_scale", type_hint=tuple),
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("num_frames", type_hint=int),
+        ]
+        guider_input_names = []
+        for value in self._guider_input_fields.values():
+            if isinstance(value, tuple):
+                guider_input_names.extend(value)
+            else:
+                guider_input_names.append(value)
+
+        for name in guider_input_names:
+            inputs.append(InputParam(name=name, required=True, type_hint=torch.Tensor))
+        return inputs
+
+    @torch.no_grad()
+    def __call__(
+        self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+
+        latent_num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1
+        latent_height = block_state.height // components.vae_spatial_compression_ratio
+        latent_width = block_state.width // components.vae_spatial_compression_ratio
+
+        guider_state = components.guider.prepare_inputs_from_block_state(block_state, self._guider_input_fields)
+
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+            cond_kwargs = guider_state_batch.as_dict()
+            cond_kwargs = {
+                k: v.to(block_state.dtype) if isinstance(v, torch.Tensor) else v
+                for k, v in cond_kwargs.items()
+                if k in self._guider_input_fields.keys()
+            }
+
+            guider_state_batch.noise_pred = components.transformer(
+                hidden_states=block_state.latent_model_input,
+                timestep=t.expand(block_state.latent_model_input.shape[0]).to(block_state.dtype),
+                num_frames=latent_num_frames,
+                height=latent_height,
+                width=latent_width,
+                rope_interpolation_scale=block_state.rope_interpolation_scale,
+                attention_kwargs=block_state.attention_kwargs,
+                return_dict=False,
+                **cond_kwargs,
+            )[0]
+            components.guider.cleanup_models(components.transformer)
+
+        block_state.noise_pred = components.guider(guider_state)[0]
+
+        return components, block_state
+
+
+class LTXLoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that updates the latents. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `LTXDenoiseLoopWrapper`)"
+        )
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        latents_dtype = block_state.latents.dtype
+        block_state.latents = components.scheduler.step(
+            block_state.noise_pred,
+            t,
+            block_state.latents,
+            return_dict=False,
+        )[0]
+
+        if block_state.latents.dtype != latents_dtype:
+            block_state.latents = block_state.latents.to(latents_dtype)
+
+        return components, block_state
+
+
+class LTXDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Pipeline block that iteratively denoises the latents over `timesteps`. "
+            "The specific steps within each iteration can be customized with `sub_blocks` attributes"
+        )
+
+    @property
+    def loop_expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+            ComponentSpec("transformer", LTXVideoTransformer3DModel),
+        ]
+
+    @property
+    def loop_inputs(self) -> list[InputParam]:
+        return [
+            InputParam("timesteps", required=True, type_hint=torch.Tensor),
+            InputParam("num_inference_steps", required=True, type_hint=int),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        block_state.num_warmup_steps = max(
+            len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
+        )
+
+        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
+            for i, t in enumerate(block_state.timesteps):
+                components, block_state = self.loop_step(components, block_state, i=i, t=t)
+                if i == len(block_state.timesteps) - 1 or (
+                    (i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0
+                ):
+                    progress_bar.update()
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class LTXDenoiseStep(LTXDenoiseLoopWrapper):
+    block_classes = [
+        LTXLoopBeforeDenoiser,
+        LTXLoopDenoiser(
+            guider_input_fields={
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"),
+            }
+        ),
+        LTXLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoises the latents.\n"
+            "Its loop logic is defined in `LTXDenoiseLoopWrapper.__call__` method.\n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `LTXLoopBeforeDenoiser`\n"
+            " - `LTXLoopDenoiser`\n"
+            " - `LTXLoopAfterDenoiser`\n"
+            "This block supports text-to-video tasks."
+        )
diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py
new file mode 100644
index 000000000000..c1310fdbd6da
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ltx/encoders.py
@@ -0,0 +1,205 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import T5EncoderModel, T5TokenizerFast
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import LTXModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+class LTXTextEncoderStep(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def description(self) -> str:
+        return "Text Encoder step that generates text embeddings to guide the video generation"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", T5EncoderModel),
+            ComponentSpec("tokenizer", T5TokenizerFast),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 3.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("prompt"),
+            InputParam("negative_prompt"),
+            InputParam("prompt_embeds", type_hint=torch.Tensor),
+            InputParam("prompt_attention_mask", type_hint=torch.Tensor),
+            InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
+            InputParam("negative_prompt_attention_mask", type_hint=torch.Tensor),
+            InputParam("max_sequence_length", default=128),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="text embeddings used to guide the video generation",
+            ),
+            OutputParam(
+                "prompt_attention_mask",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="attention mask for text embeddings",
+            ),
+            OutputParam(
+                "negative_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="negative text embeddings",
+            ),
+            OutputParam(
+                "negative_prompt_attention_mask",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="attention mask for negative text embeddings",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(block_state):
+        if block_state.prompt is not None and (
+            not isinstance(block_state.prompt, str) and not isinstance(block_state.prompt, list)
+        ):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}")
+
+    @staticmethod
+    def _get_t5_prompt_embeds(
+        components,
+        prompt: str | list[str],
+        max_sequence_length: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = components.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_attention_mask = text_inputs.attention_mask
+        prompt_attention_mask = prompt_attention_mask.bool().to(device)
+
+        prompt_embeds = components.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        return prompt_embeds, prompt_attention_mask
+
+    @staticmethod
+    def encode_prompt(
+        components,
+        prompt: str,
+        device: torch.device | None = None,
+        prepare_unconditional_embeds: bool = True,
+        negative_prompt: str | None = None,
+        max_sequence_length: int = 128,
+    ):
+        device = device or components._execution_device
+        dtype = components.text_encoder.dtype
+
+        if not isinstance(prompt, list):
+            prompt = [prompt]
+        batch_size = len(prompt)
+
+        prompt_embeds, prompt_attention_mask = LTXTextEncoderStep._get_t5_prompt_embeds(
+            components=components,
+            prompt=prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+
+        negative_prompt_embeds = None
+        negative_prompt_attention_mask = None
+
+        if prepare_unconditional_embeds:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds, negative_prompt_attention_mask = LTXTextEncoderStep._get_t5_prompt_embeds(
+                components=components,
+                prompt=negative_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(block_state)
+
+        block_state.device = components._execution_device
+
+        # Skip encoding if pre-computed embeddings are provided
+        if getattr(block_state, "prompt_embeds", None) is not None:
+            self.set_block_state(state, block_state)
+            return components, state
+
+        (
+            block_state.prompt_embeds,
+            block_state.prompt_attention_mask,
+            block_state.negative_prompt_embeds,
+            block_state.negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            components=components,
+            prompt=block_state.prompt,
+            device=block_state.device,
+            prepare_unconditional_embeds=components.requires_unconditional_embeds,
+            negative_prompt=block_state.negative_prompt,
+            max_sequence_length=block_state.max_sequence_length,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
new file mode 100644
index 000000000000..f6871864aa2d
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
@@ -0,0 +1,65 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
+from .before_denoise import (
+    LTXPrepareLatentsStep,
+    LTXSetTimestepsStep,
+    LTXTextInputStep,
+)
+from .decoders import LTXVaeDecoderStep
+from .denoise import LTXDenoiseStep
+from .encoders import LTXTextEncoderStep
+
+
+logger = logging.get_logger(__name__)
+
+
+class LTXCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "ltx"
+    block_classes = [
+        LTXTextInputStep,
+        LTXSetTimestepsStep,
+        LTXPrepareLatentsStep,
+        LTXDenoiseStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
+
+    @property
+    def description(self):
+        return "Denoise block that takes encoded conditions and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
+
+class LTXBlocks(SequentialPipelineBlocks):
+    model_name = "ltx"
+    block_classes = [
+        LTXTextEncoderStep,
+        LTXCoreDenoiseStep,
+        LTXVaeDecoderStep,
+    ]
+    block_names = ["text_encoder", "denoise", "decode"]
+
+    @property
+    def description(self):
+        return "Modular pipeline blocks for LTX Video."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py
new file mode 100644
index 000000000000..3cce6845396b
--- /dev/null
+++ b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py
@@ -0,0 +1,64 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...loaders import LTXVideoLoraLoaderMixin
+from ...utils import logging
+from ..modular_pipeline import ModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+class LTXModularPipeline(
+    ModularPipeline,
+    LTXVideoLoraLoaderMixin,
+):
+    """
+    A ModularPipeline for LTX Video.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "LTXBlocks"
+
+    @property
+    def vae_spatial_compression_ratio(self):
+        if getattr(self, "vae", None) is not None:
+            return self.vae.spatial_compression_ratio
+        return 32
+
+    @property
+    def vae_temporal_compression_ratio(self):
+        if getattr(self, "vae", None) is not None:
+            return self.vae.temporal_compression_ratio
+        return 8
+
+    @property
+    def transformer_spatial_patch_size(self):
+        if getattr(self, "transformer", None) is not None:
+            return self.transformer.config.patch_size
+        return 1
+
+    @property
+    def transformer_temporal_patch_size(self):
+        if getattr(self, "transformer", None) is not None:
+            return self.transformer.config.patch_size_t
+        return 1
+
+    @property
+    def requires_unconditional_embeds(self):
+        if hasattr(self, "guider") and self.guider is not None:
+            return self.guider._enabled and self.guider.num_conditions > 1
+        return False
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 9cd2f9f5c6ae..ace89f0d6f91 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -132,6 +132,7 @@ def _helios_pyramid_map_fn(config_dict=None):
         ("z-image", _create_default_map_fn("ZImageModularPipeline")),
         ("helios", _create_default_map_fn("HeliosModularPipeline")),
         ("helios-pyramid", _helios_pyramid_map_fn),
+        ("ltx", _create_default_map_fn("LTXModularPipeline")),
     ]
 )
 

From 11b891c124e54a36676cfc258cdb742123e324a4 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Wed, 1 Apr 2026 03:22:47 -0700
Subject: [PATCH 02/11] Fix guidance_scale passthrough to guider

---
 src/diffusers/modular_pipelines/ltx/before_denoise.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py
index d3673c3ac1f8..176edbe2bb69 100644
--- a/src/diffusers/modular_pipelines/ltx/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py
@@ -94,6 +94,7 @@ def expected_components(self) -> list[ComponentSpec]:
     def inputs(self) -> list[InputParam]:
         return [
             InputParam("num_videos_per_prompt", default=1),
+            InputParam("guidance_scale", type_hint=float, default=3.0),
             InputParam("prompt_embeds", required=True, type_hint=torch.Tensor),
             InputParam("prompt_attention_mask", type_hint=torch.Tensor),
             InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
@@ -111,6 +112,11 @@ def intermediate_outputs(self) -> list[OutputParam]:
     def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
+        # Set guidance_scale on guider so CFG is configured correctly
+        guidance_scale = getattr(block_state, "guidance_scale", 3.0)
+        if hasattr(components, "guider") and components.guider is not None:
+            components.guider.guidance_scale = guidance_scale
+
         block_state.batch_size = block_state.prompt_embeds.shape[0]
         block_state.dtype = block_state.prompt_embeds.dtype
         num_videos = block_state.num_videos_per_prompt

From 4d2d73eda5a3db7c34f61769cd4e627977938a54 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Wed, 1 Apr 2026 07:04:20 -0700
Subject: [PATCH 03/11] Add LTX modular pipeline tests

---
 tests/modular_pipelines/ltx/__init__.py       |  0
 .../ltx/test_modular_pipeline_ltx.py          | 49 +++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 tests/modular_pipelines/ltx/__init__.py
 create mode 100644 tests/modular_pipelines/ltx/test_modular_pipeline_ltx.py

diff --git a/tests/modular_pipelines/ltx/__init__.py b/tests/modular_pipelines/ltx/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/modular_pipelines/ltx/test_modular_pipeline_ltx.py b/tests/modular_pipelines/ltx/test_modular_pipeline_ltx.py
new file mode 100644
index 000000000000..00e68d26fdee
--- /dev/null
+++ b/tests/modular_pipelines/ltx/test_modular_pipeline_ltx.py
@@ -0,0 +1,49 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from diffusers.modular_pipelines import LTXBlocks, LTXModularPipeline
+
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+class TestLTXModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = LTXModularPipeline
+    pipeline_blocks_class = LTXBlocks
+    pretrained_model_name_or_path = "akshan-main/tiny-ltx-modular-pipe"
+
+    params = frozenset(["prompt", "height", "width", "num_frames"])
+    batch_params = frozenset(["prompt"])
+    optional_params = frozenset(["num_inference_steps", "num_videos_per_prompt", "latents"])
+    output_name = "videos"
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "num_frames": 9,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    @pytest.mark.skip(reason="num_videos_per_prompt")
+    def test_num_images_per_prompt(self):
+        pass

From 7b645e6953c4a71f323caf851c7813c017eab2e2 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Wed, 1 Apr 2026 07:41:47 -0700
Subject: [PATCH 04/11] Add LTX image-to-video modular pipeline

---
 src/diffusers/__init__.py                     |   2 +
 src/diffusers/modular_pipelines/__init__.py   |   4 +-
 .../modular_pipelines/ltx/__init__.py         |   8 +-
 .../modular_pipelines/ltx/before_denoise.py   | 126 +++++++++++
 .../modular_pipelines/ltx/denoise.py          | 206 ++++++++++++++++++
 .../ltx/modular_blocks_ltx.py                 |  42 +++-
 .../modular_pipelines/ltx/modular_pipeline.py |  10 +
 .../modular_pipelines/modular_pipeline.py     |   1 +
 8 files changed, 392 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 637bf2685824..a1a82974eb50 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -444,6 +444,8 @@
             "HeliosPyramidDistilledAutoBlocks",
             "HeliosPyramidDistilledModularPipeline",
             "LTXBlocks",
+            "LTXImage2VideoBlocks",
+            "LTXImage2VideoModularPipeline",
             "LTXModularPipeline",
             "HeliosPyramidModularPipeline",
             "QwenImageAutoBlocks",
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index 389a5416f3ea..967401ba6e57 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -90,6 +90,8 @@
     ]
     _import_structure["ltx"] = [
         "LTXBlocks",
+        "LTXImage2VideoBlocks",
+        "LTXImage2VideoModularPipeline",
         "LTXModularPipeline",
     ]
     _import_structure["z_image"] = [
@@ -145,7 +147,7 @@
             QwenImageModularPipeline,
         )
         from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
-        from .ltx import LTXBlocks, LTXModularPipeline
+        from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXImage2VideoModularPipeline, LTXModularPipeline
         from .wan import (
             Wan22Blocks,
             Wan22Image2VideoBlocks,
diff --git a/src/diffusers/modular_pipelines/ltx/__init__.py b/src/diffusers/modular_pipelines/ltx/__init__.py
index 019fa96fef14..3939db1ac9d8 100644
--- a/src/diffusers/modular_pipelines/ltx/__init__.py
+++ b/src/diffusers/modular_pipelines/ltx/__init__.py
@@ -21,8 +21,8 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["modular_blocks_ltx"] = ["LTXBlocks"]
-    _import_structure["modular_pipeline"] = ["LTXModularPipeline"]
+    _import_structure["modular_blocks_ltx"] = ["LTXBlocks", "LTXImage2VideoBlocks"]
+    _import_structure["modular_pipeline"] = ["LTXModularPipeline", "LTXImage2VideoModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -31,8 +31,8 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .modular_blocks_ltx import LTXBlocks
-        from .modular_pipeline import LTXModularPipeline
+        from .modular_blocks_ltx import LTXBlocks, LTXImage2VideoBlocks
+        from .modular_pipeline import LTXImage2VideoModularPipeline, LTXModularPipeline
 else:
     import sys
 
diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py
index 176edbe2bb69..056cf8d5ad80 100644
--- a/src/diffusers/modular_pipelines/ltx/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py
@@ -285,3 +285,129 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
 
         self.set_block_state(state, block_state)
         return components, state
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class LTXImage2VideoPrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def description(self) -> str:
+        return "Prepare latents step for image-to-video: encodes the first frame and creates a conditioning mask"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        from ...models import AutoencoderKLLTXVideo
+        return [
+            ComponentSpec("vae", AutoencoderKLLTXVideo),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("image", required=True),
+            InputParam("height", type_hint=int, default=512),
+            InputParam("width", type_hint=int, default=704),
+            InputParam("num_frames", type_hint=int, default=161),
+            InputParam("latents", type_hint=torch.Tensor | None),
+            InputParam("num_videos_per_prompt", type_hint=int, default=1),
+            InputParam("generator"),
+            InputParam("batch_size", required=True, type_hint=int),
+            InputParam("dtype", type_hint=torch.dtype),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("latents", type_hint=torch.Tensor),
+            OutputParam("conditioning_mask", type_hint=torch.Tensor),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        batch_size = block_state.batch_size * block_state.num_videos_per_prompt
+        num_channels_latents = components.transformer.config.in_channels
+
+        height = block_state.height // components.vae_spatial_compression_ratio
+        width = block_state.width // components.vae_spatial_compression_ratio
+        num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1
+
+        shape = (batch_size, num_channels_latents, num_frames, height, width)
+        mask_shape = (batch_size, 1, num_frames, height, width)
+
+        if block_state.latents is not None:
+            conditioning_mask = block_state.latents.new_zeros(mask_shape)
+            conditioning_mask[:, :, 0] = 1.0
+            conditioning_mask = LTXPipeline._pack_latents(
+                conditioning_mask,
+                components.transformer_spatial_patch_size,
+                components.transformer_temporal_patch_size,
+            ).squeeze(-1)
+            block_state.latents = block_state.latents.to(device=device, dtype=torch.float32)
+            block_state.conditioning_mask = conditioning_mask
+            self.set_block_state(state, block_state)
+            return components, state
+
+        image = block_state.image
+        if not isinstance(image, torch.Tensor):
+            from ...video_processor import VideoProcessor
+            processor = VideoProcessor(vae_scale_factor=components.vae_spatial_compression_ratio)
+            image = processor.preprocess(image, height=block_state.height, width=block_state.width)
+            image = image.to(device=device, dtype=torch.float32)
+
+        if isinstance(block_state.generator, list):
+            init_latents = [
+                retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2)), block_state.generator[i])
+                for i in range(batch_size)
+            ]
+        else:
+            init_latents = [
+                retrieve_latents(components.vae.encode(img.unsqueeze(0).unsqueeze(2)), block_state.generator)
+                for img in image
+            ]
+
+        init_latents = torch.cat(init_latents, dim=0).to(torch.float32)
+        init_latents = LTXPipeline._normalize_latents(
+            init_latents, components.vae.latents_mean, components.vae.latents_std
+        )
+        init_latents = init_latents.repeat(1, 1, num_frames, 1, 1)
+
+        actual_mask_shape = (init_latents.shape[0], 1, init_latents.shape[2], init_latents.shape[3], init_latents.shape[4])
+        conditioning_mask = torch.zeros(actual_mask_shape, device=device, dtype=torch.float32)
+        conditioning_mask[:, :, 0] = 1.0
+
+        noise = randn_tensor(init_latents.shape, generator=block_state.generator, device=device, dtype=torch.float32)
+        latents = init_latents * conditioning_mask + noise * (1 - conditioning_mask)
+
+        conditioning_mask = LTXPipeline._pack_latents(
+            conditioning_mask,
+            components.transformer_spatial_patch_size,
+            components.transformer_temporal_patch_size,
+        ).squeeze(-1)
+        latents = LTXPipeline._pack_latents(
+            latents,
+            components.transformer_spatial_patch_size,
+            components.transformer_temporal_patch_size,
+        )
+
+        block_state.latents = latents
+        block_state.conditioning_mask = conditioning_mask
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py
index 184aeb2d1b72..04ff496f8477 100644
--- a/src/diffusers/modular_pipelines/ltx/denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/denoise.py
@@ -254,3 +254,209 @@ def description(self) -> str:
             " - `LTXLoopAfterDenoiser`\n"
             "This block supports text-to-video tasks."
         )
+
+
+class LTXImage2VideoLoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the i2v denoising loop that prepares the latent input and modulates "
+            "the timestep with the conditioning mask."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("latents", required=True, type_hint=torch.Tensor),
+            InputParam("conditioning_mask", required=True, type_hint=torch.Tensor),
+            InputParam("dtype", required=True, type_hint=torch.dtype),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        block_state.latent_model_input = block_state.latents.to(block_state.dtype)
+        block_state.timestep_adjusted = t.expand(block_state.latent_model_input.shape[0]).unsqueeze(-1) * (
+            1 - block_state.conditioning_mask
+        )
+        return components, block_state
+
+
+class LTXImage2VideoLoopDenoiser(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    def __init__(
+        self,
+        guider_input_fields: dict[str, Any] = {
+            "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+            "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"),
+        },
+    ):
+        if not isinstance(guider_input_fields, dict):
+            raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
+        self._guider_input_fields = guider_input_fields
+        super().__init__()
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        from ...configuration_utils import FrozenDict
+        from ...guiders import ClassifierFreeGuidance
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 3.0}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", LTXVideoTransformer3DModel),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the i2v denoising loop that denoises the latents with guidance "
+            "using timestep modulated by the conditioning mask."
+        )
+
+    @property
+    def inputs(self) -> list[tuple[str, Any]]:
+        inputs = [
+            InputParam("attention_kwargs"),
+            InputParam("num_inference_steps", required=True, type_hint=int),
+            InputParam("rope_interpolation_scale", type_hint=tuple),
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("num_frames", type_hint=int),
+        ]
+        guider_input_names = []
+        for value in self._guider_input_fields.values():
+            if isinstance(value, tuple):
+                guider_input_names.extend(value)
+            else:
+                guider_input_names.append(value)
+        for name in guider_input_names:
+            inputs.append(InputParam(name=name, required=True, type_hint=torch.Tensor))
+        return inputs
+
+    @torch.no_grad()
+    def __call__(
+        self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+
+        latent_num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1
+        latent_height = block_state.height // components.vae_spatial_compression_ratio
+        latent_width = block_state.width // components.vae_spatial_compression_ratio
+
+        guider_state = components.guider.prepare_inputs_from_block_state(block_state, self._guider_input_fields)
+
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+            cond_kwargs = guider_state_batch.as_dict()
+            cond_kwargs = {
+                k: v.to(block_state.dtype) if isinstance(v, torch.Tensor) else v
+                for k, v in cond_kwargs.items()
+                if k in self._guider_input_fields.keys()
+            }
+
+            guider_state_batch.noise_pred = components.transformer(
+                hidden_states=block_state.latent_model_input,
+                timestep=block_state.timestep_adjusted,
+                num_frames=latent_num_frames,
+                height=latent_height,
+                width=latent_width,
+                rope_interpolation_scale=block_state.rope_interpolation_scale,
+                attention_kwargs=block_state.attention_kwargs,
+                return_dict=False,
+                **cond_kwargs,
+            )[0]
+            components.guider.cleanup_models(components.transformer)
+
+        block_state.noise_pred = components.guider(guider_state)[0]
+
+        return components, block_state
+
+
+class LTXImage2VideoLoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the i2v denoising loop that updates the latents, "
+            "applying the scheduler step only to frames after the first (conditioned) frame."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("num_frames", type_hint=int),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        from ...pipelines.ltx.pipeline_ltx import LTXPipeline
+
+        latent_num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1
+        latent_height = block_state.height // components.vae_spatial_compression_ratio
+        latent_width = block_state.width // components.vae_spatial_compression_ratio
+
+        noise_pred = LTXPipeline._unpack_latents(
+            block_state.noise_pred,
+            latent_num_frames, latent_height, latent_width,
+            components.transformer_spatial_patch_size,
+            components.transformer_temporal_patch_size,
+        )
+        latents = LTXPipeline._unpack_latents(
+            block_state.latents,
+            latent_num_frames, latent_height, latent_width,
+            components.transformer_spatial_patch_size,
+            components.transformer_temporal_patch_size,
+        )
+
+        noise_pred = noise_pred[:, :, 1:]
+        noise_latents = latents[:, :, 1:]
+        pred_latents = components.scheduler.step(noise_pred, t, noise_latents, return_dict=False)[0]
+
+        latents = torch.cat([latents[:, :, :1], pred_latents], dim=2)
+        block_state.latents = LTXPipeline._pack_latents(
+            latents,
+            components.transformer_spatial_patch_size,
+            components.transformer_temporal_patch_size,
+        )
+
+        return components, block_state
+
+
+class LTXImage2VideoDenoiseStep(LTXDenoiseLoopWrapper):
+    block_classes = [
+        LTXImage2VideoLoopBeforeDenoiser,
+        LTXImage2VideoLoopDenoiser(
+            guider_input_fields={
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"),
+            }
+        ),
+        LTXImage2VideoLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step for image-to-video that iteratively denoises the latents.\n"
+            "The first frame is kept fixed via a conditioning mask.\n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `LTXImage2VideoLoopBeforeDenoiser`\n"
+            " - `LTXImage2VideoLoopDenoiser`\n"
+            " - `LTXImage2VideoLoopAfterDenoiser`"
+        )
diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
index f6871864aa2d..290e75051ea0 100644
--- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
+++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
@@ -16,12 +16,13 @@
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
+    LTXImage2VideoPrepareLatentsStep,
     LTXPrepareLatentsStep,
     LTXSetTimestepsStep,
     LTXTextInputStep,
 )
 from .decoders import LTXVaeDecoderStep
-from .denoise import LTXDenoiseStep
+from .denoise import LTXDenoiseStep, LTXImage2VideoDenoiseStep
 from .encoders import LTXTextEncoderStep
 
 
@@ -47,6 +48,25 @@ def outputs(self):
         return [OutputParam.template("latents")]
 
 
+class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "ltx"
+    block_classes = [
+        LTXTextInputStep,
+        LTXSetTimestepsStep,
+        LTXImage2VideoPrepareLatentsStep,
+        LTXImage2VideoDenoiseStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
+
+    @property
+    def description(self):
+        return "Denoise block for image-to-video that takes encoded conditions and an image, and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
+
 class LTXBlocks(SequentialPipelineBlocks):
     model_name = "ltx"
     block_classes = [
@@ -58,7 +78,25 @@ class LTXBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return "Modular pipeline blocks for LTX Video."
+        return "Modular pipeline blocks for LTX Video text-to-video."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
+
+
+class LTXImage2VideoBlocks(SequentialPipelineBlocks):
+    model_name = "ltx"
+    block_classes = [
+        LTXTextEncoderStep,
+        LTXImage2VideoCoreDenoiseStep,
+        LTXVaeDecoderStep,
+    ]
+    block_names = ["text_encoder", "denoise", "decode"]
+
+    @property
+    def description(self):
+        return "Modular pipeline blocks for LTX Video image-to-video."
 
     @property
     def outputs(self):
diff --git a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py
index 3cce6845396b..9f4d1b45e93a 100644
--- a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py
@@ -62,3 +62,13 @@ def requires_unconditional_embeds(self):
         if hasattr(self, "guider") and self.guider is not None:
             return self.guider._enabled and self.guider.num_conditions > 1
         return False
+
+
+class LTXImage2VideoModularPipeline(LTXModularPipeline):
+    """
+    A ModularPipeline for LTX Video image-to-video.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "LTXImage2VideoBlocks"
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index ace89f0d6f91..07636e95191b 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -133,6 +133,7 @@ def _helios_pyramid_map_fn(config_dict=None):
         ("helios", _create_default_map_fn("HeliosModularPipeline")),
         ("helios-pyramid", _helios_pyramid_map_fn),
         ("ltx", _create_default_map_fn("LTXModularPipeline")),
+        ("ltx-i2v", _create_default_map_fn("LTXImage2VideoModularPipeline")),
     ]
 )
 

From 7491d56e94d0d6869a91e2c2e2f12f58f205ed63 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Wed, 1 Apr 2026 07:56:02 -0700
Subject: [PATCH 05/11] Fix i2v VAE dtype mismatch

---
 src/diffusers/modular_pipelines/ltx/before_denoise.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py
index 056cf8d5ad80..6333dd493d8c 100644
--- a/src/diffusers/modular_pipelines/ltx/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py
@@ -371,14 +371,16 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
             image = processor.preprocess(image, height=block_state.height, width=block_state.width)
             image = image.to(device=device, dtype=torch.float32)
 
+        vae_dtype = components.vae.dtype
+
         if isinstance(block_state.generator, list):
             init_latents = [
-                retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2)), block_state.generator[i])
+                retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i])
                 for i in range(batch_size)
             ]
         else:
             init_latents = [
-                retrieve_latents(components.vae.encode(img.unsqueeze(0).unsqueeze(2)), block_state.generator)
+                retrieve_latents(components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator)
                 for img in image
             ]
 

From 1e53507166e3cd82d4db7696aa9511cf33d8b8e6 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Wed, 1 Apr 2026 11:35:37 -0700
Subject: [PATCH 06/11] Add cache_context to denoiser for CFG parity

---
 .../modular_pipelines/ltx/denoise.py          | 48 ++++++++++---------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py
index 04ff496f8477..40c05fafcd74 100644
--- a/src/diffusers/modular_pipelines/ltx/denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/denoise.py
@@ -135,17 +135,19 @@ def __call__(
                 if k in self._guider_input_fields.keys()
             }
 
-            guider_state_batch.noise_pred = components.transformer(
-                hidden_states=block_state.latent_model_input,
-                timestep=t.expand(block_state.latent_model_input.shape[0]).to(block_state.dtype),
-                num_frames=latent_num_frames,
-                height=latent_height,
-                width=latent_width,
-                rope_interpolation_scale=block_state.rope_interpolation_scale,
-                attention_kwargs=block_state.attention_kwargs,
-                return_dict=False,
-                **cond_kwargs,
-            )[0]
+            context_name = getattr(guider_state_batch, components.guider._identifier_key, None)
+            with components.transformer.cache_context(context_name):
+                guider_state_batch.noise_pred = components.transformer(
+                    hidden_states=block_state.latent_model_input,
+                    timestep=t.expand(block_state.latent_model_input.shape[0]).to(block_state.dtype),
+                    num_frames=latent_num_frames,
+                    height=latent_height,
+                    width=latent_width,
+                    rope_interpolation_scale=block_state.rope_interpolation_scale,
+                    attention_kwargs=block_state.attention_kwargs,
+                    return_dict=False,
+                    **cond_kwargs,
+                )[0]
             components.guider.cleanup_models(components.transformer)
 
         block_state.noise_pred = components.guider(guider_state)[0]
@@ -360,17 +362,19 @@ def __call__(
                 if k in self._guider_input_fields.keys()
             }
 
-            guider_state_batch.noise_pred = components.transformer(
-                hidden_states=block_state.latent_model_input,
-                timestep=block_state.timestep_adjusted,
-                num_frames=latent_num_frames,
-                height=latent_height,
-                width=latent_width,
-                rope_interpolation_scale=block_state.rope_interpolation_scale,
-                attention_kwargs=block_state.attention_kwargs,
-                return_dict=False,
-                **cond_kwargs,
-            )[0]
+            context_name = getattr(guider_state_batch, components.guider._identifier_key, None)
+            with components.transformer.cache_context(context_name):
+                guider_state_batch.noise_pred = components.transformer(
+                    hidden_states=block_state.latent_model_input,
+                    timestep=block_state.timestep_adjusted,
+                    num_frames=latent_num_frames,
+                    height=latent_height,
+                    width=latent_width,
+                    rope_interpolation_scale=block_state.rope_interpolation_scale,
+                    attention_kwargs=block_state.attention_kwargs,
+                    return_dict=False,
+                    **cond_kwargs,
+                )[0]
             components.guider.cleanup_models(components.transformer)
 
         block_state.noise_pred = components.guider(guider_state)[0]

From 322727d27ad84a4b77e5f4374fdb464f54df1e8b Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Wed, 1 Apr 2026 12:29:54 -0700
Subject: [PATCH 07/11] Address review feedback

---
 src/diffusers/__init__.py                     |  6 +++---
 src/diffusers/modular_pipelines/__init__.py   |  3 +--
 .../modular_pipelines/ltx/__init__.py         |  4 ++--
 .../modular_pipelines/ltx/before_denoise.py   |  5 ++++-
 .../modular_pipelines/ltx/decoders.py         |  3 ++-
 .../modular_pipelines/ltx/denoise.py          | 20 +++++++++++--------
 .../modular_pipelines/ltx/encoders.py         | 11 ++++------
 .../ltx/modular_blocks_ltx.py                 |  4 ++++
 .../modular_pipelines/ltx/modular_pipeline.py | 10 ----------
 .../modular_pipelines/modular_pipeline.py     |  1 -
 10 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index a1a82974eb50..6c09a59bb8f9 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -443,11 +443,10 @@
             "HeliosPyramidAutoBlocks",
             "HeliosPyramidDistilledAutoBlocks",
             "HeliosPyramidDistilledModularPipeline",
+            "HeliosPyramidModularPipeline",
             "LTXBlocks",
             "LTXImage2VideoBlocks",
-            "LTXImage2VideoModularPipeline",
             "LTXModularPipeline",
-            "HeliosPyramidModularPipeline",
             "QwenImageAutoBlocks",
             "QwenImageEditAutoBlocks",
             "QwenImageEditModularPipeline",
@@ -1214,9 +1213,10 @@
             HeliosPyramidAutoBlocks,
             HeliosPyramidDistilledAutoBlocks,
             HeliosPyramidDistilledModularPipeline,
+            HeliosPyramidModularPipeline,
             LTXBlocks,
+            LTXImage2VideoBlocks,
             LTXModularPipeline,
-            HeliosPyramidModularPipeline,
             QwenImageAutoBlocks,
             QwenImageEditAutoBlocks,
             QwenImageEditModularPipeline,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index 967401ba6e57..f34ff1cedcbb 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -91,7 +91,6 @@
     _import_structure["ltx"] = [
         "LTXBlocks",
         "LTXImage2VideoBlocks",
-        "LTXImage2VideoModularPipeline",
         "LTXModularPipeline",
     ]
     _import_structure["z_image"] = [
@@ -147,7 +146,7 @@
             QwenImageModularPipeline,
         )
         from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
-        from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXImage2VideoModularPipeline, LTXModularPipeline
+        from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXModularPipeline
         from .wan import (
             Wan22Blocks,
             Wan22Image2VideoBlocks,
diff --git a/src/diffusers/modular_pipelines/ltx/__init__.py b/src/diffusers/modular_pipelines/ltx/__init__.py
index 3939db1ac9d8..6be74e6b4112 100644
--- a/src/diffusers/modular_pipelines/ltx/__init__.py
+++ b/src/diffusers/modular_pipelines/ltx/__init__.py
@@ -22,7 +22,7 @@
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["modular_blocks_ltx"] = ["LTXBlocks", "LTXImage2VideoBlocks"]
-    _import_structure["modular_pipeline"] = ["LTXModularPipeline", "LTXImage2VideoModularPipeline"]
+    _import_structure["modular_pipeline"] = ["LTXModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -32,7 +32,7 @@
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
         from .modular_blocks_ltx import LTXBlocks, LTXImage2VideoBlocks
-        from .modular_pipeline import LTXImage2VideoModularPipeline, LTXModularPipeline
+        from .modular_pipeline import LTXModularPipeline
 else:
     import sys
 
diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py
index 6333dd493d8c..808a1fdd524f 100644
--- a/src/diffusers/modular_pipelines/ltx/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py
@@ -373,10 +373,11 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
 
         vae_dtype = components.vae.dtype
 
+        num_images = image.shape[0]
         if isinstance(block_state.generator, list):
             init_latents = [
                 retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i])
-                for i in range(batch_size)
+                for i in range(num_images)
             ]
         else:
             init_latents = [
@@ -385,6 +386,8 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
             ]
 
         init_latents = torch.cat(init_latents, dim=0).to(torch.float32)
+        if init_latents.shape[0] < batch_size:
+            init_latents = init_latents.repeat_interleave(batch_size // init_latents.shape[0], dim=0)
         init_latents = LTXPipeline._normalize_latents(
             init_latents, components.vae.latents_mean, components.vae.latents_std
         )
diff --git a/src/diffusers/modular_pipelines/ltx/decoders.py b/src/diffusers/modular_pipelines/ltx/decoders.py
index eca22a5797a7..d7c85171c091 100644
--- a/src/diffusers/modular_pipelines/ltx/decoders.py
+++ b/src/diffusers/modular_pipelines/ltx/decoders.py
@@ -62,6 +62,7 @@ def inputs(self) -> list[tuple[str, Any]]:
             InputParam("decode_noise_scale", default=None),
             InputParam("generator"),
             InputParam("batch_size", type_hint=int, default=1),
+            InputParam("dtype", required=True, type_hint=torch.dtype),
         ]
 
     @property
@@ -105,7 +106,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         latents = LTXPipeline._denormalize_latents(
             latents, vae.latents_mean, vae.latents_std, vae.config.scaling_factor
         )
-        latents = latents.to(block_state.dtype if hasattr(block_state, 'dtype') else torch.float32)
+        latents = latents.to(block_state.dtype)
 
         if not vae.config.timestep_conditioning:
             timestep = None
diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py
index 40c05fafcd74..bd25b6400aa6 100644
--- a/src/diffusers/modular_pipelines/ltx/denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/denoise.py
@@ -63,11 +63,13 @@ class LTXLoopDenoiser(ModularPipelineBlocks):
 
     def __init__(
         self,
-        guider_input_fields: dict[str, Any] = {
-            "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
-            "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"),
-        },
+        guider_input_fields: dict[str, Any] | None = None,
     ):
+        if guider_input_fields is None:
+            guider_input_fields = {
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"),
+            }
         if not isinstance(guider_input_fields, dict):
             raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
         self._guider_input_fields = guider_input_fields
@@ -290,11 +292,13 @@ class LTXImage2VideoLoopDenoiser(ModularPipelineBlocks):
 
     def __init__(
         self,
-        guider_input_fields: dict[str, Any] = {
-            "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
-            "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"),
-        },
+        guider_input_fields: dict[str, Any] | None = None,
     ):
+        if guider_input_fields is None:
+            guider_input_fields = {
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "encoder_attention_mask": ("prompt_attention_mask", "negative_prompt_attention_mask"),
+            }
         if not isinstance(guider_input_fields, dict):
             raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
         self._guider_input_fields = guider_input_fields
diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py
index c1310fdbd6da..91e85e57009c 100644
--- a/src/diffusers/modular_pipelines/ltx/encoders.py
+++ b/src/diffusers/modular_pipelines/ltx/encoders.py
@@ -153,12 +153,7 @@ def encode_prompt(
             negative_prompt = negative_prompt or ""
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif batch_size != len(negative_prompt):
+            if batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
@@ -183,7 +178,9 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         block_state.device = components._execution_device
 
         # Skip encoding if pre-computed embeddings are provided
-        if getattr(block_state, "prompt_embeds", None) is not None:
+        has_prompt_embeds = getattr(block_state, "prompt_embeds", None) is not None
+        has_negative = getattr(block_state, "negative_prompt_embeds", None) is not None
+        if has_prompt_embeds and (has_negative or not components.requires_unconditional_embeds):
             self.set_block_state(state, block_state)
             return components, state
 
diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
index 290e75051ea0..3c7f85424926 100644
--- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
+++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
@@ -29,6 +29,7 @@
 logger = logging.get_logger(__name__)
 
 
+# auto_docstring
 class LTXCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "ltx"
     block_classes = [
@@ -48,6 +49,7 @@ def outputs(self):
         return [OutputParam.template("latents")]
 
 
+# auto_docstring
 class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "ltx"
     block_classes = [
@@ -67,6 +69,7 @@ def outputs(self):
         return [OutputParam.template("latents")]
 
 
+# auto_docstring
 class LTXBlocks(SequentialPipelineBlocks):
     model_name = "ltx"
     block_classes = [
@@ -85,6 +88,7 @@ def outputs(self):
         return [OutputParam.template("videos")]
 
 
+# auto_docstring
 class LTXImage2VideoBlocks(SequentialPipelineBlocks):
     model_name = "ltx"
     block_classes = [
diff --git a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py
index 9f4d1b45e93a..3cce6845396b 100644
--- a/src/diffusers/modular_pipelines/ltx/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/ltx/modular_pipeline.py
@@ -62,13 +62,3 @@ def requires_unconditional_embeds(self):
         if hasattr(self, "guider") and self.guider is not None:
             return self.guider._enabled and self.guider.num_conditions > 1
         return False
-
-
-class LTXImage2VideoModularPipeline(LTXModularPipeline):
-    """
-    A ModularPipeline for LTX Video image-to-video.
-
-    > [!WARNING] > This is an experimental feature and is likely to change in the future.
-    """
-
-    default_blocks_name = "LTXImage2VideoBlocks"
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 07636e95191b..ace89f0d6f91 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -133,7 +133,6 @@ def _helios_pyramid_map_fn(config_dict=None):
         ("helios", _create_default_map_fn("HeliosModularPipeline")),
         ("helios-pyramid", _helios_pyramid_map_fn),
         ("ltx", _create_default_map_fn("LTXModularPipeline")),
-        ("ltx-i2v", _create_default_map_fn("LTXImage2VideoModularPipeline")),
     ]
 )
 

From 4b644f7cdef5b8120437eaf2d2db37b7cb005270 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Thu, 2 Apr 2026 17:15:33 -0700
Subject: [PATCH 08/11] Generate auto docstrings for LTX assembled blocks

---
 .../ltx/modular_blocks_ltx.py                 | 225 ++++++++++++++++++
 1 file changed, 225 insertions(+)

diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
index 3c7f85424926..b4b9b1b4255b 100644
--- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
+++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
@@ -31,6 +31,53 @@
 
 # auto_docstring
 class LTXCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Denoise block that takes encoded conditions and runs the denoising process.
+
+      Components:
+          transformer (`LTXVideoTransformer3DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          guidance_scale (`float`, *optional*, defaults to 3.0):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              TODO: Add description.
+          prompt_attention_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_attention_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 704):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 161):
+              TODO: Add description.
+          frame_rate (`int`, *optional*, defaults to 25):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "ltx"
     block_classes = [
         LTXTextInputStep,
@@ -51,6 +98,56 @@ def outputs(self):
 
 # auto_docstring
 class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Denoise block for image-to-video that takes encoded conditions and an image, and runs the denoising process.
+
+      Components:
+          transformer (`LTXVideoTransformer3DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          vae (`AutoencoderKLLTXVideo`)
+          guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          guidance_scale (`float`, *optional*, defaults to 3.0):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              TODO: Add description.
+          prompt_attention_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_attention_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 704):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 161):
+              TODO: Add description.
+          frame_rate (`int`, *optional*, defaults to 25):
+              TODO: Add description.
+          image (`None`):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "ltx"
     block_classes = [
         LTXTextInputStep,
@@ -71,6 +168,69 @@ def outputs(self):
 
 # auto_docstring
 class LTXBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline blocks for LTX Video text-to-video.
+
+      Components:
+          text_encoder (`T5EncoderModel`)
+          tokenizer (`T5TokenizerFast`)
+          guider (`ClassifierFreeGuidance`)
+          transformer (`LTXVideoTransformer3DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          vae (`AutoencoderKLLTXVideo`)
+          video_processor (`VideoProcessor`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_attention_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_attention_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 128):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          guidance_scale (`float`, *optional*, defaults to 3.0):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 704):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 161):
+              TODO: Add description.
+          frame_rate (`int`, *optional*, defaults to 25):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              TODO: Add description.
+          decode_timestep (`None`, *optional*, defaults to 0.0):
+              TODO: Add description.
+          decode_noise_scale (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "ltx"
     block_classes = [
         LTXTextEncoderStep,
@@ -90,6 +250,71 @@ def outputs(self):
 
 # auto_docstring
 class LTXImage2VideoBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline blocks for LTX Video image-to-video.
+
+      Components:
+          text_encoder (`T5EncoderModel`)
+          tokenizer (`T5TokenizerFast`)
+          guider (`ClassifierFreeGuidance`)
+          transformer (`LTXVideoTransformer3DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          vae (`AutoencoderKLLTXVideo`)
+          video_processor (`VideoProcessor`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_attention_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_attention_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 128):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          guidance_scale (`float`, *optional*, defaults to 3.0):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 704):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 161):
+              TODO: Add description.
+          frame_rate (`int`, *optional*, defaults to 25):
+              TODO: Add description.
+          image (`None`):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              TODO: Add description.
+          decode_timestep (`None`, *optional*, defaults to 0.0):
+              TODO: Add description.
+          decode_noise_scale (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "ltx"
     block_classes = [
         LTXTextEncoderStep,

From 3da70da038caec7948492e539ed08aa9b3f7713f Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Thu, 2 Apr 2026 17:20:58 -0700
Subject: [PATCH 09/11] Fix ruff lint and format issues

---
 .../modular_pipelines/ltx/before_denoise.py   | 32 +++++++++++++------
 .../modular_pipelines/ltx/denoise.py          |  9 ++++--
 .../modular_pipelines/ltx/encoders.py         |  1 -
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py
index 808a1fdd524f..25f80ccd473f 100644
--- a/src/diffusers/modular_pipelines/ltx/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py
@@ -124,9 +124,7 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         # Repeat prompt_embeds for num_videos_per_prompt
         _, seq_len, _ = block_state.prompt_embeds.shape
         block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, num_videos, 1)
-        block_state.prompt_embeds = block_state.prompt_embeds.view(
-            block_state.batch_size * num_videos, seq_len, -1
-        )
+        block_state.prompt_embeds = block_state.prompt_embeds.view(block_state.batch_size * num_videos, seq_len, -1)
 
         if block_state.prompt_attention_mask is not None:
             block_state.prompt_attention_mask = block_state.prompt_attention_mask.repeat(num_videos, 1)
@@ -139,7 +137,9 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
             )
 
         if block_state.negative_prompt_attention_mask is not None:
-            block_state.negative_prompt_attention_mask = block_state.negative_prompt_attention_mask.repeat(num_videos, 1)
+            block_state.negative_prompt_attention_mask = block_state.negative_prompt_attention_mask.repeat(
+                num_videos, 1
+            )
 
         self.set_block_state(state, block_state)
         return components, state
@@ -276,7 +276,9 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
             num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1
 
             shape = (batch_size, num_channels_latents, num_frames, height, width)
-            block_state.latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=torch.float32)
+            block_state.latents = randn_tensor(
+                shape, generator=block_state.generator, device=device, dtype=torch.float32
+            )
             block_state.latents = LTXPipeline._pack_latents(
                 block_state.latents,
                 components.transformer_spatial_patch_size,
@@ -311,6 +313,7 @@ def description(self) -> str:
     @property
     def expected_components(self) -> list[ComponentSpec]:
         from ...models import AutoencoderKLLTXVideo
+
         return [
             ComponentSpec("vae", AutoencoderKLLTXVideo),
         ]
@@ -342,13 +345,11 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         device = components._execution_device
 
         batch_size = block_state.batch_size * block_state.num_videos_per_prompt
-        num_channels_latents = components.transformer.config.in_channels
 
         height = block_state.height // components.vae_spatial_compression_ratio
         width = block_state.width // components.vae_spatial_compression_ratio
         num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1
 
-        shape = (batch_size, num_channels_latents, num_frames, height, width)
         mask_shape = (batch_size, 1, num_frames, height, width)
 
         if block_state.latents is not None:
@@ -367,6 +368,7 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         image = block_state.image
         if not isinstance(image, torch.Tensor):
             from ...video_processor import VideoProcessor
+
             processor = VideoProcessor(vae_scale_factor=components.vae_spatial_compression_ratio)
             image = processor.preprocess(image, height=block_state.height, width=block_state.width)
             image = image.to(device=device, dtype=torch.float32)
@@ -376,12 +378,16 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         num_images = image.shape[0]
         if isinstance(block_state.generator, list):
             init_latents = [
-                retrieve_latents(components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i])
+                retrieve_latents(
+                    components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i]
+                )
                 for i in range(num_images)
             ]
         else:
             init_latents = [
-                retrieve_latents(components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator)
+                retrieve_latents(
+                    components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator
+                )
                 for img in image
             ]
 
@@ -393,7 +399,13 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         )
         init_latents = init_latents.repeat(1, 1, num_frames, 1, 1)
 
-        actual_mask_shape = (init_latents.shape[0], 1, init_latents.shape[2], init_latents.shape[3], init_latents.shape[4])
+        actual_mask_shape = (
+            init_latents.shape[0],
+            1,
+            init_latents.shape[2],
+            init_latents.shape[3],
+            init_latents.shape[4],
+        )
         conditioning_mask = torch.zeros(actual_mask_shape, device=device, dtype=torch.float32)
         conditioning_mask[:, :, 0] = 1.0
 
diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py
index bd25b6400aa6..eb6dbeeeee82 100644
--- a/src/diffusers/modular_pipelines/ltx/denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/denoise.py
@@ -308,6 +308,7 @@ def __init__(
     def expected_components(self) -> list[ComponentSpec]:
         from ...configuration_utils import FrozenDict
         from ...guiders import ClassifierFreeGuidance
+
         return [
             ComponentSpec(
                 "guider",
@@ -420,13 +421,17 @@ def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: i
 
         noise_pred = LTXPipeline._unpack_latents(
             block_state.noise_pred,
-            latent_num_frames, latent_height, latent_width,
+            latent_num_frames,
+            latent_height,
+            latent_width,
             components.transformer_spatial_patch_size,
             components.transformer_temporal_patch_size,
         )
         latents = LTXPipeline._unpack_latents(
             block_state.latents,
-            latent_num_frames, latent_height, latent_width,
+            latent_num_frames,
+            latent_height,
+            latent_width,
             components.transformer_spatial_patch_size,
             components.transformer_temporal_patch_size,
         )
diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py
index 91e85e57009c..9f15d33b0e18 100644
--- a/src/diffusers/modular_pipelines/ltx/encoders.py
+++ b/src/diffusers/modular_pipelines/ltx/encoders.py
@@ -103,7 +103,6 @@ def _get_t5_prompt_embeds(
         dtype: torch.dtype,
     ):
         prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt)
 
         text_inputs = components.tokenizer(
             prompt,

From 38cfc86d4708ac7a6f2f7cef497c749affa45dac Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Thu, 2 Apr 2026 19:53:36 -0700
Subject: [PATCH 10/11] use InputParam/OutputParam templates and ruff check

---
 src/diffusers/modular_pipelines/__init__.py   |   2 +-
 .../modular_pipelines/ltx/before_denoise.py   |  54 ++---
 .../modular_pipelines/ltx/decoders.py         |  24 +-
 .../modular_pipelines/ltx/denoise.py          |  32 +--
 .../modular_pipelines/ltx/encoders.py         |  42 +---
 .../ltx/modular_blocks_ltx.py                 | 208 +++++++++---------
 6 files changed, 167 insertions(+), 195 deletions(-)

diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index f34ff1cedcbb..c76861df96d4 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -124,6 +124,7 @@
             HeliosPyramidDistilledModularPipeline,
             HeliosPyramidModularPipeline,
         )
+        from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXModularPipeline
         from .modular_pipeline import (
             AutoPipelineBlocks,
             BlockState,
@@ -146,7 +147,6 @@
             QwenImageModularPipeline,
         )
         from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
-        from .ltx import LTXBlocks, LTXImage2VideoBlocks, LTXModularPipeline
         from .wan import (
             Wan22Blocks,
             Wan22Image2VideoBlocks,
diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py
index 25f80ccd473f..cca52af9248e 100644
--- a/src/diffusers/modular_pipelines/ltx/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py
@@ -93,12 +93,12 @@ def expected_components(self) -> list[ComponentSpec]:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("num_videos_per_prompt", default=1),
+            InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"),
             InputParam("guidance_scale", type_hint=float, default=3.0),
-            InputParam("prompt_embeds", required=True, type_hint=torch.Tensor),
-            InputParam("prompt_attention_mask", type_hint=torch.Tensor),
-            InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
-            InputParam("negative_prompt_attention_mask", type_hint=torch.Tensor),
+            InputParam.template("prompt_embeds", required=True),
+            InputParam.template("prompt_embeds_mask", name="prompt_attention_mask"),
+            InputParam.template("negative_prompt_embeds"),
+            InputParam.template("negative_prompt_embeds_mask", name="negative_prompt_attention_mask"),
         ]
 
     @property
@@ -161,11 +161,11 @@ def description(self) -> str:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("num_inference_steps", default=50),
-            InputParam("timesteps"),
-            InputParam("sigmas"),
-            InputParam("height", type_hint=int, default=512),
-            InputParam("width", type_hint=int, default=704),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("timesteps"),
+            InputParam.template("sigmas"),
+            InputParam.template("height", default=512),
+            InputParam.template("width", default=704),
             InputParam("num_frames", type_hint=int, default=161),
             InputParam("frame_rate", type_hint=int, default=25),
         ]
@@ -244,20 +244,20 @@ def description(self) -> str:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("height", type_hint=int, default=512),
-            InputParam("width", type_hint=int, default=704),
+            InputParam.template("height", default=512),
+            InputParam.template("width", default=704),
             InputParam("num_frames", type_hint=int, default=161),
-            InputParam("latents", type_hint=torch.Tensor | None),
-            InputParam("num_videos_per_prompt", type_hint=int, default=1),
-            InputParam("generator"),
-            InputParam("batch_size", required=True, type_hint=int),
-            InputParam("dtype", type_hint=torch.dtype),
+            InputParam.template("latents"),
+            InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size", required=True),
+            InputParam.template("dtype"),
         ]
 
     @property
     def intermediate_outputs(self) -> list[OutputParam]:
         return [
-            OutputParam("latents", type_hint=torch.Tensor),
+            OutputParam.template("latents"),
         ]
 
     @torch.no_grad()
@@ -321,21 +321,21 @@ def expected_components(self) -> list[ComponentSpec]:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("image", required=True),
-            InputParam("height", type_hint=int, default=512),
-            InputParam("width", type_hint=int, default=704),
+            InputParam.template("image"),
+            InputParam.template("height", default=512),
+            InputParam.template("width", default=704),
             InputParam("num_frames", type_hint=int, default=161),
-            InputParam("latents", type_hint=torch.Tensor | None),
-            InputParam("num_videos_per_prompt", type_hint=int, default=1),
-            InputParam("generator"),
-            InputParam("batch_size", required=True, type_hint=int),
-            InputParam("dtype", type_hint=torch.dtype),
+            InputParam.template("latents"),
+            InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size", required=True),
+            InputParam.template("dtype"),
         ]
 
     @property
     def intermediate_outputs(self) -> list[OutputParam]:
         return [
-            OutputParam("latents", type_hint=torch.Tensor),
+            OutputParam.template("latents"),
             OutputParam("conditioning_mask", type_hint=torch.Tensor),
         ]
 
diff --git a/src/diffusers/modular_pipelines/ltx/decoders.py b/src/diffusers/modular_pipelines/ltx/decoders.py
index d7c85171c091..6259338e0147 100644
--- a/src/diffusers/modular_pipelines/ltx/decoders.py
+++ b/src/diffusers/modular_pipelines/ltx/decoders.py
@@ -14,8 +14,6 @@
 
 from typing import Any
 
-import numpy as np
-import PIL
 import torch
 
 from ...configuration_utils import FrozenDict
@@ -53,27 +51,21 @@ def description(self) -> str:
     @property
     def inputs(self) -> list[tuple[str, Any]]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("output_type", default="np", type_hint=str),
-            InputParam("height", type_hint=int, default=512),
-            InputParam("width", type_hint=int, default=704),
+            InputParam.template("latents", required=True),
+            InputParam.template("output_type", default="np"),
+            InputParam.template("height", default=512),
+            InputParam.template("width", default=704),
             InputParam("num_frames", type_hint=int, default=161),
             InputParam("decode_timestep", default=0.0),
             InputParam("decode_noise_scale", default=None),
-            InputParam("generator"),
-            InputParam("batch_size", type_hint=int, default=1),
-            InputParam("dtype", required=True, type_hint=torch.dtype),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype", required=True),
         ]
 
     @property
     def intermediate_outputs(self) -> list[OutputParam]:
-        return [
-            OutputParam(
-                "videos",
-                type_hint=list[list[PIL.Image.Image]] | list[torch.Tensor] | list[np.ndarray],
-                description="The generated videos",
-            )
-        ]
+        return [OutputParam.template("videos")]
 
     @torch.no_grad()
     def __call__(self, components, state: PipelineState) -> PipelineState:
diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py
index eb6dbeeeee82..3e7b7dca7a46 100644
--- a/src/diffusers/modular_pipelines/ltx/denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/denoise.py
@@ -48,8 +48,8 @@ def description(self) -> str:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("dtype", required=True, type_hint=torch.dtype),
+            InputParam.template("latents", required=True),
+            InputParam.template("dtype", required=True),
         ]
 
     @torch.no_grad()
@@ -98,11 +98,11 @@ def description(self) -> str:
     @property
     def inputs(self) -> list[tuple[str, Any]]:
         inputs = [
-            InputParam("attention_kwargs"),
-            InputParam("num_inference_steps", required=True, type_hint=int),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("num_inference_steps", required=True),
             InputParam("rope_interpolation_scale", type_hint=tuple),
-            InputParam("height", type_hint=int),
-            InputParam("width", type_hint=int),
+            InputParam.template("height"),
+            InputParam.template("width"),
             InputParam("num_frames", type_hint=int),
         ]
         guider_input_names = []
@@ -210,8 +210,8 @@ def loop_expected_components(self) -> list[ComponentSpec]:
     @property
     def loop_inputs(self) -> list[InputParam]:
         return [
-            InputParam("timesteps", required=True, type_hint=torch.Tensor),
-            InputParam("num_inference_steps", required=True, type_hint=int),
+            InputParam.template("timesteps", required=True),
+            InputParam.template("num_inference_steps", required=True),
         ]
 
     @torch.no_grad()
@@ -273,9 +273,9 @@ def description(self) -> str:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
+            InputParam.template("latents", required=True),
             InputParam("conditioning_mask", required=True, type_hint=torch.Tensor),
-            InputParam("dtype", required=True, type_hint=torch.dtype),
+            InputParam.template("dtype", required=True),
         ]
 
     @torch.no_grad()
@@ -329,11 +329,11 @@ def description(self) -> str:
     @property
     def inputs(self) -> list[tuple[str, Any]]:
         inputs = [
-            InputParam("attention_kwargs"),
-            InputParam("num_inference_steps", required=True, type_hint=int),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("num_inference_steps", required=True),
             InputParam("rope_interpolation_scale", type_hint=tuple),
-            InputParam("height", type_hint=int),
-            InputParam("width", type_hint=int),
+            InputParam.template("height"),
+            InputParam.template("width"),
             InputParam("num_frames", type_hint=int),
         ]
         guider_input_names = []
@@ -406,8 +406,8 @@ def description(self) -> str:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("height", type_hint=int),
-            InputParam("width", type_hint=int),
+            InputParam.template("height"),
+            InputParam.template("width"),
             InputParam("num_frames", type_hint=int),
         ]
 
diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py
index 9f15d33b0e18..1f8d44bb24f3 100644
--- a/src/diffusers/modular_pipelines/ltx/encoders.py
+++ b/src/diffusers/modular_pipelines/ltx/encoders.py
@@ -49,42 +49,22 @@ def expected_components(self) -> list[ComponentSpec]:
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam("prompt"),
-            InputParam("negative_prompt"),
-            InputParam("prompt_embeds", type_hint=torch.Tensor),
-            InputParam("prompt_attention_mask", type_hint=torch.Tensor),
-            InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
-            InputParam("negative_prompt_attention_mask", type_hint=torch.Tensor),
-            InputParam("max_sequence_length", default=128),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
+            InputParam.template("prompt_embeds"),
+            InputParam.template("prompt_embeds_mask", name="prompt_attention_mask"),
+            InputParam.template("negative_prompt_embeds"),
+            InputParam.template("negative_prompt_embeds_mask", name="negative_prompt_attention_mask"),
+            InputParam.template("max_sequence_length", default=128),
         ]
 
     @property
     def intermediate_outputs(self) -> list[OutputParam]:
         return [
-            OutputParam(
-                "prompt_embeds",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",
-                description="text embeddings used to guide the video generation",
-            ),
-            OutputParam(
-                "prompt_attention_mask",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",
-                description="attention mask for text embeddings",
-            ),
-            OutputParam(
-                "negative_prompt_embeds",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",
-                description="negative text embeddings",
-            ),
-            OutputParam(
-                "negative_prompt_attention_mask",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",
-                description="attention mask for negative text embeddings",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask", name="prompt_attention_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask", name="negative_prompt_attention_mask"),
         ]
 
     @staticmethod
diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
index b4b9b1b4255b..8f8dc58e1145 100644
--- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
+++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
@@ -40,38 +40,38 @@ class LTXCoreDenoiseStep(SequentialPipelineBlocks):
           guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          num_videos_per_prompt (`None`, *optional*, defaults to 1):
-              TODO: Add description.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
           guidance_scale (`float`, *optional*, defaults to 3.0):
               TODO: Add description.
           prompt_embeds (`Tensor`):
-              TODO: Add description.
-          prompt_attention_mask (`Tensor`, *optional*):
-              TODO: Add description.
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_attention_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
           negative_prompt_embeds (`Tensor`, *optional*):
-              TODO: Add description.
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
           negative_prompt_attention_mask (`Tensor`, *optional*):
-              TODO: Add description.
-          num_inference_steps (`None`, *optional*, defaults to 50):
-              TODO: Add description.
-          timesteps (`None`, *optional*):
-              TODO: Add description.
-          sigmas (`None`, *optional*):
-              TODO: Add description.
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
           height (`int`, *optional*, defaults to 512):
-              TODO: Add description.
+              The height in pixels of the generated image.
           width (`int`, *optional*, defaults to 704):
-              TODO: Add description.
+              The width in pixels of the generated image.
           num_frames (`int`, *optional*, defaults to 161):
               TODO: Add description.
           frame_rate (`int`, *optional*, defaults to 25):
               TODO: Add description.
-          latents (`Tensor | NoneType`, *optional*):
-              TODO: Add description.
-          generator (`None`, *optional*):
-              TODO: Add description.
-          attention_kwargs (`None`, *optional*):
-              TODO: Add description.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
 
       Outputs:
           latents (`Tensor`):
@@ -108,40 +108,40 @@ class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
           guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          num_videos_per_prompt (`None`, *optional*, defaults to 1):
-              TODO: Add description.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
           guidance_scale (`float`, *optional*, defaults to 3.0):
               TODO: Add description.
           prompt_embeds (`Tensor`):
-              TODO: Add description.
-          prompt_attention_mask (`Tensor`, *optional*):
-              TODO: Add description.
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_attention_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
           negative_prompt_embeds (`Tensor`, *optional*):
-              TODO: Add description.
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
           negative_prompt_attention_mask (`Tensor`, *optional*):
-              TODO: Add description.
-          num_inference_steps (`None`, *optional*, defaults to 50):
-              TODO: Add description.
-          timesteps (`None`, *optional*):
-              TODO: Add description.
-          sigmas (`None`, *optional*):
-              TODO: Add description.
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
           height (`int`, *optional*, defaults to 512):
-              TODO: Add description.
+              The height in pixels of the generated image.
           width (`int`, *optional*, defaults to 704):
-              TODO: Add description.
+              The width in pixels of the generated image.
           num_frames (`int`, *optional*, defaults to 161):
               TODO: Add description.
           frame_rate (`int`, *optional*, defaults to 25):
               TODO: Add description.
-          image (`None`):
-              TODO: Add description.
-          latents (`Tensor | NoneType`, *optional*):
-              TODO: Add description.
-          generator (`None`, *optional*):
-              TODO: Add description.
-          attention_kwargs (`None`, *optional*):
-              TODO: Add description.
+          image (`Image | list`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
 
       Outputs:
           latents (`Tensor`):
@@ -181,46 +181,46 @@ class LTXBlocks(SequentialPipelineBlocks):
           video_processor (`VideoProcessor`)
 
       Inputs:
-          prompt (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt (`None`, *optional*):
-              TODO: Add description.
-          prompt_embeds (`Tensor`, *optional*):
-              TODO: Add description.
-          prompt_attention_mask (`Tensor`, *optional*):
-              TODO: Add description.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_attention_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
           negative_prompt_embeds (`Tensor`, *optional*):
-              TODO: Add description.
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
           negative_prompt_attention_mask (`Tensor`, *optional*):
-              TODO: Add description.
-          max_sequence_length (`None`, *optional*, defaults to 128):
-              TODO: Add description.
-          num_videos_per_prompt (`None`, *optional*, defaults to 1):
-              TODO: Add description.
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          max_sequence_length (`int`, *optional*, defaults to 128):
+              Maximum sequence length for prompt encoding.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
           guidance_scale (`float`, *optional*, defaults to 3.0):
               TODO: Add description.
-          num_inference_steps (`None`, *optional*, defaults to 50):
-              TODO: Add description.
-          timesteps (`None`, *optional*):
-              TODO: Add description.
-          sigmas (`None`, *optional*):
-              TODO: Add description.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
           height (`int`, *optional*, defaults to 512):
-              TODO: Add description.
+              The height in pixels of the generated image.
           width (`int`, *optional*, defaults to 704):
-              TODO: Add description.
+              The width in pixels of the generated image.
           num_frames (`int`, *optional*, defaults to 161):
               TODO: Add description.
           frame_rate (`int`, *optional*, defaults to 25):
               TODO: Add description.
-          latents (`Tensor | NoneType`, *optional*):
-              TODO: Add description.
-          generator (`None`, *optional*):
-              TODO: Add description.
-          attention_kwargs (`None`, *optional*):
-              TODO: Add description.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
           output_type (`str`, *optional*, defaults to np):
-              TODO: Add description.
+              Output format: 'pil', 'np', 'pt'.
           decode_timestep (`None`, *optional*, defaults to 0.0):
               TODO: Add description.
           decode_noise_scale (`None`, *optional*):
@@ -263,48 +263,48 @@ class LTXImage2VideoBlocks(SequentialPipelineBlocks):
           video_processor (`VideoProcessor`)
 
       Inputs:
-          prompt (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt (`None`, *optional*):
-              TODO: Add description.
-          prompt_embeds (`Tensor`, *optional*):
-              TODO: Add description.
-          prompt_attention_mask (`Tensor`, *optional*):
-              TODO: Add description.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_attention_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
           negative_prompt_embeds (`Tensor`, *optional*):
-              TODO: Add description.
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
           negative_prompt_attention_mask (`Tensor`, *optional*):
-              TODO: Add description.
-          max_sequence_length (`None`, *optional*, defaults to 128):
-              TODO: Add description.
-          num_videos_per_prompt (`None`, *optional*, defaults to 1):
-              TODO: Add description.
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          max_sequence_length (`int`, *optional*, defaults to 128):
+              Maximum sequence length for prompt encoding.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
           guidance_scale (`float`, *optional*, defaults to 3.0):
               TODO: Add description.
-          num_inference_steps (`None`, *optional*, defaults to 50):
-              TODO: Add description.
-          timesteps (`None`, *optional*):
-              TODO: Add description.
-          sigmas (`None`, *optional*):
-              TODO: Add description.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
           height (`int`, *optional*, defaults to 512):
-              TODO: Add description.
+              The height in pixels of the generated image.
           width (`int`, *optional*, defaults to 704):
-              TODO: Add description.
+              The width in pixels of the generated image.
           num_frames (`int`, *optional*, defaults to 161):
               TODO: Add description.
           frame_rate (`int`, *optional*, defaults to 25):
               TODO: Add description.
-          image (`None`):
-              TODO: Add description.
-          latents (`Tensor | NoneType`, *optional*):
-              TODO: Add description.
-          generator (`None`, *optional*):
-              TODO: Add description.
-          attention_kwargs (`None`, *optional*):
-              TODO: Add description.
+          image (`Image | list`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
           output_type (`str`, *optional*, defaults to np):
-              TODO: Add description.
+              Output format: 'pil', 'np', 'pt'.
           decode_timestep (`None`, *optional*, defaults to 0.0):
               TODO: Add description.
           decode_noise_scale (`None`, *optional*):

From 69c10cf7ddb3cdf95b424d0b22d3a9a7a9b9c313 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Thu, 2 Apr 2026 22:44:12 -0700
Subject: [PATCH 11/11] address all review

---
 .../modular_pipelines/ltx/before_denoise.py   | 115 +++++-------
 .../modular_pipelines/ltx/decoders.py         |  37 +++-
 .../modular_pipelines/ltx/denoise.py          |  48 ++++-
 .../modular_pipelines/ltx/encoders.py         | 173 ++++++++++++++----
 .../ltx/modular_blocks_ltx.py                 |  64 ++-----
 5 files changed, 266 insertions(+), 171 deletions(-)

diff --git a/src/diffusers/modular_pipelines/ltx/before_denoise.py b/src/diffusers/modular_pipelines/ltx/before_denoise.py
index cca52af9248e..47344b55ea0d 100644
--- a/src/diffusers/modular_pipelines/ltx/before_denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/before_denoise.py
@@ -18,7 +18,6 @@
 import torch
 
 from ...models import LTXVideoTransformer3DModel
-from ...pipelines.ltx.pipeline_ltx import LTXPipeline
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
@@ -73,6 +72,43 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
+# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._pack_latents
+def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor:
+    # Unpacked latents of shape are [B, C, F, H, W] are patched into tokens of shape
+    # [B, C, F // p_t, p_t, H // p, p, W // p, p].
+    # The patch dimensions are then permuted and collapsed into the channel dimension of shape:
+    # [B, F // p_t * H // p * W // p, C * p_t * p * p] (an ndim=3 tensor).
+    # dim=0 is the batch size, dim=1 is the effective video sequence length,
+    # dim=2 is the effective number of input features
+    batch_size, num_channels, num_frames, height, width = latents.shape
+    post_patch_num_frames = num_frames // patch_size_t
+    post_patch_height = height // patch_size
+    post_patch_width = width // patch_size
+    latents = latents.reshape(
+        batch_size,
+        -1,
+        post_patch_num_frames,
+        patch_size_t,
+        post_patch_height,
+        patch_size,
+        post_patch_width,
+        patch_size,
+    )
+    latents = latents.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3)
+    return latents
+
+
+# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents
+def _normalize_latents(
+    latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
+) -> torch.Tensor:
+    # Normalize latents across the channel dimension [B, C, F, H, W]
+    latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+    latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+    latents = (latents - latents_mean) * scaling_factor / latents_std
+    return latents
+
+
 class LTXTextInputStep(ModularPipelineBlocks):
     model_name = "ltx"
 
@@ -94,7 +130,6 @@ def expected_components(self) -> list[ComponentSpec]:
     def inputs(self) -> list[InputParam]:
         return [
             InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"),
-            InputParam("guidance_scale", type_hint=float, default=3.0),
             InputParam.template("prompt_embeds", required=True),
             InputParam.template("prompt_embeds_mask", name="prompt_attention_mask"),
             InputParam.template("negative_prompt_embeds"),
@@ -112,11 +147,6 @@ def intermediate_outputs(self) -> list[OutputParam]:
     def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
-        # Set guidance_scale on guider so CFG is configured correctly
-        guidance_scale = getattr(block_state, "guidance_scale", 3.0)
-        if hasattr(components, "guider") and components.guider is not None:
-            components.guider.guidance_scale = guidance_scale
-
         block_state.batch_size = block_state.prompt_embeds.shape[0]
         block_state.dtype = block_state.prompt_embeds.dtype
         num_videos = block_state.num_videos_per_prompt
@@ -257,7 +287,7 @@ def inputs(self) -> list[InputParam]:
     @property
     def intermediate_outputs(self) -> list[OutputParam]:
         return [
-            OutputParam.template("latents"),
+            OutputParam("latents", type_hint=torch.Tensor),
         ]
 
     @torch.no_grad()
@@ -279,7 +309,7 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
             block_state.latents = randn_tensor(
                 shape, generator=block_state.generator, device=device, dtype=torch.float32
             )
-            block_state.latents = LTXPipeline._pack_latents(
+            block_state.latents = _pack_latents(
                 block_state.latents,
                 components.transformer_spatial_patch_size,
                 components.transformer_temporal_patch_size,
@@ -289,39 +319,19 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         return components, state
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
 class LTXImage2VideoPrepareLatentsStep(ModularPipelineBlocks):
     model_name = "ltx"
 
     @property
     def description(self) -> str:
-        return "Prepare latents step for image-to-video: encodes the first frame and creates a conditioning mask"
-
-    @property
-    def expected_components(self) -> list[ComponentSpec]:
-        from ...models import AutoencoderKLLTXVideo
-
-        return [
-            ComponentSpec("vae", AutoencoderKLLTXVideo),
-        ]
+        return (
+            "Prepare latents step for image-to-video: takes pre-encoded image latents and creates a conditioning mask"
+        )
 
     @property
     def inputs(self) -> list[InputParam]:
         return [
-            InputParam.template("image"),
+            InputParam("image_latents", type_hint=torch.Tensor, required=True),
             InputParam.template("height", default=512),
             InputParam.template("width", default=704),
             InputParam("num_frames", type_hint=int, default=161),
@@ -335,7 +345,7 @@ def inputs(self) -> list[InputParam]:
     @property
     def intermediate_outputs(self) -> list[OutputParam]:
         return [
-            OutputParam.template("latents"),
+            OutputParam("latents", type_hint=torch.Tensor),
             OutputParam("conditioning_mask", type_hint=torch.Tensor),
         ]
 
@@ -355,7 +365,7 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         if block_state.latents is not None:
             conditioning_mask = block_state.latents.new_zeros(mask_shape)
             conditioning_mask[:, :, 0] = 1.0
-            conditioning_mask = LTXPipeline._pack_latents(
+            conditioning_mask = _pack_latents(
                 conditioning_mask,
                 components.transformer_spatial_patch_size,
                 components.transformer_temporal_patch_size,
@@ -365,38 +375,9 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
             self.set_block_state(state, block_state)
             return components, state
 
-        image = block_state.image
-        if not isinstance(image, torch.Tensor):
-            from ...video_processor import VideoProcessor
-
-            processor = VideoProcessor(vae_scale_factor=components.vae_spatial_compression_ratio)
-            image = processor.preprocess(image, height=block_state.height, width=block_state.width)
-            image = image.to(device=device, dtype=torch.float32)
-
-        vae_dtype = components.vae.dtype
-
-        num_images = image.shape[0]
-        if isinstance(block_state.generator, list):
-            init_latents = [
-                retrieve_latents(
-                    components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator[i]
-                )
-                for i in range(num_images)
-            ]
-        else:
-            init_latents = [
-                retrieve_latents(
-                    components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)), block_state.generator
-                )
-                for img in image
-            ]
-
-        init_latents = torch.cat(init_latents, dim=0).to(torch.float32)
+        init_latents = block_state.image_latents.to(device=device, dtype=torch.float32)
         if init_latents.shape[0] < batch_size:
             init_latents = init_latents.repeat_interleave(batch_size // init_latents.shape[0], dim=0)
-        init_latents = LTXPipeline._normalize_latents(
-            init_latents, components.vae.latents_mean, components.vae.latents_std
-        )
         init_latents = init_latents.repeat(1, 1, num_frames, 1, 1)
 
         actual_mask_shape = (
@@ -412,12 +393,12 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
         noise = randn_tensor(init_latents.shape, generator=block_state.generator, device=device, dtype=torch.float32)
         latents = init_latents * conditioning_mask + noise * (1 - conditioning_mask)
 
-        conditioning_mask = LTXPipeline._pack_latents(
+        conditioning_mask = _pack_latents(
             conditioning_mask,
             components.transformer_spatial_patch_size,
             components.transformer_temporal_patch_size,
         ).squeeze(-1)
-        latents = LTXPipeline._pack_latents(
+        latents = _pack_latents(
             latents,
             components.transformer_spatial_patch_size,
             components.transformer_temporal_patch_size,
diff --git a/src/diffusers/modular_pipelines/ltx/decoders.py b/src/diffusers/modular_pipelines/ltx/decoders.py
index 6259338e0147..7524d6f7f67d 100644
--- a/src/diffusers/modular_pipelines/ltx/decoders.py
+++ b/src/diffusers/modular_pipelines/ltx/decoders.py
@@ -18,7 +18,6 @@
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKLLTXVideo
-from ...pipelines.ltx.pipeline_ltx import LTXPipeline
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
@@ -29,6 +28,31 @@
 logger = logging.get_logger(__name__)
 
 
+# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._unpack_latents
+def _unpack_latents(
+    latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1
+) -> torch.Tensor:
+    # Packed latents of shape [B, S, D] (S is the effective video sequence length,
+    # D is the effective feature dimensions) are unpacked and reshaped into a video tensor
+    # of shape [B, C, F, H, W]. This is the inverse operation of what happens in the
+    # `_pack_latents` method.
+    batch_size = latents.size(0)
+    latents = latents.reshape(batch_size, num_frames, height, width, -1, patch_size_t, patch_size, patch_size)
+    latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3)
+    return latents
+
+
+# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._denormalize_latents
+def _denormalize_latents(
+    latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
+) -> torch.Tensor:
+    # Denormalize latents across the channel dimension [B, C, F, H, W]
+    latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+    latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+    latents = latents * latents_std / scaling_factor + latents_mean
+    return latents
+
+
 class LTXVaeDecoderStep(ModularPipelineBlocks):
     model_name = "ltx"
 
@@ -74,11 +98,6 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
         latents = block_state.latents
 
-        if block_state.output_type == "latent":
-            block_state.videos = latents
-            self.set_block_state(state, block_state)
-            return components, state
-
         height = block_state.height
         width = block_state.width
         num_frames = block_state.num_frames
@@ -87,7 +106,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         latent_height = height // components.vae_spatial_compression_ratio
         latent_width = width // components.vae_spatial_compression_ratio
 
-        latents = LTXPipeline._unpack_latents(
+        latents = _unpack_latents(
             latents,
             latent_num_frames,
             latent_height,
@@ -95,9 +114,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
             components.transformer_spatial_patch_size,
             components.transformer_temporal_patch_size,
         )
-        latents = LTXPipeline._denormalize_latents(
-            latents, vae.latents_mean, vae.latents_std, vae.config.scaling_factor
-        )
+        latents = _denormalize_latents(latents, vae.latents_mean, vae.latents_std, vae.config.scaling_factor)
         latents = latents.to(block_state.dtype)
 
         if not vae.config.timestep_conditioning:
diff --git a/src/diffusers/modular_pipelines/ltx/denoise.py b/src/diffusers/modular_pipelines/ltx/denoise.py
index 3e7b7dca7a46..e8f72ec4a477 100644
--- a/src/diffusers/modular_pipelines/ltx/denoise.py
+++ b/src/diffusers/modular_pipelines/ltx/denoise.py
@@ -34,6 +34,46 @@
 logger = logging.get_logger(__name__)
 
 
+# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._pack_latents
+def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor:
+    # Unpacked latents of shape are [B, C, F, H, W] are patched into tokens of shape
+    # [B, C, F // p_t, p_t, H // p, p, W // p, p].
+    # The patch dimensions are then permuted and collapsed into the channel dimension of shape:
+    # [B, F // p_t * H // p * W // p, C * p_t * p * p] (an ndim=3 tensor).
+    # dim=0 is the batch size, dim=1 is the effective video sequence length,
+    # dim=2 is the effective number of input features
+    batch_size, num_channels, num_frames, height, width = latents.shape
+    post_patch_num_frames = num_frames // patch_size_t
+    post_patch_height = height // patch_size
+    post_patch_width = width // patch_size
+    latents = latents.reshape(
+        batch_size,
+        -1,
+        post_patch_num_frames,
+        patch_size_t,
+        post_patch_height,
+        patch_size,
+        post_patch_width,
+        patch_size,
+    )
+    latents = latents.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3)
+    return latents
+
+
+# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._unpack_latents
+def _unpack_latents(
+    latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1
+) -> torch.Tensor:
+    # Packed latents of shape [B, S, D] (S is the effective video sequence length,
+    # D is the effective feature dimensions) are unpacked and reshaped into a video tensor
+    # of shape [B, C, F, H, W]. This is the inverse operation of what happens in the
+    # `_pack_latents` method.
+    batch_size = latents.size(0)
+    latents = latents.reshape(batch_size, num_frames, height, width, -1, patch_size_t, patch_size, patch_size)
+    latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3)
+    return latents
+
+
 class LTXLoopBeforeDenoiser(ModularPipelineBlocks):
     model_name = "ltx"
 
@@ -413,13 +453,11 @@ def inputs(self) -> list[InputParam]:
 
     @torch.no_grad()
     def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
-        from ...pipelines.ltx.pipeline_ltx import LTXPipeline
-
         latent_num_frames = (block_state.num_frames - 1) // components.vae_temporal_compression_ratio + 1
         latent_height = block_state.height // components.vae_spatial_compression_ratio
         latent_width = block_state.width // components.vae_spatial_compression_ratio
 
-        noise_pred = LTXPipeline._unpack_latents(
+        noise_pred = _unpack_latents(
             block_state.noise_pred,
             latent_num_frames,
             latent_height,
@@ -427,7 +465,7 @@ def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: i
             components.transformer_spatial_patch_size,
             components.transformer_temporal_patch_size,
         )
-        latents = LTXPipeline._unpack_latents(
+        latents = _unpack_latents(
             block_state.latents,
             latent_num_frames,
             latent_height,
@@ -441,7 +479,7 @@ def __call__(self, components: LTXModularPipeline, block_state: BlockState, i: i
         pred_latents = components.scheduler.step(noise_pred, t, noise_latents, return_dict=False)[0]
 
         latents = torch.cat([latents[:, :, :1], pred_latents], dim=2)
-        block_state.latents = LTXPipeline._pack_latents(
+        block_state.latents = _pack_latents(
             latents,
             components.transformer_spatial_patch_size,
             components.transformer_temporal_patch_size,
diff --git a/src/diffusers/modular_pipelines/ltx/encoders.py b/src/diffusers/modular_pipelines/ltx/encoders.py
index 1f8d44bb24f3..ec76a86cf2f1 100644
--- a/src/diffusers/modular_pipelines/ltx/encoders.py
+++ b/src/diffusers/modular_pipelines/ltx/encoders.py
@@ -17,7 +17,9 @@
 
 from ...configuration_utils import FrozenDict
 from ...guiders import ClassifierFreeGuidance
+from ...models import AutoencoderKLLTXVideo
 from ...utils import logging
+from ...video_processor import VideoProcessor
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import LTXModularPipeline
@@ -26,6 +28,33 @@
 logger = logging.get_logger(__name__)
 
 
+def _get_t5_prompt_embeds(
+    components,
+    prompt: str | list[str],
+    max_sequence_length: int,
+    device: torch.device,
+    dtype: torch.dtype,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+
+    text_inputs = components.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_attention_mask = text_inputs.attention_mask
+    prompt_attention_mask = prompt_attention_mask.bool().to(device)
+
+    prompt_embeds = components.text_encoder(text_input_ids.to(device))[0]
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    return prompt_embeds, prompt_attention_mask
+
+
 class LTXTextEncoderStep(ModularPipelineBlocks):
     model_name = "ltx"
 
@@ -51,10 +80,6 @@ def inputs(self) -> list[InputParam]:
         return [
             InputParam.template("prompt"),
             InputParam.template("negative_prompt"),
-            InputParam.template("prompt_embeds"),
-            InputParam.template("prompt_embeds_mask", name="prompt_attention_mask"),
-            InputParam.template("negative_prompt_embeds"),
-            InputParam.template("negative_prompt_embeds_mask", name="negative_prompt_attention_mask"),
             InputParam.template("max_sequence_length", default=128),
         ]
 
@@ -74,33 +99,6 @@ def check_inputs(block_state):
         ):
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}")
 
-    @staticmethod
-    def _get_t5_prompt_embeds(
-        components,
-        prompt: str | list[str],
-        max_sequence_length: int,
-        device: torch.device,
-        dtype: torch.dtype,
-    ):
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        text_inputs = components.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=max_sequence_length,
-            truncation=True,
-            add_special_tokens=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-        prompt_attention_mask = text_inputs.attention_mask
-        prompt_attention_mask = prompt_attention_mask.bool().to(device)
-
-        prompt_embeds = components.text_encoder(text_input_ids.to(device))[0]
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-
-        return prompt_embeds, prompt_attention_mask
-
     @staticmethod
     def encode_prompt(
         components,
@@ -117,7 +115,7 @@ def encode_prompt(
             prompt = [prompt]
         batch_size = len(prompt)
 
-        prompt_embeds, prompt_attention_mask = LTXTextEncoderStep._get_t5_prompt_embeds(
+        prompt_embeds, prompt_attention_mask = _get_t5_prompt_embeds(
             components=components,
             prompt=prompt,
             max_sequence_length=max_sequence_length,
@@ -139,7 +137,7 @@ def encode_prompt(
                     " the batch size of `prompt`."
                 )
 
-            negative_prompt_embeds, negative_prompt_attention_mask = LTXTextEncoderStep._get_t5_prompt_embeds(
+            negative_prompt_embeds, negative_prompt_attention_mask = _get_t5_prompt_embeds(
                 components=components,
                 prompt=negative_prompt,
                 max_sequence_length=max_sequence_length,
@@ -156,13 +154,6 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
 
         block_state.device = components._execution_device
 
-        # Skip encoding if pre-computed embeddings are provided
-        has_prompt_embeds = getattr(block_state, "prompt_embeds", None) is not None
-        has_negative = getattr(block_state, "negative_prompt_embeds", None) is not None
-        if has_prompt_embeds and (has_negative or not components.requires_unconditional_embeds):
-            self.set_block_state(state, block_state)
-            return components, state
-
         (
             block_state.prompt_embeds,
             block_state.prompt_attention_mask,
@@ -179,3 +170,105 @@ def __call__(self, components: LTXModularPipeline, state: PipelineState) -> Pipe
 
         self.set_block_state(state, block_state)
         return components, state
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.ltx.pipeline_ltx.LTXPipeline._normalize_latents
+def _normalize_latents(
+    latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
+) -> torch.Tensor:
+    # Normalize latents across the channel dimension [B, C, F, H, W]
+    latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+    latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+    latents = (latents - latents_mean) * scaling_factor / latents_std
+    return latents
+
+
+class LTXVaeEncoderStep(ModularPipelineBlocks):
+    model_name = "ltx"
+
+    @property
+    def description(self) -> str:
+        return "VAE Encoder step that encodes an input image into latent space for image-to-video generation"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLLTXVideo),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 32}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("image", required=True),
+            InputParam.template("height", default=512),
+            InputParam.template("width", default=704),
+            InputParam.template("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "image_latents",
+                type_hint=torch.Tensor,
+                description="Encoded image latents from the VAE encoder",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: LTXModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        image = block_state.image
+        if not isinstance(image, torch.Tensor):
+            image = components.video_processor.preprocess(image, height=block_state.height, width=block_state.width)
+            image = image.to(device=device, dtype=torch.float32)
+
+        vae_dtype = components.vae.dtype
+
+        num_images = image.shape[0]
+        if isinstance(block_state.generator, list):
+            init_latents = [
+                retrieve_latents(
+                    components.vae.encode(image[i].unsqueeze(0).unsqueeze(2).to(vae_dtype)),
+                    block_state.generator[i],
+                )
+                for i in range(num_images)
+            ]
+        else:
+            init_latents = [
+                retrieve_latents(
+                    components.vae.encode(img.unsqueeze(0).unsqueeze(2).to(vae_dtype)),
+                    block_state.generator,
+                )
+                for img in image
+            ]
+
+        init_latents = torch.cat(init_latents, dim=0).to(torch.float32)
+        block_state.image_latents = _normalize_latents(
+            init_latents, components.vae.latents_mean, components.vae.latents_std
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
index 8f8dc58e1145..76c69e3f0fdb 100644
--- a/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
+++ b/src/diffusers/modular_pipelines/ltx/modular_blocks_ltx.py
@@ -23,7 +23,7 @@
 )
 from .decoders import LTXVaeDecoderStep
 from .denoise import LTXDenoiseStep, LTXImage2VideoDenoiseStep
-from .encoders import LTXTextEncoderStep
+from .encoders import LTXTextEncoderStep, LTXVaeEncoderStep
 
 
 logger = logging.get_logger(__name__)
@@ -35,15 +35,12 @@ class LTXCoreDenoiseStep(SequentialPipelineBlocks):
     Denoise block that takes encoded conditions and runs the denoising process.
 
       Components:
-          transformer (`LTXVideoTransformer3DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
+          transformer (`LTXVideoTransformer3DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`)
 
       Inputs:
           num_videos_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          guidance_scale (`float`, *optional*, defaults to 3.0):
-              TODO: Add description.
           prompt_embeds (`Tensor`):
               text embeddings used to guide the image generation. Can be generated from text_encoder step.
           prompt_attention_mask (`Tensor`):
@@ -102,16 +99,12 @@ class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
     Denoise block for image-to-video that takes encoded conditions and an image, and runs the denoising process.
 
       Components:
-          transformer (`LTXVideoTransformer3DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          vae (`AutoencoderKLLTXVideo`)
-          guider (`ClassifierFreeGuidance`)
+          transformer (`LTXVideoTransformer3DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) vae
+          (`AutoencoderKLLTXVideo`) video_processor (`VideoProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
           num_videos_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          guidance_scale (`float`, *optional*, defaults to 3.0):
-              TODO: Add description.
           prompt_embeds (`Tensor`):
               text embeddings used to guide the image generation. Can be generated from text_encoder step.
           prompt_attention_mask (`Tensor`):
@@ -136,10 +129,10 @@ class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
               TODO: Add description.
           image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
           attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
 
@@ -152,10 +145,11 @@ class LTXImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
         LTXTextInputStep,
         LTXSetTimestepsStep,
+        LTXVaeEncoderStep,
         LTXImage2VideoPrepareLatentsStep,
         LTXImage2VideoDenoiseStep,
     ]
-    block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
+    block_names = ["input", "set_timesteps", "vae_encoder", "prepare_latents", "denoise"]
 
     @property
     def description(self):
@@ -172,12 +166,8 @@ class LTXBlocks(SequentialPipelineBlocks):
     Modular pipeline blocks for LTX Video text-to-video.
 
       Components:
-          text_encoder (`T5EncoderModel`)
-          tokenizer (`T5TokenizerFast`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`LTXVideoTransformer3DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          vae (`AutoencoderKLLTXVideo`)
+          text_encoder (`T5EncoderModel`) tokenizer (`T5TokenizerFast`) guider (`ClassifierFreeGuidance`) transformer
+          (`LTXVideoTransformer3DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLLTXVideo`)
           video_processor (`VideoProcessor`)
 
       Inputs:
@@ -185,20 +175,10 @@ class LTXBlocks(SequentialPipelineBlocks):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_attention_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_attention_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
           max_sequence_length (`int`, *optional*, defaults to 128):
               Maximum sequence length for prompt encoding.
           num_videos_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          guidance_scale (`float`, *optional*, defaults to 3.0):
-              TODO: Add description.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
           timesteps (`Tensor`, *optional*):
@@ -254,12 +234,8 @@ class LTXImage2VideoBlocks(SequentialPipelineBlocks):
     Modular pipeline blocks for LTX Video image-to-video.
 
       Components:
-          text_encoder (`T5EncoderModel`)
-          tokenizer (`T5TokenizerFast`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`LTXVideoTransformer3DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          vae (`AutoencoderKLLTXVideo`)
+          text_encoder (`T5EncoderModel`) tokenizer (`T5TokenizerFast`) guider (`ClassifierFreeGuidance`) transformer
+          (`LTXVideoTransformer3DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLLTXVideo`)
           video_processor (`VideoProcessor`)
 
       Inputs:
@@ -267,20 +243,10 @@ class LTXImage2VideoBlocks(SequentialPipelineBlocks):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-          prompt_embeds (`Tensor`):
-              text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          prompt_attention_mask (`Tensor`):
-              mask for the text embeddings. Can be generated from text_encoder step.
-          negative_prompt_embeds (`Tensor`, *optional*):
-              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
-          negative_prompt_attention_mask (`Tensor`, *optional*):
-              mask for the negative text embeddings. Can be generated from text_encoder step.
           max_sequence_length (`int`, *optional*, defaults to 128):
               Maximum sequence length for prompt encoding.
           num_videos_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          guidance_scale (`float`, *optional*, defaults to 3.0):
-              TODO: Add description.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
           timesteps (`Tensor`, *optional*):
@@ -297,10 +263,10 @@ class LTXImage2VideoBlocks(SequentialPipelineBlocks):
               TODO: Add description.
           image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
           attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           output_type (`str`, *optional*, defaults to np):