huggingface · RuixiangMa · Apr 2, 2026 · Apr 2, 2026 · Apr 6, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -488,6 +488,8 @@
         title: AudioLDM 2
       - local: api/pipelines/stable_audio
         title: Stable Audio
+      - local: api/pipelines/longcat_audio_dit
+        title: LongCat-AudioDiT
       title: Audio
     - sections:
       - local: api/pipelines/animatediff

diff --git a/docs/source/en/api/pipelines/longcat_audio_dit.md b/docs/source/en/api/pipelines/longcat_audio_dit.md
@@ -0,0 +1,61 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LongCat-AudioDiT
+
+LongCat-AudioDiT is a text-to-audio diffusion model from Meituan LongCat. The diffusers integration exposes a standard [`DiffusionPipeline`] interface for text-conditioned audio generation.
+
+This pipeline supports loading the original flat LongCat checkpoint layout from either a local directory or a Hugging Face Hub repository containing:
+
+- `config.json`
+- `model.safetensors`
+
+The loader builds the text encoder, transformer, and VAE from `config.json`, restores component weights from `model.safetensors`, and ties the shared UMT5 embedding when needed.
+
+This pipeline was adapted from the LongCat-AudioDiT reference implementation: https://github.com/meituan-longcat/LongCat-AudioDiT
+
+## Usage
+
+```py
+import soundfile as sf
+import torch
+from diffusers import LongCatAudioDiTPipeline
+
+pipeline = LongCatAudioDiTPipeline.from_pretrained(
+    "meituan-longcat/LongCat-AudioDiT-1B",
+    torch_dtype=torch.float16,
+)
+pipeline = pipeline.to("cuda")
+
+audio = pipeline(
+    prompt="A calm ocean wave ambience with soft wind in the background.",
+    audio_end_in_s=5.0,
+    num_inference_steps=16,
+    guidance_scale=4.0,
+    output_type="pt",
+).audios
+
+output = audio[0, 0].float().cpu().numpy()
+sf.write("longcat.wav", output, pipeline.sample_rate)
+```
+
+## Tips
+
+- `audio_end_in_s` is the most direct way to control output duration.
+- `output_type="pt"` returns a PyTorch tensor shaped `(batch, channels, samples)`.
+
+## LongCatAudioDiTPipeline
+
+[[autodoc]] LongCatAudioDiTPipeline
+	- all
+	- __call__
+	- from_pretrained
diff --git a/docs/source/en/api/pipelines/overview.md b/docs/source/en/api/pipelines/overview.md
@@ -29,6 +29,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 |---|---|
 | [AnimateDiff](animatediff) | text2video |
 | [AudioLDM2](audioldm2) | text2audio |
+| [LongCat-AudioDiT](longcat_audio_dit) | text2audio |
 | [AuraFlow](aura_flow) | text2image |
 | [Bria 3.2](bria_3_2) | text2image |
 | [CogVideoX](cogvideox) | text2video |

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -212,6 +212,7 @@
             "AutoencoderKLTemporalDecoder",
             "AutoencoderKLWan",
             "AutoencoderOobleck",
+            "LongCatAudioDiTVae",
             "AutoencoderRAE",
             "AutoencoderTiny",
             "AutoencoderVidTok",
@@ -253,6 +254,7 @@
             "Kandinsky5Transformer3DModel",
             "LatteTransformer3DModel",
             "LongCatImageTransformer2DModel",
+            "LongCatAudioDiTTransformer",
             "LTX2VideoTransformer3DModel",
             "LTXVideoTransformer3DModel",
             "Lumina2Transformer2DModel",
@@ -594,6 +596,7 @@
             "LLaDA2PipelineOutput",
             "LongCatImageEditPipeline",
             "LongCatImagePipeline",
+            "LongCatAudioDiTPipeline",
             "LTX2ConditionPipeline",
             "LTX2ImageToVideoPipeline",
             "LTX2LatentUpsamplePipeline",
@@ -1007,6 +1010,7 @@
             AutoencoderKLTemporalDecoder,
             AutoencoderKLWan,
             AutoencoderOobleck,
+            LongCatAudioDiTVae,
             AutoencoderRAE,
             AutoencoderTiny,
             AutoencoderVidTok,
@@ -1048,6 +1052,7 @@
             Kandinsky5Transformer3DModel,
             LatteTransformer3DModel,
             LongCatImageTransformer2DModel,
+            LongCatAudioDiTTransformer,
             LTX2VideoTransformer3DModel,
             LTXVideoTransformer3DModel,
             Lumina2Transformer2DModel,
@@ -1365,6 +1370,7 @@
             LLaDA2PipelineOutput,
             LongCatImageEditPipeline,
             LongCatImagePipeline,
+            LongCatAudioDiTPipeline,
             LTX2ConditionPipeline,
             LTX2ImageToVideoPipeline,
             LTX2LatentUpsamplePipeline,

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -51,6 +51,7 @@
     _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
     _import_structure["autoencoders.autoencoder_kl_wan"] = ["AutoencoderKLWan"]
     _import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
+    _import_structure["autoencoders.autoencoder_longcat_audio_dit"] = ["LongCatAudioDiTVae"]
     _import_structure["autoencoders.autoencoder_rae"] = ["AutoencoderRAE"]
     _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
     _import_structure["autoencoders.autoencoder_vidtok"] = ["AutoencoderVidTok"]
@@ -112,6 +113,7 @@
     _import_structure["transformers.transformer_hunyuanimage"] = ["HunyuanImageTransformer2DModel"]
     _import_structure["transformers.transformer_kandinsky"] = ["Kandinsky5Transformer3DModel"]
     _import_structure["transformers.transformer_longcat_image"] = ["LongCatImageTransformer2DModel"]
+    _import_structure["transformers.transformer_longcat_audio_dit"] = ["LongCatAudioDiTTransformer"]
     _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
     _import_structure["transformers.transformer_ltx2"] = ["LTX2VideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]

diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
@@ -20,6 +20,7 @@
 from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
 from .autoencoder_kl_wan import AutoencoderKLWan
 from .autoencoder_oobleck import AutoencoderOobleck
+from .autoencoder_longcat_audio_dit import LongCatAudioDiTVae
 from .autoencoder_rae import AutoencoderRAE
 from .autoencoder_tiny import AutoencoderTiny
 from .autoencoder_vidtok import AutoencoderVidTok