ltx-2-TURBO

Sleeping

App Files Files Community

alex commited on Jan 21

Commit

25889c7

1 Parent(s): 43067da

now with audio support

Browse files

Files changed (7) hide show

app.py +279 -13
packages/ltx-core/src/ltx_core/model/audio_vae/__init__.py +2 -1
packages/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py +52 -1
packages/ltx-pipelines/src/ltx_pipelines/distilled.py +201 -4
packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py +40 -7
packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py +31 -1
requirements.txt +7 -1

app.py CHANGED Viewed

@@ -2,6 +2,80 @@ import sys
 from pathlib import Path
 import uuid
 import tempfile
 # Add packages to Python path
 current_dir = Path(__file__).parent
@@ -17,9 +91,11 @@ import random
 import torch
 from typing import Optional
 from pathlib import Path
 from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_pipelines.distilled import DistilledPipeline
 from ltx_core.model.video_vae import TilingConfig
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 from ltx_pipelines.utils.constants import (
@@ -31,6 +107,8 @@ from ltx_pipelines.utils.constants import (
     DEFAULT_LORA_STRENGTH,
 )
 from ltx_core.loader.single_gpu_model_builder import enable_only_lora
 from PIL import Image
 MAX_SEED = np.iinfo(np.int32).max
@@ -38,6 +116,11 @@ MAX_SEED = np.iinfo(np.int32).max
 # Install with: pip install git+https://github.com/Lightricks/LTX-2.git
 from ltx_pipelines.utils import ModelLedger
 from ltx_pipelines.utils.helpers import generate_enhanced_prompt
 # HuggingFace Hub defaults
 DEFAULT_REPO_ID = "Lightricks/LTX-2"
@@ -82,6 +165,8 @@ model_ledger = ModelLedger(
     local_files_only=False
 )
 # Load text encoder once and keep it in memory
 text_encoder = model_ledger.text_encoder()
@@ -90,6 +175,109 @@ print("=" * 80)
 print("Text encoder loaded and ready!")
 print("=" * 80)
 def encode_text_simple(text_encoder, prompt: str):
     """Simple text encoding without using pipeline_utils."""
     v_context, a_context, _ = text_encoder(prompt)
@@ -262,6 +450,7 @@ RUNTIME_LORA_CHOICES = [
     ("Slide Right", 5),
     ("Slide Down", 6),
     ("Slide Up", 7),
 ]
 # Initialize pipeline WITHOUT text encoder (gemma_root=None)
@@ -580,7 +769,7 @@ def generate_video_example(input_image, prompt, camera_lora, resolution, progres
     w, h = apply_resolution(resolution)
-    output_video, seed = generate_video(
         input_image,
         prompt,
         10,                      # duration seconds
@@ -589,18 +778,18 @@ def generate_video_example(input_image, prompt, camera_lora, resolution, progres
         True,                    # randomize_seed
         h,  # height
         w,   # width
-        camera_lora,
         progress
     )
     return output_video
 def generate_video_example_t2v(prompt, camera_lora, resolution, progress=gr.Progress(track_tqdm=True)):
     w, h = apply_resolution(resolution)
-    output_video, seed = generate_video(
         None,
         prompt,
         15,                      # duration seconds
@@ -609,11 +798,32 @@ def generate_video_example_t2v(prompt, camera_lora, resolution, progress=gr.Prog
         True,                    # randomize_seed
         h,  # height
         w,   # width
         camera_lora,
         progress
     )
     return output_video
 def get_duration(
     input_image,
     prompt,
@@ -624,14 +834,20 @@ def get_duration(
     height,
     width,
     camera_lora,
     progress
 ):
     if duration <= 5:
-        return 80
     elif duration <= 10:
-        return 120
     else:
-        return 180
 @spaces.GPU(duration=get_duration)
 def generate_video(
@@ -644,6 +860,7 @@ def generate_video(
     height: int = DEFAULT_1_STAGE_HEIGHT,
     width: int = DEFAULT_1_STAGE_WIDTH,
     camera_lora: str = "No LoRA",
     progress=gr.Progress(track_tqdm=True),
 ):
     """
@@ -705,10 +922,21 @@ def generate_video(
     audio_context = embeddings["audio_context"].to("cuda", non_blocking=True)
     print("✓ Embeddings loaded successfully")
     # free prompt enhancer / encoder temps ASAP
     del embeddings, final_prompt, status
     torch.cuda.empty_cache()
     # Map dropdown name -> adapter index
     name_to_idx = {name: idx for name, idx in RUNTIME_LORA_CHOICES}
@@ -717,6 +945,22 @@ def generate_video(
     enable_only_lora(pipeline._transformer, selected_idx)
     torch.cuda.empty_cache()
     # Run inference - progress automatically tracks tqdm from pipeline
     with torch.inference_mode():
         pipeline(
@@ -731,12 +975,14 @@ def generate_video(
             tiling_config=TilingConfig.default(),
             video_context=video_context,
             audio_context=audio_context,
         )
     del video_context, audio_context
     torch.cuda.empty_cache()
     print("successful generation")
-    return str(output_path), current_seed
@@ -1160,12 +1406,13 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
                     height=512
                 )
                 prompt_ui = PromptBox(
                     value="Make this image come alive with cinematic motion, smooth animation",
                     elem_id="prompt_ui",
                 )
                 prompt = gr.Textbox(
                     label="Prompt",
                     value="Make this image come alive with cinematic motion, smooth animation",
@@ -1302,11 +1549,13 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             height,
             width,
             camera_lora,
         ],
-        outputs=[output_video,seed]
     )
     timestep_prompt = """Style: Realistic live-action, cinematic, shallow depth of field, 24 fps, natural and dramatic lighting
     Environment: Interior of a space station module or realistic mock-up, metal panels, blinking lights, Earth visible through a large window
@@ -1331,6 +1580,24 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
     Music: subtle cinematic synth or ambient pad, futuristic and minimal, emphasizing awe and solitude"""
     gr.Examples(
         examples=[
@@ -1402,6 +1669,5 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
     )
 if __name__ == "__main__":
     demo.launch(ssr_mode=False, mcp_server=True, css=css)

 from pathlib import Path
 import uuid
 import tempfile
+import subprocess
+import torch
+import torch.nn.functional as F
+import torchaudio
+import os
+from typing import Any
+def _coerce_audio_path(audio_path: Any) -> str:
+    # Common Gradio case: tuple where first item is the filepath
+    if isinstance(audio_path, tuple) and len(audio_path) > 0:
+        audio_path = audio_path[0]
+    # Some gradio versions pass a dict-like object
+    if isinstance(audio_path, dict):
+        # common keys: "name", "path"
+        audio_path = audio_path.get("name") or audio_path.get("path")
+    # pathlib.Path etc.
+    if not isinstance(audio_path, (str, bytes, os.PathLike)):
+        raise TypeError(f"audio_path must be a path-like, got {type(audio_path)}: {audio_path}")
+    return os.fspath(audio_path)
+def match_audio_to_duration(
+    audio_path: str,
+    target_seconds: float,
+    target_sr: int = 48000,
+    to_mono: bool = True,
+    pad_mode: str = "silence",  # "silence" | "repeat"
+    device: str = "cuda",
+):
+    """
+    Load audio, resample, (optionally) mono, then trim/pad to exactly target_seconds.
+    Returns: (waveform[T] or [1,T], sr)
+    """
+    audio_path = _coerce_audio_path(audio_path)
+    wav, sr = torchaudio.load(audio_path)  # [C, T] float32 CPU
+    # Resample to target_sr (recommended so duration math is stable)
+    if sr != target_sr:
+        wav = torchaudio.functional.resample(wav, sr, target_sr)
+        sr = target_sr
+    # Mono (common expectation; if your model supports stereo, set to_mono=False)
+    if to_mono and wav.shape[0] > 1:
+        wav = wav.mean(dim=0, keepdim=True)  # [1, T]
+    # Exact target length in samples
+    target_len = int(round(target_seconds * sr))
+    cur_len = wav.shape[-1]
+    if cur_len > target_len:
+        wav = wav[..., :target_len]
+    elif cur_len < target_len:
+        pad_len = target_len - cur_len
+        if pad_mode == "repeat" and cur_len > 0:
+            # Repeat then cut to exact length
+            reps = (target_len + cur_len - 1) // cur_len
+            wav = wav.repeat(1, reps)[..., :target_len]
+        else:
+            # Silence pad
+            wav = F.pad(wav, (0, pad_len))
+    # move to device
+    wav = wav.to(device, non_blocking=True)
+    return wav, sr
+def sh(cmd): subprocess.check_call(cmd, shell=True)
+sh("pip install --no-deps easy_dwpose")
 # Add packages to Python path
 current_dir = Path(__file__).parent
 import torch
 from typing import Optional
 from pathlib import Path
+import torchaudio
 from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_pipelines.distilled import DistilledPipeline
 from ltx_core.model.video_vae import TilingConfig
+from ltx_core.model.audio_vae.ops import AudioProcessor
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 from ltx_pipelines.utils.constants import (
     DEFAULT_LORA_STRENGTH,
 )
 from ltx_core.loader.single_gpu_model_builder import enable_only_lora
+from ltx_core.model.audio_vae import decode_audio
+from ltx_core.model.audio_vae import encode_audio
 from PIL import Image
 MAX_SEED = np.iinfo(np.int32).max
 # Install with: pip install git+https://github.com/Lightricks/LTX-2.git
 from ltx_pipelines.utils import ModelLedger
 from ltx_pipelines.utils.helpers import generate_enhanced_prompt
+import imageio
+import cv2
+from controlnet_aux import CannyDetector
+from easy_dwpose import DWposeDetector
 # HuggingFace Hub defaults
 DEFAULT_REPO_ID = "Lightricks/LTX-2"
     local_files_only=False
 )
+canny_processor = CannyDetector()
 # Load text encoder once and keep it in memory
 text_encoder = model_ledger.text_encoder()
 print("Text encoder loaded and ready!")
 print("=" * 80)
+def on_lora_change(selected: str):
+    needs_video = selected in {"Pose", "Canny", "Detailer"}
+    return (
+        selected,
+        gr.update(visible=not needs_video, value=None if needs_video else None),
+        gr.update(visible=needs_video, value=None if not needs_video else None),
+    )
+def process_video_for_pose(frames, width: int, height: int):
+    pose_processor = DWposeDetector("cuda")
+    if not frames:
+        return []
+    pose_frames = []
+    for frame in frames:
+        # imageio frame -> PIL
+        pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
+        # ✅ do NOT pass width/height here (easy_dwpose will handle drawing sizes internally)
+        pose_img = pose_processor(pil)
+        # Ensure it's PIL then resize to your conditioning size
+        if not isinstance(pose_img, Image.Image):
+            # some versions might return np array
+            pose_img = Image.fromarray(pose_img.astype(np.uint8))
+        pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
+        pose_np = np.array(pose_img).astype(np.float32) / 255.0
+        pose_frames.append(pose_np)
+    return pose_frames
+def preprocess_video_to_pose_mp4(video_path: str, width: int, height: int, fps: float):
+    frames = load_video_frames(video_path)
+    pose_frames = process_video_for_pose(frames, width=width, height=height)
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    tmp.close()
+    return write_video_mp4(pose_frames, fps=fps, out_path=tmp.name)
+def load_video_frames(video_path: str):
+    """Return list of frames as numpy arrays (H,W,3) uint8."""
+    frames = []
+    with imageio.get_reader(video_path) as reader:
+        for frame in reader:
+            frames.append(frame)
+    return frames
+def process_video_for_canny(frames, width: int, height: int,
+                            low_threshold=50, high_threshold=200):
+    """
+    Convert RGB frames -> canny edge frames.
+    Returns list of np arrays (H,W,3) in float [0..1] (like controlnet_aux output).
+    """
+    if not frames:
+        return []
+    detect_resolution = max(frames[0].shape[0], frames[0].shape[1])
+    image_resolution = max(width, height)
+    canny_frames = []
+    for frame in frames:
+        # controlnet_aux CannyDetector returns float image in [0..1] if output_type="np"
+        canny = canny_processor(
+            frame,
+            low_threshold=low_threshold,
+            high_threshold=high_threshold,
+            detect_resolution=detect_resolution,
+            image_resolution=image_resolution,
+            output_type="np",
+        )
+        canny_frames.append(canny)
+    return canny_frames
+def write_video_mp4(frames_float_01, fps: float, out_path: str):
+    """Write frames in float [0..1] to mp4 as uint8."""
+    frames_uint8 = [(f * 255).astype(np.uint8) for f in frames_float_01]
+    # PyAV backend doesn't support `quality=...`
+    with imageio.get_writer(out_path, fps=fps, macro_block_size=1) as writer:
+        for fr in frames_uint8:
+            writer.append_data(fr)
+    return out_path
+def preprocess_video_to_canny_mp4(video_path: str, width: int, height: int, fps: float):
+    """End-to-end: read video -> canny -> write temp mp4 -> return path."""
+    frames = load_video_frames(video_path)
+    canny_frames = process_video_for_canny(frames, width=width, height=height)
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    tmp.close()
+    return write_video_mp4(canny_frames, fps=fps, out_path=tmp.name)
 def encode_text_simple(text_encoder, prompt: str):
     """Simple text encoding without using pipeline_utils."""
     v_context, a_context, _ = text_encoder(prompt)
     ("Slide Right", 5),
     ("Slide Down", 6),
     ("Slide Up", 7),
 ]
 # Initialize pipeline WITHOUT text encoder (gemma_root=None)
     w, h = apply_resolution(resolution)
+    output_video = generate_video(
         input_image,
         prompt,
         10,                      # duration seconds
         True,                    # randomize_seed
         h,  # height
         w,   # width
+        camera_lora,
+        None,
         progress
     )
     return output_video
 def generate_video_example_t2v(prompt, camera_lora, resolution, progress=gr.Progress(track_tqdm=True)):
     w, h = apply_resolution(resolution)
+    output_video = generate_video(
         None,
         prompt,
         15,                      # duration seconds
         True,                    # randomize_seed
         h,  # height
         w,   # width
+        camera_lora,
+        None,
+        progress
+    )
+    return output_video
+def generate_video_example_s2v(input_image, prompt, camera_lora, resolution, audio_path, progress=gr.Progress(track_tqdm=True)):
+    w, h = apply_resolution(resolution)
+    output_video = generate_video(
+        input_image,
+        prompt,
+        10,                      # duration seconds
+        True,                    # enhance_prompt
+        42,                      # seed
+        True,                    # randomize_seed
+        h,  # height
+        w,   # width
         camera_lora,
+        audio_path,
         progress
     )
     return output_video
 def get_duration(
     input_image,
     prompt,
     height,
     width,
     camera_lora,
+    audio_path,
     progress
 ):
+    extra_time = 0
+    if audio_path is not None or input_image is None:
+        extra_time += 10
     if duration <= 5:
+        return 80 + extra_time
     elif duration <= 10:
+        return 120 + extra_time
     else:
+        return 180 + extra_time
 @spaces.GPU(duration=get_duration)
 def generate_video(
     height: int = DEFAULT_1_STAGE_HEIGHT,
     width: int = DEFAULT_1_STAGE_WIDTH,
     camera_lora: str = "No LoRA",
+    audio_path = None,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
     audio_context = embeddings["audio_context"].to("cuda", non_blocking=True)
     print("✓ Embeddings loaded successfully")
     # free prompt enhancer / encoder temps ASAP
     del embeddings, final_prompt, status
     torch.cuda.empty_cache()
+    # ✅ if user provided audio, use a neutral audio_context
+    n_audio_context = None
+    if audio_path is not None:
+        with torch.inference_mode():
+            _, n_audio_context = encode_text_simple(text_encoder, "")  # returns tensors on GPU already
+            del audio_context
+            audio_context = n_audio_context
+    torch.cuda.empty_cache()
     # Map dropdown name -> adapter index
     name_to_idx = {name: idx for name, idx in RUNTIME_LORA_CHOICES}
     enable_only_lora(pipeline._transformer, selected_idx)
     torch.cuda.empty_cache()
+    # True video duration in seconds based on your rounding
+    video_seconds = (num_frames - 1) / frame_rate
+    if audio_path is not None:
+        input_waveform, input_waveform_sample_rate = match_audio_to_duration(
+            audio_path=audio_path,
+            target_seconds=video_seconds,
+            target_sr=48000,     # pick what your model expects; 48k is common for AV models
+            to_mono=True,        # set False if your model wants stereo
+            pad_mode="silence",  # or "repeat" if you prefer looping over silence
+            device="cuda",
+        )
+    else:
+        input_waveform = None
+        input_waveform_sample_rate = None
     # Run inference - progress automatically tracks tqdm from pipeline
     with torch.inference_mode():
         pipeline(
             tiling_config=TilingConfig.default(),
             video_context=video_context,
             audio_context=audio_context,
+            input_waveform=input_waveform,
+            input_waveform_sample_rate=input_waveform_sample_rate,
         )
     del video_context, audio_context
     torch.cuda.empty_cache()
     print("successful generation")
+    return str(output_path)
                     height=512
                 )
                 prompt_ui = PromptBox(
                     value="Make this image come alive with cinematic motion, smooth animation",
                     elem_id="prompt_ui",
                 )
+                audio_input = gr.Audio(label="Audio (Optional)", type="filepath")
                 prompt = gr.Textbox(
                     label="Prompt",
                     value="Make this image come alive with cinematic motion, smooth animation",
             height,
             width,
             camera_lora,
+            audio_input
         ],
+        outputs=[output_video]
     )
     timestep_prompt = """Style: Realistic live-action, cinematic, shallow depth of field, 24 fps, natural and dramatic lighting
     Environment: Interior of a space station module or realistic mock-up, metal panels, blinking lights, Earth visible through a large window
     Music: subtle cinematic synth or ambient pad, futuristic and minimal, emphasizing awe and solitude"""
+    gr.Examples(
+        examples=[
+            [
+                "supergirl-2.png",
+                "A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit sleeping in bed and just waking up, she gradually gets up, rubbing her eyes and looking at her dog that just popped on the bed. the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
+                "Static",
+                "16:9",
+                "supergirl.m4a"
+            ],
+        ],
+        fn=generate_video_example_s2v,
+        inputs=[input_image, prompt_ui, camera_lora_ui, radioanimated_resolution, audio_input],
+        outputs = [output_video],
+        label="S2V Example",
+        cache_examples=True,
+    )
     gr.Examples(
         examples=[
     )
 if __name__ == "__main__":
     demo.launch(ssr_mode=False, mcp_server=True, css=css)

packages/ltx-core/src/ltx_core/model/audio_vae/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Audio VAE model components."""
-from ltx_core.model.audio_vae.audio_vae import AudioDecoder, AudioEncoder, decode_audio
 from ltx_core.model.audio_vae.model_configurator import (
     AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
     AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
@@ -24,4 +24,5 @@ __all__ = [
     "Vocoder",
     "VocoderConfigurator",
     "decode_audio",
 ]

 """Audio VAE model components."""
+from ltx_core.model.audio_vae.audio_vae import AudioDecoder, AudioEncoder, decode_audio, encode_audio
 from ltx_core.model.audio_vae.model_configurator import (
     AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
     AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
     "Vocoder",
     "VocoderConfigurator",
     "decode_audio",
+    "encode_audio",
 ]

packages/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py CHANGED Viewed

@@ -8,7 +8,7 @@ from ltx_core.model.audio_vae.attention import AttentionType, make_attn
 from ltx_core.model.audio_vae.causal_conv_2d import make_conv2d
 from ltx_core.model.audio_vae.causality_axis import CausalityAxis
 from ltx_core.model.audio_vae.downsample import build_downsampling_path
-from ltx_core.model.audio_vae.ops import PerChannelStatistics
 from ltx_core.model.audio_vae.resnet import ResnetBlock
 from ltx_core.model.audio_vae.upsample import build_upsampling_path
 from ltx_core.model.audio_vae.vocoder import Vocoder
@@ -464,6 +464,57 @@ class AudioDecoder(torch.nn.Module):
         h = self.conv_out(h)
         return torch.tanh(h) if self.tanh_out else h
 def decode_audio(latent: torch.Tensor, audio_decoder: "AudioDecoder", vocoder: "Vocoder") -> torch.Tensor:
     """

 from ltx_core.model.audio_vae.causal_conv_2d import make_conv2d
 from ltx_core.model.audio_vae.causality_axis import CausalityAxis
 from ltx_core.model.audio_vae.downsample import build_downsampling_path
+from ltx_core.model.audio_vae.ops import PerChannelStatistics, AudioProcessor
 from ltx_core.model.audio_vae.resnet import ResnetBlock
 from ltx_core.model.audio_vae.upsample import build_upsampling_path
 from ltx_core.model.audio_vae.vocoder import Vocoder
         h = self.conv_out(h)
         return torch.tanh(h) if self.tanh_out else h
+@torch.no_grad()
+def encode_audio(
+    waveform: torch.Tensor,
+    waveform_sample_rate: int,
+    *,
+    audio_encoder: "AudioEncoder",
+    audio_processor: "AudioProcessor",
+    return_mean_only: bool = False,
+) -> torch.Tensor:
+    """
+    Encode a waveform into an audio latent representation.
+    Args:
+        waveform: Audio waveform tensor.
+            Expected shapes:
+              - (T,)                -> treated as (1,1,T)
+              - (B,T)               -> treated as (B,1,T)
+              - (B,C,T)             -> used as-is
+        waveform_sample_rate: Sample rate of the provided waveform.
+        audio_encoder: AudioEncoder that consumes (B, C, frames, mel_bins).
+        audio_processor: AudioProcessor from ops.py that produces log-mel features.
+        return_mean_only: If True and encoder outputs double_z, return only the mean half.
+    Returns:
+        Latent tensor from AudioEncoder.
+        If return_mean_only=True and double_z=True: returns (B, z_channels, frames, mel_bins).
+        Otherwise returns the raw encoder output (often (B, 2*z_channels, frames, mel_bins)).
+    """
+    # --- normalize waveform shape to (B, C, T) ---
+    if waveform.dim() == 1:
+        waveform = waveform.unsqueeze(0).unsqueeze(0)
+    elif waveform.dim() == 2:
+        waveform = waveform.unsqueeze(1)
+    elif waveform.dim() == 3:
+        pass
+    else:
+        raise ValueError(f"Unexpected waveform shape: {tuple(waveform.shape)}")
+    waveform = waveform.float()
+    # --- waveform -> log-mel spectrogram (B, C, frames, mel_bins) ---
+    mel = audio_processor.waveform_to_mel(waveform, waveform_sample_rate)
+    # --- mel -> latent ---
+    latent = audio_encoder(mel)
+    if return_mean_only and getattr(audio_encoder, "double_z", False):
+        latent = torch.chunk(latent, 2, dim=1)[0]
+    return latent
 def decode_audio(latent: torch.Tensor, audio_decoder: "AudioDecoder", vocoder: "Vocoder") -> torch.Tensor:
     """

packages/ltx-pipelines/src/ltx_pipelines/distilled.py CHANGED Viewed

@@ -8,7 +8,7 @@ from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 from ltx_core.components.noisers import GaussianNoiser
 from ltx_core.components.protocols import DiffusionStepProtocol
-from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex
 from ltx_core.loader import LoraPathStrengthAndSDOps
 from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
 from ltx_core.model.upsampler import upsample_video
@@ -16,6 +16,7 @@ from ltx_core.model.video_vae import TilingConfig, VideoEncoder, get_video_chunk
 from ltx_core.model.video_vae import decode_video as vae_decode_video
 from ltx_core.text_encoders.gemma import encode_text
 from ltx_core.types import LatentState, VideoPixelShape
 from ltx_pipelines import utils
 from ltx_pipelines.utils import ModelLedger
 from ltx_pipelines.utils.args import default_2_stage_distilled_arg_parser
@@ -38,6 +39,42 @@ from ltx_pipelines.utils.helpers import (
 from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
 from ltx_pipelines.utils.types import PipelineComponents
 device = get_device()
@@ -74,6 +111,151 @@ class DistilledPipeline:
         # Cached models to avoid reloading
         self._video_encoder = None
         self._transformer = None
     @torch.inference_mode()
     def __call__(
@@ -92,12 +274,27 @@ class DistilledPipeline:
         tiling_config: TilingConfig | None = None,
         video_context: torch.Tensor | None = None,
         audio_context: torch.Tensor | None = None,
     ) -> None:
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
         # Use pre-computed embeddings if provided, otherwise encode text
         if video_context is None or audio_context is None:
             text_encoder = self.model_ledger.text_encoder()
@@ -153,6 +350,7 @@ class DistilledPipeline:
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
             noiser=noiser,
             sigmas=stage_1_sigmas,
             stepper=stepper,
@@ -197,6 +395,7 @@ class DistilledPipeline:
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_2_output_shape,
             conditionings=stage_2_conditionings,
             noiser=noiser,
             sigmas=stage_2_sigmas,
             stepper=stepper,
@@ -227,8 +426,6 @@ class DistilledPipeline:
         )
     def _create_conditionings(
         self,
         images: list[tuple[str, int, float]],
@@ -275,4 +472,4 @@ class DistilledPipeline:
                 )
             )
-        return conditionings

 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 from ltx_core.components.noisers import GaussianNoiser
 from ltx_core.components.protocols import DiffusionStepProtocol
+from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex, ConditioningError
 from ltx_core.loader import LoraPathStrengthAndSDOps
 from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
 from ltx_core.model.upsampler import upsample_video
 from ltx_core.model.video_vae import decode_video as vae_decode_video
 from ltx_core.text_encoders.gemma import encode_text
 from ltx_core.types import LatentState, VideoPixelShape
+from ltx_core.tools import LatentTools
 from ltx_pipelines import utils
 from ltx_pipelines.utils import ModelLedger
 from ltx_pipelines.utils.args import default_2_stage_distilled_arg_parser
 from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
 from ltx_pipelines.utils.types import PipelineComponents
+import torchaudio
+from ltx_core.model.audio_vae import AudioProcessor
+from ltx_core.types import AudioLatentShape, VideoPixelShape
+class AudioConditionByLatent(ConditioningItem):
+    """
+    Conditions audio generation by injecting a full latent sequence.
+    Replaces tokens in the latent state with the provided audio latents,
+    and sets denoise strength according to the strength parameter.
+    """
+    def __init__(self, latent: torch.Tensor, strength: float):
+        self.latent = latent
+        self.strength = strength
+    def apply_to(self, latent_state: LatentState, latent_tools: LatentTools) -> LatentState:
+        if not isinstance(latent_tools.target_shape, AudioLatentShape):
+            raise ConditioningError("Audio conditioning requires an audio latent target shape.")
+        cond_batch, cond_channels, cond_frames, cond_bins = self.latent.shape
+        tgt_batch, tgt_channels, tgt_frames, tgt_bins = latent_tools.target_shape.to_torch_shape()
+        if (cond_batch, cond_channels, cond_frames, cond_bins) != (tgt_batch, tgt_channels, tgt_frames, tgt_bins):
+            raise ConditioningError(
+                f"Can't apply audio conditioning item to latent with shape {latent_tools.target_shape}, expected "
+                f"shape is ({tgt_batch}, {tgt_channels}, {tgt_frames}, {tgt_bins})."
+            )
+        tokens = latent_tools.patchifier.patchify(self.latent)
+        latent_state = latent_state.clone()
+        latent_state.latent[:, : tokens.shape[1]] = tokens
+        latent_state.clean_latent[:, : tokens.shape[1]] = tokens
+        latent_state.denoise_mask[:, : tokens.shape[1]] = 1.0 - self.strength
+        return latent_state
 device = get_device()
         # Cached models to avoid reloading
         self._video_encoder = None
         self._transformer = None
+    def _build_audio_conditionings_from_waveform(
+        self,
+        input_waveform: torch.Tensor,
+        input_sample_rate: int,
+        num_frames: int,
+        fps: float,
+        strength: float,
+    ) -> list[AudioConditionByLatent] | None:
+        strength = float(strength)
+        if strength <= 0.0:
+            return None
+        # Expect waveform as:
+        #  - (T,) or (C,T) or (B,C,T). Convert to (B,C,T)
+        waveform = input_waveform
+        if waveform.ndim == 1:
+            waveform = waveform.unsqueeze(0).unsqueeze(0)
+        elif waveform.ndim == 2:
+            waveform = waveform.unsqueeze(0)
+        elif waveform.ndim != 3:
+            raise ValueError(f"input_waveform must be 1D/2D/3D, got shape {tuple(waveform.shape)}")
+        # Get audio encoder + its config
+        audio_encoder = self.model_ledger.audio_encoder()  # assumes ledger exposes it
+        # If you want to cache it like video_encoder/transformer, you can.
+        target_sr = int(getattr(audio_encoder, "sample_rate"))
+        target_channels = int(getattr(audio_encoder, "in_channels", waveform.shape[1]))
+        mel_bins = int(getattr(audio_encoder, "mel_bins"))
+        mel_hop = int(getattr(audio_encoder, "mel_hop_length"))
+        n_fft = int(getattr(audio_encoder, "n_fft"))
+        # Match channels
+        if waveform.shape[1] != target_channels:
+            if waveform.shape[1] == 1 and target_channels > 1:
+                waveform = waveform.repeat(1, target_channels, 1)
+            elif target_channels == 1:
+                waveform = waveform.mean(dim=1, keepdim=True)
+            else:
+                waveform = waveform[:, :target_channels, :]
+                if waveform.shape[1] < target_channels:
+                    pad_ch = target_channels - waveform.shape[1]
+                    pad = torch.zeros((waveform.shape[0], pad_ch, waveform.shape[2]), dtype=waveform.dtype)
+                    waveform = torch.cat([waveform, pad], dim=1)
+        # Resample if needed (CPU float32 is safest for torchaudio)
+        waveform = waveform.to(device="cpu", dtype=torch.float32)
+        if int(input_sample_rate) != target_sr:
+            waveform = torchaudio.functional.resample(waveform, int(input_sample_rate), target_sr)
+        # Waveform -> Mel
+        audio_processor = AudioProcessor(
+            sample_rate=target_sr,
+            mel_bins=mel_bins,
+            mel_hop_length=mel_hop,
+            n_fft=n_fft,
+        ).to(waveform.device)
+        mel = audio_processor.waveform_to_mel(waveform, target_sr)
+        # Mel -> latent (run encoder on its own device/dtype)
+        audio_params = next(audio_encoder.parameters(), None)
+        enc_device = audio_params.device if audio_params is not None else self.device
+        enc_dtype = audio_params.dtype if audio_params is not None else self.dtype
+        mel = mel.to(device=enc_device, dtype=enc_dtype)
+        with torch.inference_mode():
+            audio_latent = audio_encoder(mel)
+        # Pad/trim latent to match the target video duration
+        audio_downsample = getattr(getattr(audio_encoder, "patchifier", None), "audio_latent_downsample_factor", 4)
+        target_shape = AudioLatentShape.from_video_pixel_shape(
+            VideoPixelShape(batch=audio_latent.shape[0], frames=int(num_frames), width=1, height=1, fps=float(fps)),
+            channels=audio_latent.shape[1],
+            mel_bins=audio_latent.shape[3],
+            sample_rate=target_sr,
+            hop_length=mel_hop,
+            audio_latent_downsample_factor=audio_downsample,
+        )
+        target_frames = int(target_shape.frames)
+        if audio_latent.shape[2] < target_frames:
+            pad_frames = target_frames - audio_latent.shape[2]
+            pad = torch.zeros(
+                (audio_latent.shape[0], audio_latent.shape[1], pad_frames, audio_latent.shape[3]),
+                device=audio_latent.device,
+                dtype=audio_latent.dtype,
+            )
+            audio_latent = torch.cat([audio_latent, pad], dim=2)
+        elif audio_latent.shape[2] > target_frames:
+            audio_latent = audio_latent[:, :, :target_frames, :]
+        # Move latent to pipeline device/dtype for conditioning object
+        audio_latent = audio_latent.to(device=self.device, dtype=self.dtype)
+        return [AudioConditionByLatent(audio_latent, strength)]
+    def _prepare_output_waveform(
+        self,
+        input_waveform: torch.Tensor,
+        input_sample_rate: int,
+        target_sample_rate: int,
+        num_frames: int,
+        fps: float,
+    ) -> torch.Tensor:
+        """
+        Returns waveform on CPU, float32, resampled to target_sample_rate and
+        trimmed/padded to match video duration.
+        Output shape: (T,) for mono or (C, T) for multi-channel.
+        """
+        wav = input_waveform
+        # Accept (T,), (C,T), (B,C,T)
+        if wav.ndim == 3:
+            wav = wav[0]
+        elif wav.ndim == 2:
+            pass
+        elif wav.ndim == 1:
+            wav = wav.unsqueeze(0)
+        else:
+            raise ValueError(f"input_waveform must be 1D/2D/3D, got {tuple(wav.shape)}")
+        # Now wav is (C, T)
+        wav = wav.detach().to("cpu", dtype=torch.float32)
+        # Resample if needed
+        if int(input_sample_rate) != int(target_sample_rate):
+            wav = torchaudio.functional.resample(wav, int(input_sample_rate), int(target_sample_rate))
+        # Match video duration
+        duration_sec = float(num_frames) / float(fps)
+        target_len = int(round(duration_sec * float(target_sample_rate)))
+        cur_len = int(wav.shape[-1])
+        if cur_len > target_len:
+            wav = wav[..., :target_len]
+        elif cur_len < target_len:
+            pad = target_len - cur_len
+            wav = torch.nn.functional.pad(wav, (0, pad))
+        # If mono, return (T,) for convenience
+        if wav.shape[0] == 1:
+            return wav[0]
+        return wav
     @torch.inference_mode()
     def __call__(
         tiling_config: TilingConfig | None = None,
         video_context: torch.Tensor | None = None,
         audio_context: torch.Tensor | None = None,
+        input_waveform: torch.Tensor | None = None,
+        input_waveform_sample_rate: int | None = None,
+        audio_strength: float = 1.0,   # or audio_scale, your naming
     ) -> None:
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
+        audio_conditionings = None
+        if input_waveform is not None:
+            if input_waveform_sample_rate is None:
+                raise ValueError("input_waveform_sample_rate must be provided when input_waveform is set.")
+            audio_conditionings = self._build_audio_conditionings_from_waveform(
+                input_waveform=input_waveform,
+                input_sample_rate=int(input_waveform_sample_rate),
+                num_frames=num_frames,
+                fps=frame_rate,
+                strength=audio_strength,
+            )
         # Use pre-computed embeddings if provided, otherwise encode text
         if video_context is None or audio_context is None:
             text_encoder = self.model_ledger.text_encoder()
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
+            audio_conditionings=audio_conditionings,
             noiser=noiser,
             sigmas=stage_1_sigmas,
             stepper=stepper,
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_2_output_shape,
             conditionings=stage_2_conditionings,
+            audio_conditionings=audio_conditionings,
             noiser=noiser,
             sigmas=stage_2_sigmas,
             stepper=stepper,
         )
     def _create_conditionings(
         self,
         images: list[tuple[str, int, float]],
                 )
             )
+        return conditionings

packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py CHANGED Viewed

@@ -245,6 +245,7 @@ def noise_audio_state(
     device: torch.device,
     noise_scale: float = 1.0,
     initial_latent: torch.Tensor | None = None,
 ) -> tuple[LatentState, AudioLatentTools]:
     """Initialize and noise an audio latent state for the diffusion pipeline.
     Creates an audio latent state from the output shape, applies conditionings,
@@ -262,6 +263,7 @@ def noise_audio_state(
         device=device,
         noise_scale=noise_scale,
         initial_latent=initial_latent,
     )
     return audio_state, audio_tools
@@ -275,18 +277,35 @@ def create_noised_state(
     device: torch.device,
     noise_scale: float = 1.0,
     initial_latent: torch.Tensor | None = None,
 ) -> LatentState:
-    """Create a noised latent state from empty state, conditionings, and noiser.
-    Creates an empty latent state, applies conditionings, and then adds noise
-    using the provided noiser. Returns the final noised state ready for diffusion.
-    """
     state = tools.create_initial_state(device, dtype, initial_latent)
     state = state_with_conditionings(state, conditionings, tools)
     state = noiser(state, noise_scale)
     return state
 def state_with_conditionings(
     latent_state: LatentState, conditioning_items: list[ConditioningItem], latent_tools: LatentTools
 ) -> LatentState:
@@ -302,7 +321,9 @@ def state_with_conditionings(
 def post_process_latent(denoised: torch.Tensor, denoise_mask: torch.Tensor, clean: torch.Tensor) -> torch.Tensor:
     """Blend denoised output with clean state based on mask."""
-    return (denoised * denoise_mask + clean.float() * (1 - denoise_mask)).to(denoised.dtype)
 def modality_from_latent_state(
@@ -386,10 +407,12 @@ def denoise_audio_video(  # noqa: PLR0913
     components: PipelineComponents,
     dtype: torch.dtype,
     device: torch.device,
     noise_scale: float = 1.0,
     initial_video_latent: torch.Tensor | None = None,
     initial_audio_latent: torch.Tensor | None = None,
-) -> tuple[LatentState, LatentState]:
     video_state, video_tools = noise_video_state(
         output_shape=output_shape,
         noiser=noiser,
@@ -403,7 +426,7 @@ def denoise_audio_video(  # noqa: PLR0913
     audio_state, audio_tools = noise_audio_state(
         output_shape=output_shape,
         noiser=noiser,
-        conditionings=[],
         components=components,
         dtype=dtype,
         device=device,
@@ -411,13 +434,22 @@ def denoise_audio_video(  # noqa: PLR0913
         initial_latent=initial_audio_latent,
     )
     video_state, audio_state = denoising_loop_fn(
         sigmas,
         video_state,
         audio_state,
         stepper,
     )
     video_state = video_tools.clear_conditioning(video_state)
     video_state = video_tools.unpatchify(video_state)
     audio_state = audio_tools.clear_conditioning(audio_state)
@@ -426,6 +458,7 @@ def denoise_audio_video(  # noqa: PLR0913
     return video_state, audio_state
 _UNICODE_REPLACEMENTS = str.maketrans("\u2018\u2019\u201c\u201d\u2014\u2013\u00a0\u2032\u2212", "''\"\"-- '-")

     device: torch.device,
     noise_scale: float = 1.0,
     initial_latent: torch.Tensor | None = None,
+    denoise_mask: torch.Tensor | None = None
 ) -> tuple[LatentState, AudioLatentTools]:
     """Initialize and noise an audio latent state for the diffusion pipeline.
     Creates an audio latent state from the output shape, applies conditionings,
         device=device,
         noise_scale=noise_scale,
         initial_latent=initial_latent,
+        denoise_mask=denoise_mask,
     )
     return audio_state, audio_tools
     device: torch.device,
     noise_scale: float = 1.0,
     initial_latent: torch.Tensor | None = None,
+    denoise_mask: torch.Tensor | None = None,   # <-- add
 ) -> LatentState:
     state = tools.create_initial_state(device, dtype, initial_latent)
     state = state_with_conditionings(state, conditionings, tools)
+    if denoise_mask is not None:
+        # Convert any tensor mask into a single scalar (solid mask behavior)
+        if isinstance(denoise_mask, torch.Tensor):
+            mask_value = float(denoise_mask.mean().item())
+        else:
+            mask_value = float(denoise_mask)
+        state = replace(
+            state,
+            clean_latent=state.latent.clone(),
+            denoise_mask=torch.full_like(state.denoise_mask, mask_value),  # <- matches internal shape
+        )
     state = noiser(state, noise_scale)
+    if denoise_mask is not None:
+        m = state.denoise_mask.to(dtype=state.latent.dtype, device=state.latent.device)
+        clean = state.clean_latent.to(dtype=state.latent.dtype, device=state.latent.device)
+        state = replace(state, latent=state.latent * m + clean * (1 - m))
     return state
 def state_with_conditionings(
     latent_state: LatentState, conditioning_items: list[ConditioningItem], latent_tools: LatentTools
 ) -> LatentState:
 def post_process_latent(denoised: torch.Tensor, denoise_mask: torch.Tensor, clean: torch.Tensor) -> torch.Tensor:
     """Blend denoised output with clean state based on mask."""
+    clean = clean.to(dtype=denoised.dtype)
+    denoise_mask = denoise_mask.to(dtype=denoised.dtype)
+    return denoised * denoise_mask + clean * (1 - denoise_mask)
 def modality_from_latent_state(
     components: PipelineComponents,
     dtype: torch.dtype,
     device: torch.device,
+    audio_conditionings: list[ConditioningItem] | None = None,
     noise_scale: float = 1.0,
     initial_video_latent: torch.Tensor | None = None,
     initial_audio_latent: torch.Tensor | None = None,
+    # mask_context: MaskInjection | None = None,
+) -> tuple[LatentState | None, LatentState | None]:
     video_state, video_tools = noise_video_state(
         output_shape=output_shape,
         noiser=noiser,
     audio_state, audio_tools = noise_audio_state(
         output_shape=output_shape,
         noiser=noiser,
+        conditionings=audio_conditionings or [],
         components=components,
         dtype=dtype,
         device=device,
         initial_latent=initial_audio_latent,
     )
+    loop_kwargs = {}
+    # if "preview_tools" in inspect.signature(denoising_loop_fn).parameters:
+    #     loop_kwargs["preview_tools"] = video_tools
+    # if "mask_context" in inspect.signature(denoising_loop_fn).parameters:
+    #     loop_kwargs["mask_context"] = mask_context
     video_state, audio_state = denoising_loop_fn(
         sigmas,
         video_state,
         audio_state,
         stepper,
+        **loop_kwargs,
     )
+    if video_state is None or audio_state is None:
+        return None, None
     video_state = video_tools.clear_conditioning(video_state)
     video_state = video_tools.unpatchify(video_state)
     audio_state = audio_tools.clear_conditioning(audio_state)
     return video_state, audio_state
 _UNICODE_REPLACEMENTS = str.maketrans("\u2018\u2019\u201c\u201d\u2014\u2013\u00a0\u2032\u2212", "''\"\"-- '-")

packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py CHANGED Viewed

@@ -36,10 +36,24 @@ from ltx_core.text_encoders.gemma import (
     module_ops_from_gemma_root,
 )
 class ModelLedger:
     """
-    Central coordinator for loading and building models used in an LTX pipeline.
     The ledger wires together multiple model builders (transformer, video VAE encoder/decoder,
     audio VAE decoder, vocoder, text encoder, and optional latent upsampler) and exposes
     factory methods for constructing model instances.
@@ -144,6 +158,14 @@ class ModelLedger:
                 registry=self.registry,
             )
             if self.gemma_root_path is not None:
                 self.text_encoder_builder = Builder(
                     model_path=self.checkpoint_path,
@@ -197,6 +219,14 @@ class ModelLedger:
                 .eval()
             )
     def video_decoder(self) -> VideoDecoder:
         if not hasattr(self, "vae_decoder_builder"):
             raise ValueError(

     module_ops_from_gemma_root,
 )
+from ltx_core.model.audio_vae import (
+    AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
+    VOCODER_COMFY_KEYS_FILTER,
+    AudioDecoder,
+    AudioDecoderConfigurator,
+    Vocoder,
+    VocoderConfigurator,
+    AudioEncoder,
+)
+from ltx_core.model.audio_vae.model_configurator import (
+    AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
+    AudioEncoderConfigurator,
+)
 class ModelLedger:
     """
+    Central coordinator for loading and building     models used in an LTX pipeline.
     The ledger wires together multiple model builders (transformer, video VAE encoder/decoder,
     audio VAE decoder, vocoder, text encoder, and optional latent upsampler) and exposes
     factory methods for constructing model instances.
                 registry=self.registry,
             )
+            self.audio_encoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=AudioEncoderConfigurator,
+                model_sd_ops=AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
             if self.gemma_root_path is not None:
                 self.text_encoder_builder = Builder(
                     model_path=self.checkpoint_path,
                 .eval()
             )
+    def audio_encoder(self) -> AudioEncoder:
+        if not hasattr(self, "audio_encoder_builder"):
+            raise ValueError(
+                "Audio encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.audio_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
     def video_decoder(self) -> VideoDecoder:
         if not hasattr(self, "vae_decoder_builder"):
             raise ValueError(

requirements.txt CHANGED Viewed

@@ -6,9 +6,15 @@ safetensors
 accelerate
 flashpack==0.1.2
 scikit-image>=0.25.2
 av
 tqdm
 pillow
 scipy>=1.14
 flash-attn-3 @ https://huggingface.co/alexnasa/flash-attn-3/resolve/main/128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
-bitsandbytes

 accelerate
 flashpack==0.1.2
 scikit-image>=0.25.2
+imageio
+imageio-ffmpeg
 av
 tqdm
 pillow
 scipy>=1.14
 flash-attn-3 @ https://huggingface.co/alexnasa/flash-attn-3/resolve/main/128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
+bitsandbytes
+opencv-python
+controlnet_aux
+onnxruntime-gpu
+matplotlib