LongCat-Video

Running on Zero

App Files Files Community

rahul7star commited on Oct 30

Commit

b2baa38

verified ·

1 Parent(s): 7ccc31d

Create app_exp.py

Browse files

Files changed (1) hide show

app_exp.py +268 -0

app_exp.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import gradio as gr
+import torch
+import os
+import sys
+import subprocess
+import tempfile
+import numpy as np
+import spaces
+import importlib
+import site
+from PIL import Image
+from huggingface_hub import snapshot_download, hf_hub_download
+# ============================================================
+# 1️⃣ FlashAttention 3 Setup (Auto-install from HF repo)
+# ============================================================
+try:
+    print("Attempting to download and install FlashAttention 3 wheel...")
+    fa3_wheel = hf_hub_download(
+        repo_id="rahul7star/flash-attn-3",
+        repo_type="model",
+        filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
+    )
+    subprocess.run(["pip", "install", fa3_wheel], check=True)
+    site.addsitedir(site.getsitepackages()[0])
+    importlib.invalidate_caches()
+    print("✅ FlashAttention 3 installed successfully.")
+except Exception as e:
+    print(f"⚠️ FlashAttention install failed: {e}")
+    print("Proceeding without FA3 acceleration...")
+# ============================================================
+# 2️⃣ Define model and repo paths
+# ============================================================
+REPO_PATH = "LongCat-Video"
+CHECKPOINT_DIR = os.path.join(REPO_PATH, "weights", "LongCat-Video")
+# ============================================================
+# 3️⃣ Clone the model repo if needed
+# ============================================================
+if not os.path.exists(REPO_PATH):
+    print(f"Cloning LongCat-Video repository to '{REPO_PATH}'...")
+    subprocess.run(
+        ["git", "clone", "https://github.com/meituan-longcat/LongCat-Video.git", REPO_PATH],
+        check=True
+    )
+    print("✅ Repository cloned successfully.")
+# Make repo importable
+sys.path.insert(0, os.path.abspath(REPO_PATH))
+# ============================================================
+# 4️⃣ Import model modules after repo setup
+# ============================================================
+from longcat_video.pipeline_longcat_video import LongCatVideoPipeline
+from longcat_video.modules.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+from longcat_video.modules.autoencoder_kl_wan import AutoencoderKLWan
+from longcat_video.modules.longcat_video_dit import LongCatVideoTransformer3DModel
+from longcat_video.context_parallel import context_parallel_util
+from transformers import AutoTokenizer, UMT5EncoderModel
+from diffusers.utils import export_to_video
+# ============================================================
+# 5️⃣ Download weights (snapshot)
+# ============================================================
+if not os.path.exists(CHECKPOINT_DIR):
+    print(f"Downloading model weights to '{CHECKPOINT_DIR}'...")
+    snapshot_download(
+        repo_id="meituan-longcat/LongCat-Video",
+        local_dir=CHECKPOINT_DIR,
+        local_dir_use_symlinks=False,
+        ignore_patterns=["*.md", "*.gitattributes", "assets/*"]
+    )
+    print("✅ Model weights ready.")
+# ============================================================
+# 6️⃣ Initialize model pipeline
+# ============================================================
+pipe = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
+print("--- Initializing Models (once at startup) ---")
+try:
+    cp_split_hw = context_parallel_util.get_optimal_split(1)
+    tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR, subfolder="tokenizer", torch_dtype=torch_dtype)
+    text_encoder = UMT5EncoderModel.from_pretrained(CHECKPOINT_DIR, subfolder="text_encoder", torch_dtype=torch_dtype)
+    vae = AutoencoderKLWan.from_pretrained(CHECKPOINT_DIR, subfolder="vae", torch_dtype=torch_dtype)
+    scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(CHECKPOINT_DIR, subfolder="scheduler", torch_dtype=torch_dtype)
+    # ✅ Enable FA3 acceleration
+    dit = LongCatVideoTransformer3DModel.from_pretrained(
+        CHECKPOINT_DIR,
+        enable_flashattn3=True,
+        enable_flashattn2=False,
+        enable_xformers=True,
+        subfolder="dit",
+        cp_split_hw=cp_split_hw,
+        torch_dtype=torch_dtype,
+    )
+    pipe = LongCatVideoPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        vae=vae,
+        scheduler=scheduler,
+        dit=dit,
+    ).to(device)
+    # Load LoRAs
+    lora_dir = os.path.join(CHECKPOINT_DIR, "lora")
+    pipe.dit.load_lora(os.path.join(lora_dir, "cfg_step_lora.safetensors"), "cfg_step_lora")
+    pipe.dit.load_lora(os.path.join(lora_dir, "refinement_lora.safetensors"), "refinement_lora")
+    print("✅ Models loaded successfully.")
+except Exception as e:
+    print(f"❌ FATAL: Model initialization failed.\n{e}")
+    pipe = None
+# ============================================================
+# 7️⃣ GPU cleanup utility
+# ============================================================
+def torch_gc():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+# ============================================================
+# 8️⃣ Dynamic GPU duration logic
+# ============================================================
+def compute_duration(mode, prompt, neg_prompt, image, height, width, resolution, seed, use_distill, use_refine, progress):
+    """
+    Adaptive GPU time allocation based on resolution & refinement usage.
+    """
+    base = 120  # baseline (seconds)
+    if resolution == "720p": base += 60
+    if use_refine: base += 60
+    if use_distill: base -= 30
+    return min(base, 240)  # cap at 4 min
+# ============================================================
+# 9️⃣ Generation function
+# ============================================================
+@spaces.GPU(duration=compute_duration)
+def generate_video(
+    mode,
+    prompt,
+    neg_prompt,
+    image,
+    height, width, resolution,
+    seed,
+    use_distill,
+    use_refine,
+    progress=gr.Progress(track_tqdm=True)
+):
+    if pipe is None:
+        raise gr.Error("⚠️ Models failed to load. Restart the app.")
+    generator = torch.Generator(device=device).manual_seed(int(seed))
+    num_frames = 48  # shorter for faster test runs
+    is_distill = use_distill or use_refine
+    pipe.dit.enable_loras(["cfg_step_lora"] if is_distill else [])
+    num_inference_steps = 12 if is_distill else 24
+    guidance_scale = 2.0 if is_distill else 4.0
+    # --- Stage 1 ---
+    progress(0.2, desc="Stage 1: Generating Base Video...")
+    if mode == "t2v":
+        output = pipe.generate_t2v(
+            prompt=prompt,
+            negative_prompt=neg_prompt,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            num_inference_steps=num_inference_steps,
+            use_distill=is_distill,
+            guidance_scale=guidance_scale,
+            generator=generator,
+        )[0]
+    else:
+        pil_img = Image.fromarray(image)
+        output = pipe.generate_i2v(
+            image=pil_img,
+            prompt=prompt,
+            negative_prompt=neg_prompt,
+            resolution=resolution,
+            num_frames=num_frames,
+            num_inference_steps=num_inference_steps,
+            use_distill=is_distill,
+            guidance_scale=guidance_scale,
+            generator=generator,
+        )[0]
+    pipe.dit.disable_all_loras()
+    torch_gc()
+    # --- Stage 2 ---
+    if use_refine:
+        progress(0.6, desc="Stage 2: Refining Video...")
+        pipe.dit.enable_loras(["refinement_lora"])
+        refined = pipe.generate_refine(
+            image=Image.fromarray(image) if mode == "i2v" else None,
+            prompt=prompt,
+            stage1_video=[Image.fromarray((f * 255).astype(np.uint8)) for f in output],
+            num_cond_frames=1 if mode == "i2v" else 0,
+            num_inference_steps=20,
+            generator=generator,
+        )[0]
+        output = refined
+        pipe.dit.disable_all_loras()
+        torch_gc()
+    # --- Export ---
+    progress(1.0, desc="Exporting video...")
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_vid:
+        export_to_video(output, tmp_vid.name, fps=24)
+        return tmp_vid.name
+# ============================================================
+# 🔟 Gradio UI
+# ============================================================
+css = ".fillable{max-width:960px!important}"
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# 🎬 LongCat-Video + FA3 Accelerated 🚀")
+    gr.Markdown("13.6B parameter dense video model — with FlashAttention 3 for speed ⚡")
+    with gr.Tabs():
+        # Text-to-Video
+        with gr.TabItem("Text-to-Video"):
+            prompt_t2v = gr.Textbox(label="Prompt", lines=3, placeholder="A cinematic shot of a corgi running on the beach.")
+            neg_t2v = gr.Textbox(label="Negative Prompt", value="ugly, blurry, static")
+            h_t2v = gr.Slider(256, 1024, 480, step=64, label="Height")
+            w_t2v = gr.Slider(256, 1024, 832, step=64, label="Width")
+            seed_t2v = gr.Number(value=42, label="Seed")
+            distill_t2v = gr.Checkbox(label="Distill Mode", value=True)
+            refine_t2v = gr.Checkbox(label="Refine Mode", value=False)
+            btn_t2v = gr.Button("Generate Video", variant="primary")
+            out_t2v = gr.Video(label="Output Video")
+            btn_t2v.click(
+                generate_video,
+                inputs=["t2v", prompt_t2v, neg_t2v, gr.State(None), h_t2v, w_t2v, gr.State("480p"), seed_t2v, distill_t2v, refine_t2v],
+                outputs=out_t2v,
+            )
+        # Image-to-Video
+        with gr.TabItem("Image-to-Video"):
+            img_i2v = gr.Image(type="numpy", label="Input Image")
+            prompt_i2v = gr.Textbox(label="Prompt", placeholder="The cat in the image blinks.")
+            neg_i2v = gr.Textbox(label="Negative Prompt", value="ugly, blurry")
+            resolution_i2v = gr.Dropdown(["480p", "720p"], value="480p", label="Resolution")
+            seed_i2v = gr.Number(value=42, label="Seed")
+            distill_i2v = gr.Checkbox(label="Distill Mode", value=True)
+            refine_i2v = gr.Checkbox(label="Refine Mode", value=False)
+            btn_i2v = gr.Button("Generate Video", variant="primary")
+            out_i2v = gr.Video(label="Output Video")
+            btn_i2v.click(
+                generate_video,
+                inputs=["i2v", prompt_i2v, neg_i2v, img_i2v, gr.State(None), gr.State(None), resolution_i2v, seed_i2v, distill_i2v, refine_i2v],
+                outputs=out_i2v,
+            )
+if __name__ == "__main__":
+    demo.launch()