Spaces:

seawolf2357
/

YuE-music-generator-demo-zero

Runtime error

App Files Files Community

KingNish commited on Jan 30, 2025

Commit

51043fd

verified ·

1 Parent(s): d7227ce

Update app.py

Browse files

Files changed (1) hide show

app.py +224 -307

app.py CHANGED Viewed

@@ -5,8 +5,9 @@ import shutil
 import tempfile
 import spaces
 import torch
-import torch.nn.functional as F
 import sys
 print("Installing flash-attn...")
 # Install flash attention
@@ -45,6 +46,7 @@ except FileNotFoundError:
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 # don't change above code
 import argparse
@@ -55,7 +57,6 @@ import torchaudio
 from torchaudio.transforms import Resample
 import soundfile as sf
-import uuid
 from tqdm import tqdm
 from einops import rearrange
 from codecmanipulator import CodecManipulator
@@ -68,36 +69,16 @@ from collections import Counter
 from models.soundstream_hubert_new import SoundStream
 from vocoder import build_codec_model, process_audio
 from post_process_audio import replace_low_freq_with_energy_matched
-import re
-import multiprocessing
-def empty_output_folder(output_dir):
-    # List all files in the output directory
-    files = os.listdir(output_dir)
-    # Iterate over the files and remove them
-    for file in files:
-        file_path = os.path.join(output_dir, file)
-        try:
-            if os.path.isdir(file_path):
-                # If it's a directory, remove it recursively
-                shutil.rmtree(file_path)
-            else:
-                # If it's a file, delete it
-                os.remove(file_path)
-        except Exception as e:
-            print(f"Error deleting file {file_path}: {e}")
 device = "cuda:0"
-# --- Model Loading and Quantization ---
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",  # To enable flashattn, you have to install flash-attn
-).to(device)
 model.eval()
-# gonna use either gguf or vllm later
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
@@ -115,30 +96,7 @@ codec_model.load_state_dict(parameter_dict['codec_model'])
 codec_model.to(device)
 codec_model.eval()
-# --- Parallel Audio Processing ---
-def process_audio_wrapper(args):
-    # Unpack arguments and call the original process_audio function
-    npy, output_path, rescale, other_args, decoder, codec_model = args
-    return process_audio(npy, output_path, rescale, other_args, decoder, codec_model)
-def parallel_process_audio(stage1_output_set, vocoder_stems_dir, rescale, other_args, vocal_decoder, inst_decoder,
-                           codec_model, num_processes=4):
-    with multiprocessing.Pool(processes=num_processes) as pool:
-        tasks = []
-        for npy in stage1_output_set:
-            if 'instrumental' in npy:
-                output_path = os.path.join(vocoder_stems_dir, 'instrumental.mp3')
-                decoder = inst_decoder
-            else:
-                output_path = os.path.join(vocoder_stems_dir, 'vocal.mp3')
-                decoder = vocal_decoder
-            tasks.append((npy, output_path, rescale, other_args, decoder, codec_model))
-        results = pool.map(process_audio_wrapper, tasks)
-    return results
-# --- Optimized Music Generation ---
 def generate_music(
         max_new_tokens=5,
         run_n_segments=2,
@@ -148,91 +106,75 @@ def generate_music(
         audio_prompt_path="",
         prompt_start_time=0.0,
         prompt_end_time=30.0,
-        output_dir="./output",
         rescale=False,
-        beam_width=3,  # Add beam search
-        length_penalty=1.0,  # Add length penalty
-        repetition_penalty=1.5, # Add repetition penalty
-        batch_size=2
 ):
     if use_audio_prompt and not audio_prompt_path:
-        raise FileNotFoundError(
-            "Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
     max_new_tokens = max_new_tokens * 100
-    stage1_output_dir = os.path.join(output_dir, f"stage1")
-    os.makedirs(stage1_output_dir, exist_ok=True)
-    class BlockTokenRangeProcessor(LogitsProcessor):
-        def __init__(self, start_id, end_id):
-            self.blocked_token_ids = list(range(start_id, end_id))
-        def __call__(self, input_ids, scores):
-            scores[:, self.blocked_token_ids] = -float("inf")
-            return scores
-    def load_audio_mono(filepath, sampling_rate=16000):
-        audio, sr = torchaudio.load(filepath)
-        # Convert to mono
-        audio = torch.mean(audio, dim=0, keepdim=True)
-        # Resample if needed
-        if sr != sampling_rate:
-            resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
-            audio = resampler(audio)
-        return audio
-    def split_lyrics(lyrics: str):
-        pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
-        segments = re.findall(pattern, lyrics, re.DOTALL)
-        structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
-        return structured_lyrics
-    # Call the function and print the result
-    stage1_output_set = []
-    genres = genre_txt.strip()
-    lyrics = split_lyrics(lyrics_txt + "\n")
-    # intruction
-    full_lyrics = "\n".join(lyrics)
-    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
-    prompt_texts += lyrics
-    random_id = uuid.uuid4()
-    output_seq = None
-    # Here is suggested decoding config
-    top_p = 0.93
-    temperature = 1.0
-    # special tokens
-    start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
-    end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
-    raw_output = None
-    segment_cache = {}  # Cache for repeated segments
-    # Format text prompt
-    run_n_segments = min(run_n_segments + 1, len(lyrics))
-    print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
-    # Modified loop for batching and caching
-    for i in range(1, run_n_segments, batch_size):
-        batch_segments = []
-        batch_prompts = []
-        for j in range(i, min(i + batch_size, run_n_segments)):
-            section_text = prompt_texts[j].replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            # Check cache
-            if section_text in segment_cache:
-                cached_output = segment_cache[section_text]
-                if j > 1:
-                    raw_output = torch.cat([raw_output, cached_output], dim=1)
-                else:
-                    raw_output = cached_output
                 continue
-            batch_segments.append(section_text)
-            guidance_scale = 1.5 if j <= 1 else 1.2
-            if j == 1:
                 if use_audio_prompt:
                     audio_prompt = load_audio_mono(audio_prompt_path)
                     audio_prompt.unsqueeze_(0)
@@ -242,8 +184,7 @@ def generate_music(
                     raw_codes = raw_codes.cpu().numpy().astype(np.int16)
                     # Format audio prompt
                     code_ids = codectool.npy2ids(raw_codes[0])
-                    audio_prompt_codec = code_ids[
-                                         int(prompt_start_time * 50): int(prompt_end_time * 50)]  # 50 is tps of xcodec
                     audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
                         mmtokenizer.eoa]
                     sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
@@ -253,201 +194,177 @@ def generate_music(
                     head_id = mmtokenizer.tokenize(prompt_texts[0])
                 prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             else:
-                prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [
-                    mmtokenizer.soa] + codectool.sep_ids
             prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
-            input_ids = torch.cat([raw_output, prompt_ids], dim=1) if j > 1 else prompt_ids
             # Use window slicing in case output sequence exceeds the context of model
             max_context = 16384 - max_new_tokens - 1
             if input_ids.shape[-1] > max_context:
                 print(
-                    f'Section {j}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
-            batch_prompts.append(input_ids)
-        if not batch_prompts:
-            continue  # All segments in the batch were cached
-        # Pad prompts in the batch to the same length
-        max_len = max(p.size(1) for p in batch_prompts)
-        padded_prompts = []
-        for p in batch_prompts:
-            pad_len = max_len - p.size(1)
-            padded_prompt = F.pad(p, (0, pad_len), value=mmtokenizer.eoa)
-            padded_prompts.append(padded_prompt)
-        batch_input_ids = torch.cat(padded_prompts, dim=0)
-        with torch.no_grad():
-            output_seqs = model.generate(
-                input_ids=batch_input_ids,
-                max_new_tokens=max_new_tokens,
-                min_new_tokens=100,
-                do_sample=True,
-                top_p=top_p,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                eos_token_id=mmtokenizer.eoa,
-                pad_token_id=mmtokenizer.eoa,
-                logits_processor=LogitsProcessorList(
-                    [BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
-                guidance_scale=guidance_scale,
-                use_cache=True,
-                num_beams=beam_width,  # Use beam search
-                length_penalty=length_penalty,  # Apply length penalty
-            )
-        # Process each output in the batch
-        for k, output_seq in enumerate(output_seqs):
-            if output_seq[0][-1].item() != mmtokenizer.eoa:
-                tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
-                output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
             if i > 1:
-                raw_output = torch.cat([raw_output, batch_prompts[k][:, :batch_input_ids.shape[-1]],
-                                        output_seq[:, batch_input_ids.shape[-1]:]], dim=1)
             else:
                 raw_output = output_seq
-            # Cache the generated output if not already cached
-            if batch_segments[k] not in segment_cache:
-                segment_cache[batch_segments[k]] = output_seq[:, batch_input_ids.shape[-1]:].cpu()
-    # save raw output and check sanity
-    ids = raw_output[0].cpu().numpy()
-    soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
-    eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
-    if len(soa_idx) != len(eoa_idx):
-        raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
-    vocals = []
-    instrumentals = []
-    range_begin = 1 if use_audio_prompt else 0
-    for i in range(range_begin, len(soa_idx)):
-        codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
-        if codec_ids[0] == 32016:
-            codec_ids = codec_ids[1:]
-        codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
-        vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
-        vocals.append(vocals_ids)
-        instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
-        instrumentals.append(instrumentals_ids)
-    vocals = np.concatenate(vocals, axis=1)
-    instrumentals = np.concatenate(instrumentals, axis=1)
-    vocal_save_path = os.path.join(stage1_output_dir,
-                                   f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_vocal_{random_id}".replace(
-                                       '.', '@') + '.npy')
-    inst_save_path = os.path.join(stage1_output_dir,
-                                  f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_instrumental_{random_id}".replace(
-                                      '.', '@') + '.npy')
-    np.save(vocal_save_path, vocals)
-    np.save(inst_save_path, instrumentals)
-    stage1_output_set.append(vocal_save_path)
-    stage1_output_set.append(inst_save_path)
-    print("Converting to Audio...")
-    # convert audio tokens to audio
-    def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
-        folder_path = os.path.dirname(path)
-        if not os.path.exists(folder_path):
-            os.makedirs(folder_path)
-        limit = 0.99
-        max_val = wav.abs().max()
-        wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
-        torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
-    # reconstruct tracks
-    recons_output_dir = os.path.join(output_dir, "recons")
-    recons_mix_dir = os.path.join(recons_output_dir, 'mix')
-    os.makedirs(recons_mix_dir, exist_ok=True)
-    tracks = []
-    for npy in stage1_output_set:
-        codec_result = np.load(npy)
-        decodec_rlt = []
-        with torch.no_grad():
-            decoded_waveform = codec_model.decode(
-                torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
-                    device))
-        decoded_waveform = decoded_waveform.cpu().squeeze(0)
-        decodec_rlt.append(torch.as_tensor(decoded_waveform))
-        decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-        save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
-        tracks.append(save_path)
-        save_audio(decodec_rlt, save_path, 16000)
-    # mix tracks
-    for inst_path in tracks:
         try:
-            if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
-                    and 'instrumental' in inst_path:
-                # find pair
-                vocal_path = inst_path.replace('instrumental', 'vocal')
-                if not os.path.exists(vocal_path):
-                    continue
-                # mix
-                recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
-                vocal_stem, sr = sf.read(inst_path)
-                instrumental_stem, _ = sf.read(vocal_path)
-                mix_stem = (vocal_stem + instrumental_stem) / 1
-                sf.write(recons_mix, mix_stem, sr)
-        except Exception as e:
             print(e)
-    # vocoder to upsample audios
-    vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
-    vocoder_output_dir = os.path.join(output_dir, 'vocoder')
-    vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
-    vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
-    os.makedirs(vocoder_mix_dir, exist_ok=True)
-    os.makedirs(vocoder_stems_dir, exist_ok=True)
-    # Use parallel processing for vocoding
-    parallel_process_audio(stage1_output_set, vocoder_stems_dir, rescale, argparse.Namespace(**locals()), vocal_decoder,
-                           inst_decoder, codec_model)
-    # mix tracks after parallel processing
-    instrumental_output_path = os.path.join(vocoder_stems_dir, 'instrumental.mp3')
-    vocal_output_path = os.path.join(vocoder_stems_dir, 'vocal.mp3')
-    if os.path.exists(instrumental_output_path) and os.path.exists(vocal_output_path):
-      instrumental_output, sr = torchaudio.load(instrumental_output_path)
-      vocal_output, _ = torchaudio.load(vocal_output_path)
-      try:
-          mix_output = instrumental_output + vocal_output
-          vocoder_mix = os.path.join(vocoder_mix_dir, os.path.basename(recons_mix))
-          save_audio(mix_output, vocoder_mix, 44100, rescale)
-          print(f"Created mix: {vocoder_mix}")
-      except RuntimeError as e:
-          print(e)
-          print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
-    else:
-      print("Skipping mix creation, instrumental or vocal output missing.")
-    # Post process
-    replace_low_freq_with_energy_matched(
-        a_file=recons_mix,  # 16kHz
-        b_file=vocoder_mix,  # 48kHz
-        c_file=os.path.join(output_dir, os.path.basename(recons_mix)),
-        cutoff_freq=5500.0
-    )
-    print("All process Done")
-    return recons_mix
 @spaces.GPU(duration=120)
-def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=5):
-    # Ensure the output folder exists
-    output_dir = "./output"
-    os.makedirs(output_dir, exist_ok=True)
-    print(f"Output folder ensured at: {output_dir}")
-    empty_output_folder(output_dir)
     # Execute the command
     try:
         music = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
-                               output_dir=output_dir, max_new_tokens=max_new_tokens)
         return music
     except Exception as e:
         gr.Warning("An Error Occured: " + str(e))
@@ -455,8 +372,8 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=
     finally:
         print("Temporary files deleted.")
-# Gradio
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")

 import tempfile
 import spaces
 import torch
 import sys
+import uuid
+import re
 print("Installing flash-attn...")
 # Install flash attention
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 # don't change above code
 import argparse
 from torchaudio.transforms import Resample
 import soundfile as sf
 from tqdm import tqdm
 from einops import rearrange
 from codecmanipulator import CodecManipulator
 from models.soundstream_hubert_new import SoundStream
 from vocoder import build_codec_model, process_audio
 from post_process_audio import replace_low_freq_with_energy_matched
 device = "cuda:0"
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",  # To enable flashattn, you have to install flash-attn
+)
+model.to(device)
 model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
 codec_model.to(device)
 codec_model.eval()
 def generate_music(
         max_new_tokens=5,
         run_n_segments=2,
         audio_prompt_path="",
         prompt_start_time=0.0,
         prompt_end_time=30.0,
+        cuda_idx=0,
         rescale=False,
 ):
     if use_audio_prompt and not audio_prompt_path:
+        raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
+    cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
+    with tempfile.TemporaryDirectory() as output_dir:
+        stage1_output_dir = os.path.join(output_dir, f"stage1")
+        os.makedirs(stage1_output_dir, exist_ok=True)
+        class BlockTokenRangeProcessor(LogitsProcessor):
+            def __init__(self, start_id, end_id):
+                self.blocked_token_ids = list(range(start_id, end_id))
+            def __call__(self, input_ids, scores):
+                scores[:, self.blocked_token_ids] = -float("inf")
+                return scores
+        def load_audio_mono(filepath, sampling_rate=16000):
+            audio, sr = torchaudio.load(filepath)
+            # Convert to mono
+            audio = torch.mean(audio, dim=0, keepdim=True)
+            # Resample if needed
+            if sr != sampling_rate:
+                resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
+                audio = resampler(audio)
+            return audio
+        def split_lyrics(lyrics: str):
+            pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
+            segments = re.findall(pattern, lyrics, re.DOTALL)
+            structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+            return structured_lyrics
+        # Call the function and print the result
+        stage1_output_set = []
+        genres = genre_txt.strip()
+        lyrics = split_lyrics(lyrics_txt + "\n")
+        # intruction
+        full_lyrics = "\n".join(lyrics)
+        prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
+        prompt_texts += lyrics
+        random_id = uuid.uuid4()
+        output_seq = None
+        # Here is suggested decoding config
+        top_p = 0.93
+        temperature = 1.0
+        repetition_penalty = 1.2
+        # special tokens
+        start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
+        end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
+        raw_output = None
+        # Format text prompt
+        run_n_segments = min(run_n_segments + 1, len(lyrics))
+        print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
+        for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
+            section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+            guidance_scale = 1.5 if i <= 1 else 1.2
+            if i == 0:
                 continue
+            if i == 1:
                 if use_audio_prompt:
                     audio_prompt = load_audio_mono(audio_prompt_path)
                     audio_prompt.unsqueeze_(0)
                     raw_codes = raw_codes.cpu().numpy().astype(np.int16)
                     # Format audio prompt
                     code_ids = codectool.npy2ids(raw_codes[0])
+                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]  # 50 is tps of xcodec
                     audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
                         mmtokenizer.eoa]
                     sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
                     head_id = mmtokenizer.tokenize(prompt_texts[0])
                 prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             else:
+                prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
+            input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
             # Use window slicing in case output sequence exceeds the context of model
             max_context = 16384 - max_new_tokens - 1
             if input_ids.shape[-1] > max_context:
                 print(
+                    f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
+            with torch.no_grad():
+                output_seq = model.generate(
+                    input_ids=input_ids,
+                    max_new_tokens=max_new_tokens,
+                    min_new_tokens=100,
+                    do_sample=True,
+                    top_p=top_p,
+                    temperature=temperature,
+                    repetition_penalty=repetition_penalty,
+                    eos_token_id=mmtokenizer.eoa,
+                    pad_token_id=mmtokenizer.eoa,
+                    logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002),
+                                                        BlockTokenRangeProcessor(32016, 32016)]),
+                    guidance_scale=guidance_scale,
+                    use_cache=True,
+                )
+                if output_seq[0][-1].item() != mmtokenizer.eoa:
+                    tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
+                    output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
             if i > 1:
+                raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
             else:
                 raw_output = output_seq
+            print(len(raw_output))
+        # save raw output and check sanity
+        ids = raw_output[0].cpu().numpy()
+        soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
+        eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
+        if len(soa_idx) != len(eoa_idx):
+            raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
+        vocals = []
+        instrumentals = []
+        range_begin = 1 if use_audio_prompt else 0
+        for i in range(range_begin, len(soa_idx)):
+            codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
+            if codec_ids[0] == 32016:
+                codec_ids = codec_ids[1:]
+            codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
+            vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
+            vocals.append(vocals_ids)
+            instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
+            instrumentals.append(instrumentals_ids)
+        vocals = np.concatenate(vocals, axis=1)
+        instrumentals = np.concatenate(instrumentals, axis=1)
+        vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
+        inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
+        np.save(vocal_save_path, vocals)
+        np.save(inst_save_path, instrumentals)
+        stage1_output_set.append(vocal_save_path)
+        stage1_output_set.append(inst_save_path)
+        print("Converting to Audio...")
+        # convert audio tokens to audio
+        def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
+            folder_path = os.path.dirname(path)
+            if not os.path.exists(folder_path):
+                os.makedirs(folder_path)
+            limit = 0.99
+            max_val = wav.abs().max()
+            wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
+            torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+        # reconstruct tracks
+        recons_output_dir = os.path.join(output_dir, "recons")
+        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
+        os.makedirs(recons_mix_dir, exist_ok=True)
+        tracks = []
+        for npy in stage1_output_set:
+            codec_result = np.load(npy)
+            decodec_rlt = []
+            with torch.no_grad():
+                decoded_waveform = codec_model.decode(
+                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
+                        device))
+            decoded_waveform = decoded_waveform.cpu().squeeze(0)
+            decodec_rlt.append(torch.as_tensor(decoded_waveform))
+            decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
+            tracks.append(save_path)
+            save_audio(decodec_rlt, save_path, 16000)
+        # mix tracks
+        for inst_path in tracks:
+            try:
+                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
+                        and 'instrumental' in inst_path:
+                    # find pair
+                    vocal_path = inst_path.replace('instrumental', 'vocal')
+                    if not os.path.exists(vocal_path):
+                        continue
+                    # mix
+                    recons_mix = os.path.join(recons_mix_dir,
+                                              os.path.basename(inst_path).replace('instrumental', 'mixed'))
+                    vocal_stem, sr = sf.read(inst_path)
+                    instrumental_stem, _ = sf.read(vocal_path)
+                    mix_stem = (vocal_stem + instrumental_stem) / 1
+                    sf.write(recons_mix, mix_stem, sr)
+            except Exception as e:
+                print(e)
+        # vocoder to upsample audios
+        vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
+        vocoder_output_dir = os.path.join(output_dir, 'vocoder')
+        vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
+        vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
+        os.makedirs(vocoder_mix_dir, exist_ok=True)
+        os.makedirs(vocoder_stems_dir, exist_ok=True)
+        instrumental_output = None
+        vocal_output = None
+        for npy in stage1_output_set:
+            if 'instrumental' in npy:
+                # Process instrumental
+                instrumental_output = process_audio(
+                    npy,
+                    os.path.join(vocoder_stems_dir, 'instrumental.mp3'),
+                    rescale,
+                    argparse.Namespace(**locals()),  # Convert local variables to argparse.Namespace
+                    inst_decoder,
+                    codec_model
+                )
+            else:
+                # Process vocal
+                vocal_output = process_audio(
+                    npy,
+                    os.path.join(vocoder_stems_dir, 'vocal.mp3'),
+                    rescale,
+                    argparse.Namespace(**locals()),  # Convert local variables to argparse.Namespace
+                    vocal_decoder,
+                    codec_model
+                )
+        # mix tracks
         try:
+            mix_output = instrumental_output + vocal_output
+            vocoder_mix = os.path.join(vocoder_mix_dir, os.path.basename(recons_mix))
+            save_audio(mix_output, vocoder_mix, 44100, rescale)
+            print(f"Created mix: {vocoder_mix}")
+        except RuntimeError as e:
             print(e)
+            print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
+        # Post process
+        final_output_path = os.path.join(output_dir, os.path.basename(recons_mix))
+        replace_low_freq_with_energy_matched(
+            a_file=recons_mix,  # 16kHz
+            b_file=vocoder_mix,  # 48kHz
+            c_file=final_output_path,
+            cutoff_freq=5500.0
+        )
+        print("All process Done")
+        return final_output_path
 @spaces.GPU(duration=120)
+def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=10):
     # Execute the command
     try:
         music = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
+                               cuda_idx=0, max_new_tokens=max_new_tokens)
         return music
     except Exception as e:
         gr.Warning("An Error Occured: " + str(e))
     finally:
         print("Temporary files deleted.")
+# Gradio
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")