import numpy as np import torch import torchaudio import gradio as gr import tempfile import gc import traceback import os import requests import spaces from slicer import Slicer from infer import ( load_models, load_audio, apply_fade, process_segment, batch_process_segments ) # Global variables for models global svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg, device svc_model = vocoder = rmvpe = hubert = rms_extractor = spk2idx = dataset_cfg = None device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Set default model path DEFAULT_MODEL_PATH = "pretrained/dit-768-12_nanami.ckpt" # HuggingFace repository URL for the model HF_MODEL_URL = "https://huggingface.co/Pur1zumu/RIFT-SVC-finetuned/resolve/main/dit-768-12_nanami.ckpt" # Maximum audio duration in seconds to avoid memory issues MAX_AUDIO_DURATION = 300 # 5 minutes def initialize_models(model_path=DEFAULT_MODEL_PATH): global svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg # Always use FP16 by default use_fp16 = True # Clean up memory before loading models if svc_model is not None: del svc_model del vocoder del rmvpe del hubert del rms_extractor torch.cuda.empty_cache() gc.collect() try: # Check if the model file exists at the default path temp_model_path = None if not os.path.exists(model_path): print(f"Model not found at {model_path}, attempting to download from HuggingFace...") # Use a persistent temp directory path for reuse between sessions temp_model_path = os.path.join(tempfile.gettempdir(), "RIFT-SVC-model.ckpt") # Only download if the model is not already in the temp location if not os.path.exists(temp_model_path): try: # Create the directory if it doesn't exist os.makedirs(os.path.dirname(model_path), exist_ok=True) # Download the model to a temporary file response = requests.get(HF_MODEL_URL, stream=True) response.raise_for_status() with open(temp_model_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print(f"Model downloaded successfully to {temp_model_path}") except Exception as e: print(f"Failed to download model: {str(e)}") raise Exception(f"Model not found at {DEFAULT_MODEL_PATH} and download failed: {str(e)}") else: print(f"Using previously downloaded model from {temp_model_path}") model_path = temp_model_path svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg = load_models(model_path, device, use_fp16) available_speakers = list(spk2idx.keys()) return available_speakers, f"✅ 模型加载成功!可用说话人: {', '.join(available_speakers)}" except Exception as e: error_trace = traceback.format_exc() return [], f"❌ 加载模型出错: {str(e)}\n\n详细信息: {error_trace}" def check_audio_length(audio_path, max_duration=MAX_AUDIO_DURATION): """Check if audio file is too long to process safely""" try: info = torchaudio.info(audio_path) duration = info.num_frames / info.sample_rate return duration <= max_duration, duration except Exception: # If we can't determine the length, we'll try to process it anyway return True, 0 @spaces.GPU(duration=120) def process_with_progress( progress=gr.Progress(), input_audio=None, speaker=None, key_shift=0, infer_steps=32, robust_f0=1, # Advanced CFG parameters ds_cfg_strength=0.1, spk_cfg_strength=1.0, skip_cfg_strength=0.0, cfg_skip_layers=6, cfg_rescale=0.7, cvec_downsample_rate=2, # Slicer parameters slicer_threshold=-30.0, slicer_min_length=3000, slicer_min_interval=100, slicer_hop_size=10, slicer_max_sil_kept=200, # Batch processing batch_size=1 ): global svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg # Fixed parameters target_loudness = -18.0 restore_loudness = True fade_duration = 20.0 sliced_inference = False use_fp16 = True # Always use FP16 by default # Input validation if input_audio is None: return None, "❌ 错误: 未提供输入音频。" if svc_model is None: return None, "❌ 错误: 模型未加载。请重新加载页面或检查模型路径。" if speaker is None or speaker not in spk2idx: return None, f"❌ 错误: 无效的说话人选择。可用说话人: {', '.join(spk2idx.keys())}" # Check audio length to avoid memory issues is_safe_length, duration = check_audio_length(input_audio) if not is_safe_length: return None, f"❌ 错误: 音频过长 ({duration:.1f} 秒)。允许的最大时长为 {MAX_AUDIO_DURATION} 秒。" # Process the audio try: # Update status message progress(0, desc="处理中: 加载音频...") # Convert speaker name to ID speaker_id = spk2idx[speaker] # Get config from loaded model hop_length = 512 sample_rate = 44100 # Load audio audio = load_audio(input_audio, sample_rate) # Initialize Slicer slicer = Slicer( sr=sample_rate, threshold=slicer_threshold, min_length=slicer_min_length, min_interval=slicer_min_interval, hop_size=slicer_hop_size, max_sil_kept=slicer_max_sil_kept ) progress(0.1, desc="处理中: 切分音频...") # Slice the input audio segments_with_pos = slicer.slice(audio) if not segments_with_pos: return None, "❌ 错误: 在输入文件中未找到有效的音频片段。" # Calculate fade size in samples fade_samples = int(fade_duration * sample_rate / 1000) # Process segments result_audio = np.zeros(len(audio) + fade_samples) # Extra space for potential overlap progress(0.2, desc="处理中: 开始转换...") with torch.no_grad(): if batch_size > 1: # Use batch processing progress_desc = f"处理中: 批次 {{0}}/{{1}}" processed_segments = batch_process_segments( segments_with_pos, svc_model, vocoder, rmvpe, hubert, rms_extractor, speaker_id, sample_rate, hop_length, device, key_shift, infer_steps, ds_cfg_strength, spk_cfg_strength, skip_cfg_strength, cfg_skip_layers, cfg_rescale, cvec_downsample_rate, target_loudness, restore_loudness, robust_f0, use_fp16, batch_size, progress, progress_desc ) for idx, (start_sample, audio_out, expected_length) in enumerate(processed_segments): # Apply fades if idx > 0: # Not first segment audio_out = apply_fade(audio_out.copy(), fade_samples, fade_in=True) result_audio[start_sample:start_sample + fade_samples] *= \ np.linspace(1, 0, fade_samples) # Fade out previous if idx < len(processed_segments) - 1: # Not last segment audio_out[-fade_samples:] *= np.linspace(1, 0, fade_samples) # Fade out # Add to result result_audio[start_sample:start_sample + len(audio_out)] += audio_out # Clean up memory after each segment if idx % 5 == 0: # Clean up every 5 segments torch.cuda.empty_cache() else: # Use sequential processing for i, (start_sample, chunk) in enumerate(segments_with_pos): segment_progress = 0.2 + (0.7 * (i / len(segments_with_pos))) progress(segment_progress, desc=f"处理中: 片段 {i+1}/{len(segments_with_pos)}") # Process the segment audio_out = process_segment( chunk, svc_model, vocoder, rmvpe, hubert, rms_extractor, speaker_id, sample_rate, hop_length, device, key_shift, infer_steps, ds_cfg_strength, spk_cfg_strength, skip_cfg_strength, cfg_skip_layers, cfg_rescale, cvec_downsample_rate, target_loudness, restore_loudness, sliced_inference, robust_f0, use_fp16 ) # Ensure consistent length expected_length = len(chunk) if len(audio_out) > expected_length: audio_out = audio_out[:expected_length] elif len(audio_out) < expected_length: audio_out = np.pad(audio_out, (0, expected_length - len(audio_out)), 'constant') # Apply fades if i > 0: # Not first segment audio_out = apply_fade(audio_out.copy(), fade_samples, fade_in=True) result_audio[start_sample:start_sample + fade_samples] *= \ np.linspace(1, 0, fade_samples) # Fade out previous if i < len(segments_with_pos) - 1: # Not last segment audio_out[-fade_samples:] *= np.linspace(1, 0, fade_samples) # Fade out # Add to result result_audio[start_sample:start_sample + len(audio_out)] += audio_out # Clean up memory after each segment torch.cuda.empty_cache() progress(0.9, desc="处理中: 完成音频...") # Trim any extra padding result_audio = result_audio[:len(audio)] # Create a temporary file to save the result with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: output_path = temp_file.name # Save output torchaudio.save(output_path, torch.from_numpy(result_audio).unsqueeze(0).float(), sample_rate) progress(1.0, desc="处理完成!") batch_text = f"批处理大小 {batch_size}" if batch_size > 1 else "顺序处理" return (sample_rate, result_audio), f"✅ 转换完成! 已转换为 **{speaker}** 并调整 **{key_shift}** 个半音。{batch_text}" except RuntimeError as e: # Handle CUDA out of memory errors if "CUDA out of memory" in str(e): # Clean up memory torch.cuda.empty_cache() gc.collect() return None, f"❌ 错误: 内存不足。请尝试更短的音频文件或减少推理步骤。" else: return None, f"❌ 转换过程中出错: {str(e)}" except Exception as e: error_trace = traceback.format_exc() return None, f"❌ 转换过程中出错: {str(e)}\n\n详细信息: {error_trace}" finally: # Clean up memory torch.cuda.empty_cache() gc.collect() def create_ui(): # CSS for better styling css = """ .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .container { max-width: 1200px; margin: auto; } .footer { margin-top: 20px; text-align: center; font-size: 0.9em; color: #666; } .title { text-align: center; margin-bottom: 10px; } .subtitle { text-align: center; margin-bottom: 20px; color: #666; } .button-primary { background-color: #5460DE !important; } .output-message { margin-top: 10px; padding: 10px; border-radius: 4px; background-color: #f8f9fa; border-left: 4px solid #5460DE; } .error-message { color: #d62828; font-weight: bold; } .success-message { color: #588157; font-weight: bold; } .info-box { background-color: #f8f9fa; border-left: 4px solid #5460DE; padding: 10px; margin: 10px 0; border-radius: 4px; } """ # Initialize models available_speakers, init_message = initialize_models() with gr.Blocks(css=css, theme=gr.themes.Soft(), title="RIFT-SVC 声音转换") as app: gr.HTML("""
🔗 想要微调自己的说话人? 请访问 RIFT-SVC GitHub 仓库 获取完整的训练和微调指南。
🎤 数据来源说明: 该demo数据来源为b站上快速爬取的约30分钟七海唱歌片段,直接分离人声后进行训练,没有额外筛选。
📝 注意: 为获得最佳效果,请使用背景噪音较少的干净音频。最大音频长度为5分钟。建议用较短的音频测试避免平台意外中断任务。