Spaces:

Pur1zumu
/

RIFT-SVC-Nanami

Runtime error

App Files Files Community

prismleong commited on Mar 10

Commit

898b100

1 Parent(s): 5c61d84

init

Browse files

Files changed (39) hide show

README.md +3 -3
app.py +398 -0
infer.py +493 -0
pretrained/content-vec-best/.gitattributes +34 -0
pretrained/content-vec-best/.gitignore +1 -0
pretrained/content-vec-best/README.md +33 -0
pretrained/content-vec-best/config.json +71 -0
pretrained/content-vec-best/convert.py +150 -0
pretrained/download.py +12 -0
pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/NOTICE.txt +87 -0
pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/NOTICE.zh-CN.txt +85 -0
pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/config.json +56 -0
pretrained/rmvpe/.gitkeep +0 -0
requirements.txt +28 -0
rift_svc/__init__.py +3 -0
rift_svc/dataset.py +139 -0
rift_svc/dit.py +227 -0
rift_svc/feature_extractors.py +144 -0
rift_svc/lightning_module.py +389 -0
rift_svc/metrics.py +71 -0
rift_svc/modules.py +261 -0
rift_svc/nsf_hifigan/__init__.py +2 -0
rift_svc/nsf_hifigan/env.py +15 -0
rift_svc/nsf_hifigan/models.py +427 -0
rift_svc/nsf_hifigan/nvSTFT.py +124 -0
rift_svc/nsf_hifigan/utils.py +67 -0
rift_svc/nsf_hifigan/vocoder.py +123 -0
rift_svc/optim.py +103 -0
rift_svc/rf.py +215 -0
rift_svc/rmvpe/__init__.py +5 -0
rift_svc/rmvpe/constants.py +9 -0
rift_svc/rmvpe/deepunet.py +189 -0
rift_svc/rmvpe/inference.py +51 -0
rift_svc/rmvpe/model.py +60 -0
rift_svc/rmvpe/seq.py +20 -0
rift_svc/rmvpe/spec.py +66 -0
rift_svc/rmvpe/utils.py +142 -0
rift_svc/utils.py +364 -0
slicer.py +252 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: RIFT SVC
-emoji: 😻
 colorFrom: red
 colorTo: yellow
 sdk: gradio
@@ -11,4 +11,4 @@ license: cc-by-nc-sa-4.0
 short_description: https://github.com/Pur1zumu/RIFT-SVC
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: RIFT-SVC (七海Nanami demo)
+emoji: 🎵
 colorFrom: red
 colorTo: yellow
 sdk: gradio
 short_description: https://github.com/Pur1zumu/RIFT-SVC
 ---

app.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import numpy as np
+import torch
+import torchaudio
+import gradio as gr
+import tempfile
+import gc
+import traceback
+from slicer import Slicer
+from infer import (
+    load_models,
+    load_audio,
+    apply_fade,
+    process_segment
+)
+# Global variables for models
+global svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg, device
+svc_model = vocoder = rmvpe = hubert = rms_extractor = spk2idx = dataset_cfg = None
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Set default model path
+DEFAULT_MODEL_PATH = "pretrained/dit-768-12_nanami.ckpt"
+# Maximum audio duration in seconds to avoid memory issues
+MAX_AUDIO_DURATION = 300  # 5 minutes
+def initialize_models(model_path=DEFAULT_MODEL_PATH):
+    global svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg
+    # Clean up memory before loading models
+    if svc_model is not None:
+        del svc_model
+        del vocoder
+        del rmvpe
+        del hubert
+        del rms_extractor
+        torch.cuda.empty_cache()
+        gc.collect()
+    try:
+        svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg = load_models(model_path, device)
+        available_speakers = list(spk2idx.keys())
+        return available_speakers, f"✅ 模型加载成功！可用说话人: {', '.join(available_speakers)}"
+    except Exception as e:
+        error_trace = traceback.format_exc()
+        return [], f"❌ 加载模型出错: {str(e)}\n\n详细信息: {error_trace}"
+def check_audio_length(audio_path, max_duration=MAX_AUDIO_DURATION):
+    """Check if audio file is too long to process safely"""
+    try:
+        info = torchaudio.info(audio_path)
+        duration = info.num_frames / info.sample_rate
+        return duration <= max_duration, duration
+    except Exception:
+        # If we can't determine the length, we'll try to process it anyway
+        return True, 0
+def process_with_progress(
+    progress=gr.Progress(),
+    input_audio=None,
+    speaker=None,
+    key_shift=0,
+    infer_steps=32,
+    robust_f0=0,
+    # Advanced CFG parameters
+    ds_cfg_strength=0.05,
+    spk_cfg_strength=1.0,
+    skip_cfg_strength=0.0,
+    cfg_skip_layers=6,
+    cfg_rescale=0.7,
+    cvec_downsample_rate=2,
+    # Slicer parameters
+    slicer_threshold=-30.0,
+    slicer_min_length=3000,
+    slicer_min_interval=100,
+    slicer_hop_size=10,
+    slicer_max_sil_kept=200
+):
+    global svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg
+    # Fixed target loudness value
+    target_loudness = -18.0
+    # Fixed audio parameters
+    restore_loudness = True
+    fade_duration = 20.0
+    sliced_inference = False
+    # Input validation
+    if input_audio is None:
+        return None, "❌ 错误: 未提供输入音频。"
+    if svc_model is None:
+        return None, "❌ 错误: 模型未加载。请重新加载页面或检查模型路径。"
+    if speaker is None or speaker not in spk2idx:
+        return None, f"❌ 错误: 无效的说话人选择。可用说话人: {', '.join(spk2idx.keys())}"
+    # Check audio length to avoid memory issues
+    is_safe_length, duration = check_audio_length(input_audio)
+    if not is_safe_length:
+        return None, f"❌ 错误: 音频过长 ({duration:.1f} 秒)。允许的最大时长为 {MAX_AUDIO_DURATION} 秒。"
+    # Process the audio
+    try:
+        # Update status message
+        progress(0, desc="处理中: 加载音频...")
+        # Convert speaker name to ID
+        speaker_id = spk2idx[speaker]
+        # Get config from loaded model
+        hop_length = 512
+        sample_rate = 44100
+        # Load audio
+        audio = load_audio(input_audio, sample_rate)
+        # Initialize Slicer
+        slicer = Slicer(
+            sr=sample_rate,
+            threshold=slicer_threshold,
+            min_length=slicer_min_length,
+            min_interval=slicer_min_interval,
+            hop_size=slicer_hop_size,
+            max_sil_kept=slicer_max_sil_kept
+        )
+        progress(0.1, desc="处理中: 切分音频...")
+        # Slice the input audio
+        segments_with_pos = slicer.slice(audio)
+        if not segments_with_pos:
+            return None, "❌ 错误: 在输入文件中未找到有效的音频片段。"
+        # Calculate fade size in samples
+        fade_samples = int(fade_duration * sample_rate / 1000)
+        # Process segments
+        result_audio = np.zeros(len(audio) + fade_samples)  # Extra space for potential overlap
+        progress(0.2, desc="处理中: 开始转换...")
+        with torch.no_grad():
+            for i, (start_sample, chunk) in enumerate(segments_with_pos):
+                segment_progress = 0.2 + (0.7 * (i / len(segments_with_pos)))
+                progress(segment_progress, desc=f"处理中: 片段 {i+1}/{len(segments_with_pos)}")
+                # Process the segment
+                audio_out = process_segment(
+                    chunk, svc_model, vocoder, rmvpe, hubert, rms_extractor,
+                    speaker_id, sample_rate, hop_length, device,
+                    key_shift, infer_steps, ds_cfg_strength, spk_cfg_strength,
+                    skip_cfg_strength, cfg_skip_layers, cfg_rescale,
+                    cvec_downsample_rate, target_loudness, restore_loudness, sliced_inference,
+                    robust_f0
+                )
+                # Ensure consistent length
+                expected_length = len(chunk)
+                if len(audio_out) > expected_length:
+                    audio_out = audio_out[:expected_length]
+                elif len(audio_out) < expected_length:
+                    audio_out = np.pad(audio_out, (0, expected_length - len(audio_out)), 'constant')
+                # Apply fades
+                if i > 0:  # Not first segment
+                    audio_out = apply_fade(audio_out.copy(), fade_samples, fade_in=True)
+                    result_audio[start_sample:start_sample + fade_samples] *= \
+                        np.linspace(1, 0, fade_samples)  # Fade out previous
+                if i < len(segments_with_pos) - 1:  # Not last segment
+                    audio_out[-fade_samples:] *= np.linspace(1, 0, fade_samples)  # Fade out
+                # Add to result
+                result_audio[start_sample:start_sample + len(audio_out)] += audio_out
+                # Clean up memory after each segment
+                torch.cuda.empty_cache()
+        progress(0.9, desc="处理中: 完成音频...")
+        # Trim any extra padding
+        result_audio = result_audio[:len(audio)]
+        # Create a temporary file to save the result
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
+            output_path = temp_file.name
+        # Save output
+        torchaudio.save(output_path, torch.from_numpy(result_audio).unsqueeze(0).float(), sample_rate)
+        progress(1.0, desc="处理完成!")
+        return (sample_rate, result_audio), f"✅ 转换完成! 已转换为 **{speaker}** 并调整 **{key_shift}** 个半音。"
+    except RuntimeError as e:
+        # Handle CUDA out of memory errors
+        if "CUDA out of memory" in str(e):
+            # Clean up memory
+            torch.cuda.empty_cache()
+            gc.collect()
+            return None, f"❌ 错误: 内存不足。请尝试更短的音频文件或减少推理步骤。"
+        else:
+            return None, f"❌ 转换过程中出错: {str(e)}"
+    except Exception as e:
+        error_trace = traceback.format_exc()
+        return None, f"❌ 转换过程中出错: {str(e)}\n\n详细信息: {error_trace}"
+    finally:
+        # Clean up memory
+        torch.cuda.empty_cache()
+        gc.collect()
+def create_ui():
+    # CSS for better styling
+    css = """
+    .gradio-container {
+        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    }
+    .container {
+        max-width: 1200px;
+        margin: auto;
+    }
+    .footer {
+        margin-top: 20px;
+        text-align: center;
+        font-size: 0.9em;
+        color: #666;
+    }
+    .title {
+        text-align: center;
+        margin-bottom: 10px;
+    }
+    .subtitle {
+        text-align: center;
+        margin-bottom: 20px;
+        color: #666;
+    }
+    .button-primary {
+        background-color: #5460DE !important;
+    }
+    .output-message {
+        margin-top: 10px;
+        padding: 10px;
+        border-radius: 4px;
+        background-color: #f8f9fa;
+        border-left: 4px solid #5460DE;
+    }
+    .error-message {
+        color: #d62828;
+        font-weight: bold;
+    }
+    .success-message {
+        color: #588157;
+        font-weight: bold;
+    }
+    .info-box {
+        background-color: #f8f9fa;
+        border-left: 4px solid #5460DE;
+        padding: 10px;
+        margin: 10px 0;
+        border-radius: 4px;
+    }
+    """
+    # Initialize models
+    available_speakers, init_message = initialize_models()
+    with gr.Blocks(css=css, theme=gr.themes.Soft(), title="RIFT-SVC 声音转换") as app:
+        gr.HTML("""
+        <div class="title">
+            <h1>🎤 RIFT-SVC 歌声音色转换 （七海Nanami demo）</h1>
+        </div>
+        <div class="subtitle">
+            <h3>使用 RIFT-SVC 模型将歌声或语音转换为七海Nanami的音色</h3>
+        </div>
+        <div class="info-box">
+            <p>📝 <strong>注意：</strong> 为获得最佳效果，请使用背景噪音较少的干净音频。最大音频长度为5分钟。</p>
+        </div>
+        <div class="info-box">
+            <p>🔗 <strong>想要微调自己的说话人？</strong> 请访问 <a href="https://github.com/Pur1zumu/RIFT-SVC" target="_blank">RIFT-SVC GitHub 仓库</a> 获取完整的��练和微调指南。</p>
+        </div>
+        """)
+        with gr.Row():
+            # Left column (input parameters)
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### 📥 输入")
+                    model_path = gr.Textbox(label="模型路径", value=DEFAULT_MODEL_PATH, interactive=True)
+                    input_audio = gr.Audio(label="输入音频文件", type="filepath", elem_id="input_audio")
+                    reload_btn = gr.Button("🔄 重新加载模型", elem_id="reload_btn")
+                with gr.Accordion("⚙️ 基本参数", open=True):
+                    speaker = gr.Dropdown(choices=available_speakers, label="目标说话人", interactive=True, elem_id="speaker")
+                    key_shift = gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="音调调整（半音）", elem_id="key_shift")
+                    infer_steps = gr.Slider(minimum=8, maximum=64, step=1, value=32, label="推理步数", elem_id="infer_steps",
+                                           info="更低的值 = 更快但质量较低，更高的值 = 更慢但质量更好")
+                    robust_f0 = gr.Radio(choices=[0, 1, 2], value=0, label="音高滤波",
+                                        info="0=无，1=轻度过滤，2=强力过滤（有助于解决断音/破音问题）",
+                                        elem_id="robust_f0")
+                with gr.Accordion("🔬 高级CFG参数", open=True):
+                    ds_cfg_strength = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.05,
+                                               label="内容向量引导强度",
+                                               info="更高的值可以改善内容保留和咬字清晰度。过高会用力过猛。",
+                                               elem_id="ds_cfg_strength")
+                    spk_cfg_strength = gr.Slider(minimum=0.0, maximum=2.0, step=0.01, value=1.0,
+                                                label="说话人引导强度",
+                                                info="更高的值可以增强说话人相似度。过高可能导致音色失真。",
+                                                elem_id="spk_cfg_strength")
+                    skip_cfg_strength = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.0,
+                                                 label="层引导强度（实验性功能）",
+                                                 info="增强指定层的特征渲染。效果取决于目标层的功能。",
+                                                 elem_id="skip_cfg_strength")
+                    cfg_skip_layers = gr.Number(value=6, label="CFG跳过层（实验性功能）", precision=0,
+                                               info="目标增强层下标",
+                                               elem_id="cfg_skip_layers")
+                    cfg_rescale = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.7,
+                                           label="CFG重缩放因子",
+                                           info="约束整体引导强度。当引导效果过于强烈时使用调高该值。",
+                                           elem_id="cfg_rescale")
+                    cvec_downsample_rate = gr.Radio(choices=[1, 2, 4, 8], value=2,
+                                                  label="用于反向引导的内容向量下采样率",
+                                                  info="更高的值（可能）可以提高内容清晰度。",
+                                                  elem_id="cvec_downsample_rate")
+                with gr.Accordion("✂️ 切片参数", open=False):
+                    slicer_threshold = gr.Slider(minimum=-60.0, maximum=-20.0, step=0.1, value=-30.0,
+                                                label="阈值 (dB)",
+                                                info="静音检测阈值",
+                                                elem_id="slicer_threshold")
+                    slicer_min_length = gr.Slider(minimum=1000, maximum=10000, step=100, value=3000,
+                                                 label="最小长度 (毫秒)",
+                                                 info="最小片段长度",
+                                                 elem_id="slicer_min_length")
+                    slicer_min_interval = gr.Slider(minimum=10, maximum=500, step=10, value=100,
+                                                   label="最小静音间隔 (毫秒)",
+                                                   info="片段之间的最小静音间隔",
+                                                   elem_id="slicer_min_interval")
+                    slicer_hop_size = gr.Slider(minimum=1, maximum=50, step=1, value=10,
+                                               label="跳跃大小 (毫秒)",
+                                               info="分析窗口跳跃大小",
+                                               elem_id="slicer_hop_size")
+                    slicer_max_sil_kept = gr.Slider(minimum=50, maximum=10000, step=10, value=200,
+                                                   label="最大保留静音 (毫秒)",
+                                                   info="边界处保留的最大静音",
+                                                   elem_id="slicer_max_sil_kept")
+            # Right column (output)
+            with gr.Column(scale=1):
+                convert_btn = gr.Button("🎵 转换声音", variant="primary", elem_id="convert_btn")
+                gr.Markdown("### 📤 输出")
+                output_audio = gr.Audio(label="转换后的音频", elem_id="output_audio", autoplay=False, show_share_button=False)
+                output_message = gr.Markdown(init_message, elem_id="output_message", elem_classes="output-message")
+                gr.HTML("""
+                <div class="info-box">
+                    <h4>🔍 快速提示</h4>
+                    <ul>
+                        <li><strong>音调调整：</strong> 以半音为单位上调或下调音高。</li>
+                        <li><strong>推理步骤：</strong> 步骤越多 = 质量越好但速度越慢。</li>
+                        <li><strong>音高滤波：</strong> 有助于提高具有挑战性的音频中的音高稳定性。</li>
+                        <li><strong>CFG参数：</strong> 调整转换质量和音色。</li>
+                    </ul>
+                </div>
+                """)
+        # Define button click events
+        reload_btn.click(
+            fn=initialize_models,
+            inputs=[model_path],
+            outputs=[speaker, output_message]
+        )
+        # Updated convert button click event
+        convert_btn.click(
+            fn=lambda: "⏳ 处理中... 请稍候。",
+            inputs=None,
+            outputs=output_message,
+            queue=False
+        ).then(
+            fn=process_with_progress,
+            inputs=[
+                input_audio, speaker, key_shift, infer_steps, robust_f0,
+                ds_cfg_strength, spk_cfg_strength, skip_cfg_strength, cfg_skip_layers, cfg_rescale, cvec_downsample_rate,
+                slicer_threshold, slicer_min_length, slicer_min_interval, slicer_hop_size, slicer_max_sil_kept
+            ],
+            outputs=[output_audio, output_message],
+            show_progress_on=output_audio
+        )
+    return app
+if __name__ == "__main__":
+    app = create_ui()
+    app.launch()

infer.py ADDED Viewed

	@@ -0,0 +1,493 @@

+import click
+import librosa
+import numpy as np
+import pyloudnorm as pyln
+import torch
+import torchaudio
+from pathlib import Path
+from tqdm import tqdm
+from rift_svc import DiT, RF
+from rift_svc.feature_extractors import HubertModelWithFinalProj, RMSExtractor, get_mel_spectrogram
+from rift_svc.nsf_hifigan import NsfHifiGAN
+from rift_svc.rmvpe import RMVPE
+from rift_svc.utils import linear_interpolate_tensor, post_process_f0, f0_ensemble, f0_ensemble_light, get_f0_pw, get_f0_pm
+from slicer import Slicer
+torch.set_grad_enabled(False)
+def extract_state_dict(ckpt):
+    state_dict = ckpt['state_dict']
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if k.startswith('model.'):
+            new_k = k.replace('model.', '')
+            new_state_dict[new_k] = v
+    spk2idx = ckpt['hyper_parameters']['cfg']['spk2idx']
+    model_cfg = ckpt['hyper_parameters']['cfg']['model']
+    dataset_cfg = ckpt['hyper_parameters']['cfg']['dataset']
+    return new_state_dict, spk2idx, model_cfg, dataset_cfg
+def load_models(model_path, device):
+    """Load all required models and return them"""
+    click.echo("Loading models...")
+    # Load the conversion model
+    ckpt = torch.load(model_path, map_location='cpu')
+    state_dict, spk2idx, dit_cfg, dataset_cfg = extract_state_dict(ckpt)
+    transformer = DiT(num_speaker=len(spk2idx), **dit_cfg)
+    svc_model = RF(transformer=transformer)
+    svc_model.load_state_dict(state_dict)
+    svc_model = svc_model.to(device)
+    svc_model.eval()
+    # Load additional models
+    vocoder = NsfHifiGAN('pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt').to(device)
+    rmvpe = RMVPE(model_path="pretrained/rmvpe/model.pt", hop_length=160, device=device)
+    hubert = HubertModelWithFinalProj.from_pretrained("pretrained/content-vec-best").to(device)
+    rms_extractor = RMSExtractor().to(device)
+    return svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg
+def load_audio(file_path, target_sr):
+    """Load and preprocess audio file"""
+    click.echo("Loading audio...")
+    audio, sr = torchaudio.load(file_path)
+    if sr != target_sr:
+        audio = torchaudio.functional.resample(audio, sr, target_sr)
+    if len(audio.shape) > 1:
+        audio = audio.mean(dim=0, keepdim=True)
+    return audio.numpy().squeeze()
+def apply_fade(audio, fade_samples, fade_in=True):
+    """Apply fade in/out using half of a Hanning window"""
+    fade_window = np.hanning(fade_samples * 2)
+    if fade_in:
+        fade_curve = fade_window[:fade_samples]
+    else:
+        fade_curve = fade_window[fade_samples:]
+    audio[:fade_samples] *= fade_curve
+    return audio
+def extract_features(audio_segment, sample_rate, hop_length, rmvpe, hubert, rms_extractor,
+                     device, key_shift=0, ds_cfg_strength=0.0, cvec_downsample_rate=2, target_loudness=-18.0,
+                     robust_f0=0):
+    """Extract all required features from an audio segment"""
+    # Normalize input segment
+    meter = pyln.Meter(sample_rate, block_size=0.1)
+    original_loudness = meter.integrated_loudness(audio_segment)
+    normalized_audio = pyln.normalize.loudness(audio_segment, original_loudness, target_loudness)
+    # Handle potential clipping
+    max_amp = np.max(np.abs(normalized_audio))
+    if max_amp > 1.0:
+        normalized_audio = normalized_audio * (0.99 / max_amp)
+    audio_tensor = torch.from_numpy(normalized_audio).float().unsqueeze(0).to(device)
+    audio_16khz = torch.from_numpy(librosa.resample(normalized_audio, orig_sr=sample_rate, target_sr=16000)).float().unsqueeze(0).to(device)
+    # Extract mel spectrogram
+    mel = get_mel_spectrogram(
+        audio_tensor,
+        sampling_rate=sample_rate,
+        n_fft=2048,
+        num_mels=128,
+        hop_size=512,
+        win_size=2048,
+        fmin=40,
+        fmax=16000
+    ).transpose(1, 2)
+    # Extract content vector
+    cvec = hubert(audio_16khz)["last_hidden_state"].squeeze(0)
+    cvec = linear_interpolate_tensor(cvec, mel.shape[1])[None, :]
+    # Create bad_cvec (downsampled) for classifier-free guidance
+    if ds_cfg_strength > 0:
+        cvec_ds = cvec.clone()
+        # Downsample and then interpolate back, similar to dataset.py
+        cvec_ds = cvec_ds[0, ::2, :]  # Take every other frame
+        cvec_ds = linear_interpolate_tensor(cvec_ds, cvec_ds.shape[0]//cvec_downsample_rate)
+        cvec_ds = linear_interpolate_tensor(cvec_ds, mel.shape[1])[None, :]
+    else:
+        cvec_ds = None
+    # Extract f0
+    if robust_f0 > 0:
+        # Parameters for F0 extraction
+        time_step = hop_length / sample_rate
+        f0_min = 40
+        f0_max = 1100
+        # Extract F0 using multiple methods
+        rmvpe_f0 = rmvpe.infer_from_audio(audio_tensor, sample_rate=sample_rate, device=device)
+        rmvpe_f0 = post_process_f0(rmvpe_f0, sample_rate, hop_length, mel.shape[1], silence_front=0.0, cut_last=False)
+        pw_f0 = get_f0_pw(normalized_audio, sample_rate, time_step, f0_min, f0_max)
+        pmac_f0 = get_f0_pm(normalized_audio, sample_rate, time_step, f0_min, f0_max)
+        if robust_f0 == 1:
+            # Level 1: Light ensemble that preserves expressiveness
+            rms_np = rms_extractor(audio_tensor).squeeze().cpu().numpy()
+            f0 = f0_ensemble_light(rmvpe_f0, pw_f0, pmac_f0, rms=rms_np)
+        else:
+            # Level 2: Strong ensemble with more filtering
+            f0 = f0_ensemble(rmvpe_f0, pw_f0, pmac_f0)
+    else:
+        # Level 0: Use only RMVPE for F0 extraction (original method)
+        f0 = rmvpe.infer_from_audio(audio_tensor, sample_rate=sample_rate, device=device)
+        f0 = post_process_f0(f0, sample_rate, hop_length, mel.shape[1], silence_front=0.0, cut_last=False)
+    if key_shift != 0:
+        f0 = f0 * 2 ** (key_shift / 12)
+    f0 = torch.from_numpy(f0).float().to(device)[None, :]
+    # Extract RMS
+    rms = rms_extractor(audio_tensor)
+    return mel, cvec, cvec_ds, f0, rms, original_loudness
+def run_inference(
+    model, mel, cvec, f0, rms, cvec_ds, spk_id,
+    infer_steps, ds_cfg_strength, spk_cfg_strength,
+    skip_cfg_strength, cfg_skip_layers, cfg_rescale,
+    sliced_inference=False
+):
+    """Run the actual inference through the model"""
+    if sliced_inference:
+        # Use sliced inference for long segments
+        sliced_len = 256
+        mel_crossfade_len = 8  # Number of frames to crossfade in mel domain
+        # If the segment is shorter than one slice, just process it directly
+        if mel.shape[1] <= sliced_len:
+            mel_out, _ = model.sample(
+                src_mel=mel,
+                spk_id=spk_id,
+                f0=f0,
+                rms=rms,
+                cvec=cvec,
+                steps=infer_steps,
+                bad_cvec=cvec_ds,
+                ds_cfg_strength=ds_cfg_strength,
+                spk_cfg_strength=spk_cfg_strength,
+                skip_cfg_strength=skip_cfg_strength,
+                cfg_skip_layers=cfg_skip_layers,
+                cfg_rescale=cfg_rescale,
+            )
+            return mel_out
+        # Create a tensor to hold the full output with crossfading
+        full_mel_out = torch.zeros_like(mel)
+        # Process each slice
+        for i in range(0, mel.shape[1], sliced_len - mel_crossfade_len):
+            # Determine slice boundaries
+            start_idx = i
+            end_idx = min(i + sliced_len, mel.shape[1])
+            # Skip if we're at the end
+            if start_idx >= mel.shape[1]:
+                break
+            # Extract slices for this window
+            mel_slice = mel[:, start_idx:end_idx, :]
+            cvec_slice = cvec[:, start_idx:end_idx, :]
+            f0_slice = f0[:, start_idx:end_idx]
+            rms_slice = rms[:, start_idx:end_idx]
+            # Slice the bad_cvec if it exists
+            cvec_ds_slice = None
+            if cvec_ds is not None:
+                cvec_ds_slice = cvec_ds[:, start_idx:end_idx, :]
+            # Process with model
+            mel_out_slice, _ = model.sample(
+                src_mel=mel_slice,
+                spk_id=spk_id,
+                f0=f0_slice,
+                rms=rms_slice,
+                cvec=cvec_slice,
+                steps=infer_steps,
+                bad_cvec=cvec_ds_slice,
+                ds_cfg_strength=ds_cfg_strength,
+                spk_cfg_strength=spk_cfg_strength,
+                skip_cfg_strength=skip_cfg_strength,
+                cfg_skip_layers=cfg_skip_layers,
+                cfg_rescale=cfg_rescale,
+            )
+            # Create crossfade weights
+            slice_len = end_idx - start_idx
+            # Apply different strategies depending on position
+            if i == 0:  # First slice
+                # No crossfade at the beginning
+                weights = torch.ones((1, slice_len, 1), device=mel.device)
+                if i + sliced_len < mel.shape[1]:  # If not the last slice too
+                    # Fade out at the end - use the minimum of slice_len and mel_crossfade_len
+                    actual_crossfade_len = min(mel_crossfade_len, slice_len)
+                    if actual_crossfade_len > 0:  # Only apply if we have space
+                        fade_out = torch.linspace(1, 0, actual_crossfade_len, device=mel.device)
+                        weights[:, -actual_crossfade_len:, :] = fade_out.view(1, -1, 1)
+            elif end_idx >= mel.shape[1]:  # Last slice
+                # Fade in at the beginning - use the minimum of slice_len and mel_crossfade_len
+                weights = torch.ones((1, slice_len, 1), device=mel.device)
+                actual_crossfade_len = min(mel_crossfade_len, slice_len)
+                if actual_crossfade_len > 0:  # Only apply if we have space
+                    fade_in = torch.linspace(0, 1, actual_crossfade_len, device=mel.device)
+                    weights[:, :actual_crossfade_len, :] = fade_in.view(1, -1, 1)
+            else:  # Middle slices
+                # Crossfade both sides, handling the case where slice_len < 2*mel_crossfade_len
+                weights = torch.ones((1, slice_len, 1), device=mel.device)
+                # Determine the actual crossfade length (might be shorter for small slices)
+                actual_crossfade_len = min(mel_crossfade_len, slice_len // 2)
+                if actual_crossfade_len > 0:
+                    fade_in = torch.linspace(0, 1, actual_crossfade_len, device=mel.device)
+                    fade_out = torch.linspace(1, 0, actual_crossfade_len, device=mel.device)
+                    weights[:, :actual_crossfade_len, :] = fade_in.view(1, -1, 1)
+                    weights[:, -actual_crossfade_len:, :] = fade_out.view(1, -1, 1)
+            # Apply weights to current slice output
+            mel_out_slice = mel_out_slice * weights
+            # Add to the appropriate region of the output
+            full_mel_out[:, start_idx:end_idx, :] += mel_out_slice
+        # Return the full crossfaded output
+        mel_out = full_mel_out
+    else:
+        # Process the entire segment at once
+        mel_out, _ = model.sample(
+            src_mel=mel,
+            spk_id=spk_id,
+            f0=f0,
+            rms=rms,
+            cvec=cvec,
+            steps=infer_steps,
+            bad_cvec=cvec_ds,
+            ds_cfg_strength=ds_cfg_strength,
+            spk_cfg_strength=spk_cfg_strength,
+            skip_cfg_strength=skip_cfg_strength,
+            cfg_skip_layers=cfg_skip_layers,
+            cfg_rescale=cfg_rescale,
+        )
+    return mel_out
+def generate_audio(vocoder, mel_out, f0, original_loudness=None, restore_loudness=True):
+    """Generate audio from mel spectrogram using vocoder"""
+    audio_out = vocoder(mel_out.transpose(1, 2), f0)
+    audio_out = audio_out.squeeze().cpu().numpy()
+    if restore_loudness and original_loudness is not None:
+        # Restore original loudness
+        meter = pyln.Meter(44100, block_size=0.1)  # Using default sample rate for vocoder
+        audio_out_loudness = meter.integrated_loudness(audio_out)
+        audio_out = pyln.normalize.loudness(audio_out, audio_out_loudness, original_loudness)
+        # Handle clipping
+        max_amp = np.max(np.abs(audio_out))
+        if max_amp > 1.0:
+            audio_out = audio_out * (0.99 / max_amp)
+    return audio_out
+def process_segment(
+    audio_segment,
+    svc_model, vocoder, rmvpe, hubert, rms_extractor,
+    speaker_id, sample_rate, hop_length, device,
+    key_shift=0,
+    infer_steps=32,
+    ds_cfg_strength=0.0,
+    spk_cfg_strength=0.0,
+    skip_cfg_strength=0.0,
+    cfg_skip_layers=None,
+    cfg_rescale=0.7,
+    cvec_downsample_rate=2,
+    target_loudness=-18.0,
+    restore_loudness=True,
+    sliced_inference=False,
+    robust_f0=0
+):
+    """Process a single audio segment and return the converted audio"""
+    # Extract features
+    mel, cvec, cvec_ds, f0, rms, original_loudness = extract_features(
+        audio_segment, sample_rate, hop_length, rmvpe, hubert, rms_extractor,
+        device, key_shift, ds_cfg_strength, cvec_downsample_rate, target_loudness,
+        robust_f0
+    )
+    # Prepare speaker ID
+    spk_id = torch.LongTensor([speaker_id]).to(device)
+    # Run inference
+    mel_out = run_inference(
+        svc_model, mel, cvec, f0, rms, cvec_ds, spk_id,
+        infer_steps, ds_cfg_strength, spk_cfg_strength,
+        skip_cfg_strength, cfg_skip_layers, cfg_rescale,
+        sliced_inference
+    )
+    # Generate audio
+    audio_out = generate_audio(
+        vocoder, mel_out, f0,
+        original_loudness if restore_loudness else None,
+        restore_loudness
+    )
+    return audio_out
+@click.command()
+@click.option('--model', type=click.Path(exists=True), required=True, help='Path to model checkpoint')
+@click.option('--input', type=click.Path(exists=True), required=True, help='Input audio file')
+@click.option('--output', type=click.Path(), required=True, help='Output audio file')
+@click.option('--speaker', type=str, required=True, help='Target speaker')
+@click.option('--key-shift', type=int, default=0, help='Pitch shift in semitones')
+@click.option('--device', type=str, default=None, help='Device to use (cuda/cpu)')
+@click.option('--infer-steps', type=int, default=32, help='Number of inference steps')
+@click.option('--ds-cfg-strength', type=float, default=0.0, help='Downsampled content vector guidance strength')
+@click.option('--spk-cfg-strength', type=float, default=0.0, help='Speaker guidance strength')
+@click.option('--skip-cfg-strength', type=float, default=0.0, help='Skip layer guidance strength')
+@click.option('--cfg-skip-layers', type=int, default=None, help='Layer to skip for classifier-free guidance')
+@click.option('--cfg-rescale', type=float, default=0.7, help='Classifier-free guidance rescale factor')
+@click.option('--cvec-downsample-rate', type=int, default=2, help='Downsampling rate for bad_cvec creation')
+@click.option('--target-loudness', type=float, default=-18.0, help='Target loudness in LUFS for normalization')
+@click.option('--restore-loudness', default=True, help='Restore loudness to original')
+@click.option('--fade-duration', type=float, default=20.0, help='Fade duration in milliseconds')
+@click.option('--sliced-inference', is_flag=True, default=False, help='Use sliced inference for processing long segments')
+@click.option('--robust-f0', type=int, default=0, help='Level of robust f0 filtering (0=none, 1=light, 2=aggressive)')
+@click.option('--slicer-threshold', type=float, default=-35.0, help='Threshold for audio slicing in dB')
+@click.option('--slicer-min-length', type=int, default=3000, help='Minimum length of audio segments in milliseconds')
+@click.option('--slicer-min-interval', type=int, default=100, help='Minimum interval between audio segments in milliseconds')
+@click.option('--slicer-hop-size', type=int, default=10, help='Hop size for audio slicing in milliseconds')
+@click.option('--slicer-max-sil-kept', type=int, default=300, help='Maximum silence kept in milliseconds')
+def main(
+    model,
+    input,
+    output,
+    speaker,
+    key_shift,
+    device,
+    infer_steps,
+    ds_cfg_strength,
+    spk_cfg_strength,
+    skip_cfg_strength,
+    cfg_skip_layers,
+    cfg_rescale,
+    cvec_downsample_rate,
+    target_loudness,
+    restore_loudness,
+    fade_duration,
+    sliced_inference,
+    robust_f0,
+    slicer_threshold,
+    slicer_min_length,
+    slicer_min_interval,
+    slicer_hop_size,
+    slicer_max_sil_kept
+):
+    """Convert the voice in an audio file to a target speaker."""
+    # Setup device
+    if device is None:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = torch.device(device)
+    # Load models
+    svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg = load_models(model, device)
+    try:
+        speaker_id = spk2idx[speaker]
+    except KeyError:
+        raise ValueError(f"Speaker {speaker} not found in the model's speaker list, valid speakers are {spk2idx.keys()}")
+    # Get config from loaded model
+    hop_length = 512
+    sample_rate = 44100
+    # Load audio
+    audio = load_audio(input, sample_rate)
+    # Initialize Slicer
+    slicer = Slicer(
+        sr=sample_rate,
+        threshold=slicer_threshold,
+        min_length=slicer_min_length,
+        min_interval=slicer_min_interval,
+        hop_size=slicer_hop_size,
+        max_sil_kept=slicer_max_sil_kept
+    )
+    # Step (1): Use slicer to segment the input audio and get positions
+    click.echo("Slicing audio...")
+    segments_with_pos = slicer.slice(audio)  # Now returns list of (start_pos, chunk)
+    if restore_loudness:
+        click.echo(f"Will restore loudness to original")
+    # Calculate fade size in samples
+    fade_samples = int(fade_duration * sample_rate / 1000)
+    # Process segments
+    click.echo(f"Processing {len(segments_with_pos)} segments...")
+    result_audio = np.zeros(len(audio) + fade_samples)  # Extra space for potential overlap
+    with torch.no_grad():
+        for idx, (start_sample, chunk) in enumerate(tqdm(segments_with_pos)):
+            # Process the segment
+            audio_out = process_segment(
+                chunk, svc_model, vocoder, rmvpe, hubert, rms_extractor,
+                speaker_id, sample_rate, hop_length, device,
+                key_shift, infer_steps, ds_cfg_strength, spk_cfg_strength,
+                skip_cfg_strength, cfg_skip_layers, cfg_rescale,
+                cvec_downsample_rate, target_loudness, restore_loudness, sliced_inference,
+                robust_f0
+            )
+            # Ensure consistent length
+            expected_length = len(chunk)
+            if len(audio_out) > expected_length:
+                audio_out = audio_out[:expected_length]
+            elif len(audio_out) < expected_length:
+                audio_out = np.pad(audio_out, (0, expected_length - len(audio_out)), 'constant')
+            # Apply fades
+            if idx > 0:  # Not first segment
+                audio_out = apply_fade(audio_out.copy(), fade_samples, fade_in=True)
+                result_audio[start_sample:start_sample + fade_samples] *= \
+                    np.linspace(1, 0, fade_samples)  # Fade out previous
+            if idx < len(segments_with_pos) - 1:  # Not last segment
+                audio_out[-fade_samples:] *= np.linspace(1, 0, fade_samples)  # Fade out
+            # Add to result
+            result_audio[start_sample:start_sample + len(audio_out)] += audio_out
+    # Trim any extra padding
+    result_audio = result_audio[:len(audio)]
+    # Save output
+    click.echo("Saving output...")
+    output_path = Path(output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    torchaudio.save(output, torch.from_numpy(result_audio).unsqueeze(0), sample_rate)
+    click.echo("Done!")
+if __name__ == '__main__':
+    main()

pretrained/content-vec-best/.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

pretrained/content-vec-best/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ content-vec-best-legacy-500.pt

pretrained/content-vec-best/README.md ADDED Viewed

	@@ -0,0 +1,33 @@

+---
+license: mit
+---
+# Content Vec Best
+Official Repo: [ContentVec](https://github.com/auspicious3000/contentvec)
+This repo brings fairseq ContentVec model to HuggingFace Transformers.
+## How to use
+To use this model, you need to define
+```python
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # The final projection layer is only used for backward compatibility.
+        # Following https://github.com/auspicious3000/contentvec/issues/6
+        # Remove this layer is necessary to achieve the desired outcome.
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+```
+and then load the model with
+```python
+model = HubertModelWithFinalProj.from_pretrained("lengyue233/content-vec-best")
+x = model(audio)["last_hidden_state"]
+```
+## How to convert
+You need to download the ContentVec_legacy model from the official repo, and then run
+```bash
+python convert.py
+```

pretrained/content-vec-best/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "activation_dropout": 0.1,
+  "apply_spec_augment": true,
+  "architectures": [
+    "HubertModelWithFinalProj"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.0,
+  "feat_proj_layer_norm": true,
+  "final_dropout": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "hubert",
+  "num_attention_heads": 12,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.27.3",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32
+}

pretrained/content-vec-best/convert.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import torch
+from torch import nn
+from transformers import HubertConfig, HubertModel
+import logging
+# Ignore fairseq's logger
+logging.getLogger("fairseq").setLevel(logging.WARNING)
+logging.getLogger("torch.distributed.nn.jit.instantiator").setLevel(logging.WARNING)
+from fairseq import checkpoint_utils
+models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+    ["content-vec-best-legacy-500.pt"], suffix=""
+)
+model = models[0]
+model.eval()
+model.eval()
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
+# Default Config
+hubert = HubertModelWithFinalProj(HubertConfig())
+# huggingface: fairseq
+mapping = {
+    "masked_spec_embed": "mask_emb",
+    "encoder.layer_norm.bias": "encoder.layer_norm.bias",
+    "encoder.layer_norm.weight": "encoder.layer_norm.weight",
+    "encoder.pos_conv_embed.conv.bias": "encoder.pos_conv.0.bias",
+    "encoder.pos_conv_embed.conv.weight_g": "encoder.pos_conv.0.weight_g",
+    "encoder.pos_conv_embed.conv.weight_v": "encoder.pos_conv.0.weight_v",
+    "feature_projection.layer_norm.bias": "layer_norm.bias",
+    "feature_projection.layer_norm.weight": "layer_norm.weight",
+    "feature_projection.projection.bias": "post_extract_proj.bias",
+    "feature_projection.projection.weight": "post_extract_proj.weight",
+    "final_proj.bias": "final_proj.bias",
+    "final_proj.weight": "final_proj.weight",
+}
+# Convert encoder
+for layer in range(12):
+    for j in ["q", "k", "v"]:
+        mapping[
+            f"encoder.layers.{layer}.attention.{j}_proj.weight"
+        ] = f"encoder.layers.{layer}.self_attn.{j}_proj.weight"
+        mapping[
+            f"encoder.layers.{layer}.attention.{j}_proj.bias"
+        ] = f"encoder.layers.{layer}.self_attn.{j}_proj.bias"
+    mapping[
+        f"encoder.layers.{layer}.final_layer_norm.bias"
+    ] = f"encoder.layers.{layer}.final_layer_norm.bias"
+    mapping[
+        f"encoder.layers.{layer}.final_layer_norm.weight"
+    ] = f"encoder.layers.{layer}.final_layer_norm.weight"
+    mapping[
+        f"encoder.layers.{layer}.layer_norm.bias"
+    ] = f"encoder.layers.{layer}.self_attn_layer_norm.bias"
+    mapping[
+        f"encoder.layers.{layer}.layer_norm.weight"
+    ] = f"encoder.layers.{layer}.self_attn_layer_norm.weight"
+    mapping[
+        f"encoder.layers.{layer}.attention.out_proj.bias"
+    ] = f"encoder.layers.{layer}.self_attn.out_proj.bias"
+    mapping[
+        f"encoder.layers.{layer}.attention.out_proj.weight"
+    ] = f"encoder.layers.{layer}.self_attn.out_proj.weight"
+    mapping[
+        f"encoder.layers.{layer}.feed_forward.intermediate_dense.bias"
+    ] = f"encoder.layers.{layer}.fc1.bias"
+    mapping[
+        f"encoder.layers.{layer}.feed_forward.intermediate_dense.weight"
+    ] = f"encoder.layers.{layer}.fc1.weight"
+    mapping[
+        f"encoder.layers.{layer}.feed_forward.output_dense.bias"
+    ] = f"encoder.layers.{layer}.fc2.bias"
+    mapping[
+        f"encoder.layers.{layer}.feed_forward.output_dense.weight"
+    ] = f"encoder.layers.{layer}.fc2.weight"
+# Convert Conv Layers
+for layer in range(7):
+    mapping[
+        f"feature_extractor.conv_layers.{layer}.conv.weight"
+    ] = f"feature_extractor.conv_layers.{layer}.0.weight"
+    if layer != 0:
+        continue
+    mapping[
+        f"feature_extractor.conv_layers.{layer}.layer_norm.weight"
+    ] = f"feature_extractor.conv_layers.{layer}.2.weight"
+    mapping[
+        f"feature_extractor.conv_layers.{layer}.layer_norm.bias"
+    ] = f"feature_extractor.conv_layers.{layer}.2.bias"
+hf_keys = set(hubert.state_dict().keys())
+fair_keys = set(model.state_dict().keys())
+hf_keys -= set(mapping.keys())
+fair_keys -= set(mapping.values())
+for i, j in zip(sorted(hf_keys), sorted(fair_keys)):
+    print(i, j)
+print(hf_keys, fair_keys)
+print(len(hf_keys), len(fair_keys))
+# try loading the weights
+new_state_dict = {}
+for k, v in mapping.items():
+    new_state_dict[k] = model.state_dict()[v]
+x = hubert.load_state_dict(new_state_dict, strict=False)
+print(x)
+hubert.eval()
+with torch.no_grad():
+    new_input = torch.randn(1, 16384)
+    result1 = hubert(new_input, output_hidden_states=True)["hidden_states"][9]
+    result1 = hubert.final_proj(result1)
+    result2 = model.extract_features(
+        **{
+            "source": new_input,
+            "padding_mask": torch.zeros(1, 16384, dtype=torch.bool),
+            # "features_only": True,
+            "output_layer": 9,
+        }
+    )[0]
+    result2 = model.final_proj(result2)
+    assert torch.allclose(result1, result2, atol=1e-3)
+print("Sanity check passed")
+# Save huggingface model
+hubert.save_pretrained(".")
+print("Saved model")

pretrained/download.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from huggingface_hub import snapshot_download
+if __name__ == "__main__":
+    model_path = snapshot_download(
+        repo_id="Pur1zumu/RIFT-SVC-modules",
+        local_dir='pretrained',
+        local_dir_use_symlinks=False,  # Don't use symlinks
+        local_files_only=False,        # Allow downloading new files
+        ignore_patterns=["*.git*"],    # Ignore git-related files
+        resume_download=True           # Resume interrupted downloads
+    )

pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/NOTICE.txt ADDED Viewed

	@@ -0,0 +1,87 @@

+--- DiffSinger Community Vocoder ---
+ARCHITECTURE: NSF-HiFiGAN
+RELEASE DATE: 2024-02-19
+HYPER PARAMETERS:
+ - 44100 sample rate
+ - 128 mel bins
+ - 512 hop size
+ - 2048 window size
+ - fmin at 40Hz
+ - fmax at 16000Hz
+NOTICE:
+All model weights in the [DiffSinger Community Vocoder Project](https://openvpi.github.io/vocoders/), including
+model weights in this directory, are provided by the [OpenVPI Team](https://github.com/openvpi/), under the
+[Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/) license.
+ACKNOWLEDGEMENTS:
+Training data of this vocoder is provided and permitted by the following organizations, societies and individuals:
+孙飒              https://www.qfssr.cn
+赤松_Akamatsu     https://www.zhibin.club
+乐威              https://www.zhibin.club
+伯添              https://space.bilibili.com/24087011
+雲宇光             https://space.bilibili.com/660675050
+橙子言             https://space.bilibili.com/318486464
+人衣大人           https://space.bilibili.com/2270344
+玖蝶              https://space.bilibili.com/676771003
+Yuuko
+白夜零BYL          https://space.bilibili.com/1605040503
+嗷天              https://space.bilibili.com/5675252
+洛泠羽            https://space.bilibili.com/347373318
+灰条纹的灰猫君      https://space.bilibili.com/2083633
+幽寂              https://space.bilibili.com/478860
+恶魔王女           https://space.bilibili.com/2475098
+AlexYHX 芮晴
+绮萱              https://y.qq.com/n/ryqq/singer/003HjD6H4aZn1K
+诗芸              https://y.qq.com/n/ryqq/singer/0005NInj142zm0
+汐蕾              https://y.qq.com/n/ryqq/singer/0023cWMH1Bq1PJ
+1262917464
+炜阳
+叶卡yolka
+幸の夏            https://space.bilibili.com/1017297686
+暮色未量           https://space.bilibili.com/272904686
+晓寞sama          https://space.bilibili.com/3463394
+没头绪的节操君
+串串BunC          https://space.bilibili.com/95817834
+落雨              https://space.bilibili.com/1292427
+长尾巴的翎艾        https://space.bilibili.com/1638666
+声闻计划           https://space.bilibili.com/392812269
+唐家大小姐         http://5sing.kugou.com/palmusic/default.html
+不伊子
+芸青岩            https://space.bilibili.com/35236775
+妖橙              https://space.bilibili.com/161975631
+双桨              https://space.bilibili.com/13245483
+灵滅              https://space.bilibili.com/276988145
+AlexYHX          https://space.bilibili.com/13303439
+祁唱              https://space.bilibili.com/11256670
+早稻叽            https://space.bilibili.com/1950658
+The following public datasets are used:
+Opencpop         https://wenet.org.cn/opencpop/
+CCMUSIC          https://ccmusic-database.github.io/index.html
+SingingVoiceDataset  http://isophonics.net/SingingVoiceDataset
+Training machines are provided by:
+花儿不哭           https://space.bilibili.com/5760446
+TERMS OF REDISTRIBUTIONS:
+1. Do not sell this vocoder, or charge any fees from redistributing it, as prohibited by
+   the license.
+2. Include a copy of the CC BY-NC-SA 4.0 license, or a link referring to it.
+3. Include a copy of this notice, or any other notices informing that this vocoder is
+   provided by the OpenVPI Team, that this vocoder is licensed under CC BY-NC-SA 4.0, and
+   with a complete acknowledgement list as shown above.
+4. If you fine-tuned or modified the weights, leave a notice about what has been changed.
+5. (Optional) Leave a link to the official release page of the vocoder, and tell users
+   that other versions and future updates of this vocoder can be obtained from the website.

pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/NOTICE.zh-CN.txt ADDED Viewed

	@@ -0,0 +1,85 @@

+--- DiffSinger 社区声码器 ---
+架构：NSF-HiFiGAN
+发布日期：2024-02-19
+超参数：
+ - 44100 sample rate
+ - 128 mel bins
+ - 512 hop size
+ - 2048 window size
+ - fmin at 40Hz
+ - fmax at 16000Hz
+注意事项：
+[DiffSinger 社区声码器企划](https://openvpi.github.io/vocoders/) 中的所有模型权重，
+包括此目录下的模型权重，均由 [OpenVPI Team](https://github.com/openvpi/) 提供，并基于
+[Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/)
+进行许可。
+致谢：
+此声码器的训练数据由以下组织、社团和个人提供并许可：
+孙飒              https://www.qfssr.cn
+赤松_Akamatsu     https://www.zhibin.club
+乐威              https://www.zhibin.club
+伯添              https://space.bilibili.com/24087011
+雲宇光             https://space.bilibili.com/660675050
+橙子言             https://space.bilibili.com/318486464
+人衣大人           https://space.bilibili.com/2270344
+玖蝶              https://space.bilibili.com/676771003
+Yuuko
+白夜零BYL          https://space.bilibili.com/1605040503
+嗷天              https://space.bilibili.com/5675252
+洛泠羽            https://space.bilibili.com/347373318
+灰条纹的灰猫君      https://space.bilibili.com/2083633
+幽寂              https://space.bilibili.com/478860
+恶魔王女           https://space.bilibili.com/2475098
+芮晴
+绮萱              https://y.qq.com/n/ryqq/singer/003HjD6H4aZn1K
+诗芸              https://y.qq.com/n/ryqq/singer/0005NInj142zm0
+汐蕾              https://y.qq.com/n/ryqq/singer/0023cWMH1Bq1PJ
+1262917464
+炜阳
+叶卡yolka
+幸の夏            https://space.bilibili.com/1017297686
+暮色未量           https://space.bilibili.com/272904686
+晓寞sama          https://space.bilibili.com/3463394
+没头绪的节操君
+串串BunC          https://space.bilibili.com/95817834
+落雨              https://space.bilibili.com/1292427
+长尾巴的翎艾        https://space.bilibili.com/1638666
+声闻计划           https://space.bilibili.com/392812269
+唐家大小姐         http://5sing.kugou.com/palmusic/default.html
+不伊子
+芸青岩            https://space.bilibili.com/35236775
+妖橙              https://space.bilibili.com/161975631
+双桨              https://space.bilibili.com/13245483
+灵滅              https://space.bilibili.com/276988145
+AlexYHX          https://space.bilibili.com/13303439
+祁唱              https://space.bilibili.com/11256670
+早稻叽            https://space.bilibili.com/1950658
+使用了以下公开数据集：
+Opencpop         https://wenet.org.cn/opencpop/
+CCMUSIC          https://ccmusic-database.github.io/index.html
+SingingVoiceDataset  http://isophonics.net/SingingVoiceDataset
+训练算力的提供者如下：
+花儿不哭           https://space.bilibili.com/5760446
+二次分发条款：
+1. 请勿售卖此声码器或从其二次分发过程中收取任何费用，因为此类行为受到许可证的禁止。
+2. 请在二次分发文件中包含一份 CC BY-NC-SA 4.0 许可证的副本或指向该许可证的链接。
+3. 请在二次分发文件中包含这份声明，或以其他形式声明此声码器由 OpenVPI Team 提供并基于 CC BY-NC-SA 4.0 许可，
+   并附带上述完整的致谢名单。
+4. 如果您微调或修改了权重，请留下一份关于其受到了何种修改的说明。
+5.（可选）留下一份指向此声码器的官方发布页面的链接，并告知使用者可从该网站获取此声码器的其他版本和未来的更新。

pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+ "discriminator_periods": [
+  3,
+  5,
+  7,
+  11,
+  17,
+  23,
+  37
+ ],
+ "resblock": "1",
+ "resblock_dilation_sizes": [
+  [
+   1,
+   3,
+   5
+  ],
+  [
+   1,
+   3,
+   5
+  ],
+  [
+   1,
+   3,
+   5
+  ]
+ ],
+ "resblock_kernel_sizes": [
+  3,
+  7,
+  11
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+  16,
+  16,
+  4,
+  4,
+  4
+ ],
+ "upsample_rates": [
+  8,
+  8,
+  2,
+  2,
+  2
+ ],
+ "sampling_rate": 44100,
+ "num_mels": 128,
+ "hop_size": 512,
+ "n_fft": 2048,
+ "win_size": 2048,
+ "fmin": 40,
+ "fmax": 16000
+}

pretrained/rmvpe/.gitkeep ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+click
+einops
+gradio
+huggingface_hub
+hydra-core
+jaxtyping
+librosa
+matplotlib
+numpy
+omegaconf
+Pillow
+praat-parselmouth
+pyloudnorm
+PyYAML
+pytorch_lightning
+resampy
+schedulefree
+scipy
+soundfile
+tensorboard
+thop
+torch
+torchaudio
+torchdiffeq
+tqdm
+transformers
+wandb
+x_transformers

rift_svc/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from rift_svc.rf import RF
+from rift_svc.dit import DiT
+from rift_svc.lightning_module import RIFTSVCLightningModule

rift_svc/dataset.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import json
+import os
+import random
+from functools import partial
+from typing import Literal
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+from rift_svc.utils import linear_interpolate_tensor, nearest_interpolate_tensor
+pt_load = partial(torch.load, weights_only=True, map_location='cpu', mmap=True)
+class SVCDataset(Dataset):
+    def __init__(
+        self,
+        data_dir: str,
+        meta_info_path: str,
+        max_frame_len = 256,
+        split = "train",
+        use_cvec_downsampled: bool = False,
+        cvec_downsample_rate: int = 2,
+    ):
+        self.data_dir = data_dir
+        self.max_frame_len = max_frame_len
+        with open(meta_info_path, 'r', encoding='utf-8') as f:
+            meta = json.load(f)
+        speakers = meta["speakers"]
+        self.num_speakers = len(speakers)
+        self.spk2idx = {spk: idx for idx, spk in enumerate(speakers)}
+        self.split = split
+        self.samples = meta[f"{split}_audios"]
+        self.use_cvec_downsampled = use_cvec_downsampled
+        self.cvec_downsample_rate = cvec_downsample_rate
+    def get_frame_len(self, index):
+        return self.samples[index]['frame_len']
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        spk = sample['speaker']
+        path = os.path.join(self.data_dir, spk, sample['file_name'])
+        spk_id = torch.LongTensor([self.spk2idx[spk]]) # [1]
+        mel = pt_load(path + ".mel.pt").squeeze(0).T
+        rms = pt_load(path + ".rms.pt").squeeze(0)
+        f0 = pt_load(path + ".f0.pt").squeeze(0)
+        cvec = pt_load(path + ".cvec.pt").squeeze(0)
+        cvec = linear_interpolate_tensor(cvec, mel.shape[0])
+        if self.use_cvec_downsampled:
+            cvec_ds = cvec[::2, :]
+            cvec_ds = linear_interpolate_tensor(cvec_ds, cvec_ds.shape[0]//self.cvec_downsample_rate)
+            cvec_ds = linear_interpolate_tensor(cvec_ds, mel.shape[0])
+        frame_len = mel.shape[0]
+        if frame_len > self.max_frame_len:
+            if self.split == "train":
+                # Keep trying until we find a good segment or hit max attempts
+                max_attempts = 10
+                attempt = 0
+                while attempt < max_attempts:
+                    start = random.randint(0, frame_len - self.max_frame_len)
+                    end = start + self.max_frame_len
+                    f0_segment = f0[start:end]
+                    # Check if more than 90% of f0 values are 0
+                    zero_ratio = (f0_segment == 0).float().mean().item()
+                    if zero_ratio < 0.9:  # Found a good segment
+                        break
+                    attempt += 1
+            else:
+                start = 0
+            end = start + self.max_frame_len
+            mel = mel[start:end]
+            rms = rms[start:end]
+            f0 = f0[start:end]
+            cvec = cvec[start:end]
+            if self.use_cvec_downsampled:
+                cvec_ds = cvec_ds[start:end]
+            frame_len = self.max_frame_len
+        result = dict(
+            spk_id = spk_id,
+            mel = mel,
+            rms = rms,
+            f0 = f0,
+            cvec = cvec,
+            frame_len = frame_len
+        )
+        if self.use_cvec_downsampled:
+            result['cvec_ds'] = cvec_ds
+        return result
+def collate_fn(batch):
+    spk_ids = [item['spk_id'] for item in batch]
+    mels = [item['mel'] for item in batch]
+    rmss = [item['rms'] for item in batch]
+    f0s = [item['f0'] for item in batch]
+    cvecs = [item['cvec'] for item in batch]
+    if 'cvec_ds' in batch[0]:
+        cvecs_ds = [item['cvec_ds'] for item in batch]
+    frame_lens = [item['frame_len'] for item in batch]
+    # Pad sequences to max length
+    mels_padded = pad_sequence(mels, batch_first=True)
+    rmss_padded = pad_sequence(rmss, batch_first=True)
+    f0s_padded = pad_sequence(f0s, batch_first=True)
+    cvecs_padded = pad_sequence(cvecs, batch_first=True)
+    if 'cvec_ds' in batch[0]:
+        cvecs_ds_padded = pad_sequence(cvecs_ds, batch_first=True)
+    spk_ids = torch.cat(spk_ids)
+    frame_len = torch.tensor(frame_lens)
+    result = {
+        'spk_id': spk_ids,
+        'mel': mels_padded,
+        'rms': rmss_padded,
+        'f0': f0s_padded,
+        'cvec': cvecs_padded,
+        'frame_len': frame_len
+    }
+    if 'cvec_ds' in batch[0]:
+        result['cvec_ds'] = cvecs_ds_padded
+    return result

rift_svc/dit.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import math
+from typing import Union, List
+from einops import repeat
+from jaxtyping import Bool, Float, Int
+import torch
+from torch import nn
+import torch.nn.functional as F
+from x_transformers.x_transformers import RotaryEmbedding
+from rift_svc.modules import (
+    AdaLayerNormZero_Final,
+    DiTBlock,
+    TimestepEmbedding,
+    LoRALinear,
+)
+# Conditional embedding for f0, rms, cvec
+class CondEmbedding(nn.Module):
+    def __init__(self, cvec_dim: int, cond_dim: int):
+        super().__init__()
+        self.cvec_dim = cvec_dim
+        self.cond_dim = cond_dim
+        self.f0_embed = nn.Linear(1, cond_dim)
+        self.rms_embed = nn.Linear(1, cond_dim)
+        self.cvec_embed = nn.Linear(cvec_dim, cond_dim)
+        self.out = nn.Linear(cond_dim, cond_dim)
+        self.ln_cvec = nn.LayerNorm(cond_dim, elementwise_affine=False, eps=1e-6)
+        self.ln = nn.LayerNorm(cond_dim, elementwise_affine=True, eps=1e-6)
+    def forward(
+            self,
+            f0: Float[torch.Tensor, "b n"],
+            rms: Float[torch.Tensor, "b n"],
+            cvec: Float[torch.Tensor, "b n d"],
+        ):
+        if f0.ndim == 2:
+            f0 = f0.unsqueeze(-1)
+        if rms.ndim == 2:
+            rms = rms.unsqueeze(-1)
+        f0_embed = self.f0_embed(f0 / 1200)
+        rms_embed = self.rms_embed(rms)
+        cvec_embed = self.ln_cvec(self.cvec_embed(cvec))
+        cond = f0_embed + rms_embed + cvec_embed
+        cond = self.ln(self.out(cond))
+        return cond
+# noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim: int, out_dim: int):
+        super().__init__()
+        self.mel_embed = nn.Linear(mel_dim, out_dim)
+        self.proj = nn.Linear(2 * out_dim, out_dim)
+        self.ln = nn.LayerNorm(out_dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x: Float[torch.Tensor, "b n d1"], cond_embed: Float[torch.Tensor, "b n d2"]):
+        x = self.mel_embed(x)
+        x = torch.cat((x, cond_embed), dim = -1)
+        x = self.proj(x)
+        x = self.ln(x)
+        return x
+# backbone using DiT blocks
+class DiT(nn.Module):
+    def __init__(self,
+                 dim: int, depth: int, head_dim: int = 64, dropout: float = 0.0, ff_mult: int = 4,
+                 n_mel_channels: int = 128, num_speaker: int = 1, cvec_dim: int = 768,
+                 kernel_size: int = 31, zero_null_spk: bool = False,
+                 init_std: float = 1):
+        super().__init__()
+        self.num_speaker = num_speaker
+        self.spk_embed = nn.Embedding(num_speaker, dim)
+        self.null_spk_embed = nn.Embedding(1, dim)
+        self.tembed = TimestepEmbedding(dim)
+        self.cond_embed = CondEmbedding(cvec_dim, dim)
+        self.input_embed = InputEmbedding(n_mel_channels, dim)
+        self.rotary_embed = RotaryEmbedding(head_dim)
+        self.dim = dim
+        self.depth = depth
+        self.transformer_blocks = nn.ModuleList(
+            [
+                DiTBlock(
+                    dim = dim,
+                    head_dim = head_dim,
+                    ff_mult = ff_mult,
+                    dropout = dropout,
+                    kernel_size = kernel_size,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.norm_out = AdaLayerNormZero_Final(dim)
+        self.output = nn.Linear(dim, n_mel_channels)
+        self.init_std = init_std
+        self.apply(self._init_weights)
+        for block in self.transformer_blocks:
+            torch.nn.init.constant_(block.attn_norm.proj.weight, 0)
+            torch.nn.init.constant_(block.attn_norm.proj.bias, 0)
+        torch.nn.init.constant_(self.norm_out.proj.weight, 0)
+        torch.nn.init.constant_(self.norm_out.proj.bias, 0)
+        torch.nn.init.constant_(self.output.weight, 0)
+        torch.nn.init.constant_(self.output.bias, 0)
+        if zero_null_spk:
+            self.null_spk_embed.weight.data.zero_()
+            self.null_spk_embed.requires_grad = False
+    def _init_weights(self, module: nn.Module):
+        if isinstance(module, nn.Linear):
+            fan_out, fan_in = module.weight.shape
+            # Spectral parameterization from the [paper](https://arxiv.org/abs/2310.17813).
+            init_std = (self.init_std / math.sqrt(fan_in)) * min(1, math.sqrt(fan_out / fan_in))
+            torch.nn.init.normal_(module.weight, mean=0.0, std=init_std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Conv1d):
+            # weight shape: (out_channels, in_channels/groups, kernel_size)
+            fan_out = module.weight.shape[0]  # out_channels
+            fan_in = module.weight.shape[1] * module.weight.shape[2]  # (in_channels/groups) * kernel_size
+            init_std = (self.init_std / math.sqrt(fan_in)) * min(1, math.sqrt(fan_out / fan_in))
+            torch.nn.init.normal_(module.weight, mean=0.0, std=init_std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=self.init_std/math.sqrt(self.dim))
+    def forward(
+        self,
+        x: Float[torch.Tensor, "b n d1`"],  # nosied input mel
+        spk: Int[torch.Tensor, "b"],  # speaker
+        f0: Float[torch.Tensor, "b n"],
+        rms: Float[torch.Tensor, "b n"],
+        cvec: Float[torch.Tensor, "b n d2"],
+        time: Float[torch.Tensor, "b"],  # time step
+        drop_speaker: Union[bool, Bool[torch.Tensor, "b"]] = False,
+        mask: Bool[torch.Tensor, "b n"] | None = None,
+        skip_layers: Union[int, List[int], None] = None,
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = repeat(time, ' -> b', b = batch)
+        if isinstance(drop_speaker, bool):
+            drop_speaker = torch.full((batch,), drop_speaker, dtype=torch.bool, device=x.device)
+        spk_embeds = self.spk_embed(spk)
+        null_spk_embeds = self.null_spk_embed(torch.zeros_like(spk, dtype=torch.long))
+        spk_embeds = torch.where(drop_speaker.unsqueeze(-1), null_spk_embeds, spk_embeds)
+        t = self.tembed(time)
+        t = t + spk_embeds
+        cond_embed = self.cond_embed(f0, rms, cvec)
+        x = self.input_embed(x, cond_embed)
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+        if skip_layers is not None:
+            if isinstance(skip_layers, int):
+                skip_layers = [skip_layers]
+        for i, block in enumerate(self.transformer_blocks):
+            if skip_layers is not None and i in skip_layers:
+                continue
+            x = block(x, t, mask = mask, rope = rope)
+        x = self.norm_out(x, t)
+        output = self.output(x)
+        return output
+    def apply_lora(self, rank, alpha):
+        for n, p in self.named_parameters():
+            p.requires_grad = False
+        self.spk_embed.weight.requires_grad = True
+        # Apply LoRA to k_proj and v_proj in each attention block
+        for block in self.transformer_blocks:
+            block.attn.k_proj = LoRALinear(block.attn.k_proj, rank, alpha)
+            block.attn.v_proj = LoRALinear(block.attn.v_proj, rank, alpha)
+    def merge_lora(self):
+        # Iterate over each transformer block in the DiT backbone
+        for block in self.transformer_blocks:
+            # Merge for k_proj if it is a LoRALinear instance
+            if isinstance(block.attn.k_proj, LoRALinear):
+                with torch.no_grad():
+                    # Compute delta update: B @ A^T
+                    delta = block.attn.k_proj.B @ block.attn.k_proj.A.T
+                    # The underlying linear layer has weight of shape (out_features, in_features)
+                    # and its forward computes x * weight.T
+                    # Note: delta.T equals A @ B^T, so merging works correctly:
+                    block.attn.k_proj.linear.weight.add_(delta)
+                # Replace the LoRALinear module with the merged linear layer
+                block.attn.k_proj = block.attn.k_proj.linear
+            # Merge for v_proj in the same way
+            if isinstance(block.attn.v_proj, LoRALinear):
+                with torch.no_grad():
+                    delta = block.attn.v_proj.B @ block.attn.v_proj.A.T
+                    block.attn.v_proj.linear.weight.add_(delta)
+                block.attn.v_proj = block.attn.v_proj.linear
+    def freeze_adaln_and_tembed(self):
+        for p in self.tembed.parameters():
+            p.requires_grad = False
+        for p in self.norm_out.parameters():
+            p.requires_grad = False
+        for block in self.transformer_blocks:
+            for p in block.attn_norm.parameters():
+                p.requires_grad = False

rift_svc/feature_extractors.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+from torch import nn
+from jaxtyping import Float
+from librosa.filters import mel as librosa_mel_fn
+from transformers import HubertModel
+def dynamic_range_compression_torch(
+        x: Float[torch.Tensor, "n_mels mel_len"],
+        C: float = 1,
+        clip_val: float = 1e-5
+) -> Float[torch.Tensor, "n_mels mel_len"]:
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def spectral_normalize_torch(
+        magnitudes: Float[torch.Tensor, "n_mels mel_len"]
+) -> Float[torch.Tensor, "n_mels mel_len"]:
+    return dynamic_range_compression_torch(magnitudes)
+mel_basis_cache = {}
+hann_window_cache = {}
+def get_mel_spectrogram(
+    y: Float[torch.Tensor, "n"],
+    n_fft: int = 2048,
+    num_mels: int = 128,
+    sampling_rate: int = 44100,
+    hop_size: int = 512,
+    win_size: int = 2048,
+    fmin: int = 40,
+    fmax: int | None = 16000,
+    center: bool = False,
+) -> Float[torch.Tensor, "n_mels mel_len"]:
+    """
+    Calculate the mel spectrogram of an input signal.
+    This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft).
+    Args:
+        y (torch.Tensor): Input signal with shape (n,).
+        n_fft (int, optional): FFT size. Defaults to 1024.
+        num_mels (int, optional): Number of mel bins. Defaults to 128.
+        sampling_rate (int, optional): Sampling rate of the input signal. Defaults to 44100.
+        hop_size (int, optional): Hop size for STFT. Defaults to 256.
+        win_size (int, optional): Window size for STFT. Defaults to 1024.
+        fmin (int, optional): Minimum frequency for mel filterbank. Defaults to 0.
+        fmax (int | None, optional): Maximum frequency for mel filterbank. If None, defaults to sr/2.0. Defaults to None.
+        center (bool, optional): Whether to pad the input to center the frames. Defaults to False.
+    Returns:
+        torch.Tensor: Mel spectrogram with shape (n_mels, mel_len).
+    """
+    if torch.min(y) < -1.0:
+        print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}")
+    if torch.max(y) > 1.0:
+        print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}")
+    device = y.device
+    key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}"
+    if key not in mel_basis_cache:
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+        )
+        mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)
+        hann_window_cache[key] = torch.hann_window(win_size).to(device)
+    mel_basis = mel_basis_cache[key]
+    hann_window = hann_window_cache[key]
+    padding = (n_fft - hop_size) // 2
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (padding, padding), mode="reflect"
+    ).squeeze(1)
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
+    mel_spec = torch.matmul(mel_basis, spec)
+    mel_spec = spectral_normalize_torch(mel_spec)
+    return mel_spec
+class RMSExtractor(nn.Module):
+    def __init__(self, hop_length=512, window_length=2048):
+        """
+        Initializes the RMSExtractor with the specified hop_length.
+        Args:
+            hop_length (int): Number of samples between successive frames.
+        """
+        super(RMSExtractor, self).__init__()
+        self.hop_length = hop_length
+        self.window_length = window_length
+    def forward(self, inp):
+        """
+        Extracts RMS energy from the input audio tensor.
+        Args:
+            inp (Tensor): Audio tensor of shape (batch, samples).
+        Returns:
+            Tensor: RMS energy tensor of shape (batch, frames).
+        """
+        # Square the audio signal
+        audio_squared = inp ** 2
+        # Use the same padding as mel spectrogram
+        padding = (self.window_length - self.hop_length) // 2
+        audio_padded = torch.nn.functional.pad(
+            audio_squared, (padding, padding), mode='reflect'
+        )
+        # Unfold to create frames with window_length instead of hop_length
+        frames = audio_padded.unfold(1, self.window_length, self.hop_length)  # Shape: (batch, frames, window_length)
+        # Compute mean energy per frame
+        mean_energy = frames.mean(dim=-1)  # Shape: (batch, frames)
+        # Compute RMS by taking square root
+        rms = torch.sqrt(mean_energy)  # Shape: (batch, frames)
+        return rms
+class HubertModelWithFinalProj(HubertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)

rift_svc/lightning_module.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import gc
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+import wandb
+from functools import partial
+import inspect
+from pytorch_lightning import LightningModule
+from rift_svc.metrics import mcd, psnr, si_snr
+from rift_svc.feature_extractors import get_mel_spectrogram
+from rift_svc.nsf_hifigan import NsfHifiGAN
+from rift_svc.utils import draw_mel_specs, l2_grad_norm
+class RIFTSVCLightningModule(LightningModule):
+    def __init__(
+        self,
+        model,
+        optimizer,
+        cfg,
+        lr_scheduler=None,
+    ):
+        super().__init__()
+        self.model = model
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.cfg = cfg
+        self.eval_sample_steps = cfg['training']['eval_sample_steps']
+        self.model.sample = partial(
+            self.model.sample,
+            steps=self.eval_sample_steps,
+        )
+        self.log_media_per_steps = cfg['training']['log_media_per_steps']
+        self.drop_spk_prob = cfg['training']['drop_spk_prob']
+        self.vocoder = None
+        self.save_hyperparameters(ignore=['model', 'optimizer', 'vocoder'])
+    def configure_optimizers(self):
+        if self.lr_scheduler is None:
+            return self.optimizer
+        return {
+            "optimizer": self.optimizer,
+            "lr_scheduler": {
+                "scheduler": self.lr_scheduler,
+                "interval": "step",
+            }
+        }
+    def training_step(self, batch, batch_idx):
+        mel = batch['mel']
+        spk_id = batch['spk_id']
+        f0 = batch['f0']
+        rms = batch['rms']
+        cvec = batch['cvec']
+        frame_len = batch['frame_len']
+        drop_speaker = False
+        if self.drop_spk_prob > 0:
+            batch_size = spk_id.shape[0]
+            num_drop = int(batch_size * self.drop_spk_prob)
+            drop_speaker = torch.zeros(batch_size, dtype=torch.bool, device=spk_id.device)
+            drop_speaker[:num_drop] = True
+            # Randomly shuffle the drop mask
+            drop_speaker = drop_speaker[torch.randperm(batch_size)]
+        loss, _ = self.model(
+            mel,
+            spk_id=spk_id,
+            f0=f0,
+            rms=rms,
+            cvec=cvec,
+            drop_speaker=drop_speaker,
+            frame_len=frame_len,
+        )
+        # Log metrics - compatible with both loggers
+        self._log_scalar("train/loss", loss.item(), prog_bar=True)
+        return loss
+    def on_validation_start(self):
+        if hasattr(self.optimizer, 'eval'):
+            self.optimizer.eval()
+        if not self.trainer.is_global_zero:
+            return
+        if self.vocoder is None:
+            self.vocoder =  NsfHifiGAN(
+                'pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt').to(self.device)
+        else:
+            self.vocoder = self.vocoder.to(self.device)
+        self.mcd = []
+        self.si_snr = []
+        self.psnr = []
+        self.mse = []
+    def on_validation_end(self, log=True):
+        if hasattr(self.optimizer, 'eval'):
+            self.optimizer.train()
+        if not self.trainer.is_global_zero:
+            return
+        if self.vocoder is not None:
+            self.vocoder = self.vocoder.cpu()
+            gc.collect()
+            torch.cuda.empty_cache()
+        metrics = {
+            'val/mcd': np.mean(self.mcd),
+            'val/si_snr': np.mean(self.si_snr),
+            'val/psnr': np.mean(self.psnr),
+            'val/mse': np.mean(self.mse)
+        }
+        if log:
+            # Log metrics - compatible with both loggers
+            for metric_name, metric_value in metrics.items():
+                self._log_scalar(metric_name, metric_value)
+    def validation_step(self, batch, batch_idx, log=True):
+        """
+        Process validation step and log metrics and media.
+        Args:
+            batch: Input batch
+            batch_idx: Batch index
+            log: Whether to log or not
+        """
+        # Skip if not the main process or logging is disabled
+        if not self.trainer.is_global_zero:
+            return
+        # Get step and interval info
+        global_step = self.global_step
+        log_media_every_n_steps = self.log_media_every_n_steps
+        # Extract input data
+        spk_id = batch['spk_id']
+        mel_gt = batch['mel']
+        rms = batch['rms']
+        f0 = batch['f0']
+        cvec = batch['cvec']
+        frame_len = batch['frame_len']
+        cvec_ds = batch.get('cvec_ds', None)
+        # Generate output
+        mel_gen, _ = self.model.sample(
+            src_mel=mel_gt,
+            spk_id=spk_id,
+            f0=f0,
+            rms=rms,
+            cvec=cvec,
+            frame_len=frame_len,
+            bad_cvec=cvec_ds,
+        )
+        mel_gen = mel_gen.float()
+        mel_gt = mel_gt.float()
+        # Process each sample in the batch
+        for i in range(mel_gen.shape[0]):
+            sample_idx = batch_idx * mel_gen.shape[0] + i
+            # Generate audio using vocoder
+            wav_gen = self.vocoder(mel_gen[i:i+1, :frame_len[i], :].transpose(1, 2), f0[i:i+1, :frame_len[i]])
+            wav_gt = self.vocoder(mel_gt[i:i+1, :frame_len[i], :].transpose(1, 2), f0[i:i+1, :frame_len[i]])
+            wav_gen = wav_gen.squeeze(0)
+            wav_gt = wav_gt.squeeze(0)
+            # Generate mel spectrograms
+            mel_gen_i = get_mel_spectrogram(wav_gen).transpose(1, 2)
+            mel_gt_i = get_mel_spectrogram(wav_gt).transpose(1, 2)
+            # Clip values to valid range
+            mel_min, mel_max = self.model.mel_min, self.model.mel_max
+            mel_gen_i = torch.clip(mel_gen_i, min=mel_min, max=mel_max)
+            mel_gt_i = torch.clip(mel_gt_i, min=mel_min, max=mel_max)
+            # Calculate metrics
+            self.mcd.append(mcd(mel_gen_i, mel_gt_i).cpu().item())
+            self.si_snr.append(si_snr(mel_gen_i, mel_gt_i).cpu().item())
+            self.psnr.append(psnr(mel_gen_i, mel_gt_i).cpu().item())
+            self.mse.append(F.mse_loss(mel_gen_i, mel_gt_i).cpu().item())
+            if log:
+                # Create cache directory if it doesn't exist
+                os.makedirs('.cache', exist_ok=True)
+                # Log generated audio at specified intervals
+                if global_step % log_media_every_n_steps == 0:
+                    audio_path = f".cache/spk-{spk_id[i].item()}_{sample_idx}_gen.wav"
+                    torchaudio.save(audio_path, wav_gen.cpu().to(torch.float32), 44100)
+                    self._log_audio(self.logger, f"val-audio/spk-{spk_id[i].item()}_{sample_idx}-gen", audio_path, global_step)
+                # Log ground truth audio only at the first step
+                if global_step == 0:
+                    gt_audio_path = f".cache/spk-{spk_id[i].item()}_{sample_idx}_gt.wav"
+                    torchaudio.save(gt_audio_path, wav_gt.cpu().to(torch.float32), 44100)
+                    self._log_audio(self.logger, f"val-audio/spk-{spk_id[i].item()}_{sample_idx}-gt", gt_audio_path, global_step)
+                # Log mel spectrograms at specified intervals
+                if global_step % log_media_every_n_steps == 0:
+                    # Create mel spectrogram visualization
+                    data_gt = mel_gt_i.squeeze().T.cpu().numpy()
+                    data_gen = mel_gen_i.squeeze().T.cpu().numpy()
+                    data_abs_diff = data_gen - data_gt
+                    cache_path = f".cache/{sample_idx}_mel.jpg"
+                    draw_mel_specs(data_gt, data_gen, data_abs_diff, cache_path)
+                    self._log_image(self.logger, f"val-mel/{sample_idx}_mel", cache_path, global_step)
+    def on_test_start(self):
+        self.on_validation_start()
+    def on_test_end(self):
+        self.on_validation_end(log=False)
+    def test_step(self, batch, batch_idx):
+        self.validation_step(batch, batch_idx, log=False)
+    def on_before_optimizer_step(self, optimizer):
+        # Calculate gradient norm
+        norm = l2_grad_norm(self.model)
+        # Log gradient norm
+        self._log_scalar("train/grad_norm", norm)
+    @property
+    def global_step(self):
+        return self.trainer.global_step
+    @property
+    def log_media_every_n_steps(self):
+        if self.log_media_per_steps is not None:
+            return self.log_media_per_steps
+        if self.save_every_n_steps is None:
+            return self.trainer.val_check_interval
+        return self.save_every_n_steps
+    @property
+    def save_every_n_steps(self):
+        for callback in self.trainer.callbacks:
+            if hasattr(callback, '_every_n_train_steps'):
+                return callback._every_n_train_steps
+        return None
+    @property
+    def is_using_wandb(self):
+        """
+        Check if WandB logger is being used.
+        Returns:
+            bool: True if WandB logger is being used, False otherwise
+        """
+        from pytorch_lightning.loggers import WandbLogger
+        if isinstance(self.logger, WandbLogger):
+            return True
+        return False
+    @property
+    def is_using_tensorboard(self):
+        """
+        Check if TensorBoard logger is being used.
+        Returns:
+            bool: True if TensorBoard logger is being used, False otherwise
+        """
+        from pytorch_lightning.loggers import TensorBoardLogger
+        if isinstance(self.logger, TensorBoardLogger):
+            return True
+        return False
+    @property
+    def logger_type(self):
+        """
+        Get a string representation of the logger type.
+        Returns:
+            str: 'wandb', 'tensorboard', or 'unknown'
+        """
+        if self.is_using_wandb:
+            return 'wandb'
+        elif self.is_using_tensorboard:
+            return 'tensorboard'
+        else:
+            return 'unknown'
+    def state_dict(self, *args, **kwargs):
+        # Temporarily store vocoder
+        vocoder = self.vocoder
+        self.vocoder = None
+        # Get state dict without vocoder
+        state = super().state_dict(*args, **kwargs)
+        # Restore vocoder
+        self.vocoder = vocoder
+        return state
+    # Add helper methods for logging with different logger types
+    def _log_scalar(self, name, value, step=None, **kwargs):
+        """
+        Log a scalar value to the appropriate logger.
+        Args:
+            name: Name of the metric
+            value: Value of the metric
+            step: Step value (defaults to current global step if None)
+            **kwargs: Additional arguments to pass to the logger
+        """
+        if step is None:
+            step = self.global_step
+        # Special handling for on_validation_end or on_test_end
+        # Get the caller function name to determine if we're in on_validation_end
+        caller_frame = inspect.currentframe().f_back
+        caller_function = caller_frame.f_code.co_name
+        if caller_function in ['on_validation_end', 'on_test_end']:
+            # Use logger.experiment directly as self.log() is not allowed in these hooks
+            if self.is_using_wandb:
+                self.logger.experiment.log({name: value}, step=step)
+            elif self.is_using_tensorboard:
+                self.logger.experiment.add_scalar(name, value, step)
+            # Add other logger types here if needed
+        else:
+            # Use PyTorch Lightning's built-in logging system for scalars
+            # This handles different logger types automatically
+            self.log(name, value, **kwargs)
+    def _log_audio(self, logger, name, file_path, step):
+        """
+        Log audio to the appropriate logger.
+        Args:
+            logger: The logger instance
+            name: Name of the audio
+            file_path: Path to the audio file
+            step: Step value
+        """
+        try:
+            if hasattr(logger, 'experiment') and hasattr(logger.experiment, 'log'):
+                # WandbLogger
+                import wandb
+                logger.experiment.log({
+                    name: wandb.Audio(file_path, sample_rate=44100)
+                }, step=step)
+            elif hasattr(logger, 'experiment') and hasattr(logger.experiment, 'add_audio'):
+                # TensorBoardLogger
+                import soundfile as sf
+                audio, sample_rate = sf.read(file_path)
+                logger.experiment.add_audio(name, audio, step, sample_rate=44100)
+        except Exception as e:
+            print(f"Warning: Failed to log audio {name}: {e}")
+    def _log_image(self, logger, name, file_path, step):
+        """
+        Log an image to the appropriate logger.
+        Args:
+            logger: The logger instance
+            name: Name of the image
+            file_path: Path to the image file
+            step: Step value
+        """
+        try:
+            if hasattr(logger, 'experiment') and hasattr(logger.experiment, 'log'):
+                # WandbLogger
+                import wandb
+                logger.experiment.log({
+                    name: wandb.Image(file_path)
+                }, step=step)
+            elif hasattr(logger, 'experiment') and hasattr(logger.experiment, 'add_image'):
+                # TensorBoardLogger
+                import PIL.Image
+                import numpy as np
+                import torch
+                image = PIL.Image.open(file_path)
+                image_array = np.array(image)
+                image_tensor = torch.from_numpy(image_array).permute(2, 0, 1)  # HWC to CHW
+                logger.experiment.add_image(name, image_tensor, step)
+        except Exception as e:
+            print(f"Warning: Failed to log image {name}: {e}")

rift_svc/metrics.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+def psnr(estimated, target, max_val=None):
+    """Calculate Peak Signal-to-Noise Ratio (PSNR)
+    Args:
+        estimated (torch.Tensor): Estimated mel spectrogram [B, len, n_mel]
+        target (torch.Tensor): Target mel spectrogram [B, len, n_mel]
+        max_val (float): Maximum value of the signal. If None, uses max of target
+    Returns:
+        torch.Tensor: PSNR value in dB [B]
+    """
+    if max_val is None:
+        # Use the maximum absolute value between both tensors
+        max_val = max(torch.abs(target).max(), torch.abs(estimated).max())
+    # Ensure max_val is not zero
+    max_val = max(max_val, torch.finfo(target.dtype).eps)
+    mse = torch.mean((estimated - target) ** 2, dim=(1, 2))
+    # Add eps to avoid log of zero
+    eps = torch.finfo(target.dtype).eps
+    psnr = 20 * torch.log10(max_val + eps) - 10 * torch.log10(mse + eps)
+    return psnr
+def si_snr(estimated, target, eps=1e-8):
+    """Calculate Scale-Invariant Signal-to-Noise Ratio (SI-SNR)
+    Args:
+        estimated (torch.Tensor): Estimated mel spectrogram [B, len, n_mel]
+        target (torch.Tensor): Target mel spectrogram [B, len, n_mel]
+        eps (float): Small value to avoid division by zero
+    Returns:
+        torch.Tensor: SI-SNR value in dB [B]
+    """
+    # Flatten the mel dimension
+    estimated = estimated.reshape(estimated.shape[0], -1)
+    target = target.reshape(target.shape[0], -1)
+    # Zero-mean normalization
+    estimated = estimated - torch.mean(estimated, dim=1, keepdim=True)
+    target = target - torch.mean(target, dim=1, keepdim=True)
+    # SI-SNR
+    alpha = torch.sum(estimated * target, dim=1, keepdim=True) / (
+        torch.sum(target ** 2, dim=1, keepdim=True) + eps)
+    target_scaled = alpha * target
+    si_snr = 10 * torch.log10(
+        torch.sum(target_scaled ** 2, dim=1) /
+        (torch.sum((estimated - target_scaled) ** 2, dim=1) + eps) + eps
+    )
+    return si_snr
+def mcd(estimated, target):
+    """Calculate Mel-Cepstral Distortion (MCD)
+    Args:
+        estimated (torch.Tensor): Estimated mel spectrogram [B, len, n_mel]
+        target (torch.Tensor): Target mel spectrogram [B, len, n_mel]
+    Returns:
+        torch.Tensor: MCD value [B], averaged over time steps
+    """
+    # Convert to log scale
+    estimated = torch.log10(torch.clamp(estimated, min=1e-8))
+    target = torch.log10(torch.clamp(target, min=1e-8))
+    # Calculate MCD
+    diff = estimated - target
+    mcd = torch.sqrt(2 * torch.sum(diff ** 2, dim=2))  # [B, len]
+    # Average over time dimension
+    mcd = mcd.mean(dim=1)  # [B]
+    return mcd

rift_svc/modules.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import math
+from einops import rearrange
+from jaxtyping import Float, Bool
+import torch
+from torch import nn
+import torch.nn.functional as F
+from x_transformers.x_transformers import apply_rotary_pos_emb
+class LoRALinear(nn.Module):
+    def __init__(self, linear, rank, alpha):
+        super().__init__()
+        self.linear = linear
+        self.rank = rank
+        self.alpha = alpha
+        self.scale = alpha / math.sqrt(rank)
+        in_features = linear.in_features
+        out_features = linear.out_features
+        self.A = nn.Parameter(torch.zeros(in_features, rank))
+        self.B = nn.Parameter(torch.zeros(out_features, rank))
+        # Initialize LoRA parameters
+        nn.init.normal_(self.A, mean=0, std=math.sqrt(self.rank) / self.linear.in_features)
+        nn.init.zeros_(self.B)
+        # Freeze original linear layer parameters
+        self.linear.weight.requires_grad = False
+        if self.linear.bias is not None:
+            self.linear.bias.requires_grad = False
+    def forward(self, x):
+        original_out = self.linear(x)
+        lora_out = (x @ self.A) @ self.B.T
+        return original_out + lora_out * self.scale
+# AdaLayerNormZero
+# return with modulated x for attn input, and params for later mlp modulation
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.proj = nn.Linear(dim, dim * 6)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb = None):
+        emb = self.proj(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+# AdaLayerNormZero for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+class AdaLayerNormZero_Final(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.proj = nn.Linear(dim, dim * 2)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.proj(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+# ReLU^2
+class ReLU2(nn.Module):
+    def forward(self, x):
+        return F.relu(x, inplace=True).square()
+# FeedForward
+class ConvMLP(nn.Module):
+    def __init__(self, dim: int, dim_out: int | None = None, mult: float = 4, dropout: float = 0.0, kernel_size: int = 7):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        #self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=kernel_size//2, groups=dim)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.activation = ReLU2()
+        self.dropout = nn.Dropout(dropout)
+        self.mlp_proj = nn.Linear(dim, inner_dim)
+        self.mlp_out = nn.Linear(inner_dim, dim_out)
+    def forward(self, x):
+        x = x.permute(0, 2, 1)
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 1)
+        x = self.norm(x)
+        x = self.mlp_proj(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+        x = self.mlp_out(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int = 64,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("Attention requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.dim = dim
+        assert dim % head_dim == 0
+        self.head_dim = head_dim
+        self.num_heads = int(dim // head_dim)
+        self.inner_dim = dim
+        self.dropout = dropout
+        self.scale = 1 / dim
+        self.q_proj = nn.Linear(dim, self.inner_dim)
+        self.k_proj = nn.Linear(dim, self.inner_dim)
+        self.v_proj = nn.Linear(dim, self.inner_dim)
+        self.norm_q = nn.LayerNorm(self.head_dim, elementwise_affine=False, eps=1e-6)
+        self.norm_k = nn.LayerNorm(self.head_dim, elementwise_affine=False, eps=1e-6)
+        self.attn_out = nn.Linear(self.inner_dim, dim)
+        self.attn_dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        x: Float[torch.Tensor, "b n d"],
+        mask: Bool[torch.Tensor, "b n"] | None = None,
+        rope = None,
+    ) -> Float[torch.Tensor, "b n d"]:
+        batch_size = x.shape[0]
+        # projections
+        query = self.q_proj(x)
+        key = self.k_proj(x)
+        value = self.v_proj(x)
+        # apply rotary position embedding
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale ** -1.) if xpos_scale is not None else (1., 1.)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.num_heads
+        query = query.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        query = self.norm_q(query)
+        key = self.norm_k(key)
+        # mask
+        if mask is not None:
+            attn_mask = mask
+            attn_mask = rearrange(attn_mask, 'b n -> b 1 1 n')
+            attn_mask = attn_mask.expand(batch_size, self.num_heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False, scale=self.scale)
+        x = x.transpose(1, 2).reshape(batch_size, -1, self.num_heads * head_dim)
+        x = x.to(query.dtype)
+        # linear proj and dropout
+        x = self.attn_out(x)
+        x = self.attn_dropout(x)
+        if mask is not None:
+            mask = rearrange(mask, 'b n -> b n 1')
+            x = x.masked_fill(~mask, 0.)
+        return x
+# DiT Block
+class DiTBlock(nn.Module):
+    def __init__(
+            self, dim: int, head_dim: int, ff_mult: float = 4,
+            dropout: float = 0.0, kernel_size: int = 31):
+        super().__init__()
+        self.attn_norm = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            dim = dim,
+            head_dim = head_dim,
+            dropout = dropout,
+        )
+        self.mlp_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.mlp = ConvMLP(dim = dim, mult = ff_mult, dropout = dropout, kernel_size=kernel_size)
+    def forward(
+        self,
+        x: Float[torch.Tensor, "b n d"],
+        t: Float[torch.Tensor, "b d"],
+        mask: Bool[torch.Tensor, "b n"] | None = None,
+        rope: Float[torch.Tensor, "b d"] | None = None,
+    ) -> Float[torch.Tensor, "b n d"]:
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+        # attention
+        attn_output = self.attn(x=norm, mask=mask, rope=rope)
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        norm = self.mlp_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        mlp_output = self.mlp(norm)
+        x = x + gate_mlp.unsqueeze(1) * mlp_output
+        return x
+# sinusoidal position embedding
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x: Float[torch.Tensor, "b"], scale: float = 1000) -> Float[torch.Tensor, "b d"]:
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# time step conditioning embedding
+class TimestepEmbedding(nn.Module):
+    def __init__(self, dim: int, freq_embed_dim: int = 256):
+        super().__init__()
+        self.time2emb = SinusPositionEmbedding(freq_embed_dim)
+        self.time_emb = nn.Linear(freq_embed_dim, dim)
+        self.act = nn.SiLU()
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, timestep: Float[torch.Tensor, "b"]) -> Float[torch.Tensor, "b d"]:
+        time = self.time2emb(timestep)
+        time = self.time_emb(time)
+        time = self.act(time)
+        time = self.proj(time)
+        return time

rift_svc/nsf_hifigan/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .nvSTFT import STFT
2	+ from .vocoder import Vocoder, NsfHifiGAN

rift_svc/nsf_hifigan/env.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

rift_svc/nsf_hifigan/models.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import os
+import json
+from .env import AttrDict
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .utils import init_weights, get_padding
+LRELU_SLOPE = 0.1
+def load_model(model_path, device='cuda'):
+    h = load_config(model_path)
+    generator = Generator(h).to(device)
+    cp_dict = torch.load(model_path, map_location=device, weights_only=True)
+    generator.load_state_dict(cp_dict['generator'])
+    generator.eval()
+    generator.remove_weight_norm()
+    del cp_dict
+    return generator, h
+def load_config(model_path):
+    config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
+    with open(config_file) as f:
+        data = f.read()
+    json_config = json.loads(data)
+    h = AttrDict(json_config)
+    return h
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-waveform (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_threshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def _f02sine(self, f0, upp):
+        """ f0: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        rad = f0 / self.sampling_rate * torch.arange(1, upp + 1, device=f0.device)
+        rad2 = torch.fmod(rad[..., -1:].float() + 0.5, 1.0) - 0.5
+        rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0)
+        rad += F.pad(rad_acc, (0, 0, 1, -1))
+        rad = rad.reshape(f0.shape[0], -1, 1)
+        rad = torch.multiply(rad, torch.arange(1, self.dim + 1, device=f0.device).reshape(1, 1, -1))
+        rand_ini = torch.rand(1, 1, self.dim, device=f0.device)
+        rand_ini[..., 0] = 0
+        rad += rand_ini
+        sines = torch.sin(2 * np.pi * rad)
+        return sines
+    @torch.no_grad()
+    def forward(self, f0, upp):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        f0 = f0.unsqueeze(-1)
+        sine_waves = self._f02sine(f0, upp) * self.sine_amp
+        uv = (f0 > self.voiced_threshold).float()
+        uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        sine_waves = sine_waves * uv + noise
+        return sine_waves
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x, upp):
+        sine_wavs = self.l_sin_gen(x, upp)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=h.sampling_rate,
+            harmonic_num=8
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            c_cur = h.upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(weight_norm(
+                ConvTranspose1d(h.upsample_initial_channel // (2 ** i), h.upsample_initial_channel // (2 ** (i + 1)),
+                                k, u, padding=(k - u) // 2)))
+            if i + 1 < len(h.upsample_rates):  #
+                stride_f0 = int(np.prod(h.upsample_rates[i + 1:]))
+                self.noise_convs.append(Conv1d(
+                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        ch = h.upsample_initial_channel
+        for i in range(len(self.ups)):
+            ch //= 2
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.upp = int(np.prod(h.upsample_rates))
+    def forward(self, x, f0):
+        har_source = self.m_source(f0, self.upp).transpose(1, 2)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, periods=None):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
+        self.discriminators = nn.ModuleList()
+        for period in self.periods:
+            self.discriminators.append(DiscriminatorP(period))
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool1d(4, 2, padding=2),
+            AvgPool1d(4, 2, padding=2)
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg ** 2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

rift_svc/nsf_hifigan/nvSTFT.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import math
+import os
+os.environ["LRU_CACHE_CAPACITY"] = "3"
+import random
+import torch
+import torch.utils.data
+import numpy as np
+import librosa
+from librosa.util import normalize
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+import soundfile as sf
+import torch.nn.functional as F
+def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
+    sampling_rate = None
+    try:
+        data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
+    except Exception as ex:
+        print(f"'{full_path}' failed to load.\nException:")
+        print(ex)
+        if return_empty_on_exception:
+            return [], sampling_rate or target_sr or 48000
+        else:
+            raise Exception(ex)
+    if len(data.shape) > 1:
+        data = data[:, 0]
+        assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
+    if np.issubdtype(data.dtype, np.integer): # if audio data is type int
+        max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
+    else: # if audio data is type fp32
+        max_mag = max(np.amax(data), -np.amin(data))
+        max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
+    data = torch.FloatTensor(data.astype(np.float32))/max_mag
+    if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
+        return [], sampling_rate or target_sr or 48000
+    if target_sr is not None and sampling_rate != target_sr:
+        data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
+        sampling_rate = target_sr
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+class STFT():
+    def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
+        self.target_sr = sr
+        self.n_mels     = n_mels
+        self.n_fft      = n_fft
+        self.win_size   = win_size
+        self.hop_length = hop_length
+        self.fmin     = fmin
+        self.fmax     = fmax
+        self.clip_val = clip_val
+        self.mel_basis = {}
+        self.hann_window = {}
+    def get_mel(self, y, keyshift=0, speed=1, center=False):
+        sampling_rate = self.target_sr
+        n_mels     = self.n_mels
+        n_fft      = self.n_fft
+        win_size   = self.win_size
+        hop_length = self.hop_length
+        fmin       = self.fmin
+        fmax       = self.fmax
+        clip_val   = self.clip_val
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(n_fft * factor))
+        win_size_new = int(np.round(win_size * factor))
+        hop_length_new = int(np.round(hop_length * speed))
+        mel_basis_key = str(fmax)+'_'+str(y.device)
+        if mel_basis_key not in self.mel_basis:
+            mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
+            self.mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
+        keyshift_key = str(keyshift)+'_'+str(y.device)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
+        pad_left = (win_size_new - hop_length_new) //2
+        pad_right = max((win_size_new- hop_length_new + 1) //2, win_size_new - y.size(-1) - pad_left)
+        if pad_right < y.size(-1):
+            mode = 'reflect'
+        else:
+            mode = 'constant'
+        y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode = mode)
+        y = y.squeeze(1)
+        spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=self.hann_window[keyshift_key],
+                          center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
+        spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
+        if keyshift != 0:
+            size = n_fft // 2 + 1
+            resize = spec.size(1)
+            if resize < size:
+                spec = F.pad(spec, (0, 0, 0, size-resize))
+            spec = spec[:, :size, :] * win_size / win_size_new
+        spec = torch.matmul(self.mel_basis[mel_basis_key], spec)
+        spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
+        return spec
+    def __call__(self, audiopath):
+        audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
+        spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
+        return spect
+stft = STFT()

rift_svc/nsf_hifigan/utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import glob
+import os
+import matplotlib
+import torch
+from torch.nn.utils import weight_norm
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+    fig.canvas.draw()
+    plt.close()
+    return fig
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+def save_checkpoint(filepath, obj):
+    print("Saving checkpoint to {}".format(filepath))
+    torch.save(obj, filepath)
+    print("Complete.")
+def del_old_checkpoints(cp_dir, prefix, n_models=2):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern) # get checkpoint paths
+    cp_list = sorted(cp_list)# sort by iter
+    if len(cp_list) > n_models: # if more than n_models models are found
+        for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
+            open(cp, 'w').close()# empty file contents
+            os.unlink(cp)# delete file (move to trash when using Colab)
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]

rift_svc/nsf_hifigan/vocoder.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import yaml
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from .nvSTFT import STFT
+from .models import load_model,load_config
+from torchaudio.transforms import Resample
+from jaxtyping import Float
+class DotDict(dict):
+    def __getattr__(*args):
+        val = dict.get(*args)
+        return DotDict(val) if type(val) is dict else val
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def load_model_vocoder(
+        model_path,
+        device='cpu'):
+    config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
+    with open(config_file, "r") as config:
+        args = yaml.safe_load(config)
+    args = DotDict(args)
+    # load vocoder
+    vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=device)
+    return vocoder, args
+class Vocoder:
+    def __init__(self, vocoder_type, vocoder_ckpt, device = None):
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = device
+        if vocoder_type == 'nsf-hifigan':
+            self.vocoder = NsfHifiGAN(vocoder_ckpt, device = device)
+        elif vocoder_type == 'nsf-hifigan-log10':
+            self.vocoder = NsfHifiGANLog10(vocoder_ckpt, device = device)
+        else:
+            raise ValueError(f" [x] Unknown vocoder: {vocoder_type}")
+        self.resample_kernel = {}
+        self.vocoder_sample_rate = self.vocoder.sample_rate()
+        self.vocoder_hop_size = self.vocoder.hop_size()
+        self.dimension = self.vocoder.dimension()
+    def extract(self, audio, sample_rate=0, keyshift=0):
+        # resample
+        if sample_rate == self.vocoder_sample_rate or sample_rate == 0:
+            audio_res = audio
+        else:
+            key_str = str(sample_rate)
+            if key_str not in self.resample_kernel:
+                self.resample_kernel[key_str] = Resample(sample_rate, self.vocoder_sample_rate, lowpass_filter_width = 128).to(self.device)
+            audio_res = self.resample_kernel[key_str](audio)
+        # extract
+        mel = self.vocoder.extract(audio_res, keyshift=keyshift) # B, n_frames, bins
+        return mel
+    def infer(self, mel, f0):
+        f0 = f0[:,:mel.size(1),0] # B, n_frames
+        audio = self.vocoder(mel, f0)
+        return audio
+class NsfHifiGAN(torch.nn.Module):
+    def __init__(self, model_path, device=None):
+        super().__init__()
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = device
+        self.model_path = model_path
+        self.model = None
+        self.h = load_config(model_path)
+        self.stft = STFT(
+                self.h.sampling_rate,
+                self.h.num_mels,
+                self.h.n_fft,
+                self.h.win_size,
+                self.h.hop_size,
+                self.h.fmin,
+                self.h.fmax)
+    def sample_rate(self):
+        return self.h.sampling_rate
+    def hop_size(self):
+        return self.h.hop_size
+    def dimension(self):
+        return self.h.num_mels
+    def extract(self, audio, keyshift=0):
+        mel = self.stft.get_mel(audio, keyshift=keyshift).transpose(1, 2) # B, n_frames, bins
+        return mel
+    def forward(self, mel: Float[torch.Tensor, "batch bins n_frames"], f0: Float[torch.Tensor, "batch n_frames"]):
+        if self.model is None:
+            print('| Load HifiGAN: ', self.model_path)
+            self.model, self.h = load_model(self.model_path, device=self.device)
+        with torch.no_grad():
+            audio = self.model(mel, f0)
+            return audio
+class NsfHifiGANLog10(NsfHifiGAN):
+    def forward(self, mel, f0):
+        if self.model is None:
+            print('| Load HifiGAN: ', self.model_path)
+            self.model, self.h = load_model(self.model_path, device=self.device)
+        with torch.no_grad():
+            c = 0.434294 * mel.transpose(1, 2)
+            audio = self.model(c, f0)
+            return audio

rift_svc/optim.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from schedulefree import AdamWScheduleFree
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import _LRScheduler
+import math
+def get_optimizer(
+        optimizer_type, model, lr, betas, weight_decay, warmup_steps,
+        lora_training=False, **kwargs):
+    from collections import defaultdict
+    param_dict = {pn: p for pn, p in model.named_parameters() if p.requires_grad}
+    if not lora_training:
+        specp_decay_params = defaultdict(list)
+        specp_decay_lr = {}
+        decay_params = []
+        nodecay_params = []
+        for n, p in param_dict.items():
+            if p.dim() >= 2:
+                if n.endswith('out.weight') or n.endswith('proj.weight'):
+                    fan_out, fan_in = p.shape[-2:]
+                    fan_ratio = fan_out / fan_in
+                    specp_decay_params[f"specp_decay_{fan_ratio:.2f}"].append(p)
+                    specp_decay_lr[f"specp_decay_{fan_ratio:.2f}"] = lr * fan_ratio
+                else:
+                    decay_params.append(p)
+            else:
+                nodecay_params.append(p)
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay, 'lr': lr},
+            {'params': nodecay_params, 'weight_decay': 0.0, 'lr': lr}
+        ] + [
+            {'params': params, 'weight_decay': weight_decay, 'lr': specp_decay_lr[group_name]}
+            for group_name, params in specp_decay_params.items()
+        ]
+    else:
+        lora_a_or_spk_embed_params = []
+        lora_b_params = []
+        for n, p in param_dict.items():
+            if n.endswith('.A.weight') or n.endswith('.spk_embed.weight'):
+                lora_a_or_spk_embed_params.append(p)
+            elif n.endswith('.B.weight'):
+                lora_b_params.append(p)
+        dim = model.transformer.dim
+        rank = model.transformer.transformer_blocks[0].attn.k_proj.rank
+        optim_groups = [
+            {'params': lora_a_or_spk_embed_params, 'weight_decay': weight_decay, 'lr': lr},
+            {'params': lora_b_params, 'weight_decay': weight_decay, 'lr': lr*math.sqrt(dim/rank)}
+        ]
+    if optimizer_type == 'adamwsf':
+        optimizer = AdamWScheduleFree(optim_groups, betas=betas, warmup_steps=warmup_steps)
+        return optimizer, None
+    elif optimizer_type == 'adamw':
+        optimizer = AdamW(optim_groups, betas=betas, weight_decay=weight_decay)
+        max_steps = kwargs['max_steps']
+        min_lr = kwargs.get('min_lr', 0.0)
+        lr_scheduler = LinearWarmupDecayLR(optimizer, warmup_steps, max_steps, min_lr=min_lr)
+        return optimizer, lr_scheduler
+    else:
+        raise ValueError(f"Invalid optimizer type: {optimizer_type}")
+class LinearWarmupDecayLR(_LRScheduler):
+    """
+    Linear learning rate scheduler with warmup and minimum lr.
+    During warmup, the LR increases linearly from 0 to the base LR.
+    After warmup, the LR decays linearly from the base LR down to min_lr.
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        warmup_steps (int): Number of steps to linearly increase LR.
+        total_steps (int): Total number of steps for training (warmup + decay).
+        min_lr (float): Minimum learning rate after decay.
+        last_epoch (int): The index of last epoch. Default: -1.
+    """
+    def __init__(self, optimizer, warmup_steps, total_steps, min_lr=0.0, last_epoch=-1):
+        if total_steps <= warmup_steps:
+            raise ValueError(
+                "Total steps must be larger than warmup_steps for decay to happen."
+            )
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.min_lr = min_lr
+        super(LinearWarmupDecayLR, self).__init__(optimizer, last_epoch)
+    def get_lr(self):
+        """Compute learning rate using linear warmup and then linear decay."""
+        # Note: self.last_epoch is incremented by the base _LRScheduler.step() before calling get_lr().
+        if self.last_epoch < self.warmup_steps:
+            # Warmup phase: increase linearly from 0 (or a small value) to base_lr.
+            return [
+                base_lr * float(self.last_epoch + 1) / float(self.warmup_steps)
+                for base_lr in self.base_lrs
+            ]
+        else:
+            # Decay phase: decrease linearly from base_lr to min_lr.
+            progress = float(self.last_epoch - self.warmup_steps) / float(self.total_steps - self.warmup_steps)
+            return [
+                max(base_lr * (1.0 - progress) + self.min_lr * progress, self.min_lr)
+                for base_lr in self.base_lrs
+            ]

rift_svc/rf.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from typing import Union, List, Literal
+from jaxtyping import Bool
+import torch
+from torch import nn
+import torch.nn.functional as F
+import math
+from torchdiffeq import odeint
+from einops import rearrange
+from rift_svc.utils import (
+    exists,
+    lens_to_mask,
+)
+def sample_time(time_schedule: Literal['uniform', 'lognorm'], size: int, device: torch.device):
+    if time_schedule == 'uniform':
+        t = torch.rand((size,), device=device)
+    elif time_schedule == 'lognorm':
+        # stratified sampling of normals
+        # first stratified sample from uniform
+        quantiles = torch.linspace(0, 1, size + 1).to(device)
+        z = quantiles[:-1] + torch.rand((size,)).to(device) / size
+        # now transform to normal
+        z = torch.erfinv(2 * z - 1) * math.sqrt(2)
+        t = torch.sigmoid(z)
+    return t
+class RF(nn.Module):
+    def __init__(
+        self,
+        transformer: nn.Module,
+        time_schedule: Literal['uniform', 'lognorm'] = 'lognorm',
+        odeint_kwargs: dict = dict(
+            method='euler'
+        ),
+    ):
+        super().__init__()
+        self.transformer = transformer
+        dim = transformer.dim
+        self.dim = dim
+        # Sampling related parameters
+        self.odeint_kwargs = odeint_kwargs
+        self.time_schedule = time_schedule
+        self.mel_min = -12
+        self.mel_max = 2
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @torch.no_grad()
+    def sample(
+        self,
+        src_mel: torch.Tensor,           # [b n d]
+        spk_id: torch.Tensor,        # [b]
+        f0: torch.Tensor,            # [b n]
+        rms: torch.Tensor,           # [b n]
+        cvec: torch.Tensor,          # [b n d]
+        frame_len: torch.Tensor | None = None, # [b]
+        steps: int = 32,
+        bad_cvec: torch.Tensor | None = None,
+        ds_cfg_strength: float = 0.0,
+        spk_cfg_strength: float = 0.0,
+        skip_cfg_strength: float = 0.0,
+        cfg_skip_layers: Union[int, List[int], None] = None,
+        cfg_rescale: float = 0.7,
+    ):
+        self.eval()
+        batch, mel_seq_len, num_mel_channels = src_mel.shape
+        device = src_mel.device
+        if not exists(frame_len):
+            frame_len = torch.full((batch,), mel_seq_len, device=device)
+        mask = lens_to_mask(frame_len)
+        # Define the ODE function
+        def fn(t, x):
+            pred = self.transformer(
+                x=x,
+                spk=spk_id,
+                f0=f0,
+                rms=rms,
+                cvec=cvec,
+                time=t,
+                mask=mask
+            )
+            cfg_flag = (ds_cfg_strength > 1e-5) or (skip_cfg_strength > 1e-5) or (spk_cfg_strength > 1e-5)
+            if cfg_rescale > 1e-5 and cfg_flag:
+                std_pred = pred.std()
+            if ds_cfg_strength > 1e-5:
+                assert exists(bad_cvec), "bad_cvec is required when cfg_strength is greater than 0"
+                bad_cvec_pred = self.transformer(
+                    x=x,
+                    spk=spk_id,
+                    f0=f0,
+                    rms=rms,
+                    cvec=bad_cvec,
+                    time=t,
+                    mask=mask,
+                    skip_layers=cfg_skip_layers
+                )
+                pred = pred + (pred - bad_cvec_pred) * ds_cfg_strength
+            if skip_cfg_strength > 1e-5:
+                skip_pred = self.transformer(
+                    x=x,
+                    spk=spk_id,
+                    f0=f0,
+                    rms=rms,
+                    cvec=cvec,
+                    time=t,
+                    mask=mask,
+                    skip_layers=cfg_skip_layers
+                )
+                pred = pred + (pred - skip_pred) * skip_cfg_strength
+            if spk_cfg_strength > 1e-5:
+                null_spk_pred = self.transformer(
+                    x=x,
+                    spk=spk_id,
+                    f0=f0,
+                    rms=rms,
+                    cvec=cvec,
+                    time=t,
+                    mask=mask,
+                    drop_speaker=True
+                )
+                pred = pred + (pred - null_spk_pred) * spk_cfg_strength
+            if cfg_rescale > 1e-5 and cfg_flag:
+                std_cfg = pred.std()
+                pred_rescaled = pred * (std_pred / std_cfg)
+                pred = cfg_rescale * pred_rescaled + (1 - cfg_rescale) * pred
+            return pred
+        # Noise input
+        y0 = torch.randn(batch, mel_seq_len, num_mel_channels, device=self.device)
+        # mask out the padded tokens
+        y0 = y0.masked_fill(~mask.unsqueeze(-1), 0)
+        t_start = 0
+        t = torch.linspace(t_start, 1, steps, device=self.device)
+        trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
+        sampled = trajectory[-1]
+        out = self.denorm_mel(sampled)
+        out = torch.where(mask.unsqueeze(-1), out, src_mel)
+        return out, trajectory
+    def forward(
+        self,
+        mel: torch.Tensor,        # mel
+        spk_id: torch.Tensor,     # [b]
+        f0: torch.Tensor,         # [b n]
+        rms: torch.Tensor,        # [b n]
+        cvec: torch.Tensor,       # [b n d]
+        frame_len: torch.Tensor | None = None,
+        drop_speaker: Union[bool, Bool[torch.Tensor, "b"]] = False,
+    ):
+        batch, seq_len, dtype, device = *mel.shape[:2], mel.dtype, self.device
+        # Handle lengths and masks
+        if not exists(frame_len):
+            frame_len = torch.full((batch,), seq_len, device=device)
+        mask = lens_to_mask(frame_len, length=seq_len)  # Typically padded to max length in batch
+        x1 = self.norm_mel(mel)
+        x0 = torch.randn_like(x1)
+        # uniform time steps sampling
+        time = sample_time(self.time_schedule, batch, self.device)
+        t = rearrange(time, 'b -> b 1 1')
+        xt = (1 - t) * x0 + t * x1
+        flow = x1 - x0
+        pred = self.transformer(
+            x=xt,
+            spk=spk_id,
+            f0=f0,
+            rms=rms,
+            cvec=cvec,
+            time=time,
+            drop_speaker=drop_speaker,
+            mask=mask
+        )
+        # Flow matching loss
+        loss = F.mse_loss(pred, flow, reduction='none')
+        loss = loss[mask]
+        return loss.mean(), pred
+    def norm_mel(self, mel: torch.Tensor):
+        return (mel - self.mel_min) / (self.mel_max - self.mel_min) * 2 - 1
+    def denorm_mel(self, mel: torch.Tensor):
+        return (mel + 1) / 2 * (self.mel_max - self.mel_min) + self.mel_min

rift_svc/rmvpe/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .constants import *
+from .model import E2E, E2E0
+from .utils import to_local_average_f0, to_viterbi_f0
+from .inference import RMVPE
+from .spec import MelSpectrogram

rift_svc/rmvpe/constants.py ADDED Viewed

	@@ -0,0 +1,9 @@

+SAMPLE_RATE = 16000
+N_CLASS = 360
+N_MELS = 128
+MEL_FMIN = 30
+MEL_FMAX = 8000
+WINDOW_LENGTH = 1024
+CONST = 1997.3794084376191

rift_svc/rmvpe/deepunet.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import torch
+import torch.nn as nn
+from .constants import N_MELS
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels,
+                      out_channels=out_channels,
+                      kernel_size=(3, 3),
+                      stride=(1, 1),
+                      padding=(1, 1),
+                      bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=out_channels,
+                      out_channels=out_channels,
+                      kernel_size=(3, 3),
+                      stride=(1, 1),
+                      padding=(1, 1),
+                      bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x):
+        if self.is_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+class ResEncoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None:
+            self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+    def forward(self, x):
+        for i in range(self.n_blocks):
+            x = self.conv[i](x)
+        if self.kernel_size is not None:
+            return x, self.pool(x)
+        else:
+            return x
+class ResDecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.n_blocks = n_blocks
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=in_channels,
+                               out_channels=out_channels,
+                               kernel_size=(3, 3),
+                               stride=stride,
+                               padding=(1, 1),
+                               output_padding=out_padding,
+                               bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for i in range(n_blocks-1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        for i in range(self.n_blocks):
+            x = self.conv2[i](x)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(self.n_encoders):
+            self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum))
+            self.latent_channels.append([out_channels, in_size])
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+    def forward(self, x):
+        concat_tensors = []
+        x = self.bn(x)
+        for i in range(self.n_encoders):
+            _, x = self.layers[i](x)
+            concat_tensors.append(_)
+        return x, concat_tensors
+class Intermediate(nn.Module):
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.n_inters = n_inters
+        self.layers = nn.ModuleList()
+        self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum))
+        for i in range(self.n_inters-1):
+            self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum))
+    def forward(self, x):
+        for i in range(self.n_inters):
+            x = self.layers[i](x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList()
+        self.n_decoders = n_decoders
+        for i in range(self.n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum))
+            in_channels = out_channels
+    def forward(self, x, concat_tensors):
+        for i in range(self.n_decoders):
+            x = self.layers[i](x, concat_tensors[-1-i])
+        return x
+class TimbreFilter(nn.Module):
+    def __init__(self, latent_rep_channels):
+        super(TimbreFilter, self).__init__()
+        self.layers = nn.ModuleList()
+        for latent_rep in latent_rep_channels:
+            self.layers.append(ConvBlockRes(latent_rep[0], latent_rep[0]))
+    def forward(self, x_tensors):
+        out_tensors = []
+        for i, layer in enumerate(self.layers):
+            out_tensors.append(layer(x_tensors[i]))
+        return out_tensors
+class DeepUnet(nn.Module):
+    def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
+        super(DeepUnet, self).__init__()
+        self.encoder = Encoder(in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels)
+        self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks)
+        self.tf = TimbreFilter(self.encoder.latent_channels)
+        self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        concat_tensors = self.tf(concat_tensors)
+        x = self.decoder(x, concat_tensors)
+        return x
+class DeepUnet0(nn.Module):
+    def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
+        super(DeepUnet0, self).__init__()
+        self.encoder = Encoder(in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels)
+        self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks)
+        self.tf = TimbreFilter(self.encoder.latent_channels)
+        self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x

rift_svc/rmvpe/inference.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchaudio.transforms import Resample
+from .constants import *
+from .model import E2E0, E2E
+from .spec import MelSpectrogram
+from .utils import to_local_average_f0, to_viterbi_f0
+class RMVPE:
+    def __init__(self, model_path, hop_length=160, device='cpu'):
+        self.resample_kernel = {}
+        model = E2E0(4, 1, (2, 2))
+        ckpt = torch.load(model_path, weights_only=True)
+        model.load_state_dict(ckpt['model'], strict=False)
+        model.eval()
+        self.model = model
+        self.mel_extractor = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX)
+        self.resample_kernel = {}
+        self.model = self.model.to(device)
+        self.mel_extractor = self.mel_extractor.to(device)
+    def mel2hidden(self, mel):
+        with torch.no_grad():
+            n_frames = mel.shape[-1]
+            mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
+            hidden = self.model(mel)
+            return hidden[:, :n_frames]
+    def decode(self, hidden, thred=0.03, use_viterbi=False):
+        if use_viterbi:
+            f0 = to_viterbi_f0(hidden, thred=thred)
+        else:
+            f0 = to_local_average_f0(hidden, thred=thred)
+        return f0
+    def infer_from_audio(self, audio, sample_rate=16000, device=None, thred=0.03, use_viterbi=False):
+        #audio = torch.from_numpy(audio).float().unsqueeze(0).to(device)
+        if sample_rate == 16000:
+            audio_res = audio
+        else:
+            key_str = str(sample_rate)
+            if key_str not in self.resample_kernel:
+                self.resample_kernel[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
+            self.resample_kernel[key_str] = self.resample_kernel[key_str].to(device)
+            audio_res = self.resample_kernel[key_str](audio)
+        mel = self.mel_extractor(audio_res, center=True)
+        hidden = self.mel2hidden(mel)
+        f0 = self.decode(hidden, thred=thred, use_viterbi=use_viterbi)
+        return f0

rift_svc/rmvpe/model.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import torch
+from torch import nn
+from .deepunet import DeepUnet, DeepUnet0
+from .constants import *
+from .spec import MelSpectrogram
+from .seq import BiGRU
+class E2E(nn.Module):
+    def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1,
+                 en_out_channels=16):
+        super(E2E, self).__init__()
+        self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * N_MELS, 256, n_gru),
+                nn.Linear(512, N_CLASS),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * N_MELS, N_CLASS),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+    def forward(self, mel):
+        mel = mel.transpose(-1, -2).unsqueeze(1)
+        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        return x
+class E2E0(nn.Module):
+    def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1,
+                 en_out_channels=16):
+        super(E2E0, self).__init__()
+        self.unet = DeepUnet0(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * N_MELS, 256, n_gru),
+                nn.Linear(512, N_CLASS),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * N_MELS, N_CLASS),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+    def forward(self, mel):
+        mel = mel.transpose(-1, -2).unsqueeze(1)
+        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        return x

rift_svc/rmvpe/seq.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch.nn as nn
+class BiGRU(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)
+    def forward(self, x):
+        return self.gru(x)[0]
+class BiLSTM(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiLSTM, self).__init__()
+        self.lstm = nn.LSTM(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)
+    def forward(self, x):
+        return self.lstm(x)[0]

rift_svc/rmvpe/spec.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+from librosa.filters import mel
+class MelSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        n_mel_channels,
+        sampling_rate,
+        win_length,
+        hop_length,
+        n_fft=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        clamp = 1e-5
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(
+            sr=sampling_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True)
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_length_new = int(np.round(self.win_length * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        keyshift_key = str(keyshift)+'_'+str(audio.device)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device)
+        fft = torch.stft(
+            audio,
+            n_fft=n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_length_new,
+            window=self.hann_window[keyshift_key],
+            center=center,
+            return_complex=True)
+        magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.size(1)
+            if resize < size:
+                magnitude = F.pad(magnitude, (0, 0, 0, size-resize))
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+        return log_mel_spec

rift_svc/rmvpe/utils.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import sys
+import numpy as np
+import librosa
+import torch
+from functools import reduce
+from .constants import *
+from torch.nn.modules.module import _addindent
+def cycle(iterable):
+    while True:
+        for item in iterable:
+            yield item
+def summary(model, file=sys.stdout):
+    def repr(model):
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        extra_repr = model.extra_repr()
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines = extra_repr.split('\n')
+        child_lines = []
+        total_params = 0
+        for key, module in model._modules.items():
+            mod_str, num_params = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append('(' + key + '): ' + mod_str)
+            total_params += num_params
+        lines = extra_lines + child_lines
+        for name, p in model._parameters.items():
+            if hasattr(p, 'shape'):
+                total_params += reduce(lambda x, y: x * y, p.shape)
+        main_str = model._get_name() + '('
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += '\n  ' + '\n  '.join(lines) + '\n'
+        main_str += ')'
+        if file is sys.stdout:
+            main_str += ', \033[92m{:,}\033[0m params'.format(total_params)
+        else:
+            main_str += ', {:,} params'.format(total_params)
+        return main_str, total_params
+    string, count = repr(model)
+    if file is not None:
+        if isinstance(file, str):
+            file = open(file, 'w')
+        print(string, file=file)
+        file.flush()
+    return count
+def to_local_average_cents(salience, center=None, thred=0.03):
+    """
+    find the weighted average cents near the argmax bin
+    """
+    if not hasattr(to_local_average_cents, 'cents_mapping'):
+        # the bin number-to-cents mapping
+        to_local_average_cents.cents_mapping = (
+                20 * np.arange(N_CLASS) + CONST)
+    if salience.ndim == 1:
+        if center is None:
+            center = int(np.argmax(salience))
+        start = max(0, center - 4)
+        end = min(len(salience), center + 5)
+        salience = salience[start:end]
+        product_sum = np.sum(
+            salience * to_local_average_cents.cents_mapping[start:end])
+        weight_sum = np.sum(salience)
+        return product_sum / weight_sum if np.max(salience) > thred else 0
+    if salience.ndim == 2:
+        return np.array([to_local_average_cents(salience[i, :], None, thred) for i in
+                         range(salience.shape[0])])
+    raise Exception("label should be either 1d or 2d ndarray")
+def to_viterbi_cents(salience, thred=0.03):
+    # Create viterbi transition matrix
+    if not hasattr(to_viterbi_cents, 'transition'):
+        xx, yy = np.meshgrid(range(N_CLASS), range(N_CLASS))
+        transition = np.maximum(30 - abs(xx - yy), 0)
+        transition = transition / transition.sum(axis=1, keepdims=True)
+        to_viterbi_cents.transition = transition
+    # Convert to probability
+    prob = salience.T
+    prob = prob / prob.sum(axis=0)
+    # Perform viterbi decoding
+    path = librosa.sequence.viterbi(prob, to_viterbi_cents.transition).astype(np.int64)
+    return np.array([to_local_average_cents(salience[i, :], path[i], thred) for i in
+                     range(len(path))])
+def to_local_average_f0(hidden, center=None, thred=0.03):
+    idx = torch.arange(N_CLASS, device=hidden.device)[None, None, :]  # [B=1, T=1, N]
+    idx_cents = idx * 20 + CONST  # [B=1, N]
+    if center is None:
+        center = torch.argmax(hidden, dim=2, keepdim=True)  # [B, T, 1]
+    start = torch.clip(center - 4, min=0)  # [B, T, 1]
+    end = torch.clip(center + 5, max=N_CLASS)  # [B, T, 1]
+    idx_mask = (idx >= start) & (idx < end)  # [B, T, N]
+    weights = hidden * idx_mask  # [B, T, N]
+    product_sum = torch.sum(weights * idx_cents, dim=2)  # [B, T]
+    weight_sum = torch.sum(weights, dim=2)  # [B, T]
+    cents = product_sum / (weight_sum + (weight_sum == 0))  # avoid dividing by zero, [B, T]
+    f0 = 10 * 2 ** (cents / 1200)
+    uv = hidden.max(dim=2)[0] < thred  # [B, T]
+    f0 = f0 * ~uv
+    return f0.squeeze(0).cpu().numpy()
+def to_viterbi_f0(hidden, thred=0.03):
+    # Create viterbi transition matrix
+    if not hasattr(to_viterbi_cents, 'transition'):
+        xx, yy = np.meshgrid(range(N_CLASS), range(N_CLASS))
+        transition = np.maximum(30 - abs(xx - yy), 0)
+        transition = transition / transition.sum(axis=1, keepdims=True)
+        to_viterbi_cents.transition = transition
+    # Convert to probability
+    prob = hidden.squeeze(0).cpu().numpy()
+    prob = prob.T
+    prob = prob / prob.sum(axis=0)
+    # Perform viterbi decoding
+    path = librosa.sequence.viterbi(prob, to_viterbi_cents.transition).astype(np.int64)
+    center = torch.from_numpy(path).unsqueeze(0).unsqueeze(-1).to(hidden.device)
+    return to_local_average_f0(hidden, center=center, thred=thred)

rift_svc/utils.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import io
+import os
+import random
+import time
+from typing import Any
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn.functional as F
+from jaxtyping import Bool, Int
+from PIL import Image
+from pytorch_lightning.callbacks import TQDMProgressBar
+import parselmouth as pm
+import librosa
+import pyworld as pw
+def seed_everything(seed: int = 0):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+# helpers
+def exists(v: Any) -> bool:
+    return v is not None
+def default(v: Any, d: Any) -> Any:
+    return v if exists(v) else d
+def draw_mel_specs(gt: np.ndarray, gen: np.ndarray, diff: np.ndarray, cache_path: str):
+    vmin = min(gt.min(), gen.min())
+    vmax = max(gt.max(), gen.max())
+    # Create figure with space for colorbar
+    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(20, 15), sharex=True, gridspec_kw={'hspace': 0})
+    # Plot all spectrograms with the same scale
+    im1 = ax1.imshow(gt, origin='lower', aspect='auto', vmin=vmin, vmax=vmax)
+    ax1.set_ylabel('GT', fontsize=14)
+    ax1.set_xticks([])
+    im2 = ax2.imshow(gen, origin='lower', aspect='auto', vmin=vmin, vmax=vmax)
+    ax2.set_ylabel('Gen', fontsize=14)
+    ax2.set_xticks([])
+    # Find symmetric limits for difference plot
+    diff_abs_max = max(abs(diff.min()), abs(diff.max()))
+    im3 = ax3.imshow(diff, origin='lower', aspect='auto',
+                     cmap='RdBu_r',  # Red-White-Blue colormap (reversed)
+                     vmin=-diff_abs_max, vmax=diff_abs_max)
+    ax3.set_ylabel('Diff', fontsize=14)
+    fig.colorbar(im1, ax=[ax1, ax2], location='right', label='Magnitude')
+    fig.colorbar(im3, ax=[ax3], location='right', label='Difference')
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight')
+    plt.close()
+    buf.seek(0)
+    # Open with PIL and save as compressed JPEG
+    img = Image.open(buf)
+    img = img.convert('RGB')
+    img.save(cache_path, 'JPEG', quality=85, optimize=True)
+    buf.close()
+# tensor helpers
+def lens_to_mask(
+    t: Int[torch.Tensor, "b"],
+    length: int | None = None
+) -> Bool[torch.Tensor, "b n"]:
+    if not exists(length):
+        length = t.amax()
+    seq = torch.arange(length, device = t.device)
+    return seq < t[..., None]
+def l2_grad_norm(model: torch.nn.Module):
+    return torch.cat([p.grad.data.flatten() for p in model.parameters() if p.grad is not None]).norm(2)
+def nearest_interpolate_tensor(tensor, new_size):
+    # Add two dummy dimensions to make it [1, 1, n, d]
+    tensor = tensor.unsqueeze(0).unsqueeze(0)
+    # Interpolate
+    interpolated = F.interpolate(tensor, size=(new_size, tensor.shape[-1]), mode='nearest')
+    # Remove the dummy dimensions
+    interpolated = interpolated.squeeze(0).squeeze(0)
+    return interpolated
+def linear_interpolate_tensor(tensor, new_size):
+    # Assumes input tensor shape is [n, d]
+    # Rearrange tensor to shape [1, d, n] to prepare for linear interpolation
+    tensor = tensor.transpose(0, 1).unsqueeze(0)
+    # Interpolate along the length dimension (last dimension) using linear interpolation.
+    # align_corners=True preserves the boundary values; adjust this flag if needed.
+    interpolated = F.interpolate(tensor, size=new_size, mode='linear', align_corners=True)
+    # Restore the tensor to shape [new_size, d]
+    return interpolated.squeeze(0).transpose(0, 1)
+# f0 helpers
+def post_process_f0(f0, sample_rate, hop_length, n_frames, silence_front=0.0, cut_last=True):
+    """
+    Post-process the extracted f0 to align with Mel spectrogram frames.
+    Args:
+        f0 (numpy.ndarray): Extracted f0 array.
+        sample_rate (int): Sample rate of the audio.
+        hop_length (int): Hop length used during processing.
+        n_frames (int): Total number of frames (for alignment).
+        silence_front (float): Seconds of silence to remove from the front.
+    Returns:
+        numpy.ndarray: Processed f0 array aligned with Mel spectrogram frames.
+    """
+    # Calculate number of frames to skip based on silence_front
+    start_frame = int(silence_front * sample_rate / hop_length)
+    real_silence_front = start_frame * hop_length / sample_rate
+    # Assuming silence_front has been handled during RMVPE inference if needed
+    # Handle unvoiced frames by interpolation
+    uv = f0 == 0
+    if np.any(~uv):
+        f0_interp = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
+        f0[uv] = f0_interp
+    else:
+        # If no voiced frames, set all to zero
+        f0 = np.zeros_like(f0)
+    # Align with hop_length frames
+    origin_time = 0.01 * np.arange(len(f0))  # Placeholder: Adjust based on RMVPE's timing
+    target_time = hop_length / sample_rate * np.arange(n_frames - start_frame)
+    f0 = np.interp(target_time, origin_time, f0)
+    uv = np.interp(target_time, origin_time, uv.astype(float)) > 0.5
+    f0[uv] = 0
+    # Pad the silence_front if needed
+    f0 = np.pad(f0, (start_frame, 0), mode='constant')
+    if cut_last:
+        return f0[:-1]
+    else:
+        return f0
+# pyworld
+def get_f0_pw(audio, sr, time_step, f0_min, f0_max):
+    pw_pre_f0, times = pw.dio(
+        audio.astype(np.double), sr,
+        f0_floor=f0_min, f0_ceil=f0_max,
+        frame_period=time_step*1000)    # raw pitch extractor
+    pw_post_f0 = pw.stonemask(audio.astype(np.double), pw_pre_f0, times, sr)  # pitch refinement
+    pw_post_f0[pw_post_f0==0] = np.nan
+    pw_post_f0 = slide_nanmedian(pw_post_f0, 3)
+    return pw_post_f0
+# parselmouth
+def get_f0_pm(audio, sr, time_step, f0_min, f0_max):
+    pmac_pitch = pm.Sound(audio, sampling_frequency=sr).to_pitch_ac(
+        time_step=time_step, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max,
+        very_accurate=True, octave_jump_cost=0.5)
+    pmac_f0 = pmac_pitch.selected_array['frequency']
+    pmac_f0[pmac_f0==0] = np.nan
+    pmac_f0 = slide_nanmedian(pmac_f0, 3)
+    return pmac_f0
+from numba import njit
+@njit
+def slide_nanmedian(signals=np.array([]), win_length=3):
+    """Filters a sequence, ignoring nan values
+    Arguments
+        signals (numpy.ndarray (shape=(time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+    Returns
+        filtered (numpy.ndarray (shape=(time)))
+    """
+    # Output buffer
+    filtered = np.empty_like(signals)
+    # Loop over frames
+    for i in range(signals.shape[0]):
+        # Get analysis window bounds
+        start = max(0, i - win_length // 2)
+        end = min(signals.shape[0], i + win_length // 2 + 1)
+        # Apply filter to window
+        filtered[i] = np.nanmedian(signals[start:end])
+    return filtered
+def f0_ensemble(rmvpe_f0, pw_f0, pmac_f0):
+    trunc_len = len(rmvpe_f0)
+    pw_f0 = pw_f0[:trunc_len]
+    # pad pmac_f0
+    pmac_f0 = np.concatenate(
+        [pmac_f0, np.full(len(pw_f0)-len(pmac_f0), np.nan, dtype=pmac_f0.dtype)])
+    stack_f0 = np.stack([pw_f0, pmac_f0, rmvpe_f0], axis=0)
+    meadian_f0 = np.nanmedian(stack_f0, axis=0)
+    nan_nums = np.sum(np.isnan(stack_f0), axis=0)
+    meadian_f0[nan_nums>=2] = np.nan
+    slide_meadian_f0 = slide_nanmedian(meadian_f0, 41)
+    f0_dev = np.abs(meadian_f0-slide_meadian_f0)
+    meadian_f0[f0_dev>96] = slide_meadian_f0[f0_dev>96]
+    nan1_f0_min = np.nanmin(stack_f0[:, nan_nums==1], axis=0)
+    nan1_f0_max = np.nanmax(stack_f0[:, nan_nums==1], axis=0)
+    nan1_f0 = np.where(
+        np.abs(nan1_f0_min-slide_meadian_f0[nan_nums==1])<np.abs(nan1_f0_max-slide_meadian_f0[nan_nums==1]),
+        nan1_f0_min, nan1_f0_max)
+    meadian_f0[nan_nums==1] = nan1_f0
+    meadian_f0 = slide_nanmedian(meadian_f0, 3)
+    meadian_f0[nan_nums>=2] = np.nan
+    meadian_f0[np.isnan(meadian_f0)] = 0
+    return meadian_f0
+def f0_ensemble_light(rmvpe_f0, pw_f0, pmac_f0, rms=None, rms_threshold=0.05):
+    """
+    A lighter version of f0 ensemble that preserves RMVPE's expressiveness.
+    Only applies corrections when RMVPE shows abnormalities.
+    Args:
+        rmvpe_f0 (numpy.ndarray): F0 from RMVPE
+        pw_f0 (numpy.ndarray): F0 from WORLD
+        pmac_f0 (numpy.ndarray): F0 from Parselmouth
+        rms (numpy.ndarray, optional): RMS energy values, used to detect voiced segments
+        rms_threshold (float, optional): Threshold for RMS to consider a segment as voiced
+    Returns:
+        numpy.ndarray: Corrected F0 values
+    """
+    trunc_len = len(rmvpe_f0)
+    pw_f0 = pw_f0[:trunc_len]
+    # Pad pmac_f0 if needed
+    pmac_f0 = np.concatenate(
+        [pmac_f0, np.full(max(0, len(pw_f0)-len(pmac_f0)), np.nan, dtype=pmac_f0.dtype)])
+    # Create a copy of rmvpe_f0 to preserve most of its values
+    corrected_f0 = rmvpe_f0.copy()
+    # Stack all F0 values
+    stack_f0 = np.stack([pw_f0, pmac_f0, rmvpe_f0], axis=0)
+    # Count non-NaN values for each frame
+    valid_count = np.sum(~np.isnan(stack_f0), axis=0)
+    # Identify frames where RMVPE shows zero but other methods detect pitch
+    zero_rmvpe_mask = (rmvpe_f0 == 0)
+    # For frames where RMVPE is zero but at least one other method has a valid F0
+    # and there's voice activity (if RMS is provided)
+    other_methods_valid = ((~np.isnan(pw_f0) & (pw_f0 > 0)) |
+                           (~np.isnan(pmac_f0) & (pmac_f0 > 0)))
+    correction_mask = zero_rmvpe_mask & other_methods_valid
+    # If RMS is provided, only correct frames with voice activity
+    if rms is not None:
+        voice_activity = rms > rms_threshold
+        correction_mask = correction_mask & voice_activity
+    # For frames needing correction, use median of available values
+    if np.any(correction_mask):
+        # For each frame needing correction, calculate median of non-NaN values
+        for i in np.where(correction_mask)[0]:
+            valid_values = stack_f0[:, i][~np.isnan(stack_f0[:, i]) & (stack_f0[:, i] > 0)]
+            if len(valid_values) > 0:
+                corrected_f0[i] = np.median(valid_values)
+    # Handle any remaining NaN values
+    corrected_f0[np.isnan(corrected_f0)] = 0
+    return corrected_f0
+# progress bar helper
+class CustomProgressBar(TQDMProgressBar):
+    def __init__(self):
+        super().__init__()
+        self.start_time = None
+        self.step_start_time = None
+        self.total_steps = None
+    def on_train_start(self, trainer, pl_module):
+        super().on_train_start(trainer, pl_module)
+        self.start_time = time.time()
+        self.total_steps = trainer.max_steps
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        super().on_train_batch_end(trainer, pl_module, outputs, batch, batch_idx)
+        current_step = trainer.global_step
+        total_steps = self.total_steps
+        # Calculate elapsed time since training started
+        elapsed_time = time.time() - self.start_time
+        # Estimate average step time and remaining time
+        average_step_time = elapsed_time / current_step if current_step > 0 else 0
+        remaining_steps = total_steps - current_step
+        remaining_time = average_step_time * remaining_steps if total_steps > 0 else 0
+        # Format times with no leading zeros for hours
+        def format_time(seconds):
+            hours = int(seconds // 3600)
+            minutes = int((seconds % 3600) // 60)
+            seconds = int(seconds % 60)
+            return f"{hours}:{minutes:02d}:{seconds:02d}"
+        elapsed_time_str = format_time(elapsed_time)
+        remaining_time_str = format_time(remaining_time)
+        # Update the progress bar with loss, elapsed time, remaining time, and remaining steps
+        self.train_progress_bar.set_postfix({
+            "loss": f"{outputs['loss'].item():.4f}",
+            "elapsed_time": elapsed_time_str + "/" + remaining_time_str,
+            "remaining_steps": str(remaining_steps) + "/" + str(total_steps)
+        })
+# state dict helpers
+def load_state_dict(model, state_dict, strict=False):
+    """Load state dict while handling 'model.' prefix"""
+    if any(k.startswith('model.') for k in state_dict.keys()):
+        # Remove 'model.' prefix
+        state_dict = {k.replace('model.', ''): v for k, v in state_dict.items()}
+    return model.load_state_dict(state_dict, strict=strict)

slicer.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import logging
+import warnings
+import librosa
+warnings.filterwarnings('ignore')
+# Configure logging at the top of your slicer.py
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class Slicer:
+    def __init__(self,
+                 sr: int,
+                 threshold: float = -30.,
+                 min_length: int = 3000,
+                 min_interval: int = 100,
+                 hop_size: int = 20,
+                 max_sil_kept: int = 5000):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
+        if not max_sil_kept >= hop_size:
+            raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
+        min_interval = sr * min_interval / 1000
+        self.sr = sr
+        self.threshold = 10 ** (threshold / 20.)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+    def _apply_slice(self, waveform, begin, end):
+        if len(waveform.shape) > 1:
+            return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
+        else:
+            return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
+    def slice(self, waveform):
+        if len(waveform.shape) > 1:
+            samples = librosa.to_mono(waveform)
+        else:
+            samples = waveform
+        if samples.shape[0] <= self.min_length:
+            # Return the entire audio as a single chunk
+            return [(0, waveform)]
+        rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start: i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if silence_start is not None and total_frames - silence_start >= self.min_interval:
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        if len(sil_tags) == 0:
+            # Return the entire audio as a single chunk if no silence detected
+            return [(0, waveform)]
+        # Extract non-silence chunks
+        non_silence_chunks = []
+        # Add first non-silence chunk if it exists
+        if sil_tags[0][0] > 0:
+            start_pos = 0
+            end_frame = sil_tags[0][0]
+            chunk = self._apply_slice(waveform, 0, end_frame)
+            non_silence_chunks.append((start_pos, chunk))
+        # Add middle non-silence chunks
+        for i in range(1, len(sil_tags)):
+            start_frame = sil_tags[i-1][1]
+            end_frame = sil_tags[i][0]
+            if start_frame < end_frame:  # Only add if there's actual non-silence content
+                start_pos = start_frame * self.hop_size
+                chunk = self._apply_slice(waveform, start_frame, end_frame)
+                non_silence_chunks.append((start_pos, chunk))
+        # Add last non-silence chunk if it exists
+        if sil_tags[-1][1] * self.hop_size < len(waveform):
+            start_frame = sil_tags[-1][1]
+            start_pos = start_frame * self.hop_size
+            chunk = self._apply_slice(waveform, start_frame, total_frames)
+            non_silence_chunks.append((start_pos, chunk))
+        for i, (start_pos, chunk) in enumerate(non_silence_chunks):
+            # Calculate start and end times in seconds
+            start_time_sec = start_pos / self.sr
+            end_time_sec = start_pos / self.sr + len(chunk) / self.sr if len(chunk.shape) == 1 else start_pos / self.sr + chunk.shape[1] / self.sr
+            duration_sec = end_time_sec - start_time_sec
+            # Format start and end times as mm:ss
+            start_min, start_sec = divmod(start_time_sec, 60)
+            end_min, end_sec = divmod(end_time_sec, 60)
+            # Log the information
+            logger.info(f"Chunk {i}: Start={int(start_min):02d}:{start_sec:05.2f}, End={int(end_min):02d}:{end_sec:05.2f}, Duration={duration_sec:.2f}s")
+        return non_silence_chunks
+def main():
+    import os.path
+    from argparse import ArgumentParser
+    import librosa
+    import soundfile
+    from pathlib import Path
+    parser = ArgumentParser()
+    parser.add_argument('audio', type=str, help='The audio file or directory to be sliced')
+    parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
+    parser.add_argument('--db_thresh', type=float, required=False, default=-30,
+                        help='The dB threshold for silence detection')
+    parser.add_argument('--min_length', type=int, required=False, default=3000,
+                        help='The minimum milliseconds required for each sliced audio clip')
+    parser.add_argument('--min_interval', type=int, required=False, default=100,
+                        help='The minimum milliseconds for a silence part to be sliced')
+    parser.add_argument('--hop_size', type=int, required=False, default=20,
+                        help='Frame length in milliseconds')
+    parser.add_argument('--max_sil_kept', type=int, required=False, default=5000,
+                        help='The maximum silence length kept around the sliced clip, presented in milliseconds')
+    args = parser.parse_args()
+    # Determine if the input is a file or directory
+    audio_path = Path(args.audio)
+    is_directory = audio_path.is_dir()
+    # Prepare output directory
+    out = args.out
+    if out is None:
+        if is_directory:
+            out = os.path.abspath(args.audio)
+        else:
+            out = os.path.dirname(os.path.abspath(args.audio))
+    if not os.path.exists(out):
+        os.makedirs(out)
+    # Audio file extensions to process
+    audio_extensions = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
+    # Process a single file or all files in a directory
+    if is_directory:
+        logger.info(f"Processing all audio files in directory: {args.audio}")
+        audio_files = []
+        for ext in audio_extensions:
+            audio_files.extend(list(audio_path.glob(f'*{ext}')))
+        if not audio_files:
+            logger.warning(f"No audio files found in {args.audio}")
+            return
+        logger.info(f"Found {len(audio_files)} audio files to process")
+        for audio_file in audio_files:
+            process_audio_file(audio_file, out, args)
+    else:
+        # Process a single audio file
+        logger.info(f"Processing single audio file: {args.audio}")
+        process_audio_file(audio_path, out, args)
+def process_audio_file(audio_file, out_dir, args):
+    """Process a single audio file with the given parameters"""
+    import os.path
+    import librosa
+    import soundfile
+    try:
+        logger.info(f"Loading audio file: {audio_file}")
+        audio, sr = librosa.load(str(audio_file), sr=None, mono=False)
+        slicer = Slicer(
+            sr=sr,
+            threshold=args.db_thresh,
+            min_length=args.min_length,
+            min_interval=args.min_interval,
+            hop_size=args.hop_size,
+            max_sil_kept=args.max_sil_kept
+        )
+        # Get non-silence chunks with their positions
+        chunks_with_pos = slicer.slice(audio)
+        file_basename = os.path.basename(str(audio_file)).rsplit('.', maxsplit=1)[0]
+        logger.info(f"Saving {len(chunks_with_pos)} non-silence audio chunks from {file_basename}...")
+        for i, (pos, chunk) in enumerate(chunks_with_pos):
+            if len(chunk.shape) > 1:
+                chunk = chunk.T
+            output_file = os.path.join(out_dir, f'{file_basename}_{i}_pos_{pos}.wav')
+            soundfile.write(output_file, chunk, sr)
+        logger.info(f"Finished processing {audio_file}")
+    except Exception as e:
+        logger.error(f"Error processing {audio_file}: {str(e)}")
+if __name__ == '__main__':
+    main()