import click
import librosa
import numpy as np
import pyloudnorm as pyln
import torch
import torchaudio
from pathlib import Path
from tqdm import tqdm
from torch.amp import autocast

from rift_svc import DiT, RF
from rift_svc.feature_extractors import HubertModelWithFinalProj, RMSExtractor, get_mel_spectrogram
from rift_svc.nsf_hifigan import NsfHifiGAN
from rift_svc.rmvpe import RMVPE
from rift_svc.utils import linear_interpolate_tensor, post_process_f0, f0_ensemble, f0_ensemble_light, get_f0_pw, get_f0_pm
from slicer import Slicer


torch.set_grad_enabled(False)


def extract_state_dict(ckpt):
    state_dict = ckpt['state_dict']
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith('model.'):
            new_k = k.replace('model.', '')
            new_state_dict[new_k] = v
    spk2idx = ckpt['hyper_parameters']['cfg']['spk2idx']
    model_cfg = ckpt['hyper_parameters']['cfg']['model']
    dataset_cfg = ckpt['hyper_parameters']['cfg']['dataset']
    return new_state_dict, spk2idx, model_cfg, dataset_cfg


def load_models(model_path, device, use_fp16=True):
    """Load all required models and return them"""
    click.echo("Loading models...")
    
    # Load the conversion model
    ckpt = torch.load(model_path, map_location='cpu')
    state_dict, spk2idx, dit_cfg, dataset_cfg = extract_state_dict(ckpt)

    transformer = DiT(num_speaker=len(spk2idx), **dit_cfg)
    svc_model = RF(transformer=transformer)
    svc_model.load_state_dict(state_dict)
    svc_model = svc_model.to(device)
    
    # Convert to half precision (float16) if specified and using CUDA
    if use_fp16 and device != 'cpu':
        svc_model = svc_model.half()
    
    svc_model.eval()
    
    # Load additional models
    vocoder = NsfHifiGAN('pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt').to(device)
    rmvpe = RMVPE(model_path="pretrained/rmvpe/model.pt", hop_length=160, device=device)
    hubert = HubertModelWithFinalProj.from_pretrained("pretrained/content-vec-best").to(device)
    rms_extractor = RMSExtractor().to(device)
    
    # Convert additional models to half precision if specified and using CUDA
    if use_fp16 and device != 'cpu':
        vocoder = vocoder.half()
        hubert = hubert.half()
        rms_extractor = rms_extractor.half()
        # RMVPE model is handled separately as it may have custom implementation
    
    return svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg


def load_audio(file_path, target_sr):
    """Load and preprocess audio file"""
    click.echo("Loading audio...")
    audio, sr = torchaudio.load(file_path)
    if sr != target_sr:
        audio = torchaudio.functional.resample(audio, sr, target_sr)

    if len(audio.shape) > 1:
        audio = audio.mean(dim=0, keepdim=True)

    return audio.numpy().squeeze()


def apply_fade(audio, fade_samples, fade_in=True):
    """Apply fade in/out using half of a Hanning window"""
    fade_window = np.hanning(fade_samples * 2)
    if fade_in:
        fade_curve = fade_window[:fade_samples]
    else:
        fade_curve = fade_window[fade_samples:]
    audio[:fade_samples] *= fade_curve
    return audio


def extract_features(audio_segment, sample_rate, hop_length, rmvpe, hubert, rms_extractor, 
                     device, key_shift=0, ds_cfg_strength=0.0, cvec_downsample_rate=2, target_loudness=-18.0,
                     robust_f0=0, use_fp16=True):
    """Extract all required features from an audio segment"""
    # Normalize input segment
    meter = pyln.Meter(sample_rate)
    original_loudness = meter.integrated_loudness(audio_segment)
    normalized_audio = pyln.normalize.loudness(audio_segment, original_loudness, target_loudness)

    # Handle potential clipping
    max_amp = np.max(np.abs(normalized_audio))
    if max_amp > 1.0:
        normalized_audio = normalized_audio * (0.99 / max_amp)

    audio_tensor = torch.from_numpy(normalized_audio).float().unsqueeze(0).to(device)
    audio_16khz = torch.from_numpy(librosa.resample(normalized_audio, orig_sr=sample_rate, target_sr=16000)).float().unsqueeze(0).to(device)
    
    # Convert to half precision if specified and using CUDA
    if use_fp16 and device.type != 'cpu':
        audio_tensor = audio_tensor.half()
        audio_16khz = audio_16khz.half()

    # Extract mel spectrogram
    mel = get_mel_spectrogram(
        audio_tensor,
        sampling_rate=sample_rate,
        n_fft=2048,
        num_mels=128,
        hop_size=512,
        win_size=2048,
        fmin=40,
        fmax=16000
    ).transpose(1, 2)

    # Extract content vector
    device_type = 'cuda' if device.type == 'cuda' else 'cpu'
    with autocast(device_type=device_type, enabled=use_fp16):
        cvec = hubert(audio_16khz)["last_hidden_state"].squeeze(0)
    cvec = linear_interpolate_tensor(cvec, mel.shape[1])[None, :]

    # Create bad_cvec (downsampled) for classifier-free guidance
    if ds_cfg_strength > 0:
        cvec_ds = cvec.clone()
        # Downsample and then interpolate back, similar to dataset.py
        cvec_ds = cvec_ds[0, ::2, :]  # Take every other frame
        cvec_ds = linear_interpolate_tensor(cvec_ds, cvec_ds.shape[0]//cvec_downsample_rate)
        cvec_ds = linear_interpolate_tensor(cvec_ds, mel.shape[1])[None, :]
    else:
        cvec_ds = None

    # Extract f0
    if robust_f0 > 0:
        # Parameters for F0 extraction
        time_step = hop_length / sample_rate
        f0_min = 40
        f0_max = 1100
        
        # Extract F0 using multiple methods
        with autocast(device_type=device_type, enabled=use_fp16):
            rmvpe_f0 = rmvpe.infer_from_audio(audio_tensor, sample_rate=sample_rate, device=device)
        rmvpe_f0 = post_process_f0(rmvpe_f0, sample_rate, hop_length, mel.shape[1], silence_front=0.0, cut_last=False)
        pw_f0 = get_f0_pw(normalized_audio, sample_rate, time_step, f0_min, f0_max)
        pmac_f0 = get_f0_pm(normalized_audio, sample_rate, time_step, f0_min, f0_max)
        
        if robust_f0 == 1:
            # Level 1: Light ensemble that preserves expressiveness
            with autocast(device_type=device_type, enabled=use_fp16):
                rms_np = rms_extractor(audio_tensor).squeeze().cpu().numpy()
            f0 = f0_ensemble_light(rmvpe_f0, pw_f0, pmac_f0, rms=rms_np)
        else:
            # Level 2: Strong ensemble with more filtering
            f0 = f0_ensemble(rmvpe_f0, pw_f0, pmac_f0)
    else:
        # Level 0: Use only RMVPE for F0 extraction (original method)
        device_type = 'cuda' if device.type == 'cuda' else 'cpu'
        with autocast(device_type=device_type, enabled=use_fp16):
            f0 = rmvpe.infer_from_audio(audio_tensor, sample_rate=sample_rate, device=device)
        f0 = post_process_f0(f0, sample_rate, hop_length, mel.shape[1], silence_front=0.0, cut_last=False)
    
    if key_shift != 0:
        f0 = f0 * 2 ** (key_shift / 12)
    f0 = torch.from_numpy(f0).float().to(device)[None, :]
    
    # Extract RMS
    rms = rms_extractor(audio_tensor)
    
    return mel, cvec, cvec_ds, f0, rms, original_loudness


def run_inference(
    model, mel, cvec, f0, rms, cvec_ds, spk_id, 
    infer_steps, ds_cfg_strength, spk_cfg_strength, 
    skip_cfg_strength, cfg_skip_layers, cfg_rescale,
    sliced_inference=False, use_fp16=True, frame_lengths=None
):
    """Run the actual inference through the model"""
    device_type = 'cuda' if mel.device.type == 'cuda' else 'cpu'
    
    if frame_lengths is not None:
        # Use batch inference with frame lengths
        with autocast(device_type=device_type, enabled=use_fp16):
            mel_out, _ = model.sample(
                src_mel=mel,
                spk_id=spk_id,
                f0=f0,
                rms=rms,
                cvec=cvec,
                steps=infer_steps,
                bad_cvec=cvec_ds,
                ds_cfg_strength=ds_cfg_strength,
                spk_cfg_strength=spk_cfg_strength,
                skip_cfg_strength=skip_cfg_strength,
                cfg_skip_layers=cfg_skip_layers,
                cfg_rescale=cfg_rescale,
                frame_len=frame_lengths,
            )
        return mel_out
    elif sliced_inference:
        # Use sliced inference for long segments
        sliced_len = 256
        mel_crossfade_len = 8  # Number of frames to crossfade in mel domain
        
        # If the segment is shorter than one slice, just process it directly
        if mel.shape[1] <= sliced_len:
            with autocast(device_type=device_type, enabled=use_fp16):
                mel_out, _ = model.sample(
                    src_mel=mel,
                    spk_id=spk_id,
                    f0=f0,
                    rms=rms,
                    cvec=cvec,
                    steps=infer_steps,
                    bad_cvec=cvec_ds,
                    ds_cfg_strength=ds_cfg_strength,
                    spk_cfg_strength=spk_cfg_strength,
                    skip_cfg_strength=skip_cfg_strength,
                    cfg_skip_layers=cfg_skip_layers,
                    cfg_rescale=cfg_rescale,
                )
            return mel_out
        
        # Create a tensor to hold the full output with crossfading
        full_mel_out = torch.zeros_like(mel)
        
        # Process each slice
        for i in range(0, mel.shape[1], sliced_len - mel_crossfade_len):
            # Determine slice boundaries
            start_idx = i
            end_idx = min(i + sliced_len, mel.shape[1])
            
            # Skip if we're at the end
            if start_idx >= mel.shape[1]:
                break
            
            # Extract slices for this window
            mel_slice = mel[:, start_idx:end_idx, :]
            cvec_slice = cvec[:, start_idx:end_idx, :]
            f0_slice = f0[:, start_idx:end_idx]
            rms_slice = rms[:, start_idx:end_idx]
            
            # Slice the bad_cvec if it exists
            cvec_ds_slice = None
            if cvec_ds is not None:
                cvec_ds_slice = cvec_ds[:, start_idx:end_idx, :]
            
            # Process with model using mixed precision if enabled
            with autocast(device_type=device_type, enabled=use_fp16):
                mel_out_slice, _ = model.sample(
                    src_mel=mel_slice,
                    spk_id=spk_id,
                    f0=f0_slice,
                    rms=rms_slice,
                    cvec=cvec_slice,
                    steps=infer_steps,
                    bad_cvec=cvec_ds_slice,
                    ds_cfg_strength=ds_cfg_strength,
                    spk_cfg_strength=spk_cfg_strength,
                    skip_cfg_strength=skip_cfg_strength,
                    cfg_skip_layers=cfg_skip_layers,
                    cfg_rescale=cfg_rescale,
                )
            
            # Create crossfade weights
            slice_len = end_idx - start_idx
            
            # Apply different strategies depending on position
            if i == 0:  # First slice
                # No crossfade at the beginning
                weights = torch.ones((1, slice_len, 1), device=mel.device)
                if i + sliced_len < mel.shape[1]:  # If not the last slice too
                    # Fade out at the end - use the minimum of slice_len and mel_crossfade_len
                    actual_crossfade_len = min(mel_crossfade_len, slice_len)
                    if actual_crossfade_len > 0:  # Only apply if we have space
                        fade_out = torch.linspace(1, 0, actual_crossfade_len, device=mel.device)
                        weights[:, -actual_crossfade_len:, :] = fade_out.view(1, -1, 1)
            elif end_idx >= mel.shape[1]:  # Last slice
                # Fade in at the beginning - use the minimum of slice_len and mel_crossfade_len
                weights = torch.ones((1, slice_len, 1), device=mel.device)
                actual_crossfade_len = min(mel_crossfade_len, slice_len)
                if actual_crossfade_len > 0:  # Only apply if we have space
                    fade_in = torch.linspace(0, 1, actual_crossfade_len, device=mel.device)
                    weights[:, :actual_crossfade_len, :] = fade_in.view(1, -1, 1)
            else:  # Middle slice
                # Crossfade both ends
                weights = torch.ones((1, slice_len, 1), device=mel.device)
                # Fade in at the beginning
                if mel_crossfade_len > 0:  # Only apply if we have space
                    fade_in = torch.linspace(0, 1, mel_crossfade_len, device=mel.device)
                    weights[:, :mel_crossfade_len, :] = fade_in.view(1, -1, 1)
                # Fade out at the end
                if mel_crossfade_len > 0:  # Only apply if we have space
                    fade_out = torch.linspace(1, 0, mel_crossfade_len, device=mel.device)
                    weights[:, -mel_crossfade_len:, :] = fade_out.view(1, -1, 1)
            
            # Apply weighted update to the output
            full_mel_out[:, start_idx:end_idx, :] += weights * mel_out_slice
            
        # Return the full crossfaded output
        mel_out = full_mel_out
    else:
        # Process the entire segment at once with mixed precision if enabled
        with autocast(device_type=device_type, enabled=use_fp16):
            mel_out, _ = model.sample(
                src_mel=mel,
                spk_id=spk_id,
                f0=f0,
                rms=rms,
                cvec=cvec,
                steps=infer_steps,
                bad_cvec=cvec_ds,
                ds_cfg_strength=ds_cfg_strength,
                spk_cfg_strength=spk_cfg_strength,
                skip_cfg_strength=skip_cfg_strength,
                cfg_skip_layers=cfg_skip_layers,
                cfg_rescale=cfg_rescale,
            )
    
    return mel_out


def generate_audio(vocoder, mel_out, f0, original_loudness=None, restore_loudness=True, use_fp16=True):
    """Generate audio from mel spectrogram using vocoder"""
    # Use mixed precision for vocoder inference if enabled
    device_type = 'cuda' if mel_out.device.type == 'cuda' else 'cpu'
    with autocast(device_type=device_type, enabled=use_fp16):
        audio_out = vocoder(mel_out.transpose(1, 2), f0)
    audio_out = audio_out.squeeze().cpu().numpy()

    if restore_loudness and original_loudness is not None:
        # Restore original loudness
        meter = pyln.Meter(44100)
        audio_out_loudness = meter.integrated_loudness(audio_out)
        audio_out = pyln.normalize.loudness(audio_out, audio_out_loudness, original_loudness)

        # Handle clipping
        max_amp = np.max(np.abs(audio_out))
        if max_amp > 1.0:
            audio_out = audio_out * (0.99 / max_amp)
            
    return audio_out


def process_segment(
    audio_segment, 
    svc_model, vocoder, rmvpe, hubert, rms_extractor, 
    speaker_id, sample_rate, hop_length, device,
    key_shift=0, 
    infer_steps=32,
    ds_cfg_strength=0.0, 
    spk_cfg_strength=0.0, 
    skip_cfg_strength=0.0, 
    cfg_skip_layers=None, 
    cfg_rescale=0.7,
    cvec_downsample_rate=2,
    target_loudness=-18.0,
    restore_loudness=True,
    sliced_inference=False,
    robust_f0=0,
    use_fp16=True
):
    """Process a single audio segment and return the converted audio"""
    # Extract features
    mel, cvec, cvec_ds, f0, rms, original_loudness = extract_features(
        audio_segment, sample_rate, hop_length, rmvpe, hubert, rms_extractor, 
        device, key_shift, ds_cfg_strength, cvec_downsample_rate, target_loudness,
        robust_f0, use_fp16
    )
    
    # Prepare speaker ID - convert to tensor
    spk_id = torch.LongTensor([speaker_id]).to(device)
    
    # Run inference to generate output mel spectrogram
    mel_out = run_inference(
        model=svc_model, 
        mel=mel, 
        cvec=cvec, 
        f0=f0, 
        rms=rms, 
        cvec_ds=cvec_ds, 
        spk_id=spk_id,
        infer_steps=infer_steps,
        ds_cfg_strength=ds_cfg_strength,
        spk_cfg_strength=spk_cfg_strength,
        skip_cfg_strength=skip_cfg_strength,
        cfg_skip_layers=cfg_skip_layers,
        cfg_rescale=cfg_rescale,
        sliced_inference=sliced_inference,
        use_fp16=use_fp16
    )
    
    # Generate audio
    audio_out = generate_audio(
        vocoder, mel_out, f0, 
        original_loudness if restore_loudness else None, 
        restore_loudness, use_fp16
    )
    
    return audio_out


def pad_tensor_to_length(tensor, length):
    """Pad a tensor to the specified length along the sequence dimension (dim=1)"""
    curr_len = tensor.shape[1]
    if curr_len >= length:
        return tensor
    
    pad_len = length - curr_len
    
    if tensor.dim() == 2:
        padding = (0, pad_len)
    elif tensor.dim() == 3:
        padding = (0, 0, 0, pad_len)
    else:
        raise ValueError(f"Unsupported tensor dimension: {tensor.dim()}")
    
    padded = torch.nn.functional.pad(tensor, padding, "constant", 0)
    return padded


def batch_process_segments(
    segments_with_pos, 
    svc_model, vocoder, rmvpe, hubert, rms_extractor, 
    speaker_id, sample_rate, hop_length, device,
    key_shift=0, 
    infer_steps=32,
    ds_cfg_strength=0.0, 
    spk_cfg_strength=0.0, 
    skip_cfg_strength=0.0, 
    cfg_skip_layers=None, 
    cfg_rescale=0.7,
    cvec_downsample_rate=2,
    target_loudness=-18.0,
    restore_loudness=True,
    robust_f0=0,
    use_fp16=True,
    batch_size=1,
    gr_progress=None,
    progress_desc=None
):
    """Process audio segments in batches for faster inference"""
    if batch_size <= 1:
        results = []
        for i, (start_sample, chunk) in enumerate(tqdm(segments_with_pos, desc="Processing segments")):
            if gr_progress is not None:
                gr_progress(0.2 + (0.7 * (i / len(segments_with_pos))), desc=progress_desc.format(i+1, len(segments_with_pos)))
            audio_out = process_segment(
                chunk, svc_model, vocoder, rmvpe, hubert, rms_extractor,
                speaker_id, sample_rate, hop_length, device,
                key_shift, infer_steps, ds_cfg_strength, spk_cfg_strength,
                skip_cfg_strength, cfg_skip_layers, cfg_rescale,
                cvec_downsample_rate, target_loudness, restore_loudness,
                robust_f0, use_fp16
            )
            results.append((start_sample, audio_out, len(chunk)))
        return results
    
    sorted_with_idx = sorted(enumerate(segments_with_pos), key=lambda x: len(x[1][1]))
    sorted_segments = []
    original_indices = []
    
    for orig_idx, (pos, chunk) in sorted_with_idx:
        original_indices.append(orig_idx)
        sorted_segments.append((pos, chunk))

    batched_segments = [sorted_segments[i:i + batch_size] for i in range(0, len(sorted_segments), batch_size)]
    
    all_results = []
    
    for batch_idx, batch in enumerate(tqdm(batched_segments, desc="Processing batches")):
        if gr_progress is not None:
            gr_progress(
                0.2 + (0.7 * (batch_idx / len(batched_segments))),
                desc=progress_desc.format(batch_idx+1, len(batched_segments)))

        batch_start_samples = [pos for pos, _ in batch]
        batch_chunks = [chunk for _, chunk in batch]
        batch_lengths = [len(chunk) for chunk in batch_chunks]
        
        batch_features = []
        for chunk in batch_chunks:
            mel, cvec, cvec_ds, f0, rms, original_loudness = extract_features(
                chunk, sample_rate, hop_length, rmvpe, hubert, rms_extractor, 
                device, key_shift, ds_cfg_strength, cvec_downsample_rate, target_loudness,
                robust_f0, use_fp16
            )
            batch_features.append({
                'mel': mel, 
                'cvec': cvec, 
                'cvec_ds': cvec_ds, 
                'f0': f0, 
                'rms': rms, 
                'original_loudness': original_loudness,
                'length': mel.shape[1]
            })
        
        max_length = max(feat['length'] for feat in batch_features)
        
        padded_mels = []
        padded_cvecs = []
        padded_f0s = []
        padded_rmss = []
        frame_lengths = []
        original_loudness_values = []
        
        if ds_cfg_strength > 0:
            padded_cvec_ds = []
        
        for feat in batch_features:
            curr_len = feat['length']
            frame_lengths.append(curr_len)
            
            padded_mels.append(pad_tensor_to_length(feat['mel'], max_length))
            padded_cvecs.append(pad_tensor_to_length(feat['cvec'], max_length))
            padded_f0s.append(pad_tensor_to_length(feat['f0'], max_length))
            padded_rmss.append(pad_tensor_to_length(feat['rms'], max_length))
            
            if ds_cfg_strength > 0:
                padded_cvec_ds.append(pad_tensor_to_length(feat['cvec_ds'], max_length))
            
            original_loudness_values.append(feat['original_loudness'])
        
        batched_mel = torch.cat(padded_mels, dim=0)
        batched_cvec = torch.cat(padded_cvecs, dim=0)
        batched_f0 = torch.cat(padded_f0s, dim=0)
        batched_rms = torch.cat(padded_rmss, dim=0)
        
        if ds_cfg_strength > 0:
            batched_cvec_ds = torch.cat(padded_cvec_ds, dim=0)
        else:
            batched_cvec_ds = None
        
        frame_lengths = torch.tensor(frame_lengths, device=device)
        
        batch_spk_id = torch.LongTensor([speaker_id] * len(batch)).to(device)
        
        with torch.no_grad():
            mel_out = run_inference(
                model=svc_model,
                mel=batched_mel,
                cvec=batched_cvec,
                f0=batched_f0,
                rms=batched_rms,
                cvec_ds=batched_cvec_ds,
                spk_id=batch_spk_id,
                infer_steps=infer_steps,
                ds_cfg_strength=ds_cfg_strength,
                spk_cfg_strength=spk_cfg_strength,
                skip_cfg_strength=skip_cfg_strength,
                cfg_skip_layers=cfg_skip_layers,
                cfg_rescale=cfg_rescale,
                frame_lengths=frame_lengths,
                use_fp16=use_fp16
            )
            
            with autocast(device_type='cuda' if device.type == 'cuda' else 'cpu', enabled=use_fp16):
                audio_out = vocoder(mel_out.transpose(1, 2), batched_f0)
            
            for i in range(len(batch)):
                expected_audio_length = batch_lengths[i]
                
                curr_audio = audio_out[i].squeeze().cpu().numpy()
                
                if len(curr_audio) > expected_audio_length:
                    curr_audio = curr_audio[:expected_audio_length]
                elif len(curr_audio) < expected_audio_length:
                    curr_audio = np.pad(curr_audio, (0, expected_audio_length - len(curr_audio)), 'constant')
                
                if restore_loudness:
                    meter = pyln.Meter(44100, block_size=0.1)
                    curr_loudness = meter.integrated_loudness(curr_audio)
                    curr_audio = pyln.normalize.loudness(curr_audio, curr_loudness, original_loudness_values[i])
                    
                    max_amp = np.max(np.abs(curr_audio))
                    if max_amp > 1.0:
                        curr_audio = curr_audio * (0.99 / max_amp)
                
                expected_length = batch_lengths[i]
                
                all_results.append((batch_idx, i, batch_start_samples[i], curr_audio, expected_length, original_indices[batch_size * batch_idx + i]))
    
    all_results.sort(key=lambda x: x[5])
    
    return [(pos, audio, length) for _, _, pos, audio, length, _ in all_results]


@click.command()
@click.option('--model', type=click.Path(exists=True), required=True, help='Path to model checkpoint')
@click.option('--input', type=click.Path(exists=True), required=True, help='Input audio file')
@click.option('--output', type=click.Path(), required=True, help='Output audio file')
@click.option('--speaker', type=str, required=True, help='Target speaker')
@click.option('--key-shift', type=int, default=0, help='Pitch shift in semitones')
@click.option('--device', type=str, default=None, help='Device to use (cuda/cpu)')
@click.option('--infer-steps', type=int, default=32, help='Number of inference steps')
@click.option('--ds-cfg-strength', type=float, default=0.0, help='Downsampled content vector guidance strength')
@click.option('--spk-cfg-strength', type=float, default=0.0, help='Speaker guidance strength')
@click.option('--skip-cfg-strength', type=float, default=0.0, help='Skip layer guidance strength')
@click.option('--cfg-skip-layers', type=int, default=None, help='Layer to skip for classifier-free guidance')
@click.option('--cfg-rescale', type=float, default=0.7, help='Classifier-free guidance rescale factor')
@click.option('--cvec-downsample-rate', type=int, default=2, help='Downsampling rate for bad_cvec creation')
@click.option('--target-loudness', type=float, default=-18.0, help='Target loudness in LUFS for normalization')
@click.option('--restore-loudness', default=True, help='Restore loudness to original')
@click.option('--fade-duration', type=float, default=20.0, help='Fade duration in milliseconds')
@click.option('--sliced-inference', is_flag=True, default=False, help='Use sliced inference for processing long segments')
@click.option('--robust-f0', type=int, default=0, help='Level of robust f0 filtering (0=none, 1=light, 2=aggressive)')
@click.option('--slicer-threshold', type=float, default=-30.0, help='Threshold for audio slicing in dB')
@click.option('--slicer-min-length', type=int, default=3000, help='Minimum length of audio segments in milliseconds')
@click.option('--slicer-min-interval', type=int, default=100, help='Minimum interval between audio segments in milliseconds')
@click.option('--slicer-hop-size', type=int, default=10, help='Hop size for audio slicing in milliseconds')
@click.option('--slicer-max-sil-kept', type=int, default=200, help='Maximum silence kept in milliseconds')
@click.option('--use-fp16', is_flag=True, default=True, help='Use float16 precision for faster inference')
@click.option('--batch-size', type=int, default=1, help='Batch size for parallel inference')
def main(
    model,
    input,
    output,
    speaker,
    key_shift,
    device,
    infer_steps,
    ds_cfg_strength,
    spk_cfg_strength,
    skip_cfg_strength,
    cfg_skip_layers,
    cfg_rescale,
    cvec_downsample_rate,
    target_loudness,
    restore_loudness,
    fade_duration,
    sliced_inference,
    robust_f0,
    slicer_threshold,
    slicer_min_length,
    slicer_min_interval,
    slicer_hop_size,
    slicer_max_sil_kept,
    use_fp16,
    batch_size
):
    """Convert the voice in an audio file to a target speaker."""

    # Setup device
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    # Load models
    svc_model, vocoder, rmvpe, hubert, rms_extractor, spk2idx, dataset_cfg = load_models(model, device, use_fp16)

    try:
        speaker_id = spk2idx[speaker]
    except KeyError:
        raise ValueError(f"Speaker {speaker} not found in the model's speaker list, valid speakers are {spk2idx.keys()}")
    
    # Get config from loaded model
    hop_length = 512
    sample_rate = 44100

    # Load audio
    audio = load_audio(input, sample_rate)

    # Initialize Slicer
    slicer = Slicer(
        sr=sample_rate,
        threshold=slicer_threshold,
        min_length=slicer_min_length,
        min_interval=slicer_min_interval,
        hop_size=slicer_hop_size,
        max_sil_kept=slicer_max_sil_kept
    )

    # Step (1): Use slicer to segment the input audio and get positions
    click.echo("Slicing audio...")
    segments_with_pos = slicer.slice(audio)  # Now returns list of (start_pos, chunk)

    if restore_loudness:
        click.echo(f"Will restore loudness to original")

    # Calculate fade size in samples
    fade_samples = int(fade_duration * sample_rate / 1000)

    # Process segments
    if batch_size > 1:
        click.echo(f"Processing {len(segments_with_pos)} segments with batch size {batch_size}...")
        result_audio = np.zeros(len(audio) + fade_samples)  # Extra space for potential overlap
        
        with torch.no_grad():
            processed_segments = batch_process_segments(
                segments_with_pos, svc_model, vocoder, rmvpe, hubert, rms_extractor,
                speaker_id, sample_rate, hop_length, device,
                key_shift, infer_steps, ds_cfg_strength, spk_cfg_strength,
                skip_cfg_strength, cfg_skip_layers, cfg_rescale,
                cvec_downsample_rate, target_loudness, restore_loudness,
                robust_f0, use_fp16, batch_size
            )
            
        for idx, (start_sample, audio_out, expected_length) in enumerate(processed_segments):
            # Apply fades
            if idx > 0:  # Not first segment
                audio_out = apply_fade(audio_out.copy(), fade_samples, fade_in=True)
                result_audio[start_sample:start_sample + fade_samples] *= \
                    np.linspace(1, 0, fade_samples)  # Fade out previous
            
            if idx < len(processed_segments) - 1:  # Not last segment
                audio_out[-fade_samples:] *= np.linspace(1, 0, fade_samples)  # Fade out
            
            # Add to result
            result_audio[start_sample:start_sample + len(audio_out)] += audio_out
    else:
        # Original processing method using sliced_inference
        click.echo(f"Processing {len(segments_with_pos)} segments...")
        result_audio = np.zeros(len(audio) + fade_samples)  # Extra space for potential overlap

        with torch.no_grad():
            for idx, (start_sample, chunk) in enumerate(tqdm(segments_with_pos)):

                # Process the segment
                audio_out = process_segment(
                    chunk, svc_model, vocoder, rmvpe, hubert, rms_extractor,
                    speaker_id, sample_rate, hop_length, device,
                    key_shift, infer_steps, ds_cfg_strength, spk_cfg_strength,
                    skip_cfg_strength, cfg_skip_layers, cfg_rescale,
                    cvec_downsample_rate, target_loudness, restore_loudness, sliced_inference,
                    robust_f0, use_fp16
                )
                
                # Ensure consistent length
                expected_length = len(chunk)
                if len(audio_out) > expected_length:
                    audio_out = audio_out[:expected_length]
                elif len(audio_out) < expected_length:
                    audio_out = np.pad(audio_out, (0, expected_length - len(audio_out)), 'constant')
                
                # Apply fades
                if idx > 0:  # Not first segment
                    audio_out = apply_fade(audio_out.copy(), fade_samples, fade_in=True)
                    result_audio[start_sample:start_sample + fade_samples] *= \
                        np.linspace(1, 0, fade_samples)  # Fade out previous
                
                if idx < len(segments_with_pos) - 1:  # Not last segment
                    audio_out[-fade_samples:] *= np.linspace(1, 0, fade_samples)  # Fade out
                
                # Add to result
                result_audio[start_sample:start_sample + len(audio_out)] += audio_out

    # Trim any extra padding
    result_audio = result_audio[:len(audio)]

    # Save output
    click.echo("Saving output...")
    output_path = Path(output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    torchaudio.save(output, torch.from_numpy(result_audio).unsqueeze(0), sample_rate)
    click.echo("Done!")


if __name__ == '__main__':
    main()