Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import torchaudio | |
| import spaces | |
| from tortoise.api import TextToSpeech | |
| from tortoise.utils.audio import load_audio | |
| # Create cache/output directory | |
| os.makedirs("outputs", exist_ok=True) | |
| # Create a global TTS model instance | |
| tts_model = None | |
| # Synchronous function with GPU decorator | |
| def _generate_speech_gpu(text, voice_preset="random", voice_file_path=None): | |
| global tts_model | |
| try: | |
| # Initialize the model if not already initialized | |
| if tts_model is None: | |
| print("Initializing Tortoise-TTS model...") | |
| tts_model = TextToSpeech(use_deepspeed=torch.cuda.is_available()) | |
| print(f"Model initialized. Using device: {next(tts_model.autoregressive.parameters()).device}") | |
| # Process voice sample if provided | |
| voice_samples = None | |
| if voice_file_path and os.path.exists(voice_file_path): | |
| print(f"Loading voice from {voice_file_path}") | |
| voice_samples, _ = load_audio(voice_file_path, 22050) | |
| voice_samples = [voice_samples] | |
| voice_preset = None | |
| # Generate speech | |
| print(f"Generating speech for text: {text[:50]}...") | |
| output_filename = f"outputs/tts_output_{hash(text) % 10000}.wav" | |
| gen = tts_model.tts_with_preset( | |
| text, | |
| voice_samples=voice_samples, | |
| preset=voice_preset | |
| ) | |
| # Save the generated audio | |
| torchaudio.save(output_filename, gen.squeeze(0).cpu(), 24000) | |
| print(f"Speech generated and saved to {output_filename}") | |
| # Return the filename and audio data | |
| return output_filename, (24000, gen.squeeze(0).cpu()) | |
| except Exception as e: | |
| print(f"Error generating speech: {str(e)}") | |
| raise | |
| # Async wrapper that calls the GPU function | |
| async def generate_speech(text, voice_preset="random", voice_file_path=None): | |
| # Call the GPU-decorated function | |
| return _generate_speech_gpu(text, voice_preset, voice_file_path) |