import gradio as gr import asyncio import io import sys sys.path.insert(0, '.') # Mock spaces module for local testing try: import spaces except ImportError: class SpacesMock: @staticmethod def GPU(func): return func spaces = SpacesMock() from maya1.model_loader import Maya1Model from maya1.pipeline import Maya1Pipeline from maya1.prompt_builder import Maya1PromptBuilder from maya1.snac_decoder import SNACDecoder from maya1.constants import AUDIO_SAMPLE_RATE # Preset characters (2 realistic + 2 creative) PRESET_CHARACTERS = { "Male American": { "description": "Male voice in their 30s with american accent", "example_text": "Hello world this is amazing I love it" }, "Female British": { "description": "Female voice in their 20s with british accent", "example_text": "Welcome everyone let me tell you something incredible" }, "Robot": { "description": "Creative, ai_machine_voice character. Male voice with robotic timbre", "example_text": "System initialized processing data computation complete" }, "Singer": { "description": "Creative character. Female voice with smooth timbre", "example_text": "Listen to this la la la beautiful melody " } } # Global pipeline variables model = None prompt_builder = None snac_decoder = None pipeline = None models_loaded = False def load_models(): """Load Maya1 vLLM model and pipeline (runs once).""" global model, prompt_builder, snac_decoder, pipeline, models_loaded if models_loaded: return import torch import os # Ensure CUDA is available for HF Spaces if not torch.cuda.is_available(): print("Warning: CUDA not available, using CPU") device = "cpu" else: device = "cuda" print(f"CUDA available: {torch.cuda.get_device_name(0)}") # Set environment variable for vLLM os.environ.setdefault("VLLM_USE_V1", "0") print("Loading Maya1 model with vLLM...") model = Maya1Model( model_path="maya-research/maya1", dtype="bfloat16", max_model_len=8192, gpu_memory_utilization=0.85, ) print("Initializing prompt builder...") prompt_builder = Maya1PromptBuilder(model.tokenizer, model) print("Loading SNAC decoder...") snac_decoder = SNACDecoder( device=device, enable_batching=False, ) print("Initializing pipeline...") pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder) models_loaded = True print("Models loaded successfully!") def preset_selected(preset_name): """Update description and text when preset is selected.""" if preset_name in PRESET_CHARACTERS: char = PRESET_CHARACTERS[preset_name] return char["description"], char["example_text"] return "", "" @spaces.GPU def generate_speech(preset_name, description, text, temperature, max_tokens): """Generate emotional speech from description and text using vLLM.""" try: # Load models if not already loaded load_models() # If using preset, override description if preset_name and preset_name in PRESET_CHARACTERS: description = PRESET_CHARACTERS[preset_name]["description"] # Validate inputs if not description or not text: return None, "Error: Please provide both description and text!" print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...") # Generate audio using vLLM pipeline (async wrapper) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) audio_bytes = loop.run_until_complete( pipeline.generate_speech( description=description, text=text, temperature=temperature, top_p=0.9, max_tokens=max_tokens, repetition_penalty=1.1, seed=None, ) ) loop.close() if audio_bytes is None: return None, "Error: Audio generation failed. Try different text or increase max_tokens." # Convert bytes to WAV file import wave wav_buffer = io.BytesIO() with wave.open(wav_buffer, 'wb') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(AUDIO_SAMPLE_RATE) wav_file.writeframes(audio_bytes) wav_buffer.seek(0) # Calculate duration duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7 status_msg = f"Generated {duration:.2f}s of emotional speech!" return wav_buffer, status_msg except Exception as e: import traceback error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" print(error_msg) return None, error_msg # Create Gradio interface with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # Maya1 - Open Source Emotional Text-to-Speech **The best open source voice AI model with emotions!** Generate realistic and expressive speech with natural language voice design. Choose a preset character or create your own custom voice. [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi) """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Character Selection") preset_dropdown = gr.Dropdown( choices=list(PRESET_CHARACTERS.keys()), label="Preset Characters", value=list(PRESET_CHARACTERS.keys())[0], info="Quick pick from 4 preset characters" ) gr.Markdown("### Voice Design") description_input = gr.Textbox( label="Voice Description", placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...", lines=3, value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"] ) text_input = gr.Textbox( label="Text to Speak", placeholder="Enter text with tags like , , ...", lines=4, value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"] ) with gr.Accordion("Advanced Settings", open=False): temperature_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.4, step=0.1, label="Temperature", info="Lower = more stable, Higher = more creative" ) max_tokens_slider = gr.Slider( minimum=100, maximum=2048, value=500, step=50, label="Max Tokens", info="More tokens = longer audio" ) generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### Generated Audio") audio_output = gr.Audio( label="Generated Speech", type="filepath", interactive=False ) status_output = gr.Textbox( label="Status", lines=3, interactive=False ) gr.Markdown(""" ### Supported Emotions `` `` `` `` `` `` `` `` `` `` `` `` `` """) # Event handlers preset_dropdown.change( fn=preset_selected, inputs=[preset_dropdown], outputs=[description_input, text_input] ) generate_btn.click( fn=generate_speech, inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider], outputs=[audio_output, status_output] ) if __name__ == "__main__": demo.launch()