Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import os | |
| import tempfile | |
| import ffmpeg | |
| import json | |
| from huggingface_hub import InferenceApi | |
| from typing import List, Dict, Tuple | |
| # ๐น Constants | |
| MODEL_NAME: str = "ivrit-ai/faster-whisper-v2-d4" | |
| TRANSLATION_MODEL_NAME: str = "dicta-il/dictalm2.0-GGUF" | |
| TEMP_DIR: str = tempfile.gettempdir() | |
| # ๐น Load Hugging Face Inference API | |
| ASR_API = InferenceApi(repo_id=MODEL_NAME) | |
| TRANSLATION_API = InferenceApi(repo_id=TRANSLATION_MODEL_NAME) | |
| def convert_audio(audio_path: str) -> str: | |
| """Converts an audio file to 16kHz WAV format for compatibility.""" | |
| converted_path = os.path.join(TEMP_DIR, "converted.wav") | |
| ( | |
| ffmpeg | |
| .input(audio_path) | |
| .output(converted_path, format="wav", ar="16000") | |
| .run(overwrite_output=True, quiet=True) | |
| ) | |
| return converted_path | |
| def transcribe_audio(file: str, translate: bool) -> Tuple[str, str]: | |
| """Transcribes audio and optionally translates it using Hugging Face API.""" | |
| audio_path = file if file.endswith(".wav") else convert_audio(file) | |
| with open(audio_path, "rb") as audio_file: | |
| result = ASR_API(inputs=audio_file) | |
| segments = result.get("segments", []) | |
| subtitles: List[Dict[str, str]] = [] | |
| transcribed_text: str = "" | |
| for segment in segments: | |
| hebrew_text = segment["text"] | |
| start_time = segment["start"] | |
| end_time = segment["end"] | |
| eng_translation = "" | |
| if translate: | |
| eng_translation = TRANSLATION_API(inputs=hebrew_text)[0]["translation_text"] | |
| subtitles.append({ | |
| "start": start_time, | |
| "end": end_time, | |
| "text": hebrew_text, | |
| "translation": eng_translation if translate else None | |
| }) | |
| transcribed_text += f"{hebrew_text} " | |
| return json.dumps(subtitles), transcribed_text | |
| # ๐น Inject WebGPU-compatible JavaScript via `gr.HTML()` | |
| webgpu_script = """ | |
| <script type="module"> | |
| import { pipeline } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@latest'; | |
| let asr; | |
| async function loadModel() { | |
| asr = await pipeline("automatic-speech-recognition", "openai/whisper-large-v3"); | |
| console.log("WebGPU ASR model loaded."); | |
| } | |
| async function transcribe(audioFile) { | |
| if (!asr) { | |
| console.error("Model not loaded."); | |
| return; | |
| } | |
| const result = await asr(audioFile); | |
| document.getElementById("output").innerText = result.text; | |
| } | |
| document.getElementById("upload").addEventListener("change", async (event) => { | |
| const file = event.target.files[0]; | |
| transcribe(file); | |
| }); | |
| loadModel(); | |
| </script> | |
| <input type="file" id="upload" accept="audio/*"> | |
| <p id="output">Transcription will appear here.</p> | |
| """ | |
| # ๐น Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# WhatShutup: Transcribe WhatsApp Voice Messages with WebGPU Support") | |
| webgpu_component = gr.HTML(webgpu_script) | |
| audio_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File") | |
| translate_checkbox = gr.Checkbox(label="Translate to English?", value=False) | |
| with gr.Row(): | |
| audio_player = gr.Audio(source="upload", type="filepath", label="Playback") | |
| transcript_output = gr.Textbox(label="Transcription & Subtitles", lines=10) | |
| submit_btn = gr.Button("Transcribe") | |
| submit_btn.click(transcribe_audio, inputs=[audio_input, translate_checkbox], outputs=[audio_player, transcript_output]) | |
| demo.launch() | |