Loren commited on
Commit
ef9c7f9
·
verified ·
1 Parent(s): 6645fa0

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +466 -238
  2. requirements.txt +9 -7
app.py CHANGED
@@ -1,238 +1,466 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
- import spaces
5
-
6
- #### Functions
7
-
8
- @spaces.GPU
9
- def process_transcript(language: str, audio_path: str) -> str:
10
- """Process the audio file to return its transcription.
11
-
12
- Args:
13
- language: The language of the audio.
14
- audio_path: The path to the audio file.
15
-
16
- Returns:
17
- The transcribed text of the audio.
18
- """
19
-
20
- if audio_path is None:
21
- return "Please provide some input audio: either upload an audio file or use the microphone."
22
- else:
23
- id_language = dict_languages[language]
24
- inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
25
- inputs = inputs.to(device, dtype=torch.bfloat16)
26
- outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
27
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
28
-
29
- return decoded_outputs[0]
30
- ###
31
-
32
- @spaces.GPU
33
- def process_translate(language: str, audio_path: str) -> str:
34
- if audio_path is None:
35
- return "Please provide some input audio: either upload an audio file or use the microphone."
36
- else:
37
- conversation = [
38
- {
39
- "role": "user",
40
- "content": [
41
- {
42
- "type": "audio",
43
- "path": audio_path,
44
- },
45
- {"type": "text", "text": "Translate this in "+language},
46
- ],
47
- }
48
- ]
49
-
50
- inputs = processor.apply_chat_template(conversation)
51
- inputs = inputs.to(device, dtype=torch.bfloat16)
52
-
53
- outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
54
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
55
-
56
- return decoded_outputs[0]
57
- ###
58
-
59
- @spaces.GPU
60
- def process_chat(question: str, audio_path: str) -> str:
61
- if audio_path is None:
62
- return "Please provide some input audio: either upload an audio file or use the microphone."
63
- else:
64
- conversation = [
65
- {
66
- "role": "user",
67
- "content": [
68
- {
69
- "type": "audio",
70
- "path": audio_path,
71
- },
72
- {"type": "text", "text": question},
73
- ],
74
- }
75
- ]
76
-
77
- inputs = processor.apply_chat_template(conversation)
78
- inputs = inputs.to(device, dtype=torch.bfloat16)
79
-
80
- outputs = model.generate(**inputs, max_new_tokens=500)
81
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
82
-
83
- return decoded_outputs[0]
84
- ###
85
-
86
- def disable_buttons():
87
- return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
88
-
89
- def enable_buttons():
90
- return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
91
- ###
92
-
93
- ### Initializations
94
-
95
- MAX_TOKENS = 32000
96
-
97
- device = "cuda" if torch.cuda.is_available() else "cpu"
98
- print(f"*** Device: {device}")
99
- model_name = 'mistralai/Voxtral-Mini-3B-2507'
100
-
101
- processor = AutoProcessor.from_pretrained(model_name)
102
- model = VoxtralForConditionalGeneration.from_pretrained(model_name,
103
- torch_dtype=torch.bfloat16,
104
- device_map=device)
105
- # Supported languages
106
- dict_languages = {"English": "en",
107
- "French": "fr",
108
- "German": "de",
109
- "Spanish": "es",
110
- "Italian": "it",
111
- "Portuguese": "pt",
112
- "Dutch": "nl",
113
- "Hindi": "hi"}
114
-
115
-
116
- #### Gradio interface
117
- with gr.Blocks(title="Voxtral") as voxtral:
118
- gr.Markdown("# **Voxtral Mini Evaluation**")
119
- gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
120
- capabilities while retaining best-in-class text performance.
121
- #### It excels at speech transcription, translation and audio understanding.""")
122
-
123
- with gr.Accordion("🔎 More on Voxtral", open=False):
124
- gr.Markdown("""## **Key Features:**
125
-
126
- #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
127
- ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
128
- ##### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
129
- ##### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
130
- ##### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
131
- ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
132
- ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
133
-
134
-
135
- gr.Markdown("### **1. Upload an audio file, record via microphone, or select a demo file:**")
136
- gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
137
-
138
- with gr.Row():
139
- sel_audio = gr.Audio(sources=["upload", "microphone"], type="filepath",
140
- label="Set an audio file to process it:")
141
- example = [["mapo_tofu.mp3"]]
142
- gr.Examples(
143
- examples=example,
144
- inputs=sel_audio,
145
- outputs=None,
146
- fn=None,
147
- cache_examples=False,
148
- run_on_click=False
149
- )
150
-
151
- with gr.Row():
152
- gr.Markdown("### **2. Choose one of theese tasks:**")
153
-
154
- with gr.Row():
155
- with gr.Column():
156
- with gr.Accordion("📝 Transcription", open=True):
157
- sel_language = gr.Dropdown(
158
- choices=list(dict_languages.keys()),
159
- value="English",
160
- label="Select the language of the audio file:"
161
- )
162
- submit_transcript = gr.Button("Extract transcription", variant="primary")
163
- text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
164
-
165
- with gr.Column():
166
- with gr.Accordion("🔁 Translation", open=True):
167
- sel_translate_language = gr.Dropdown(
168
- choices=list(dict_languages.keys()),
169
- value="English",
170
- label="Select the language for translation:"
171
- )
172
-
173
- submit_translate = gr.Button("Translate audio file", variant="primary")
174
- text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
175
-
176
- with gr.Column():
177
- with gr.Accordion("🤖 Ask audio file", open=True):
178
- question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
179
- submit_chat = gr.Button("Ask audio file", variant="primary")
180
- example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]]
181
- gr.Examples(
182
- examples=example_chat,
183
- inputs=question_chat,
184
- outputs=None,
185
- fn=None,
186
- cache_examples=False,
187
- run_on_click=False
188
- )
189
- text_chat = gr.Textbox(label="💬 Model answer", lines=10)
190
-
191
- ### Processing
192
-
193
- # Transcription
194
- submit_transcript.click(
195
- disable_buttons,
196
- outputs=[submit_transcript, submit_translate, submit_chat],
197
- trigger_mode="once",
198
- ).then(
199
- fn=process_transcript,
200
- inputs=[sel_language, sel_audio],
201
- outputs=text_transcript
202
- ).then(
203
- enable_buttons,
204
- outputs=[submit_transcript, submit_translate, submit_chat],
205
- )
206
-
207
- # Translation
208
- submit_translate.click(
209
- disable_buttons,
210
- outputs=[submit_transcript, submit_translate, submit_chat],
211
- trigger_mode="once",
212
- ).then(
213
- fn=process_translate,
214
- inputs=[sel_translate_language, sel_audio],
215
- outputs=text_translate
216
- ).then(
217
- enable_buttons,
218
- outputs=[submit_transcript, submit_translate, submit_chat],
219
- )
220
-
221
- # Chat
222
- submit_chat.click(
223
- disable_buttons,
224
- outputs=[submit_transcript, submit_translate, submit_chat],
225
- trigger_mode="once",
226
- ).then(
227
- fn=process_chat,
228
- inputs=[question_chat, sel_audio],
229
- outputs=text_chat
230
- ).then(
231
- enable_buttons,
232
- outputs=[submit_transcript, submit_translate, submit_chat],
233
- )
234
-
235
- ### Launch the app
236
-
237
- if __name__ == "__main__":
238
- voxtral.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
+ from pydub import AudioSegment
5
+ from pydub.silence import split_on_silence, detect_silence
6
+ import yt_dlp
7
+ import requests
8
+ import validators
9
+ from urllib.parse import urlparse
10
+ import subprocess
11
+ import os
12
+ import re
13
+ import glob
14
+ import spaces
15
+
16
+ #### Functions
17
+
18
+ @spaces.GPU
19
+ def process_transcript(language: str, audio_path: str) -> str:
20
+ """Process the audio file to return its transcription.
21
+
22
+ Args:
23
+ language: The language of the audio.
24
+ audio_path: The path to the audio file.
25
+
26
+ Returns:
27
+ The transcribed text of the audio.
28
+ """
29
+
30
+ if audio_path is None:
31
+ return "Please provide some input audio: either upload an audio file or use the microphone."
32
+ else:
33
+ id_language = dict_languages[language]
34
+ inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
35
+ inputs = inputs.to(device, dtype=torch.bfloat16)
36
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
37
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
38
+
39
+ return decoded_outputs[0]
40
+ ###
41
+
42
+ @spaces.GPU
43
+ def process_translate(language: str, audio_path: str) -> str:
44
+ conversation = [
45
+ {
46
+ "role": "user",
47
+ "content": [
48
+ {
49
+ "type": "audio",
50
+ "path": audio_path,
51
+ },
52
+ {"type": "text", "text": "Translate this in "+language},
53
+ ],
54
+ }
55
+ ]
56
+
57
+ inputs = processor.apply_chat_template(conversation)
58
+ inputs = inputs.to(device, dtype=torch.bfloat16)
59
+
60
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
61
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
62
+
63
+ return decoded_outputs[0]
64
+ ###
65
+
66
+ @spaces.GPU
67
+ def process_chat(question: str, audio_path: str) -> str:
68
+ conversation = [
69
+ {
70
+ "role": "user",
71
+ "content": [
72
+ {
73
+ "type": "audio",
74
+ "path": audio_path,
75
+ },
76
+ {"type": "text", "text": question},
77
+ ],
78
+ }
79
+ ]
80
+
81
+ inputs = processor.apply_chat_template(conversation)
82
+ inputs = inputs.to(device, dtype=torch.bfloat16)
83
+
84
+ outputs = model.generate(**inputs, max_new_tokens=500)
85
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
86
+
87
+ return decoded_outputs[0]
88
+ ###
89
+
90
+ def disable_buttons():
91
+ return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
92
+
93
+ def enable_buttons():
94
+ return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
95
+ ###
96
+
97
+ def secure_download_from_url(url: str):
98
+ """
99
+ Validates a URL and downloads the file if it is an authorized media.
100
+ Returns the path of the downloaded file or an error message.
101
+ """
102
+ # Step 1: Validate the URL format
103
+ if not validators.url(url):
104
+ return None, None, gr.Markdown("❌ **Error:** The provided URL is invalid.")
105
+
106
+ try:
107
+ # Step 2: Send a HEAD request to check the headers without downloading the content
108
+ # allow_redirects=True to follow redirects to the final file location.
109
+ # timeout to avoid blocking requests.
110
+ response = requests.head(url, allow_redirects=True, timeout=10)
111
+
112
+ # Check if the request was successful (status code 2xx)
113
+ response.raise_for_status()
114
+
115
+ # Step 3: Validate the content type (MIME type)
116
+ content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
117
+ if content_type not in ALLOWED_MIME_TYPES:
118
+ error_message = (
119
+ f" **Error:** The file type is not allowed.\n"
120
+ f" - **Type detected:** `{content_type}`\n"
121
+ f" - **Allowed types:** Audio and Video only."
122
+ )
123
+ return None, None, gr.Markdown(error_message)
124
+
125
+ # Step 4: Validate the file size
126
+ content_length = response.headers.get('Content-Length')
127
+ if content_length and int(content_length) > MAX_FILE_SIZE:
128
+ error_message = (
129
+ f"❌ **Error:** The file is too large.\n"
130
+ f" - **File size:** {int(content_length) / 1024 / 1024:.2f} MB\n"
131
+ f" - **Maximum allowed size:** {MAX_FILE_SIZE / 1024 / 1024:.2f} MB"
132
+ )
133
+ return None, None, gr.Markdown(error_message)
134
+
135
+ # Step 5: Secure streaming download
136
+ with requests.get(url, stream=True, timeout=20) as r:
137
+ r.raise_for_status()
138
+
139
+ # Extract the file name from the URL
140
+ parsed_url = urlparse(url)
141
+ filename = os.path.basename(parsed_url.path)
142
+ if not filename: # Si l'URL se termine par un '/'
143
+ filename = "downloaded_media_file"
144
+
145
+ filepath = os.path.join(DOWNLOAD_DIR, filename)
146
+
147
+ # --- Step 6: Download the audio ---
148
+ # Write the file in chunks to avoid overloading memory
149
+ with open(filepath, 'wb') as f:
150
+ downloaded_size = 0
151
+ for chunk in r.iter_content(chunk_size=8192):
152
+ downloaded_size += len(chunk)
153
+ if downloaded_size > MAX_FILE_SIZE:
154
+ os.remove(filepath) # Supprimer le fichier partiel
155
+ return None, None, gr.Markdown("❌ **Error:** The file exceeds the maximum allowed size during download.")
156
+ f.write(chunk)
157
+
158
+ # --- Step 7: Convert to WAV using Pydub ---
159
+ audio_file = AudioSegment.from_file(filepath)
160
+ file_handle = audio_file.export("audio_file.wav", format="wav")
161
+
162
+ # --- Step 8: Clean up ---
163
+ try:
164
+ files = glob.glob(DOWNLOAD_DIR)
165
+ for f in files:
166
+ os.remove(f)
167
+ except:
168
+ pass
169
+
170
+ success_message = (
171
+ f"✅ **Success!** File downloaded and saved."
172
+ )
173
+
174
+ # Returns the file path and a success message.
175
+ return "audio_file.wav", "audio_file.wav", gr.Markdown(success_message)
176
+
177
+ except requests.exceptions.RequestException as e:
178
+ # Handle network errors (timeout, DNS, connection refused, etc.)
179
+ return None, None, gr.Markdown(f" **Network error:** Unable to reach URL. Details: {e}")
180
+ except Exception as e:
181
+ # Handle Other potential errors
182
+ return None, None, gr.Markdown(f"❌ **Unexpected error:** {e}")
183
+ ###
184
+
185
+ def secure_download_youtube_audio(url: str):
186
+ """
187
+ Returns the path of the downloaded file or an error message.
188
+ """
189
+ # --- Step 1: Validate URL format with Regex ---
190
+ youtube_regex = re.compile(
191
+ r'^(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/'
192
+ r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')
193
+ if not youtube_regex.match(url):
194
+ return None, None, gr.Markdown("❌ **Error:** The URL '{url}' does not appear to be a valid YouTube URL.")
195
+
196
+ try:
197
+ # --- Step 2: Check video availability ---
198
+ ydl_info_opts = {'quiet': True, 'skip_download': True}
199
+ try:
200
+ with yt_dlp.YoutubeDL(ydl_info_opts) as ydl:
201
+ info = ydl.extract_info(url, download=False)
202
+ except yt_dlp.utils.DownloadError as e:
203
+ return None, None, gr.Markdown(f"❌ **Error:** The video at URL '{url}' is unavailable ({str(e)})")
204
+
205
+ # --- Step 3: Select best audio format ---
206
+ formats = [f for f in info['formats'] if f.get('acodec') != 'none']
207
+ if not formats:
208
+ return None, None, gr.Markdown("❌ **Error:** No audio-only stream was found for this video.")
209
+
210
+ formats.sort(key=lambda f: f.get('abr') or 0, reverse=True)
211
+ best_audio_format = formats[0]
212
+
213
+ # --- Step 4: Check file size BEFORE downloading ---
214
+ filesize = best_audio_format.get('filesize') or best_audio_format.get('filesize_approx')
215
+ if filesize is None:
216
+ print("Could not determine file size before downloading.")
217
+ filesize = 1
218
+
219
+ if filesize > MAX_FILE_SIZE:
220
+ return None, None, gr.Markdown(
221
+ f"❌ **Error:** The file is too large.\n"
222
+ f" - **File size:** {filesize / 1024 / 1024:.2f} MB\n"
223
+ f" - **Maximum allowed size:** {MAX_FILE_SIZE / 1024 / 1024:.2f} MB"
224
+ )
225
+
226
+ # --- Step 5: Download & convert directly to WAV ---
227
+ ydl_opts = {
228
+ 'quiet': True,
229
+ 'format': f"{best_audio_format['format_id']}",
230
+ 'outtmpl': "audio_file", # will be replaced by ffmpeg output
231
+ 'postprocessors': [{
232
+ 'key': 'FFmpegExtractAudio',
233
+ 'preferredcodec': 'wav',
234
+ 'preferredquality': '192',
235
+ }],
236
+ }
237
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
238
+ ydl.download([url])
239
+
240
+ success_message = "✅ **Success!** Audio extracted and saved."
241
+ return "audio_file.wav", "audio_file.wav", gr.Markdown(success_message)
242
+
243
+ except FileNotFoundError:
244
+ return None, None, gr.Markdown("❌ **Error:** FFmpeg not found. Please ensure it is installed and in your system's PATH.")
245
+ except Exception as e:
246
+ return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
247
+ ###
248
+
249
+ def voice_extract_demucs():
250
+ """
251
+ Returns the path of the voice extracted file.
252
+ """
253
+ try:
254
+ cmd = [
255
+ "demucs",
256
+ "--two-stems=vocals",
257
+ "--out", "demucs",
258
+ "audio_file.wav"
259
+ ]
260
+ subprocess.run(cmd, check=True)
261
+ voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
262
+ success_message = "✅ **Success!** Voice extracted."
263
+ return voice_path, voice_path, gr.Markdown(success_message)
264
+ except Exception as e:
265
+ return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
266
+
267
+ ### Initializations
268
+
269
+ MAX_TOKENS = 32000
270
+
271
+ device = "cuda" if torch.cuda.is_available() else "cpu"
272
+ print(f"*** Device: {device}")
273
+ model_name = 'mistralai/Voxtral-Mini-3B-2507'
274
+
275
+ processor = AutoProcessor.from_pretrained(model_name)
276
+ model = VoxtralForConditionalGeneration.from_pretrained(model_name,
277
+ torch_dtype=torch.bfloat16,
278
+ device_map=device)
279
+ # Supported languages
280
+ dict_languages = {"English": "en",
281
+ "French": "fr",
282
+ "German": "de",
283
+ "Spanish": "es",
284
+ "Italian": "it",
285
+ "Portuguese": "pt",
286
+ "Dutch": "nl",
287
+ "Hindi": "hi"}
288
+
289
+ # Whitelist of allowed MIME types for audio and video
290
+ ALLOWED_MIME_TYPES = {
291
+ # Audio
292
+ 'audio/mpeg', 'audio/wav', 'audio/wave', 'audio/x-wav', 'audio/x-pn-wav',
293
+ 'audio/ogg', 'audio/vorbis', 'audio/aac', 'audio/mp4', 'audio/flac',
294
+ 'audio/x-flac', 'audio/opus', 'audio/webm',
295
+ # Vidéo
296
+ 'video/mp4', 'video/mpeg', 'video/ogg', 'video/webm', 'video/quicktime',
297
+ 'video/x-msvideo', 'video/x-matroska'
298
+ }
299
+
300
+ # Maximum allowed file size (in bytes). Ex: 1 GB
301
+ MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024 # 1 GB
302
+
303
+ # Directory where the files will be saved
304
+ DOWNLOAD_DIR = "downloaded_files"
305
+ if not os.path.exists(DOWNLOAD_DIR):
306
+ os.makedirs(DOWNLOAD_DIR)
307
+
308
+
309
+ #### Gradio interface
310
+ with gr.Blocks(title="Voxtral") as voxtral:
311
+ gr.Markdown("# **Voxtral Mini Evaluation**")
312
+ gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
313
+ capabilities while retaining best-in-class text performance.
314
+ #### It excels at speech transcription, translation and audio understanding.""")
315
+
316
+ with gr.Accordion("🔎 More on Voxtral", open=False):
317
+ gr.Markdown("""## **Key Features:**
318
+
319
+ #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
320
+ ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
321
+ ##### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
322
+ ##### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
323
+ ##### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
324
+ ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
325
+ ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
326
+
327
+
328
+ gr.Markdown("### **1.Choose the audio:**")
329
+
330
+ with gr.Row():
331
+ with gr.Tabs():
332
+ with gr.Tab("From record or file upload"):
333
+ gr.Markdown("### **Upload an audio file, record via microphone, or select a demo file:**")
334
+ gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
335
+ sel_audio = gr.Audio(sources=["upload", "microphone"], type="filepath",
336
+ label="Set an audio file to process it:")
337
+ example = [["mapo_tofu.mp3"]]
338
+ gr.Examples(
339
+ examples=example,
340
+ inputs=sel_audio,
341
+ outputs=None,
342
+ fn=None,
343
+ cache_examples=False,
344
+ run_on_click=False
345
+ )
346
+ status_output = gr.Markdown()
347
+ voice_button = gr.Button("Extract voice (if noisy environment)")
348
+ voice_button.click(
349
+ fn=voice_extract_demucs,
350
+ outputs=[sel_audio, sel_audio, status_output])
351
+
352
+ with gr.Tab("From file url (audio or video file)"):
353
+ gr.Markdown("### **Enter the url of the file (mp3, wav, mp4, ...):**")
354
+ sel_audio = gr.State()
355
+ url_input = gr.Textbox(label="URL (MP3 or MP4 file)",
356
+ placeholder="https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4")
357
+ download_button = gr.Button("Check and upload", variant="primary")
358
+ input_audio = gr.Audio()
359
+ status_output = gr.Markdown()
360
+ download_button.click(
361
+ fn=secure_download_from_url,
362
+ inputs=url_input,
363
+ outputs=[input_audio, sel_audio, status_output]
364
+ )
365
+ voice_button = gr.Button("Extract voice (if noisy environment)")
366
+ voice_button.click(
367
+ fn=voice_extract_demucs,
368
+ outputs=[input_audio, sel_audio, status_output])
369
+
370
+ with gr.Tab("From Youtube url:"):
371
+ gr.Markdown("### **Enter the url of the Youtube video:**")
372
+ sel_audio = gr.State()
373
+ url_input = gr.Textbox(label="Youtube url",
374
+ placeholder="https://www.youtube.com/...")
375
+ download_button = gr.Button("Check and upload", variant="primary")
376
+ input_audio = gr.Audio()
377
+ status_output = gr.Markdown()
378
+ download_button.click(
379
+ fn=secure_download_youtube_audio,
380
+ inputs=url_input,
381
+ outputs=[input_audio, sel_audio, status_output]
382
+ )
383
+ voice_button = gr.Button("Extract voice (if noisy environment)")
384
+ voice_button.click(
385
+ fn=voice_extract_demucs,
386
+ outputs=[input_audio, sel_audio, status_output])
387
+
388
+ with gr.Row():
389
+ gr.Markdown("### **2. Choose one of theese tasks:**")
390
+
391
+ with gr.Row():
392
+ with gr.Column():
393
+ with gr.Accordion("📝 Transcription", open=True):
394
+ sel_language = gr.Dropdown(
395
+ choices=list(dict_languages.keys()),
396
+ value="English",
397
+ label="Select the language of the audio file:"
398
+ )
399
+ submit_transcript = gr.Button("Extract transcription", variant="primary")
400
+ text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
401
+
402
+ with gr.Column():
403
+ with gr.Accordion("🔁 Translation", open=True):
404
+ sel_translate_language = gr.Dropdown(
405
+ choices=list(dict_languages.keys()),
406
+ value="English",
407
+ label="Select the language for translation:"
408
+ )
409
+
410
+ submit_translate = gr.Button("Translate audio file", variant="primary")
411
+ text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
412
+
413
+ with gr.Column():
414
+ with gr.Accordion("🤖 Ask audio file", open=True):
415
+ question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
416
+ submit_chat = gr.Button("Ask audio file:", variant="primary")
417
+ text_chat = gr.Textbox(label="💬 Model answer", lines=10)
418
+
419
+ ### Processing
420
+
421
+ # Transcription
422
+ submit_transcript.click(
423
+ disable_buttons,
424
+ outputs=[submit_transcript, submit_translate, submit_chat],
425
+ trigger_mode="once",
426
+ ).then(
427
+ fn=process_transcript,
428
+ inputs=[sel_language, sel_audio],
429
+ outputs=text_transcript
430
+ ).then(
431
+ enable_buttons,
432
+ outputs=[submit_transcript, submit_translate, submit_chat],
433
+ )
434
+
435
+ # Translation
436
+ submit_translate.click(
437
+ disable_buttons,
438
+ outputs=[submit_transcript, submit_translate, submit_chat],
439
+ trigger_mode="once",
440
+ ).then(
441
+ fn=process_translate,
442
+ inputs=[sel_translate_language, sel_audio],
443
+ outputs=text_translate
444
+ ).then(
445
+ enable_buttons,
446
+ outputs=[submit_transcript, submit_translate, submit_chat],
447
+ )
448
+
449
+ # Chat
450
+ submit_chat.click(
451
+ disable_buttons,
452
+ outputs=[submit_transcript, submit_translate, submit_chat],
453
+ trigger_mode="once",
454
+ ).then(
455
+ fn=process_chat,
456
+ inputs=[question_chat, sel_audio],
457
+ outputs=text_chat
458
+ ).then(
459
+ enable_buttons,
460
+ outputs=[submit_transcript, submit_translate, submit_chat],
461
+ )
462
+
463
+ ### Launch the app
464
+
465
+ if __name__ == "__main__":
466
+ voxtral.queue().launch()
requirements.txt CHANGED
@@ -1,7 +1,9 @@
1
- mistral-common
2
- git+https://github.com/huggingface/transformers
3
- gradio
4
- torch
5
- accelerate
6
- librosa
7
- gradio_modal
 
 
 
1
+ mistral-common
2
+ git+https://github.com/huggingface/transformers
3
+ gradio==5.39.0 pydub requests
4
+ torch
5
+ accelerate
6
+ librosa
7
+ validators
8
+ yt-dlp
9
+ demucs