Spaces:
Running
Running
show TTS into Expression tab
Browse files
app.py
CHANGED
|
@@ -466,10 +466,11 @@ def audionar_tts(text=None,
|
|
| 466 |
}
|
| 467 |
|
| 468 |
if text and text.strip():
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
|
|
|
| 473 |
|
| 474 |
else: # VITS
|
| 475 |
|
|
@@ -555,7 +556,7 @@ def audionar_tts(text=None,
|
|
| 555 |
wavfile = '_vits_.wav'
|
| 556 |
audiofile.write(wavfile, final_audio, 16000)
|
| 557 |
|
| 558 |
-
return wavfile
|
| 559 |
|
| 560 |
|
| 561 |
# -- EXPRESSIO
|
|
@@ -1338,10 +1339,6 @@ def _stylett2(text='Hallov worlds Far over the',
|
|
| 1338 |
|
| 1339 |
return speech_audio
|
| 1340 |
|
| 1341 |
-
def update_selected_voice(voice_filename):
|
| 1342 |
-
return 'wav/' + voice_filename + '.wav'
|
| 1343 |
-
|
| 1344 |
-
|
| 1345 |
description = (
|
| 1346 |
"Estimate **age**, **gender**, and **expression** "
|
| 1347 |
"of the speaker contained in an audio file or microphone recording. \n"
|
|
@@ -1353,8 +1350,21 @@ description = (
|
|
| 1353 |
"recognises the expression dimensions arousal, dominance, and valence. "
|
| 1354 |
)
|
| 1355 |
|
|
|
|
| 1356 |
|
| 1357 |
with gr.Blocks(theme='huggingface') as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1358 |
with gr.Tab(label="TTS"):
|
| 1359 |
with gr.Row():
|
| 1360 |
text_input = gr.Textbox(
|
|
@@ -1363,13 +1373,10 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
| 1363 |
lines=4,
|
| 1364 |
value="Farover the misty mountains cold too dungeons deep and caverns old.",
|
| 1365 |
)
|
| 1366 |
-
# Unified dropdown for both voices and languages
|
| 1367 |
-
# You'll need to handle the logic to determine if it's a voice or a language
|
| 1368 |
-
# based on the selection. A single list of choices is used here.
|
| 1369 |
choice_dropdown = gr.Dropdown(
|
| 1370 |
choices=language_names + VOICES,
|
| 1371 |
label="Select Voice or Language",
|
| 1372 |
-
value=VOICES[0]
|
| 1373 |
)
|
| 1374 |
soundscape_input = gr.Textbox(
|
| 1375 |
lines=1,
|
|
@@ -1384,31 +1391,45 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
| 1384 |
|
| 1385 |
output_audio = gr.Audio(label="TTS Output")
|
| 1386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1387 |
generate_button.click(
|
| 1388 |
-
fn=
|
| 1389 |
-
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
| 1390 |
-
outputs=output_audio
|
| 1391 |
)
|
|
|
|
| 1392 |
with gr.Tab(label="Speech Analysis"):
|
| 1393 |
with gr.Row():
|
| 1394 |
with gr.Column():
|
| 1395 |
gr.Markdown(description)
|
| 1396 |
-
|
| 1397 |
sources=["upload", "microphone"],
|
| 1398 |
type="filepath",
|
| 1399 |
label="Audio input",
|
| 1400 |
-
min_length=0.025,
|
| 1401 |
)
|
| 1402 |
-
|
| 1403 |
-
|
| 1404 |
-
|
| 1405 |
-
|
| 1406 |
-
"wav/
|
| 1407 |
-
"wav/
|
|
|
|
|
|
|
| 1408 |
],
|
| 1409 |
-
[
|
| 1410 |
label="Examples from CREMA-D, ODbL v1.0 license",
|
| 1411 |
)
|
|
|
|
| 1412 |
gr.Markdown("Only the first two seconds of the audio will be processed.")
|
| 1413 |
submit_btn = gr.Button(value="Submit")
|
| 1414 |
with gr.Column():
|
|
@@ -1417,6 +1438,19 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
| 1417 |
output_expression = gr.Image(label="Expression")
|
| 1418 |
|
| 1419 |
outputs = [output_age, output_gender, output_expression]
|
| 1420 |
-
submit_btn.click(recognize, input, outputs)
|
| 1421 |
|
| 1422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
}
|
| 467 |
|
| 468 |
if text and text.strip():
|
| 469 |
+
|
| 470 |
+
if lang not in language_names:
|
| 471 |
+
|
| 472 |
+
speech_audio = _styletts2(text=text, # Eng.
|
| 473 |
+
ref_s='wav/' + lang + '.wav')
|
| 474 |
|
| 475 |
else: # VITS
|
| 476 |
|
|
|
|
| 556 |
wavfile = '_vits_.wav'
|
| 557 |
audiofile.write(wavfile, final_audio, 16000)
|
| 558 |
|
| 559 |
+
return wavfile, wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
|
| 560 |
|
| 561 |
|
| 562 |
# -- EXPRESSIO
|
|
|
|
| 1339 |
|
| 1340 |
return speech_audio
|
| 1341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1342 |
description = (
|
| 1343 |
"Estimate **age**, **gender**, and **expression** "
|
| 1344 |
"of the speaker contained in an audio file or microphone recording. \n"
|
|
|
|
| 1350 |
"recognises the expression dimensions arousal, dominance, and valence. "
|
| 1351 |
)
|
| 1352 |
|
| 1353 |
+
# =============
|
| 1354 |
|
| 1355 |
with gr.Blocks(theme='huggingface') as demo:
|
| 1356 |
+
# This state will be used to hold the generated TTS file path
|
| 1357 |
+
tts_file = gr.State(value=None)
|
| 1358 |
+
# This state will hold the list of examples, including the generated one
|
| 1359 |
+
audio_examples_state = gr.State(
|
| 1360 |
+
value=[
|
| 1361 |
+
["wav/female-46-neutral.wav"],
|
| 1362 |
+
["wav/female-20-happy.wav"],
|
| 1363 |
+
["wav/male-60-angry.wav"],
|
| 1364 |
+
["wav/male-27-sad.wav"],
|
| 1365 |
+
]
|
| 1366 |
+
)
|
| 1367 |
+
|
| 1368 |
with gr.Tab(label="TTS"):
|
| 1369 |
with gr.Row():
|
| 1370 |
text_input = gr.Textbox(
|
|
|
|
| 1373 |
lines=4,
|
| 1374 |
value="Farover the misty mountains cold too dungeons deep and caverns old.",
|
| 1375 |
)
|
|
|
|
|
|
|
|
|
|
| 1376 |
choice_dropdown = gr.Dropdown(
|
| 1377 |
choices=language_names + VOICES,
|
| 1378 |
label="Select Voice or Language",
|
| 1379 |
+
value=VOICES[0]
|
| 1380 |
)
|
| 1381 |
soundscape_input = gr.Textbox(
|
| 1382 |
lines=1,
|
|
|
|
| 1391 |
|
| 1392 |
output_audio = gr.Audio(label="TTS Output")
|
| 1393 |
|
| 1394 |
+
def generate_and_update_state(text, choice, soundscape, kv, current_examples):
|
| 1395 |
+
# This function calls the TTS and updates the state
|
| 1396 |
+
audio_path = audionar_tts(text, choice, soundscape, kv)
|
| 1397 |
+
|
| 1398 |
+
# Append the new audio path to the existing list of examples
|
| 1399 |
+
updated_examples = current_examples + [[audio_path]]
|
| 1400 |
+
|
| 1401 |
+
# Return the generated audio path for the output and the updated list for the state
|
| 1402 |
+
return audio_path, updated_examples
|
| 1403 |
+
|
| 1404 |
generate_button.click(
|
| 1405 |
+
fn=generate_and_update_state,
|
| 1406 |
+
inputs=[text_input, choice_dropdown, soundscape_input, kv_input, audio_examples_state],
|
| 1407 |
+
outputs=[output_audio, audio_examples_state]
|
| 1408 |
)
|
| 1409 |
+
|
| 1410 |
with gr.Tab(label="Speech Analysis"):
|
| 1411 |
with gr.Row():
|
| 1412 |
with gr.Column():
|
| 1413 |
gr.Markdown(description)
|
| 1414 |
+
input_audio_analysis = gr.Audio(
|
| 1415 |
sources=["upload", "microphone"],
|
| 1416 |
type="filepath",
|
| 1417 |
label="Audio input",
|
| 1418 |
+
min_length=0.025,
|
| 1419 |
)
|
| 1420 |
+
|
| 1421 |
+
# The gr.Examples component that will be dynamically updated
|
| 1422 |
+
audio_examples = gr.Examples(
|
| 1423 |
+
examples=[
|
| 1424 |
+
["wav/female-46-neutral.wav"],
|
| 1425 |
+
["wav/female-20-happy.wav"],
|
| 1426 |
+
["wav/male-60-angry.wav"],
|
| 1427 |
+
["wav/male-27-sad.wav"],
|
| 1428 |
],
|
| 1429 |
+
inputs=[input_audio_analysis],
|
| 1430 |
label="Examples from CREMA-D, ODbL v1.0 license",
|
| 1431 |
)
|
| 1432 |
+
|
| 1433 |
gr.Markdown("Only the first two seconds of the audio will be processed.")
|
| 1434 |
submit_btn = gr.Button(value="Submit")
|
| 1435 |
with gr.Column():
|
|
|
|
| 1438 |
output_expression = gr.Image(label="Expression")
|
| 1439 |
|
| 1440 |
outputs = [output_age, output_gender, output_expression]
|
|
|
|
| 1441 |
|
| 1442 |
+
# Function to update the examples from the state
|
| 1443 |
+
def load_examples_from_state(examples_list):
|
| 1444 |
+
return gr.Examples.update(examples=examples_list)
|
| 1445 |
+
|
| 1446 |
+
# This is the key: an event listener that triggers when the tab is selected
|
| 1447 |
+
demo.load(
|
| 1448 |
+
fn=load_examples_from_state,
|
| 1449 |
+
inputs=audio_examples_state,
|
| 1450 |
+
outputs=audio_examples,
|
| 1451 |
+
queue=False,
|
| 1452 |
+
)
|
| 1453 |
+
|
| 1454 |
+
submit_btn.click(recognize, input_audio_analysis, outputs)
|
| 1455 |
+
|
| 1456 |
+
demo.launch(debug=True)
|