Spaces:

Dionyssos
/

speech-analysis2

Running

App Files Files Community

Dionyssos commited on Sep 14

Commit

9b2426f

1 Parent(s): 6c9a684

show TTS into Expression tab

Browse files

Files changed (1) hide show

app.py +61 -27

app.py CHANGED Viewed

@@ -466,10 +466,11 @@ def audionar_tts(text=None,
         }
     if text and text.strip():
-        if 'wav/' in lang:
-            # call StyleTTS2
-            speech_audio = _styletts2(text=text,
-                                    ref_s=lang)
         else:  # VITS
@@ -555,7 +556,7 @@ def audionar_tts(text=None,
     wavfile = '_vits_.wav'
     audiofile.write(wavfile, final_audio, 16000)
-    return wavfile
 # -- EXPRESSIO
@@ -1338,10 +1339,6 @@ def _stylett2(text='Hallov worlds Far over the',
     return speech_audio
-def update_selected_voice(voice_filename):
-    return 'wav/' + voice_filename + '.wav'
 description = (
     "Estimate **age**, **gender**, and **expression** "
     "of the speaker contained in an audio file or microphone recording.  \n"
@@ -1353,8 +1350,21 @@ description = (
     "recognises the expression dimensions arousal, dominance, and valence. "
 )
 with gr.Blocks(theme='huggingface') as demo:
     with gr.Tab(label="TTS"):
         with gr.Row():
             text_input = gr.Textbox(
@@ -1363,13 +1373,10 @@ with gr.Blocks(theme='huggingface') as demo:
                 lines=4,
                 value="Farover the misty mountains cold too dungeons deep and caverns old.",
             )
-            # Unified dropdown for both voices and languages
-            # You'll need to handle the logic to determine if it's a voice or a language
-            # based on the selection. A single list of choices is used here.
             choice_dropdown = gr.Dropdown(
                 choices=language_names + VOICES,
                 label="Select Voice or Language",
-                value=VOICES[0]  # Set a default value
             )
             soundscape_input = gr.Textbox(
                 lines=1,
@@ -1384,31 +1391,45 @@ with gr.Blocks(theme='huggingface') as demo:
         output_audio = gr.Audio(label="TTS Output")
         generate_button.click(
-            fn=audionar_tts,
-            inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
-            outputs=output_audio
         )
     with gr.Tab(label="Speech Analysis"):
         with gr.Row():
             with gr.Column():
                 gr.Markdown(description)
-                input = gr.Audio(
                     sources=["upload", "microphone"],
                     type="filepath",
                     label="Audio input",
-                    min_length=0.025,  # seconds
                 )
-                gr.Examples(
-                    [
-                        "wav/female-46-neutral.wav",
-                        "wav/female-20-happy.wav",
-                        "wav/male-60-angry.wav",
-                        "wav/male-27-sad.wav",
                     ],
-                    [input],
                     label="Examples from CREMA-D, ODbL v1.0 license",
                 )
                 gr.Markdown("Only the first two seconds of the audio will be processed.")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
@@ -1417,6 +1438,19 @@ with gr.Blocks(theme='huggingface') as demo:
                 output_expression = gr.Image(label="Expression")
         outputs = [output_age, output_gender, output_expression]
-        submit_btn.click(recognize, input, outputs)
-demo.launch(debug=True)

         }
     if text and text.strip():
+        if lang not in language_names:
+            speech_audio = _styletts2(text=text,  # Eng.
+                                      ref_s='wav/' + lang + '.wav')
         else:  # VITS
     wavfile = '_vits_.wav'
     audiofile.write(wavfile, final_audio, 16000)
+    return wavfile, wavfile  # 2x file for [audio out & state to pass to the Emotion reco tAB]
 # -- EXPRESSIO
     return speech_audio
 description = (
     "Estimate **age**, **gender**, and **expression** "
     "of the speaker contained in an audio file or microphone recording.  \n"
     "recognises the expression dimensions arousal, dominance, and valence. "
 )
+# =============
 with gr.Blocks(theme='huggingface') as demo:
+    # This state will be used to hold the generated TTS file path
+    tts_file = gr.State(value=None)
+    # This state will hold the list of examples, including the generated one
+    audio_examples_state = gr.State(
+        value=[
+            ["wav/female-46-neutral.wav"],
+            ["wav/female-20-happy.wav"],
+            ["wav/male-60-angry.wav"],
+            ["wav/male-27-sad.wav"],
+        ]
+    )
     with gr.Tab(label="TTS"):
         with gr.Row():
             text_input = gr.Textbox(
                 lines=4,
                 value="Farover the misty mountains cold too dungeons deep and caverns old.",
             )
             choice_dropdown = gr.Dropdown(
                 choices=language_names + VOICES,
                 label="Select Voice or Language",
+                value=VOICES[0]
             )
             soundscape_input = gr.Textbox(
                 lines=1,
         output_audio = gr.Audio(label="TTS Output")
+        def generate_and_update_state(text, choice, soundscape, kv, current_examples):
+            # This function calls the TTS and updates the state
+            audio_path = audionar_tts(text, choice, soundscape, kv)
+            # Append the new audio path to the existing list of examples
+            updated_examples = current_examples + [[audio_path]]
+            # Return the generated audio path for the output and the updated list for the state
+            return audio_path, updated_examples
         generate_button.click(
+            fn=generate_and_update_state,
+            inputs=[text_input, choice_dropdown, soundscape_input, kv_input, audio_examples_state],
+            outputs=[output_audio, audio_examples_state]
         )
     with gr.Tab(label="Speech Analysis"):
         with gr.Row():
             with gr.Column():
                 gr.Markdown(description)
+                input_audio_analysis = gr.Audio(
                     sources=["upload", "microphone"],
                     type="filepath",
                     label="Audio input",
+                    min_length=0.025,
                 )
+                # The gr.Examples component that will be dynamically updated
+                audio_examples = gr.Examples(
+                    examples=[
+                        ["wav/female-46-neutral.wav"],
+                        ["wav/female-20-happy.wav"],
+                        ["wav/male-60-angry.wav"],
+                        ["wav/male-27-sad.wav"],
                     ],
+                    inputs=[input_audio_analysis],
                     label="Examples from CREMA-D, ODbL v1.0 license",
                 )
                 gr.Markdown("Only the first two seconds of the audio will be processed.")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
                 output_expression = gr.Image(label="Expression")
         outputs = [output_age, output_gender, output_expression]
+        # Function to update the examples from the state
+        def load_examples_from_state(examples_list):
+            return gr.Examples.update(examples=examples_list)
+        # This is the key: an event listener that triggers when the tab is selected
+        demo.load(
+            fn=load_examples_from_state,
+            inputs=audio_examples_state,
+            outputs=audio_examples,
+            queue=False,
+        )
+        submit_btn.click(recognize, input_audio_analysis, outputs)
+demo.launch(debug=True)