Spaces:

Luigi
/

dinercall-intent-demo

Paused

App Files Files Community

Luigi commited on Apr 12

Commit

294e733

1 Parent(s): 1bdb3dd

add debug on processed audio

Browse files

Files changed (1) hide show

app.py +31 -13

app.py CHANGED Viewed

@@ -109,7 +109,7 @@ def tts_audio_output(message: str, voice: str = 'af_heart'):
         audio_chunks.append(audio)
     if audio_chunks:
         audio_concat = np.concatenate(audio_chunks)
-        # Return as tuple (sample_rate, numpy_array) for gr.Audio (sample rate used: 24000 Hz)
         return (24000, audio_concat)
     else:
         return None
@@ -128,27 +128,43 @@ def transcribe_audio(audio_input):
     return result["text"]
 # ------------------- Main Processing Function -------------------
-@spaces.GPU  # Decorate with ZeroGPU to run on GPU when processing
 def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
-    # Determine input based on mode.
     if mode == "Microphone" and mic_audio is not None:
-        transcription = transcribe_audio(mic_audio)
     elif mode == "Text" and text_input:
         transcription = text_input
     elif mode == "File" and file_audio is not None:
-        transcription = transcribe_audio(file_audio)
     else:
-        return "請提供語音或文字輸入", "", None
     # Classify the transcribed or provided text.
     if available_models[model_choice] == "qwen":
         classification = predict_with_qwen(transcription)
     else:
         classification = predict_intent(transcription, available_models[model_choice])
-    # Generate TTS message and audio.
     tts_msg = get_tts_message(classification)
     tts_audio = tts_audio_output(tts_msg)
-    return transcription, classification, tts_audio
 # ------------------- Gradio Blocks Interface Setup -------------------
 with gr.Blocks() as demo:
@@ -160,16 +176,16 @@ with gr.Blocks() as demo:
         mode = gr.Radio(choices=["Microphone", "Text", "File"], label="選擇輸入模式", value="Microphone")
     with gr.Row():
-        # Three input components: one for microphone, one for file upload, and one for text.
         mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
         text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
         file_audio = gr.Audio(sources=["upload"], type="numpy", label="上傳語音檔案")
-    # Initially, only the microphone input is visible; hide text and file inputs.
     text_input.visible = False
     file_audio.visible = False
-    # Change event for mode selection to toggle visibility of the three inputs.
     def update_visibility(selected_mode):
         if selected_mode == "Microphone":
             return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
@@ -192,10 +208,12 @@ with gr.Blocks() as demo:
         classification_output = gr.Textbox(label="意圖判斷結果")
     with gr.Row():
         tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
-    # Button event triggers the classification. Note that we now pass four inputs.
     classify_btn.click(fn=classify_intent,
                        inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
-                       outputs=[transcription_output, classification_output, tts_output])
 demo.launch()

         audio_chunks.append(audio)
     if audio_chunks:
         audio_concat = np.concatenate(audio_chunks)
+        # Return as tuple (sample_rate, numpy_array) for gr.Audio (using 24000 Hz)
         return (24000, audio_concat)
     else:
         return None
     return result["text"]
 # ------------------- Main Processing Function -------------------
+@spaces.GPU  # Decorate to run on GPU when processing
 def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
+    # Choose the input and prepare debug audio
     if mode == "Microphone" and mic_audio is not None:
+        chosen_audio = mic_audio
+        transcription = transcribe_audio(chosen_audio)
     elif mode == "Text" and text_input:
         transcription = text_input
+        chosen_audio = None
     elif mode == "File" and file_audio is not None:
+        chosen_audio = file_audio
+        transcription = transcribe_audio(chosen_audio)
     else:
+        return "請提供語音或文字輸入", "", None, None
+    # For debugging: process chosen_audio similarly to transcribe_audio to ensure mono.
+    debug_audio = None
+    if chosen_audio is not None:
+        if isinstance(chosen_audio, tuple):
+            audio_array = chosen_audio[1]
+            sample_rate = chosen_audio[0]
+        else:
+            audio_array = chosen_audio
+            sample_rate = 16000  # default fallback sample rate
+        if audio_array.ndim > 1:
+            audio_array = np.mean(audio_array, axis=-1)
+        debug_audio = (sample_rate, audio_array)
     # Classify the transcribed or provided text.
     if available_models[model_choice] == "qwen":
         classification = predict_with_qwen(transcription)
     else:
         classification = predict_intent(transcription, available_models[model_choice])
+    # Generate TTS message and corresponding audio.
     tts_msg = get_tts_message(classification)
     tts_audio = tts_audio_output(tts_msg)
+    return transcription, classification, tts_audio, debug_audio
 # ------------------- Gradio Blocks Interface Setup -------------------
 with gr.Blocks() as demo:
         mode = gr.Radio(choices=["Microphone", "Text", "File"], label="選擇輸入模式", value="Microphone")
     with gr.Row():
+        # Three input components: microphone, text, and file upload.
         mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
         text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
         file_audio = gr.Audio(sources=["upload"], type="numpy", label="上傳語音檔案")
+    # Initially, only the microphone input is visible.
     text_input.visible = False
     file_audio.visible = False
+    # Change event for mode selection to toggle visibility.
     def update_visibility(selected_mode):
         if selected_mode == "Microphone":
             return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
         classification_output = gr.Textbox(label="意圖判斷結果")
     with gr.Row():
         tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
+    with gr.Row():
+        debug_audio_output = gr.Audio(type="numpy", label="Debug: 傳送到 Whisper Pipeline 的音訊")
+    # Button event triggers the classification.
     classify_btn.click(fn=classify_intent,
                        inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
+                       outputs=[transcription_output, classification_output, tts_output, debug_audio_output])
 demo.launch()