Spaces:

Luigi
/

dinercall-intent-demo

Paused

App Files Files Community

Luigi commited on Apr 12

Commit

1bdb3dd

1 Parent(s): 6f6f431

add audio file input

Browse files

Files changed (1) hide show

app.py +23 -18

app.py CHANGED Viewed

@@ -129,12 +129,14 @@ def transcribe_audio(audio_input):
 # ------------------- Main Processing Function -------------------
 @spaces.GPU  # Decorate with ZeroGPU to run on GPU when processing
-def classify_intent(mode, audio_input, text_input, model_choice):
-    # Determine input based on explicit mode.
-    if mode == "Microphone" and audio_input is not None:
-        transcription = transcribe_audio(audio_input)
     elif mode == "Text" and text_input:
         transcription = text_input
     else:
         return "請提供語音或文字輸入", "", None
@@ -151,28 +153,31 @@ def classify_intent(mode, audio_input, text_input, model_choice):
 # ------------------- Gradio Blocks Interface Setup -------------------
 with gr.Blocks() as demo:
     gr.Markdown("## 🍽️ 餐廳訂位意圖識別")
-    gr.Markdown("錄音或輸入文字，自動判斷是否具有訂位意圖。")
     with gr.Row():
-        # Input Mode Selector
-        mode = gr.Radio(choices=["Microphone", "Text"], label="選擇輸入模式", value="Microphone")
     with gr.Row():
-        # Audio and Text inputs – only one will be visible based on mode selection.
-        # Use gr.Audio type "numpy" for in-memory capture.
-        audio_input = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
         text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
-    # Initially, only the microphone input is visible.
     text_input.visible = False
-    # Change event for mode selection to toggle visibility.
     def update_visibility(selected_mode):
         if selected_mode == "Microphone":
-            return gr.update(visible=True), gr.update(visible=False)
-        else:
-            return gr.update(visible=False), gr.update(visible=True)
-    mode.change(fn=update_visibility, inputs=mode, outputs=[audio_input, text_input])
     with gr.Row():
         model_dropdown = gr.Dropdown(choices=list(available_models.keys()),
@@ -188,9 +193,9 @@ with gr.Blocks() as demo:
     with gr.Row():
         tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
-    # Button event triggers the classification.
     classify_btn.click(fn=classify_intent,
-                       inputs=[mode, audio_input, text_input, model_dropdown],
                        outputs=[transcription_output, classification_output, tts_output])
 demo.launch()

 # ------------------- Main Processing Function -------------------
 @spaces.GPU  # Decorate with ZeroGPU to run on GPU when processing
+def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
+    # Determine input based on mode.
+    if mode == "Microphone" and mic_audio is not None:
+        transcription = transcribe_audio(mic_audio)
     elif mode == "Text" and text_input:
         transcription = text_input
+    elif mode == "File" and file_audio is not None:
+        transcription = transcribe_audio(file_audio)
     else:
         return "請提供語音或文字輸入", "", None
 # ------------------- Gradio Blocks Interface Setup -------------------
 with gr.Blocks() as demo:
     gr.Markdown("## 🍽️ 餐廳訂位意圖識別")
+    gr.Markdown("錄音、上傳語音檔案或輸入文字，自動判斷是否具有訂位意圖。")
     with gr.Row():
+        # Input Mode Selector with three options.
+        mode = gr.Radio(choices=["Microphone", "Text", "File"], label="選擇輸入模式", value="Microphone")
     with gr.Row():
+        # Three input components: one for microphone, one for file upload, and one for text.
+        mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
         text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
+        file_audio = gr.Audio(sources=["upload"], type="numpy", label="上傳語音檔案")
+    # Initially, only the microphone input is visible; hide text and file inputs.
     text_input.visible = False
+    file_audio.visible = False
+    # Change event for mode selection to toggle visibility of the three inputs.
     def update_visibility(selected_mode):
         if selected_mode == "Microphone":
+            return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+        elif selected_mode == "Text":
+            return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+        else:  # File
+            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
+    mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
     with gr.Row():
         model_dropdown = gr.Dropdown(choices=list(available_models.keys()),
     with gr.Row():
         tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
+    # Button event triggers the classification. Note that we now pass four inputs.
     classify_btn.click(fn=classify_intent,
+                       inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
                        outputs=[transcription_output, classification_output, tts_output])
 demo.launch()