Spaces:
Paused
Paused
remove audio debug zone
Browse files
app.py
CHANGED
|
@@ -135,34 +135,15 @@ def transcribe_audio(audio_input):
|
|
| 135 |
# ------------------- Main Processing Function -------------------
|
| 136 |
@spaces.GPU # Decorate to run on GPU when processing
|
| 137 |
def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
| 138 |
-
# Determine input
|
| 139 |
if mode == "Microphone" and mic_audio is not None:
|
| 140 |
-
|
| 141 |
-
transcription = transcribe_audio(chosen_audio)
|
| 142 |
elif mode == "Text" and text_input:
|
| 143 |
transcription = text_input
|
| 144 |
-
chosen_audio = None
|
| 145 |
elif mode == "File" and file_audio is not None:
|
| 146 |
-
|
| 147 |
-
transcription = transcribe_audio(chosen_audio)
|
| 148 |
else:
|
| 149 |
-
return "請提供語音或文字輸入", "", None
|
| 150 |
-
|
| 151 |
-
# For debugging: prepare debug audio.
|
| 152 |
-
debug_audio = None
|
| 153 |
-
if chosen_audio is not None:
|
| 154 |
-
if isinstance(chosen_audio, str):
|
| 155 |
-
# For file input, read using soundfile to get raw audio.
|
| 156 |
-
audio_array, sample_rate = sf.read(chosen_audio)
|
| 157 |
-
if audio_array.ndim > 1:
|
| 158 |
-
audio_array = np.mean(audio_array, axis=-1)
|
| 159 |
-
debug_audio = (sample_rate, audio_array)
|
| 160 |
-
elif isinstance(chosen_audio, tuple):
|
| 161 |
-
audio_array = chosen_audio[1]
|
| 162 |
-
sample_rate = chosen_audio[0]
|
| 163 |
-
if audio_array.ndim > 1:
|
| 164 |
-
audio_array = np.mean(audio_array, axis=-1)
|
| 165 |
-
debug_audio = (sample_rate, audio_array)
|
| 166 |
|
| 167 |
# Classify the transcribed or provided text.
|
| 168 |
if available_models[model_choice] == "qwen":
|
|
@@ -172,7 +153,7 @@ def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
|
| 172 |
# Generate TTS message and corresponding audio.
|
| 173 |
tts_msg = get_tts_message(classification)
|
| 174 |
tts_audio = tts_audio_output(tts_msg)
|
| 175 |
-
return transcription, classification, tts_audio
|
| 176 |
|
| 177 |
# ------------------- Gradio Blocks Interface Setup -------------------
|
| 178 |
with gr.Blocks() as demo:
|
|
@@ -187,7 +168,7 @@ with gr.Blocks() as demo:
|
|
| 187 |
# Three input components: microphone, text, and file upload.
|
| 188 |
mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
| 189 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
| 190 |
-
# For file input, use 'filepath' so Whisper pipeline
|
| 191 |
file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
|
| 192 |
|
| 193 |
# Initially, only the microphone input is visible.
|
|
@@ -201,7 +182,7 @@ with gr.Blocks() as demo:
|
|
| 201 |
elif selected_mode == "Text":
|
| 202 |
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
| 203 |
else: # File
|
| 204 |
-
return gr.update(visible=False), gr.update(visible
|
| 205 |
mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
|
| 206 |
|
| 207 |
with gr.Row():
|
|
@@ -217,12 +198,10 @@ with gr.Blocks() as demo:
|
|
| 217 |
classification_output = gr.Textbox(label="意圖判斷結果")
|
| 218 |
with gr.Row():
|
| 219 |
tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
|
| 220 |
-
with gr.Row():
|
| 221 |
-
debug_audio_output = gr.Audio(type="numpy", label="Debug: 傳送到 Whisper Pipeline 的音訊")
|
| 222 |
|
| 223 |
# Button event triggers the classification.
|
| 224 |
classify_btn.click(fn=classify_intent,
|
| 225 |
inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
|
| 226 |
-
outputs=[transcription_output, classification_output, tts_output
|
| 227 |
|
| 228 |
demo.launch()
|
|
|
|
| 135 |
# ------------------- Main Processing Function -------------------
|
| 136 |
@spaces.GPU # Decorate to run on GPU when processing
|
| 137 |
def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
| 138 |
+
# Determine input based on selected mode.
|
| 139 |
if mode == "Microphone" and mic_audio is not None:
|
| 140 |
+
transcription = transcribe_audio(mic_audio)
|
|
|
|
| 141 |
elif mode == "Text" and text_input:
|
| 142 |
transcription = text_input
|
|
|
|
| 143 |
elif mode == "File" and file_audio is not None:
|
| 144 |
+
transcription = transcribe_audio(file_audio)
|
|
|
|
| 145 |
else:
|
| 146 |
+
return "請提供語音或文字輸入", "", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
# Classify the transcribed or provided text.
|
| 149 |
if available_models[model_choice] == "qwen":
|
|
|
|
| 153 |
# Generate TTS message and corresponding audio.
|
| 154 |
tts_msg = get_tts_message(classification)
|
| 155 |
tts_audio = tts_audio_output(tts_msg)
|
| 156 |
+
return transcription, classification, tts_audio
|
| 157 |
|
| 158 |
# ------------------- Gradio Blocks Interface Setup -------------------
|
| 159 |
with gr.Blocks() as demo:
|
|
|
|
| 168 |
# Three input components: microphone, text, and file upload.
|
| 169 |
mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
| 170 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
| 171 |
+
# For file input, use 'filepath' so Whisper pipeline handles conversion.
|
| 172 |
file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
|
| 173 |
|
| 174 |
# Initially, only the microphone input is visible.
|
|
|
|
| 182 |
elif selected_mode == "Text":
|
| 183 |
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
| 184 |
else: # File
|
| 185 |
+
return gr.update(visible=False), gr.update(visible(False)), gr.update(visible=True)
|
| 186 |
mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
|
| 187 |
|
| 188 |
with gr.Row():
|
|
|
|
| 198 |
classification_output = gr.Textbox(label="意圖判斷結果")
|
| 199 |
with gr.Row():
|
| 200 |
tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
|
|
|
|
|
|
|
| 201 |
|
| 202 |
# Button event triggers the classification.
|
| 203 |
classify_btn.click(fn=classify_intent,
|
| 204 |
inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
|
| 205 |
+
outputs=[transcription_output, classification_output, tts_output])
|
| 206 |
|
| 207 |
demo.launch()
|