Spaces:
Paused
Paused
add debug on processed audio
Browse files
app.py
CHANGED
|
@@ -109,7 +109,7 @@ def tts_audio_output(message: str, voice: str = 'af_heart'):
|
|
| 109 |
audio_chunks.append(audio)
|
| 110 |
if audio_chunks:
|
| 111 |
audio_concat = np.concatenate(audio_chunks)
|
| 112 |
-
# Return as tuple (sample_rate, numpy_array) for gr.Audio (
|
| 113 |
return (24000, audio_concat)
|
| 114 |
else:
|
| 115 |
return None
|
|
@@ -128,27 +128,43 @@ def transcribe_audio(audio_input):
|
|
| 128 |
return result["text"]
|
| 129 |
|
| 130 |
# ------------------- Main Processing Function -------------------
|
| 131 |
-
@spaces.GPU # Decorate
|
| 132 |
def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
| 133 |
-
#
|
| 134 |
if mode == "Microphone" and mic_audio is not None:
|
| 135 |
-
|
|
|
|
| 136 |
elif mode == "Text" and text_input:
|
| 137 |
transcription = text_input
|
|
|
|
| 138 |
elif mode == "File" and file_audio is not None:
|
| 139 |
-
|
|
|
|
| 140 |
else:
|
| 141 |
-
return "請提供語音或文字輸入", "", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
# Classify the transcribed or provided text.
|
| 144 |
if available_models[model_choice] == "qwen":
|
| 145 |
classification = predict_with_qwen(transcription)
|
| 146 |
else:
|
| 147 |
classification = predict_intent(transcription, available_models[model_choice])
|
| 148 |
-
# Generate TTS message and audio.
|
| 149 |
tts_msg = get_tts_message(classification)
|
| 150 |
tts_audio = tts_audio_output(tts_msg)
|
| 151 |
-
return transcription, classification, tts_audio
|
| 152 |
|
| 153 |
# ------------------- Gradio Blocks Interface Setup -------------------
|
| 154 |
with gr.Blocks() as demo:
|
|
@@ -160,16 +176,16 @@ with gr.Blocks() as demo:
|
|
| 160 |
mode = gr.Radio(choices=["Microphone", "Text", "File"], label="選擇輸入模式", value="Microphone")
|
| 161 |
|
| 162 |
with gr.Row():
|
| 163 |
-
# Three input components:
|
| 164 |
mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
| 165 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
| 166 |
file_audio = gr.Audio(sources=["upload"], type="numpy", label="上傳語音檔案")
|
| 167 |
|
| 168 |
-
# Initially, only the microphone input is visible
|
| 169 |
text_input.visible = False
|
| 170 |
file_audio.visible = False
|
| 171 |
|
| 172 |
-
# Change event for mode selection to toggle visibility
|
| 173 |
def update_visibility(selected_mode):
|
| 174 |
if selected_mode == "Microphone":
|
| 175 |
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
|
|
@@ -192,10 +208,12 @@ with gr.Blocks() as demo:
|
|
| 192 |
classification_output = gr.Textbox(label="意圖判斷結果")
|
| 193 |
with gr.Row():
|
| 194 |
tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
|
|
|
|
|
|
|
| 195 |
|
| 196 |
-
# Button event triggers the classification.
|
| 197 |
classify_btn.click(fn=classify_intent,
|
| 198 |
inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
|
| 199 |
-
outputs=[transcription_output, classification_output, tts_output])
|
| 200 |
|
| 201 |
demo.launch()
|
|
|
|
| 109 |
audio_chunks.append(audio)
|
| 110 |
if audio_chunks:
|
| 111 |
audio_concat = np.concatenate(audio_chunks)
|
| 112 |
+
# Return as tuple (sample_rate, numpy_array) for gr.Audio (using 24000 Hz)
|
| 113 |
return (24000, audio_concat)
|
| 114 |
else:
|
| 115 |
return None
|
|
|
|
| 128 |
return result["text"]
|
| 129 |
|
| 130 |
# ------------------- Main Processing Function -------------------
|
| 131 |
+
@spaces.GPU # Decorate to run on GPU when processing
|
| 132 |
def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
| 133 |
+
# Choose the input and prepare debug audio
|
| 134 |
if mode == "Microphone" and mic_audio is not None:
|
| 135 |
+
chosen_audio = mic_audio
|
| 136 |
+
transcription = transcribe_audio(chosen_audio)
|
| 137 |
elif mode == "Text" and text_input:
|
| 138 |
transcription = text_input
|
| 139 |
+
chosen_audio = None
|
| 140 |
elif mode == "File" and file_audio is not None:
|
| 141 |
+
chosen_audio = file_audio
|
| 142 |
+
transcription = transcribe_audio(chosen_audio)
|
| 143 |
else:
|
| 144 |
+
return "請提供語音或文字輸入", "", None, None
|
| 145 |
+
|
| 146 |
+
# For debugging: process chosen_audio similarly to transcribe_audio to ensure mono.
|
| 147 |
+
debug_audio = None
|
| 148 |
+
if chosen_audio is not None:
|
| 149 |
+
if isinstance(chosen_audio, tuple):
|
| 150 |
+
audio_array = chosen_audio[1]
|
| 151 |
+
sample_rate = chosen_audio[0]
|
| 152 |
+
else:
|
| 153 |
+
audio_array = chosen_audio
|
| 154 |
+
sample_rate = 16000 # default fallback sample rate
|
| 155 |
+
if audio_array.ndim > 1:
|
| 156 |
+
audio_array = np.mean(audio_array, axis=-1)
|
| 157 |
+
debug_audio = (sample_rate, audio_array)
|
| 158 |
|
| 159 |
# Classify the transcribed or provided text.
|
| 160 |
if available_models[model_choice] == "qwen":
|
| 161 |
classification = predict_with_qwen(transcription)
|
| 162 |
else:
|
| 163 |
classification = predict_intent(transcription, available_models[model_choice])
|
| 164 |
+
# Generate TTS message and corresponding audio.
|
| 165 |
tts_msg = get_tts_message(classification)
|
| 166 |
tts_audio = tts_audio_output(tts_msg)
|
| 167 |
+
return transcription, classification, tts_audio, debug_audio
|
| 168 |
|
| 169 |
# ------------------- Gradio Blocks Interface Setup -------------------
|
| 170 |
with gr.Blocks() as demo:
|
|
|
|
| 176 |
mode = gr.Radio(choices=["Microphone", "Text", "File"], label="選擇輸入模式", value="Microphone")
|
| 177 |
|
| 178 |
with gr.Row():
|
| 179 |
+
# Three input components: microphone, text, and file upload.
|
| 180 |
mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
| 181 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
| 182 |
file_audio = gr.Audio(sources=["upload"], type="numpy", label="上傳語音檔案")
|
| 183 |
|
| 184 |
+
# Initially, only the microphone input is visible.
|
| 185 |
text_input.visible = False
|
| 186 |
file_audio.visible = False
|
| 187 |
|
| 188 |
+
# Change event for mode selection to toggle visibility.
|
| 189 |
def update_visibility(selected_mode):
|
| 190 |
if selected_mode == "Microphone":
|
| 191 |
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
|
|
|
|
| 208 |
classification_output = gr.Textbox(label="意圖判斷結果")
|
| 209 |
with gr.Row():
|
| 210 |
tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
|
| 211 |
+
with gr.Row():
|
| 212 |
+
debug_audio_output = gr.Audio(type="numpy", label="Debug: 傳送到 Whisper Pipeline 的音訊")
|
| 213 |
|
| 214 |
+
# Button event triggers the classification.
|
| 215 |
classify_btn.click(fn=classify_intent,
|
| 216 |
inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
|
| 217 |
+
outputs=[transcription_output, classification_output, tts_output, debug_audio_output])
|
| 218 |
|
| 219 |
demo.launch()
|