Spaces:
Paused
Paused
add audio preprocessing to make sure asr input is single channel audio
Browse files
app.py
CHANGED
|
@@ -121,6 +121,9 @@ def transcribe_audio(audio_input):
|
|
| 121 |
audio_array = audio_input[1]
|
| 122 |
else:
|
| 123 |
audio_array = audio_input
|
|
|
|
|
|
|
|
|
|
| 124 |
result = whisper_pipe(audio_array)
|
| 125 |
return result["text"]
|
| 126 |
|
|
@@ -156,8 +159,7 @@ with gr.Blocks() as demo:
|
|
| 156 |
|
| 157 |
with gr.Row():
|
| 158 |
# Audio and Text inputs – only one will be visible based on mode selection.
|
| 159 |
-
#
|
| 160 |
-
# which helps release the recording resource faster.
|
| 161 |
audio_input = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
| 162 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
| 163 |
|
|
@@ -186,7 +188,7 @@ with gr.Blocks() as demo:
|
|
| 186 |
with gr.Row():
|
| 187 |
tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
|
| 188 |
|
| 189 |
-
# Button event triggers the classification.
|
| 190 |
classify_btn.click(fn=classify_intent,
|
| 191 |
inputs=[mode, audio_input, text_input, model_dropdown],
|
| 192 |
outputs=[transcription_output, classification_output, tts_output])
|
|
|
|
| 121 |
audio_array = audio_input[1]
|
| 122 |
else:
|
| 123 |
audio_array = audio_input
|
| 124 |
+
# Ensure input is mono by averaging channels if necessary.
|
| 125 |
+
if audio_array.ndim > 1:
|
| 126 |
+
audio_array = np.mean(audio_array, axis=-1)
|
| 127 |
result = whisper_pipe(audio_array)
|
| 128 |
return result["text"]
|
| 129 |
|
|
|
|
| 159 |
|
| 160 |
with gr.Row():
|
| 161 |
# Audio and Text inputs – only one will be visible based on mode selection.
|
| 162 |
+
# Use gr.Audio type "numpy" for in-memory capture.
|
|
|
|
| 163 |
audio_input = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
| 164 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
| 165 |
|
|
|
|
| 188 |
with gr.Row():
|
| 189 |
tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
|
| 190 |
|
| 191 |
+
# Button event triggers the classification.
|
| 192 |
classify_btn.click(fn=classify_intent,
|
| 193 |
inputs=[mode, audio_input, text_input, model_dropdown],
|
| 194 |
outputs=[transcription_output, classification_output, tts_output])
|