Luigi commited on
Commit
6f6f431
·
1 Parent(s): d7367c4

add audio preprocessing to make sure asr input is single channel audio

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -121,6 +121,9 @@ def transcribe_audio(audio_input):
121
  audio_array = audio_input[1]
122
  else:
123
  audio_array = audio_input
 
 
 
124
  result = whisper_pipe(audio_array)
125
  return result["text"]
126
 
@@ -156,8 +159,7 @@ with gr.Blocks() as demo:
156
 
157
  with gr.Row():
158
  # Audio and Text inputs – only one will be visible based on mode selection.
159
- # Changed gr.Audio type from "filepath" to "numpy" to capture audio in memory,
160
- # which helps release the recording resource faster.
161
  audio_input = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
162
  text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
163
 
@@ -186,7 +188,7 @@ with gr.Blocks() as demo:
186
  with gr.Row():
187
  tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
188
 
189
- # Button event triggers the classification. Gradio will show a spinner during processing.
190
  classify_btn.click(fn=classify_intent,
191
  inputs=[mode, audio_input, text_input, model_dropdown],
192
  outputs=[transcription_output, classification_output, tts_output])
 
121
  audio_array = audio_input[1]
122
  else:
123
  audio_array = audio_input
124
+ # Ensure input is mono by averaging channels if necessary.
125
+ if audio_array.ndim > 1:
126
+ audio_array = np.mean(audio_array, axis=-1)
127
  result = whisper_pipe(audio_array)
128
  return result["text"]
129
 
 
159
 
160
  with gr.Row():
161
  # Audio and Text inputs – only one will be visible based on mode selection.
162
+ # Use gr.Audio type "numpy" for in-memory capture.
 
163
  audio_input = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
164
  text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
165
 
 
188
  with gr.Row():
189
  tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
190
 
191
+ # Button event triggers the classification.
192
  classify_btn.click(fn=classify_intent,
193
  inputs=[mode, audio_input, text_input, model_dropdown],
194
  outputs=[transcription_output, classification_output, tts_output])