Spaces:
Paused
Paused
make whisper pipeline read file instead of numpy array
Browse files
app.py
CHANGED
|
@@ -118,21 +118,24 @@ def tts_audio_output(message: str, voice: str = 'af_heart'):
|
|
| 118 |
|
| 119 |
def transcribe_audio(audio_input):
|
| 120 |
whisper_pipe = load_whisper_pipeline()
|
| 121 |
-
#
|
| 122 |
-
if isinstance(audio_input,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
audio_array = audio_input[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
else:
|
| 125 |
-
|
| 126 |
-
# Ensure input is mono by averaging channels if necessary.
|
| 127 |
-
if audio_array.ndim > 1:
|
| 128 |
-
audio_array = np.mean(audio_array, axis=-1)
|
| 129 |
-
result = whisper_pipe(audio_array)
|
| 130 |
-
return result["text"]
|
| 131 |
|
| 132 |
# ------------------- Main Processing Function -------------------
|
| 133 |
@spaces.GPU # Decorate to run on GPU when processing
|
| 134 |
def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
| 135 |
-
#
|
| 136 |
if mode == "Microphone" and mic_audio is not None:
|
| 137 |
chosen_audio = mic_audio
|
| 138 |
transcription = transcribe_audio(chosen_audio)
|
|
@@ -145,18 +148,21 @@ def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
|
| 145 |
else:
|
| 146 |
return "請提供語音或文字輸入", "", None, None
|
| 147 |
|
| 148 |
-
# For debugging:
|
| 149 |
debug_audio = None
|
| 150 |
if chosen_audio is not None:
|
| 151 |
-
if isinstance(chosen_audio,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
audio_array = chosen_audio[1]
|
| 153 |
sample_rate = chosen_audio[0]
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
if audio_array.ndim > 1:
|
| 158 |
-
audio_array = np.mean(audio_array, axis=-1)
|
| 159 |
-
debug_audio = (sample_rate, audio_array)
|
| 160 |
|
| 161 |
# Classify the transcribed or provided text.
|
| 162 |
if available_models[model_choice] == "qwen":
|
|
@@ -181,7 +187,8 @@ with gr.Blocks() as demo:
|
|
| 181 |
# Three input components: microphone, text, and file upload.
|
| 182 |
mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
| 183 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
| 184 |
-
|
|
|
|
| 185 |
|
| 186 |
# Initially, only the microphone input is visible.
|
| 187 |
text_input.visible = False
|
|
|
|
| 118 |
|
| 119 |
def transcribe_audio(audio_input):
|
| 120 |
whisper_pipe = load_whisper_pipeline()
|
| 121 |
+
# For file input, audio_input is a filepath string.
|
| 122 |
+
if isinstance(audio_input, str):
|
| 123 |
+
result = whisper_pipe(audio_input)
|
| 124 |
+
return result["text"]
|
| 125 |
+
# For microphone input, Gradio returns a tuple (sample_rate, audio_array).
|
| 126 |
+
elif isinstance(audio_input, tuple):
|
| 127 |
audio_array = audio_input[1]
|
| 128 |
+
if audio_array.ndim > 1:
|
| 129 |
+
audio_array = np.mean(audio_array, axis=-1)
|
| 130 |
+
result = whisper_pipe(audio_array)
|
| 131 |
+
return result["text"]
|
| 132 |
else:
|
| 133 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
# ------------------- Main Processing Function -------------------
|
| 136 |
@spaces.GPU # Decorate to run on GPU when processing
|
| 137 |
def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
| 138 |
+
# Determine input and choose appropriately.
|
| 139 |
if mode == "Microphone" and mic_audio is not None:
|
| 140 |
chosen_audio = mic_audio
|
| 141 |
transcription = transcribe_audio(chosen_audio)
|
|
|
|
| 148 |
else:
|
| 149 |
return "請提供語音或文字輸入", "", None, None
|
| 150 |
|
| 151 |
+
# For debugging: prepare debug audio.
|
| 152 |
debug_audio = None
|
| 153 |
if chosen_audio is not None:
|
| 154 |
+
if isinstance(chosen_audio, str):
|
| 155 |
+
# For file input, read using soundfile to get raw audio.
|
| 156 |
+
audio_array, sample_rate = sf.read(chosen_audio)
|
| 157 |
+
if audio_array.ndim > 1:
|
| 158 |
+
audio_array = np.mean(audio_array, axis=-1)
|
| 159 |
+
debug_audio = (sample_rate, audio_array)
|
| 160 |
+
elif isinstance(chosen_audio, tuple):
|
| 161 |
audio_array = chosen_audio[1]
|
| 162 |
sample_rate = chosen_audio[0]
|
| 163 |
+
if audio_array.ndim > 1:
|
| 164 |
+
audio_array = np.mean(audio_array, axis=-1)
|
| 165 |
+
debug_audio = (sample_rate, audio_array)
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
# Classify the transcribed or provided text.
|
| 168 |
if available_models[model_choice] == "qwen":
|
|
|
|
| 187 |
# Three input components: microphone, text, and file upload.
|
| 188 |
mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
| 189 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
| 190 |
+
# For file input, use 'filepath' so Whisper pipeline gets the file and can convert it internally.
|
| 191 |
+
file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
|
| 192 |
|
| 193 |
# Initially, only the microphone input is visible.
|
| 194 |
text_input.visible = False
|