Spaces:

Luigi
/

dinercall-intent-demo

Paused

App Files Files Community

Luigi commited on Apr 9

Commit

4804dfb

1 Parent(s): 1ec5603

fix tts and improve rrs-related ergonimicity

Browse files

Files changed (2) hide show

app.py +78 -14
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -11,8 +11,15 @@ from pathlib import Path
 from faster_whisper import WhisperModel
 from huggingface_hub import hf_hub_download
 llm_repo_id = "Qwen/Qwen2.5-7B-Instruct-GGUF"
-llm_filename="qwen2.5-7b-instruct-q2_k.gguf"
 asr_repo_id = "Luigi/whisper-small-zh_tw-ct2"
 llm_model_path = hf_hub_download(repo_id=llm_repo_id, filename=llm_filename)
@@ -40,7 +47,8 @@ def load_transformers_model(model_id):
 @st.cache_resource
 def load_outlines_model():
-    model = outlines_llama_cpp(model_path=llm_model_path,
         n_ctx=1024,
         n_threads=2,
         n_threads_batch=2,
@@ -48,36 +56,33 @@ def load_outlines_model():
         n_gpu_layers=0,
         use_mlock=False,
         use_mmap=True,
-        verbose=False,)
     return model
 def predict_with_llm(text):
     model = load_outlines_model()
     prompt = f"""
-        You are an expert in classification of restautant customers' message.
-        I'm going to provide you with a message from a restautant customer.
-        You have to classify it in one of the follwing two intents:
         RESERVATION: Inquiries and requests highly related to table reservations and seating 與訂位與座位安排相關的詢問與請求
         NOT_RESERVATION: All other messages that do not involve table booking or reservations 所有非訂位或預約類的其他留言
         Please reply with *only* the name of the intent labels in a JSON object like:
-        {{\"result\": \"RESERVATION\"}} or {{\"result\": \"NOT_RESERVATION\"}}
         Here is the message to classify: {text}
     """.strip()
     classifier = choice(model, ["RESERVATION", "NOT_RESERVATION"])
     prediction = classifier(prompt)
     if prediction == "RESERVATION":
         return "📞 訂位意圖 (Reservation intent)"
     elif prediction == "NOT_RESERVATION":
         return "❌ 無訂位意圖 (Not Reservation intent)"
-# Standard Transformers classifier
 def predict_intent(text, model_id):
     tokenizer, model = load_transformers_model(model_id)
     inputs = tokenizer(text, return_tensors="pt")
@@ -90,14 +95,65 @@ def predict_intent(text, model_id):
     else:
         return f"❌ 無訂位意圖 (Not Reservation intent)（訂位信心度 Confidence: {confidence:.2%}）"
-# Clean README
 def load_clean_readme(path="README.md"):
     text = Path(path).read_text(encoding="utf-8")
     text = re.sub(r"(?s)^---.*?---", "", text).strip()
     text = re.sub(r"^# .*?\n+", "", text)
     return text
-# App UI
 st.title("🍽️ 餐廳訂位意圖識別")
 st.markdown("���音或輸入文字，自動判斷是否具有訂位意圖。")
@@ -110,7 +166,6 @@ audio = mic_recorder(start_prompt="開始錄音", stop_prompt="停止錄音", ju
 if audio:
     st.success("錄音完成！")
     st.audio(audio["bytes"], format="audio/wav")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
         tmpfile.write(audio["bytes"])
         tmpfile_path = tmpfile.name
@@ -131,6 +186,11 @@ if audio:
             else:
                 result = predict_intent(transcription, model_id)
             st.success(result)
 text_input = st.text_input("✍️ 或手動輸入語句")
@@ -141,6 +201,10 @@ if text_input and st.button("🚀 送出"):
         else:
             result = predict_intent(text_input, model_id)
         st.success(result)
 with st.expander("ℹ️ 說明文件 / 使用說明 (README)", expanded=False):
     readme_md = load_clean_readme()

 from faster_whisper import WhisperModel
 from huggingface_hub import hf_hub_download
+# Additional imports for TTS and audio auto-play
+import numpy as np
+import io
+import soundfile as sf
+from kokoro import KPipeline
+import base64
 llm_repo_id = "Qwen/Qwen2.5-7B-Instruct-GGUF"
+llm_filename = "qwen2.5-7b-instruct-q2_k.gguf"
 asr_repo_id = "Luigi/whisper-small-zh_tw-ct2"
 llm_model_path = hf_hub_download(repo_id=llm_repo_id, filename=llm_filename)
 @st.cache_resource
 def load_outlines_model():
+    model = outlines_llama_cpp(
+        model_path=llm_model_path,
         n_ctx=1024,
         n_threads=2,
         n_threads_batch=2,
         n_gpu_layers=0,
         use_mlock=False,
         use_mmap=True,
+        verbose=False,
+    )
     return model
 def predict_with_llm(text):
     model = load_outlines_model()
     prompt = f"""
+        You are an expert in classification of restaurant customers' messages.
+        I'm going to provide you with a message from a restaurant customer.
+        You have to classify it in one of the following two intents:
         RESERVATION: Inquiries and requests highly related to table reservations and seating 與訂位與座位安排相關的詢問與請求
         NOT_RESERVATION: All other messages that do not involve table booking or reservations 所有非訂位或預約類的其他留言
         Please reply with *only* the name of the intent labels in a JSON object like:
+        {{"result": "RESERVATION"}} or {{"result": "NOT_RESERVATION"}}
         Here is the message to classify: {text}
     """.strip()
     classifier = choice(model, ["RESERVATION", "NOT_RESERVATION"])
     prediction = classifier(prompt)
     if prediction == "RESERVATION":
         return "📞 訂位意圖 (Reservation intent)"
     elif prediction == "NOT_RESERVATION":
         return "❌ 無訂位意圖 (Not Reservation intent)"
 def predict_intent(text, model_id):
     tokenizer, model = load_transformers_model(model_id)
     inputs = tokenizer(text, return_tensors="pt")
     else:
         return f"❌ 無訂位意圖 (Not Reservation intent)（訂位信心度 Confidence: {confidence:.2%}）"
 def load_clean_readme(path="README.md"):
     text = Path(path).read_text(encoding="utf-8")
     text = re.sub(r"(?s)^---.*?---", "", text).strip()
     text = re.sub(r"^# .*?\n+", "", text)
     return text
+# ---- TTS Integration using kokoro KPipeline ----
+@st.cache_resource
+def get_tts_pipeline():
+    # Instantiate and cache the KPipeline for TTS.
+    # Adjust lang_code as needed; here we set it to "zh" for Chinese.
+    return KPipeline(lang_code="zh")
+def get_tts_message(intent_result):
+    """
+    Determine the TTS message based on the classification result.
+    Reservation intent returns one message; all others, another.
+    """
+    if "訂位意圖" in intent_result and "無" not in intent_result:
+        return "稍後您將會從簡訊收到訂位連結"
+    else:
+        return "我們將會將您的回饋傳達給負責人，謝謝您"
+def play_tts_message(message, voice='af_heart'):
+    """
+    Synthesize speech using kokoro's KPipeline and return audio bytes in WAV format.
+    The pipeline returns a generator yielding tuples; the audio chunks are concatenated.
+    """
+    pipeline = get_tts_pipeline()
+    generator = pipeline(message, voice=voice)
+    audio_chunks = []
+    for i, (gs, ps, audio) in enumerate(generator):
+        audio_chunks.append(audio)
+    if audio_chunks:
+        audio_concat = np.concatenate(audio_chunks)
+    else:
+        audio_concat = np.array([])
+    wav_buffer = io.BytesIO()
+    # Using a sample rate of 24000 as in the example.
+    sf.write(wav_buffer, audio_concat, 24000, format="WAV")
+    wav_buffer.seek(0)
+    return wav_buffer.read()
+def play_audio_auto(audio_data, mime="audio/wav"):
+    """
+    Auto-plays the audio by creating an HTML audio element with the autoplay attribute.
+    """
+    audio_base64 = base64.b64encode(audio_data).decode()
+    audio_html = f'''
+    <audio controls autoplay style="width: 100%;">
+        <source src="data:{mime};base64,{audio_base64}" type="{mime}">
+        Your browser does not support the audio element.
+    </audio>
+    '''
+    st.markdown(audio_html, unsafe_allow_html=True)
+# ---- App UI ----
 st.title("🍽️ 餐廳訂位意圖識別")
 st.markdown("���音或輸入文字，自動判斷是否具有訂位意圖。")
 if audio:
     st.success("錄音完成！")
     st.audio(audio["bytes"], format="audio/wav")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
         tmpfile.write(audio["bytes"])
         tmpfile_path = tmpfile.name
             else:
                 result = predict_intent(transcription, model_id)
             st.success(result)
+            tts_text = get_tts_message(result)
+            # Show the TTS message text on the page
+            st.info(f"TTS 語音內容: {tts_text}")
+            audio_message = play_tts_message(tts_text)
+            play_audio_auto(audio_message, mime="audio/wav")
 text_input = st.text_input("✍️ 或手動輸入語句")
         else:
             result = predict_intent(text_input, model_id)
         st.success(result)
+        tts_text = get_tts_message(result)
+        st.info(f"TTS 語音內容: {tts_text}")
+        audio_message = play_tts_message(tts_text)
+        play_audio_auto(audio_message, mime="audio/wav")
 with st.expander("ℹ️ 說明文件 / 使用說明 (README)", expanded=False):
     readme_md = load_clean_readme()

requirements.txt CHANGED Viewed

@@ -7,4 +7,7 @@ faster-whisper
 soundfile
 outlines[llamacpp]==0.0.36 # issue beyond 0.0.36 https://github.com/dottxt-ai/outlines/issues/820
 numpy>=1.24,<2.0
-llama-cpp-python

 soundfile
 outlines[llamacpp]==0.0.36 # issue beyond 0.0.36 https://github.com/dottxt-ai/outlines/issues/820
 numpy>=1.24,<2.0
+llama-cpp-python
+kokoro
+ordered-set
+cn2an