Spaces:

tianzhechu
/

Book-QA-Chat

Sleeping

App Files Files Community

tianzhechu commited on Aug 11

Commit

82b5e45

verified ·

1 Parent(s): 84f58ea

Upload 3 files

Browse files

Files changed (3) hide show

README.md +31 -1
app.py +93 -9
requirements.txt +5 -1

README.md CHANGED Viewed

@@ -9,4 +9,34 @@ app_file: app.py
 pinned: false
 ---
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

 pinned: false
 ---
+An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
+## Local Transformers mode with alternate tokenizer
+If the target model repository does not include a tokenizer, you can instruct the app to run locally with `transformers` and use a tokenizer from another repository.
+Environment variables:
+- `MODEL_ID` (optional): model repo to load. Defaults to `tianzhechu/BookQA-7B-Instruct`.
+- `TOKENIZER_ID` (optional): tokenizer repo to use locally (e.g., a base model's tokenizer). When set, the app switches to a local `transformers` backend and streams tokens from your machine.
+- `USE_LOCAL_TRANSFORMERS` (optional): set to `1` to force local mode even without `TOKENIZER_ID`.
+Install extra dependencies:
+```bash
+pip install -r requirements.txt
+```
+Run with an alternate tokenizer (example):
+```bash
+export MODEL_ID=tianzhechu/BookQA-7B-Instruct
+export TOKENIZER_ID=TheBaseModel/TokenizerRepo
+python app.py
+```
+Notes:
+- Local inference will download and load the model weights via `transformers` and may require significant memory.
+- If the tokenizer exposes a chat template, it is applied automatically. Otherwise a simple fallback template is used.
+- You'll need a compatible version of `torch` installed for your platform. If the default pip install fails, follow the official install instructions for your OS/GPU.

app.py CHANGED Viewed

@@ -1,10 +1,38 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 def respond(
@@ -27,17 +55,73 @@ def respond(
     response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
         temperature=temperature,
         top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """

+import os
+import threading
 import gradio as gr
 from huggingface_hub import InferenceClient
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
+os.environ["HF_HOME"] = "/tmp/huggingface"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
+MODEL_ID = os.getenv("MODEL_ID", "tianzhechu/BookQA-7B-Instruct")
+TOKENIZER_ID = os.getenv("TOKENIZER_ID", "Qwen/Qwen2.5-0.5B-Instruct")  # Optional: tokenizer repo to use locally
+USE_LOCAL_TRANSFORMERS = bool(TOKENIZER_ID) or os.getenv("USE_LOCAL_TRANSFORMERS") == "1"
+# Remote inference (default)
+client = None if USE_LOCAL_TRANSFORMERS else InferenceClient(MODEL_ID)
+# Lazy-loaded local model/tokenizer when TOKENIZER_ID is provided
+local_model = None
+local_tokenizer = None
+def _ensure_local_model_loaded():
+    global local_model, local_tokenizer
+    if local_model is not None and local_tokenizer is not None:
+        return
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    if not TOKENIZER_ID:
+        raise RuntimeError(
+            "Local transformers backend requires TOKENIZER_ID to be set to a tokenizer repo."
+        )
+    local_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID, use_fast=True)
+    local_model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
 def respond(
     response = ""
+    if not USE_LOCAL_TRANSFORMERS:
+        for message in client.chat_completion(
+            messages,
+            max_tokens=max_tokens,
+            stream=True,
+            temperature=temperature,
+            top_p=top_p,
+        ):
+            token = message.choices[0].delta.content
+            if token:
+                response += token
+            yield response
+        return
+    # Local generation using transformers with an alternate tokenizer
+    _ensure_local_model_loaded()
+    try:
+        from transformers import TextIteratorStreamer
+    except Exception as e:
+        raise RuntimeError(
+            "transformers TextIteratorStreamer is required for local streaming; ensure transformers is installed."
+        ) from e
+    # Use chat template if available; otherwise fall back to a simple concatenation
+    try:
+        prompt_text = local_tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    except Exception:
+        convo_parts = []
+        for m in messages:
+            role = m.get("role", "user")
+            content = m.get("content", "")
+            if role == "system":
+                convo_parts.append(f"<system>\n{content}\n</system>")
+            elif role == "assistant":
+                convo_parts.append(f"<assistant>\n{content}\n</assistant>")
+            else:
+                convo_parts.append(f"<user>\n{content}\n</user>")
+        prompt_text = "\n".join(convo_parts) + "\n<assistant>\n"
+    inputs = local_tokenizer(prompt_text, return_tensors="pt")
+    streamer = TextIteratorStreamer(
+        local_tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
+    generate_kwargs = dict(
+        inputs=inputs.input_ids,
+        attention_mask=inputs.get("attention_mask"),
+        max_new_tokens=max_tokens,
+        do_sample=temperature > 0,
         temperature=temperature,
         top_p=top_p,
+        streamer=streamer,
+    )
+    thread = threading.Thread(target=local_model.generate, kwargs=generate_kwargs)
+    thread.start()
+    for new_text in streamer:
+        if new_text:
+            response += new_text
+            yield response
 """

requirements.txt CHANGED Viewed

	@@ -1 +1,5 @@
1	- huggingface_hub==0.25.2

+huggingface_hub==0.25.2
+gradio==5.0.1
+transformers>=4.38.0
+torch>=2.1.0
+transformers>=4.38.0