Spaces:

docilio
/

3nhance

Sleeping

App Files Files Community

Tiago Caldeira commited on Aug 4

Commit

6b5a511

1 Parent(s): 7c94b16

different approach using unsloth model

Browse files

Files changed (2) hide show

__app.py +66 -0
app.py +27 -30

__app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+import textwrap
+model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Load model in full precision on CPU — no bitsandbytes
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="cpu",            # Force CPU
+    torch_dtype=torch.float32,   # Use FP32 to ensure CPU compatibility
+)
+model.eval()
+# Helper to format response nicely
+def print_response(text: str) -> str:
+    return "\n".join(textwrap.fill(line, 100) for line in text.split("\n"))
+# Inference function for Gradio
+def predict_text(system_prompt: str, user_prompt: str) -> str:
+    messages = [
+        {"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]},
+        {"role": "user", "content": [{"type": "text", "text": user_prompt.strip()}]},
+    ]
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to("cpu")
+    input_len = inputs["input_ids"].shape[-1]
+    with torch.inference_mode():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=300,
+            do_sample=False,
+            use_cache=False  # Important for CPU compatibility
+        )
+    generated = output[0][input_len:]
+    decoded = tokenizer.decode(generated, skip_special_tokens=True)
+    return print_response(decoded)
+# Gradio UI
+demo = gr.Interface(
+    fn=predict_text,
+    inputs=[
+        gr.Textbox(lines=2, label="System Prompt", value="You are a helpful assistant."),
+        gr.Textbox(lines=4, label="User Prompt", placeholder="Ask something..."),
+    ],
+    outputs=gr.Textbox(label="Gemma 3n Response"),
+    title="Gemma 3n Chat (CPU-friendly)",
+    description="Lightweight CPU-only chatbot using a quantized Gemma 3n model.",
+)
+if __name__ == "__main__":
+    demo.launch()

app.py CHANGED Viewed

@@ -1,66 +1,63 @@
 import torch
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
-import textwrap
 model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-# Load model in full precision on CPU — no bitsandbytes
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    device_map="cpu",            # Force CPU
-    torch_dtype=torch.float32,   # Use FP32 to ensure CPU compatibility
 )
 model.eval()
-# Helper to format response nicely
-def print_response(text: str) -> str:
-    return "\n".join(textwrap.fill(line, 100) for line in text.split("\n"))
-# Inference function for Gradio
-def predict_text(system_prompt: str, user_prompt: str) -> str:
     messages = [
-        {"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]},
-        {"role": "user", "content": [{"type": "text", "text": user_prompt.strip()}]},
     ]
     inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
         return_tensors="pt"
     ).to("cpu")
     input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
-        output = model.generate(
-            **inputs,
-            max_new_tokens=300,
             do_sample=False,
-            use_cache=False  # Important for CPU compatibility
         )
-    generated = output[0][input_len:]
-    decoded = tokenizer.decode(generated, skip_special_tokens=True)
-    return print_response(decoded)
 # Gradio UI
 demo = gr.Interface(
-    fn=predict_text,
-    inputs=[
-        gr.Textbox(lines=2, label="System Prompt", value="You are a helpful assistant."),
-        gr.Textbox(lines=4, label="User Prompt", placeholder="Ask something..."),
-    ],
     outputs=gr.Textbox(label="Gemma 3n Response"),
-    title="Gemma 3n Chat (CPU-friendly)",
-    description="Lightweight CPU-only chatbot using a quantized Gemma 3n model.",
 )
 if __name__ == "__main__":
     demo.launch()

 import torch
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
 model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
 # Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Load model on CPU
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype=torch.float32,
 )
+model.to("cpu")
 model.eval()
+# Minimal generation: single user prompt, static system
+def generate_response(user_prompt):
     messages = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": "You are a helpful assistant."}]
+        },
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": user_prompt.strip()}]
+        }
     ]
     inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         return_tensors="pt"
     ).to("cpu")
     input_len = inputs["input_ids"].shape[-1]
     with torch.inference_mode():
+        outputs = model.generate(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_new_tokens=100,
             do_sample=False,
+            use_cache=False
         )
+    generated_tokens = outputs[0][input_len:]
+    decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    return decoded.strip()
 # Gradio UI
 demo = gr.Interface(
+    fn=generate_response,
+    inputs=gr.Textbox(lines=3, label="Enter your question"),
     outputs=gr.Textbox(label="Gemma 3n Response"),
+    title="🧪 Simple Gemma 3n Demo (CPU)",
+    description="Test the Gemma 3n model with minimal output. Max 100 tokens.",
 )
 if __name__ == "__main__":
     demo.launch()