import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_NAME = "ValiantLabs/Qwen3-4B-Thinking-2507-Esper3.1" # Load model & tokenizer once at startup tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype="auto", device_map="auto" ) def ask_question(prompt): """Generate response (thinking + final content) from Qwen3 model.""" try: messages = [{"role": "user", "content": prompt}] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=True # thinking mode ) inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate( **inputs, max_new_tokens=4096, temperature=0.7, do_sample=True ) output_ids = generated_ids[0][len(inputs.input_ids[0]):].tolist() # Find the thinking section (token 151668 == ) try: index = len(output_ids) - output_ids[::-1].index(151668) except ValueError: index = 0 thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") return thinking_content, content except Exception as e: return f"⚠️ Error: {e}", "" # --- Gradio UI --- with gr.Blocks(title="Qwen3 Thinking Chat") as demo: gr.Markdown("## 🧠 Qwen3-4B-Thinking — Ask Anything") gr.Markdown( "This demo uses **ValiantLabs/Qwen3-4B-Thinking-2507-Esper3.1**, " "a reasoning model that shows its internal 'thinking' trace before giving the final answer." ) with gr.Row(): prompt_box = gr.Textbox( label="Ask your question", placeholder="e.g. Explain how quantum entanglement works.", lines=3 ) with gr.Row(): think_output = gr.Textbox(label="🧩 Thinking process", lines=10) final_output = gr.Textbox(label="💬 Final answer", lines=10) ask_btn = gr.Button("🚀 Generate Answer") ask_btn.click( fn=ask_question, inputs=prompt_box, outputs=[think_output, final_output] ) demo.launch()