PromptEnhancer_32B-FlashPack

Sleeping

App Files Files Community

rahul7star commited on Oct 16

Commit

f2fce5e

verified ·

1 Parent(s): 6f21ce1

Create app_cpu.py

Browse files

Files changed (1) hide show

app_cpu.py +182 -0

app_cpu.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import time
+import logging
+import re
+import gradio as gr
+from huggingface_hub import snapshot_download
+# ============================================================
+# 1️⃣ Model auto-download during app load
+# ============================================================
+DEFAULT_MODEL_PATH = os.environ.get("MODEL_OUTPUT_PATH", "PromptEnhancer/PromptEnhancer-32B")
+print(f"🔄 Checking local model at startup: {DEFAULT_MODEL_PATH}")
+local_model_dir = snapshot_download(repo_id=DEFAULT_MODEL_PATH)
+print(f"✅ Model downloaded and cached at: {local_model_dir}")
+# ============================================================
+# 2️⃣ Helper utils
+# ============================================================
+try:
+    from qwen_vl_utils import process_vision_info
+except Exception:
+    def process_vision_info(messages):
+        return None, None
+def replace_single_quotes(text):
+    pattern = r"\B'([^']*)'\B"
+    replaced_text = re.sub(pattern, r'"\1"', text)
+    replaced_text = replaced_text.replace("’", "”").replace("‘", "“")
+    return replaced_text
+def _str_to_dtype(dtype_str):
+    if dtype_str in ("bfloat16", "float16", "float32"):
+        return dtype_str
+    return "float32"
+# ============================================================
+# 3️⃣ CPU inference function
+# ============================================================
+def cpu_predict(model_path, torch_dtype, prompt_cot, sys_prompt, temperature, max_new_tokens):
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    if not logging.getLogger(__name__).handlers:
+        logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+    dtype = {
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+        "float32": torch.float32,
+    }.get(torch_dtype, torch.float32)
+    # Force CPU
+    device = "cpu"
+    logger.info("🔧 Loading model to CPU...")
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=dtype,
+        device_map={"": device},  # CPU-only mapping
+        attn_implementation="sdpa",
+    )
+    processor = AutoProcessor.from_pretrained(model_path)
+    org_prompt_cot = prompt_cot
+    user_prompt_format = sys_prompt + "\n" + org_prompt_cot
+    messages = [{"role": "user", "content": [{"type": "text", "text": user_prompt_format}]}]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+    logger.info("🧠 Running generation on CPU...")
+    generated_ids = model.generate(
+        **inputs,
+        max_new_tokens=int(max_new_tokens),
+        temperature=float(temperature),
+        do_sample=False,
+        top_k=5,
+        top_p=0.9,
+    )
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+    output_res = output_text[0]
+    try:
+        assert output_res.count("think>") == 2
+        new_prompt = output_res.split("think>")[-1].lstrip("\n")
+        new_prompt = replace_single_quotes(new_prompt)
+    except Exception:
+        new_prompt = org_prompt_cot
+    return new_prompt, ""
+# ============================================================
+# 4️⃣ Gradio interface
+# ============================================================
+def run_single(prompt, sys_prompt, temperature, max_new_tokens, torch_dtype, state):
+    if not prompt.strip():
+        return "", "请先输入提示词。", state
+    t0 = time.time()
+    try:
+        new_prompt, err = cpu_predict(
+            model_path=local_model_dir,
+            torch_dtype=_str_to_dtype(torch_dtype),
+            prompt_cot=prompt,
+            sys_prompt=sys_prompt,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+        )
+        dt = time.time() - t0
+        msg = f"耗时：{dt:.2f}s"
+        if err:
+            msg = f"{err}（{msg}）"
+        return new_prompt, msg, state
+    except Exception as e:
+        return "", f"调用失败：{e}", state
+# ============================================================
+# 5️⃣ UI
+# ============================================================
+test_list_zh = [
+    "第三人称视角，赛车在城市赛道上飞驰，左上角是小地图，地图下面是当前名次，右下角仪表盘显示当前速度。",
+]
+test_list_en = [
+    "Create a painting depicting a 30-year-old white-collar worker on a business trip by plane.",
+]
+with gr.Blocks(title="Prompt Enhancer (CPU Mode)") as demo:
+    gr.Markdown("## 🧩 Prompt Enhancer (CPU Mode — model preloaded)")
+    with gr.Row():
+        sys_prompt = gr.Textbox(
+            label="系统提示词",
+            value="请根据用户的输入，生成思考过程的思维链并改写提示词：",
+            lines=3
+        )
+        temperature = gr.Slider(0, 1, value=0.1, step=0.05, label="Temperature")
+        max_new_tokens = gr.Slider(16, 4096, value=2048, step=16, label="Max New Tokens")
+        torch_dtype = gr.Dropdown(["float32", "float16", "bfloat16"], value="float32", label="torch_dtype")
+    state = gr.State(value=None)
+    with gr.Tab("推理"):
+        with gr.Row():
+            with gr.Column(scale=2):
+                prompt = gr.Textbox(label="输入提示词", lines=6, placeholder="在此粘贴要改写的提示词...")
+                run_btn = gr.Button("生成重写", variant="primary")
+                gr.Examples(examples=test_list_zh + test_list_en, inputs=prompt)
+            with gr.Column(scale=3):
+                out_text = gr.Textbox(label="重写结果", lines=10)
+                out_info = gr.Markdown("✅ 模型已在CPU加载。")
+        run_btn.click(
+            run_single,
+            inputs=[prompt, sys_prompt, temperature, max_new_tokens, torch_dtype, state],
+            outputs=[out_text, out_info, state]
+        )
+if __name__ == "__main__":
+    demo.launch(show_error=True, share=True)