Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 18

Commit

b5cf3d3

verified ·

1 Parent(s): f55e2f6

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +141 -74

app/app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import asyncio
 import logging
 import uuid
 import re
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
@@ -22,21 +24,28 @@ class RequestIdAdapter(logging.LoggerAdapter):
 logger = logging.getLogger("app")
 # -----------------------------
-# ✅ Configuration
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "90"))
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
 TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
 app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.1.0")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     request_id = str(uuid.uuid4())
@@ -67,19 +76,26 @@ except Exception as e:
     db_ready = False
 # -----------------------------
-# ✅ Load TinyLlama GGUF Model
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=4096,
-        n_threads=4,
-        n_batch=512,
         use_mlock=True,
-        verbose=False
     )
-    logger.info("GGUF model loaded successfully.")
     model_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
@@ -100,6 +116,49 @@ class Feedback(BaseModel):
     feedback: str
     comment: str | None = None
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
@@ -108,30 +167,21 @@ def get_logger_adapter(request: Request):
 @app.get("/")
 async def root():
-    return {"status": "✅ Server is running."}
 @app.get("/health")
 async def health_check():
     status = {
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
-        "model_status": "ready" if model_ready else "error"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
     return status
-async def generate_llm_response(prompt: str, request_id: str):
-    loop = asyncio.get_running_loop()
-    response = await loop.run_in_executor(
-        None,
-        lambda: llm(prompt, max_tokens=2048, stop=["###", "Question:", "Context:", "</s>"], temperature=0.05, echo=False)
-    )
-    answer = response["choices"][0]["text"].strip()
-    if not answer:
-        raise ValueError("Empty response from LLM")
-    return answer
 @app.post("/chat")
 async def chat(query: Query, request: Request):
     adapter = get_logger_adapter(request)
@@ -142,9 +192,9 @@ async def chat(query: Query, request: Request):
     if question_lower in greeting_keywords:
         adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
         intro_message = (
-            "Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
-            "My purpose is to help you find accurate information and answer questions based on this specific dataset. "
-            "I am currently running on a CPU-based environment. How can I assist you with the DoP policy today?"
         )
         return {
             "request_id": getattr(request.state, 'request_id', 'N/A'),
@@ -159,75 +209,86 @@ async def chat(query: Query, request: Request):
     adapter.info(f"Received query: '{query.question}'")
-    # 1. Search Vector DB
-    search_results = db.search(query.question, top_k=TOP_K_SEARCH)
-    if not search_results:
-        adapter.warning("No relevant context found in vector DB.")
-        return {
-            "question": query.question,
-            "context_used": "No relevant context found.",
-            "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
-        }
-    scores = [f"{result['relevance_score']:.4f}" for result in search_results]
-    adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
-    # 2. Prepare Context
-    context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
-    context = "\n---\n".join(context_chunks)
-    # 3. Build Prompt with Separator Instruction
-    prompt = f"""<|system|>
 You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
 Your task is to answer the user's question based ONLY on the provided context.
 - **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
 - **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
 </s>
 <|user|>
 ### Relevant Context:
 ```
 {context}
 ```
 ### Question:
 {query.question}
 </s>
 <|assistant|>
-### Detailed Answer:
 """
-    # 4. Generate Response
-    answer = "An error occurred while processing your request."
-    try:
-        adapter.info("Sending prompt to LLM for generation...")
-        raw_answer = await asyncio.wait_for(
-            generate_llm_response(prompt, request.state.request_id),
-            timeout=LLM_TIMEOUT_SECONDS
-        )
-        adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
-        # --- POST-PROCESSING LOGIC ---
-        # Check if the model used the pipe separator, indicating a list.
-        if '|' in raw_answer:
-            adapter.info("Pipe separator found. Formatting response as a bulleted list.")
-            # Split the string into a list of items
-            items = raw_answer.split('|')
-            # Clean up each item and format it as a bullet point
-            cleaned_items = [f"* {item.strip()}" for item in items if item.strip()]
-            # Join them back together with newlines
-            answer = "\n".join(cleaned_items)
-        else:
-            # If no separator, use the answer as is.
-            answer = raw_answer
-    except asyncio.TimeoutError:
-        adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
-        answer = "Sorry, the request took too long to process. Please try again with a simpler question."
     except Exception as e:
-        adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
-        answer = "Sorry, an unexpected error occurred while generating a response."
     adapter.info(f"Final answer prepared. Returning to client.")
     return {
@@ -251,3 +312,9 @@ async def collect_feedback(feedback: Feedback, request: Request):
     }
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}

 import logging
 import uuid
 import re
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from llama_cpp import Llama
 logger = logging.getLogger("app")
 # -----------------------------
+# ✅ Configuration - Optimized for CPU
 # -----------------------------
 DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
 CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
 MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "60"))  # Reduced timeout
 RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
 TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))
 TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))
+# CPU Optimization settings
+CPU_COUNT = mp.cpu_count()
+logger.info(f"Detected {CPU_COUNT} CPU cores")
 # -----------------------------
 # ✅ Initialize FastAPI App
 # -----------------------------
 app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.1.0")
+# Thread pool for async operations
+thread_executor = ThreadPoolExecutor(max_workers=CPU_COUNT * 2)
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     request_id = str(uuid.uuid4())
     db_ready = False
 # -----------------------------
+# ✅ Load TinyLlama GGUF Model - Optimized
 # -----------------------------
 logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=2048,  # Reduced context window for speed
+        n_threads=CPU_COUNT,  # Use all CPU cores
+        n_batch=256,  # Optimized batch size
         use_mlock=True,
+        verbose=False,
+        # Additional optimizations
+        n_gpu_layers=0,  # Force CPU only
+        rope_scaling_type=-1,  # Disable rope scaling for speed
+        use_mmap=True,  # Enable memory mapping
+        low_vram=False,  # We're on CPU
+        # CPU-specific optimizations
+        numa=True,  # Enable NUMA awareness if available
     )
+    logger.info("GGUF model loaded successfully with CPU optimizations.")
     model_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
     feedback: str
     comment: str | None = None
+# -----------------------------
+# ✅ Optimized LLM Generation
+# -----------------------------
+async def generate_llm_response(prompt: str, request_id: str):
+    """Optimized LLM response generation with better CPU utilization."""
+    loop = asyncio.get_running_loop()
+    def generate_response():
+        return llm(
+            prompt,
+            max_tokens=1024,  # Reduced for faster generation
+            stop=["###", "Question:", "Context:", "</s>", "\n\n###"],
+            temperature=0.05,
+            echo=False,
+            # CPU optimizations
+            repeat_penalty=1.1,
+            top_p=0.9,
+            top_k=40,
+            # Faster inference settings
+            typical_p=1.0,
+            mirostat_mode=0,  # Disable for speed
+        )
+    # Use thread executor for better concurrency
+    response = await loop.run_in_executor(thread_executor, generate_response)
+    answer = response["choices"][0]["text"].strip()
+    if not answer:
+        raise ValueError("Empty response from LLM")
+    return answer
+# -----------------------------
+# ✅ Optimized Search Function
+# -----------------------------
+async def perform_optimized_search(query_text: str):
+    """Perform vector search in a separate thread to avoid blocking."""
+    loop = asyncio.get_running_loop()
+    def search_db():
+        return db.search(query_text, top_k=TOP_K_SEARCH)
+    return await loop.run_in_executor(thread_executor, search_db)
 # -----------------------------
 # ✅ Endpoints
 # -----------------------------
 @app.get("/")
 async def root():
+    return {"status": "✅ Server is running.", "cpu_cores": CPU_COUNT}
 @app.get("/health")
 async def health_check():
     status = {
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
+        "model_status": "ready" if model_ready else "error",
+        "cpu_cores": CPU_COUNT,
+        "optimization": "enabled"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
     return status
 @app.post("/chat")
 async def chat(query: Query, request: Request):
     adapter = get_logger_adapter(request)
     if question_lower in greeting_keywords:
         adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
         intro_message = (
+            f"Hello! I am an AI assistant specifically trained on NEEPCO's Delegation of Powers (DoP) policy document. "
+            f"My purpose is to help you find accurate information and answer questions based on this specific dataset. "
+            f"I am currently running optimized on a {CPU_COUNT}-core CPU environment. How can I assist you with the DoP policy today?"
         )
         return {
             "request_id": getattr(request.state, 'request_id', 'N/A'),
     adapter.info(f"Received query: '{query.question}'")
+    try:
+        # 1. Perform parallel search and prepare context
+        search_task = perform_optimized_search(query.question)
+        search_results = await search_task
+        if not search_results:
+            adapter.warning("No relevant context found in vector DB.")
+            return {
+                "request_id": request.state.request_id,
+                "question": query.question,
+                "context_used": "No relevant context found.",
+                "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing."
+            }
+        scores = [f"{result['relevance_score']:.4f}" for result in search_results]
+        adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
+        # 2. Prepare Context (limit context size for faster processing)
+        context_chunks = [result['text'] for result in search_results[:TOP_K_CONTEXT]]
+        context = "\n---\n".join(context_chunks)
+        # Truncate context if too long for faster processing
+        max_context_length = 1500  # Reduced for faster inference
+        if len(context) > max_context_length:
+            context = context[:max_context_length] + "..."
+        # 3. Build optimized prompt
+        prompt = f"""<|system|>
 You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
 Your task is to answer the user's question based ONLY on the provided context.
 - **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
 - **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
+- **Brevity Rule:** Keep your answer concise and to the point.
 </s>
 <|user|>
 ### Relevant Context:
 ```
 {context}
 ```
 ### Question:
 {query.question}
 </s>
 <|assistant|>
+### Answer:
 """
+        # 4. Generate Response with timeout
+        answer = "An error occurred while processing your request."
+        try:
+            adapter.info("Sending prompt to LLM for generation...")
+            raw_answer = await asyncio.wait_for(
+                generate_llm_response(prompt, request.state.request_id),
+                timeout=LLM_TIMEOUT_SECONDS
+            )
+            adapter.info(f"LLM generation successful. Raw response: {raw_answer[:100]}...")
+            # --- POST-PROCESSING LOGIC ---
+            # Check if the model used the pipe separator, indicating a list.
+            if '|' in raw_answer:
+                adapter.info("Pipe separator found. Formatting response as a bulleted list.")
+                # Split the string into a list of items
+                items = raw_answer.split('|')
+                # Clean up each item and format it as a bullet point
+                cleaned_items = [f"• {item.strip()}" for item in items if item.strip()]
+                # Join them back together with newlines
+                answer = "\n".join(cleaned_items)
+            else:
+                # If no separator, use the answer as is.
+                answer = raw_answer
+        except asyncio.TimeoutError:
+            adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
+            answer = "Sorry, the request took too long to process. Please try again with a simpler question."
+        except Exception as e:
+            adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
+            answer = "Sorry, an unexpected error occurred while generating a response."
     except Exception as e:
+        adapter.error(f"An unexpected error occurred: {e}", exc_info=True)
+        answer = "Sorry, an unexpected error occurred. Please try again."
     adapter.info(f"Final answer prepared. Returning to client.")
     return {
     }
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}
+# Graceful shutdown
+@app.on_event("shutdown")
+async def shutdown_event():
+    thread_executor.shutdown(wait=True)
+    logger.info("Application shutdown complete.")