import subprocess subprocess.run("pip install llama-cpp-python==0.3.15", shell=True, check=True) import gradio as gr import hopsworks from sentence_transformers import SentenceTransformer from llama_cpp import Llama import faiss import numpy as np import os import json import yaml from dotenv import load_dotenv # 1. Load Environment Variables & Validation load_dotenv() HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY") if not HOPSWORKS_API_KEY: raise ValueError("HOPSWORKS_API_KEY not found in environment variables.") # Load models configuration with open("models_config.json", "r") as f: models_config = json.load(f) # Load RAG prompt configuration with open("prompts/rag_prompt.yml", "r") as f: prompt_config = yaml.safe_load(f) # Global variable to store the current LLM llm = None print("Initializing embeddings and connecting to Hopsworks...") try: embeddings = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY) fs = project.get_feature_store() book_fg = fs.get_feature_group("book_embeddings", version=1) df = book_fg.read() if df.empty: raise ValueError("Feature group 'book_embeddings' is empty.") texts = df['text'].tolist() raw_embeddings = [emb if isinstance(emb, list) else emb.tolist() for emb in df['embedding']] embedding_vectors = np.array(raw_embeddings, dtype='float32') dimension = embedding_vectors.shape[1] index = faiss.IndexFlatIP(dimension) faiss.normalize_L2(embedding_vectors) index.add(embedding_vectors) print("Embeddings and FAISS index initialized.") except Exception as e: print(f"Critical Error during initialization: {e}") index = None # Function to load a model dynamically def load_model(repo_name, model_name, progress=gr.Progress()): global llm try: progress(0, desc="Initializing...") # Find the repository repo = next((r for r in models_config["repositories"] if r["name"] == repo_name), None) if not repo: return f"Error: Repository '{repo_name}' not found in config." # Find the model within the repository model = next((m for m in repo["models"] if m["name"] == model_name), None) if not model: return f"Error: Model '{model_name}' not found in repository." print(f"Loading model: {model['name']}...") print(f"Repo: {repo['repo_id']}, File: {model['filename']}") progress(0.3, desc=f"Downloading/Loading {model['name']}...") llm = Llama.from_pretrained( repo_id=repo["repo_id"], filename=model["filename"], n_ctx=2048, n_threads=4, n_gpu_layers=-1, verbose=False ) progress(1.0, desc="Complete!") return f"✅ Model '{model_name}' loaded successfully!" except Exception as e: llm = None return f"❌ Error loading model: {str(e)}" def retrieve_context(query, k=None): if index is None: return "Error: Search index not initialized." # Use k from prompt config if not specified if k is None: k = prompt_config["rag"]["num_retrieved_chunks"] query_embedding = embeddings.encode(query).astype('float32').reshape(1, -1) faiss.normalize_L2(query_embedding) distances, indices = index.search(query_embedding, k) retrieved_texts = [] for i in indices[0]: if 0 <= i < len(texts): retrieved_texts.append(texts[i]) # Use separator from prompt config separator = prompt_config["rag"]["context_separator"] print(f"Retrieved {len(retrieved_texts)} context chunks for the query.") print("Similarities:", distances) return separator.join(retrieved_texts) def respond(message, history): """ Generator function for streaming response. gr.ChatInterface passes 'message' and 'history' automatically. """ if llm is None: yield "System Error: Models failed to load. Check console logs." return # Retrieve context using config settings context = retrieve_context(message) # Build prompt from template prompt = prompt_config["template"].format( context=context, question=message ) # Get generation parameters from config gen_params = prompt_config["generation"] output = llm( prompt, max_tokens=gen_params["max_tokens"], temperature=gen_params["temperature"], stop=gen_params["stop_sequences"], stream=True ) partial_message = "" for chunk in output: text_chunk = chunk["choices"][0]["text"] partial_message += text_chunk yield partial_message with gr.Blocks(title="Hopsworks RAG ChatBot") as demo: gr.Markdown("