Spaces:

schmuelling
/

hopsworks_chat

Sleeping

App Files Files Community

Sebastian Schmülling commited on 12 days ago

Commit

afd7f5e

0 Parent(s):

working RAG demo

Browse files

Files changed (6) hide show

.gitignore +9 -0
.gradio/certificate.pem +31 -0
app.py +126 -0
images/hopsworks_image.jpeg +0 -0
index_book.ipynb +335 -0
requirements.txt +14 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+*.gguf
+*.pdf
+*.pyc
+__pycache__/
+.env
+.ipynb_checkpoints/
+venv/
+.DS_Store
+.content

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import gradio as gr
+import hopsworks
+from sentence_transformers import SentenceTransformer
+from llama_cpp import Llama
+import faiss
+import numpy as np
+import os
+from dotenv import load_dotenv
+# 1. Load Environment Variables & Validation
+load_dotenv()
+HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
+MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "your-username/your-model-repo")
+MODEL_FILENAME = os.getenv("MODEL_FILENAME", "model.gguf")
+if not HOPSWORKS_API_KEY:
+    raise ValueError("HOPSWORKS_API_KEY not found in environment variables.")
+print("Initializing models and connecting to Hopsworks...")
+try:
+    embeddings = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
+    fs = project.get_feature_store()
+    book_fg = fs.get_feature_group("book_embeddings", version=1)
+    df = book_fg.read()
+    if df.empty:
+        raise ValueError("Feature group 'book_embeddings' is empty.")
+    texts = df['text'].tolist()
+    raw_embeddings = [emb if isinstance(emb, list) else emb.tolist() for emb in df['embedding']]
+    embedding_vectors = np.array(raw_embeddings, dtype='float32')
+    dimension = embedding_vectors.shape[1]
+    index = faiss.IndexFlatIP(dimension)
+    faiss.normalize_L2(embedding_vectors)
+    index.add(embedding_vectors)
+    llm = Llama.from_pretrained(
+        repo_id=MODEL_REPO_ID,
+        filename=MODEL_FILENAME,
+        n_ctx=2048,
+        n_threads=4,
+        n_gpu_layers=-1,
+        verbose=False
+    )
+    print("Initialization complete.")
+except Exception as e:
+    print(f"Critical Error during initialization: {e}")
+    llm = None
+    index = None
+def retrieve_context(query, k=3):
+    if index is None:
+        return "Error: Search index not initialized."
+    query_embedding = embeddings.encode(query).astype('float32').reshape(1, -1)
+    faiss.normalize_L2(query_embedding)
+    distances, indices = index.search(query_embedding, k)
+    retrieved_texts = []
+    for i in indices[0]:
+        if 0 <= i < len(texts):
+            retrieved_texts.append(texts[i])
+    return "\n\n".join(retrieved_texts)
+def respond(message, history):
+    """
+    Generator function for streaming response.
+    gr.ChatInterface passes 'message' and 'history' automatically.
+    """
+    if llm is None:
+        yield "System Error: Models failed to load. Check console logs."
+        return
+    context = retrieve_context(message, k=3)
+    prompt = f"""Use the following context to answer the question. If you don't know the answer, say you don't know.
+Context:
+{context}
+Question: {message}
+Answer:"""
+    output = llm(
+        prompt,
+        max_tokens=256,
+        temperature=0.7,
+        stop=["Question:", "\n\n"],
+        stream=True
+    )
+    partial_message = ""
+    for chunk in output:
+        text_chunk = chunk["choices"][0]["text"]
+        partial_message += text_chunk
+        yield partial_message
+with gr.Blocks(title="Hopsworks RAG ChatBot") as demo:
+    with gr.Row():
+        #gr.Image("images/hopsworks_image.jpeg", height=80, width=80, show_label=False, container=False)
+        gr.Markdown("<h1>Hopsworks RAG ChatBot</h1>")
+    chat_interface = gr.ChatInterface(
+        fn=respond,
+        chatbot=gr.Chatbot(height=500),
+        textbox=gr.Textbox(placeholder="Ask a question about your Hopsworks...", container=False, scale=7),
+        examples=["What is the main topic of the documents?", "Summarize the key points."],
+        cache_examples=False,
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

images/hopsworks_image.jpeg ADDED Viewed

index_book.ipynb ADDED Viewed

	@@ -0,0 +1,335 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/rag_llm/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import hopsworks\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from langchain_docling import DoclingLoader\n",
+    "from langchain_docling.loader import ExportType\n",
+    "from docling.chunking import HybridChunker\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PDF_PATH = \"content/Building+Machine+Learning+Systems+with+a+Feature+Store.pdf\"\n",
+    "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "EXPORT_TYPE = ExportType.DOC_CHUNKS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-12-02 19:43:33,611 INFO: detected formats: [<InputFormat.PDF: 'pdf'>]\n",
+      "2025-12-02 19:43:33,861 INFO: Going to convert document batch...\n",
+      "2025-12-02 19:43:33,863 INFO: Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7\n",
+      "2025-12-02 19:43:33,913 WARNING: The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.\n",
+      "2025-12-02 19:43:33,914 INFO: Loading plugin 'docling_defaults'\n",
+      "2025-12-02 19:43:33,926 INFO: Registered picture descriptions: ['vlm', 'api']\n",
+      "2025-12-02 19:43:33,981 WARNING: The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.\n",
+      "2025-12-02 19:43:33,982 INFO: Loading plugin 'docling_defaults'\n",
+      "2025-12-02 19:43:34,010 INFO: Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']\n",
+      "2025-12-02 19:43:42,281 INFO: Auto OCR model selected ocrmac.\n",
+      "2025-12-02 19:43:42,299 WARNING: The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.\n",
+      "2025-12-02 19:43:42,299 INFO: Loading plugin 'docling_defaults'\n",
+      "2025-12-02 19:43:42,323 INFO: Registered layout engines: ['docling_layout_default', 'docling_experimental_table_crops_layout']\n",
+      "2025-12-02 19:43:42,347 INFO: Accelerator device: 'mps'\n",
+      "2025-12-02 19:43:57,889 WARNING: The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.\n",
+      "2025-12-02 19:43:57,907 INFO: Loading plugin 'docling_defaults'\n",
+      "2025-12-02 19:43:57,919 INFO: Registered table structure engines: ['docling_tableformer']\n",
+      "2025-12-02 19:44:40,325 INFO: Accelerator device: 'mps'\n",
+      "2025-12-02 19:44:41,261 INFO: Processing document Building+Machine+Learning+Systems+with+a+Feature+Store.pdf\n",
+      "2025-12-02 19:51:45,276 INFO: Finished converting document Building+Machine+Learning+Systems+with+a+Feature+Store.pdf in 491.52 sec.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (1143 > 512). Running this sequence through the model will result in indexing errors\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 1333 document chunks\n"
+     ]
+    }
+   ],
+   "source": [
+    "loader = DoclingLoader(\n",
+    "    file_path=PDF_PATH,\n",
+    "    export_type=EXPORT_TYPE,\n",
+    "    chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),\n",
+    ")\n",
+    "\n",
+    "docs = loader.load()\n",
+    "print(f\"Loaded {len(docs)} document chunks\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "page_content='Praise for Building Machine Learning Systems with a Feature Store\n",
+      "It' s easy to be lost in quality metrics land and forget about the crucial systems aspect to ML. Jim does a great job explaining those aspects and gives a lot of practical tips on how to survive a long deployment.\n",
+      "-Hannes Mühleisen, cocreator of DuckDB\n",
+      "Building machine learning systems in production has historically involved a lot of black magic and undocumented learnings. Jim Dowling is doing a great service to ML practitioners by sharing the best practices and putting together clear step-by-step guide.' metadata={'source': 'content/Building+Machine+Learning+Systems+with+a+Feature+Store.pdf', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/7', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 97.75, 't': 162.01999999999998, 'r': 432.0, 'b': 126.02999999999997, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 213]}]}, {'self_ref': '#/texts/8', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 264.75, 't': 122.13, 'r': 432.0, 'b': 110.03200000000004, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 38]}]}, {'self_ref': '#/texts/9', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 2, 'bbox': {'l': 81.2, 't': 608.02, 'r': 432.0, 'b': 572.03, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 256]}]}], 'headings': ['Praise for Building Machine Learning Systems with a Feature Store'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 2591788756701469466, 'filename': 'Building+Machine+Learning+Systems+with+a+Feature+Store.pdf'}}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(docs[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Created 1333 splits\n",
+      "Sample: Praise for Building Machine Learning Systems with a Feature Store\n",
+      "I witnessed the rise of feature st...\n"
+     ]
+    }
+   ],
+   "source": [
+    "if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
+    "    splits = docs\n",
+    "else:\n",
+    "    from langchain_text_splitters import MarkdownHeaderTextSplitter\n",
+    "    splitter = MarkdownHeaderTextSplitter(\n",
+    "        headers_to_split_on=[\n",
+    "            (\"#\", \"Header_1\"),\n",
+    "            (\"##\", \"Header_2\"),\n",
+    "            (\"###\", \"Header_3\"),\n",
+    "        ],\n",
+    "    )\n",
+    "    splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]\n",
+    "\n",
+    "print(f\"Created {len(splits)} splits\")\n",
+    "print(f\"Sample: {splits[0].page_content[:100]}...\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-12-02 19:52:07,229 INFO: Use pytorch device_name: mps\n",
+      "2025-12-02 19:52:07,232 INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2\n"
+     ]
+    }
+   ],
+   "source": [
+    "embeddings = SentenceTransformer(EMBED_MODEL_ID)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Batches: 100%|██████████| 42/42 [00:18<00:00,  2.31it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Created 1333 embeddings\n"
+     ]
+    }
+   ],
+   "source": [
+    "texts = [split.page_content for split in splits]\n",
+    "metadatas = [split.metadata for split in splits]\n",
+    "\n",
+    "vectors = embeddings.encode(texts, show_progress_bar=True, batch_size=32)\n",
+    "print(f\"Created {len(vectors)} embeddings\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-12-02 19:52:44,050 INFO: Initializing external client\n",
+      "2025-12-02 19:52:44,064 INFO: Base URL: https://c.app.hopsworks.ai:443\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "UserWarning: The installed hopsworks client version 4.4.2 may not be compatible with the connected Hopsworks backend version 4.2.2. \n",
+      "To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-12-02 19:52:47,302 INFO: Python Engine initialized.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1271977\n"
+     ]
+    }
+   ],
+   "source": [
+    "project = hopsworks.login()\n",
+    "fs = project.get_feature_store()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Created dataframe with 1333 rows\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = []\n",
+    "for i, (text, vector, metadata) in enumerate(zip(texts, vectors, metadatas)):\n",
+    "    data.append({\n",
+    "        'id': i,\n",
+    "        'text': text,\n",
+    "        'page': metadata.get('page', metadata.get('page_number', 0)),\n",
+    "        'embedding': vector\n",
+    "    })\n",
+    "\n",
+    "df = pd.DataFrame(data)\n",
+    "print(f\"Created dataframe with {len(df)} rows\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Group created successfully, explore it at \n",
+      "https://c.app.hopsworks.ai:443/p/1271977/fs/1258579/fg/1790385\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Uploading Dataframe: 100.00% |██████████| Rows 1333/1333 | Elapsed Time: 00:01 | Remaining Time: 00:00\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching job: book_embeddings_2_offline_fg_materialization\n",
+      "Job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai:443/p/1271977/jobs/named/book_embeddings_2_offline_fg_materialization/executions\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(Job('book_embeddings_2_offline_fg_materialization', 'SPARK'), None)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "book_fg = fs.get_or_create_feature_group(\n",
+    "    name=\"book_embeddings\",\n",
+    "    version=2,\n",
+    "    primary_key=[\"id\"],\n",
+    "    description=\"Book text chunks with embeddings\"\n",
+    ")\n",
+    "\n",
+    "book_fg.insert(df)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rag_llm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio
+langchain
+langchain-community
+langchain-docling
+sentence-transformers
+hopsworks[python] == 4.4.*
+llama-cpp-python
+python-dotenv
+langchain-text-splitters
+faiss-cpu
+numpy
+pandas