Sebastian Schmülling commited on
Commit
afd7f5e
·
0 Parent(s):

working RAG demo

Browse files
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ *.gguf
2
+ *.pdf
3
+ *.pyc
4
+ __pycache__/
5
+ .env
6
+ .ipynb_checkpoints/
7
+ venv/
8
+ .DS_Store
9
+ .content
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import hopsworks
3
+ from sentence_transformers import SentenceTransformer
4
+ from llama_cpp import Llama
5
+ import faiss
6
+ import numpy as np
7
+ import os
8
+ from dotenv import load_dotenv
9
+
10
+ # 1. Load Environment Variables & Validation
11
+ load_dotenv()
12
+
13
+ HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
14
+ MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "your-username/your-model-repo")
15
+ MODEL_FILENAME = os.getenv("MODEL_FILENAME", "model.gguf")
16
+
17
+ if not HOPSWORKS_API_KEY:
18
+ raise ValueError("HOPSWORKS_API_KEY not found in environment variables.")
19
+
20
+
21
+ print("Initializing models and connecting to Hopsworks...")
22
+
23
+ try:
24
+ embeddings = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
25
+
26
+ project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
27
+ fs = project.get_feature_store()
28
+ book_fg = fs.get_feature_group("book_embeddings", version=1)
29
+
30
+ df = book_fg.read()
31
+
32
+ if df.empty:
33
+ raise ValueError("Feature group 'book_embeddings' is empty.")
34
+
35
+ texts = df['text'].tolist()
36
+ raw_embeddings = [emb if isinstance(emb, list) else emb.tolist() for emb in df['embedding']]
37
+ embedding_vectors = np.array(raw_embeddings, dtype='float32')
38
+
39
+ dimension = embedding_vectors.shape[1]
40
+ index = faiss.IndexFlatIP(dimension)
41
+
42
+ faiss.normalize_L2(embedding_vectors)
43
+ index.add(embedding_vectors)
44
+
45
+ llm = Llama.from_pretrained(
46
+ repo_id=MODEL_REPO_ID,
47
+ filename=MODEL_FILENAME,
48
+ n_ctx=2048,
49
+ n_threads=4,
50
+ n_gpu_layers=-1,
51
+ verbose=False
52
+ )
53
+
54
+ print("Initialization complete.")
55
+
56
+ except Exception as e:
57
+ print(f"Critical Error during initialization: {e}")
58
+ llm = None
59
+ index = None
60
+
61
+ def retrieve_context(query, k=3):
62
+ if index is None:
63
+ return "Error: Search index not initialized."
64
+
65
+ query_embedding = embeddings.encode(query).astype('float32').reshape(1, -1)
66
+ faiss.normalize_L2(query_embedding)
67
+
68
+ distances, indices = index.search(query_embedding, k)
69
+
70
+ retrieved_texts = []
71
+ for i in indices[0]:
72
+ if 0 <= i < len(texts):
73
+ retrieved_texts.append(texts[i])
74
+
75
+ return "\n\n".join(retrieved_texts)
76
+
77
+ def respond(message, history):
78
+ """
79
+ Generator function for streaming response.
80
+ gr.ChatInterface passes 'message' and 'history' automatically.
81
+ """
82
+ if llm is None:
83
+ yield "System Error: Models failed to load. Check console logs."
84
+ return
85
+
86
+ context = retrieve_context(message, k=3)
87
+
88
+ prompt = f"""Use the following context to answer the question. If you don't know the answer, say you don't know.
89
+
90
+ Context:
91
+ {context}
92
+
93
+ Question: {message}
94
+
95
+ Answer:"""
96
+
97
+
98
+ output = llm(
99
+ prompt,
100
+ max_tokens=256,
101
+ temperature=0.7,
102
+ stop=["Question:", "\n\n"],
103
+ stream=True
104
+ )
105
+
106
+ partial_message = ""
107
+ for chunk in output:
108
+ text_chunk = chunk["choices"][0]["text"]
109
+ partial_message += text_chunk
110
+ yield partial_message
111
+
112
+ with gr.Blocks(title="Hopsworks RAG ChatBot") as demo:
113
+ with gr.Row():
114
+ #gr.Image("images/hopsworks_image.jpeg", height=80, width=80, show_label=False, container=False)
115
+ gr.Markdown("<h1>Hopsworks RAG ChatBot</h1>")
116
+
117
+ chat_interface = gr.ChatInterface(
118
+ fn=respond,
119
+ chatbot=gr.Chatbot(height=500),
120
+ textbox=gr.Textbox(placeholder="Ask a question about your Hopsworks...", container=False, scale=7),
121
+ examples=["What is the main topic of the documents?", "Summarize the key points."],
122
+ cache_examples=False,
123
+ )
124
+
125
+ if __name__ == "__main__":
126
+ demo.launch(share=True)
images/hopsworks_image.jpeg ADDED
index_book.ipynb ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/opt/anaconda3/envs/rag_llm/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import os\n",
19
+ "import hopsworks\n",
20
+ "from sentence_transformers import SentenceTransformer\n",
21
+ "import numpy as np\n",
22
+ "import pandas as pd\n",
23
+ "from langchain_docling import DoclingLoader\n",
24
+ "from langchain_docling.loader import ExportType\n",
25
+ "from docling.chunking import HybridChunker\n",
26
+ "\n",
27
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "PDF_PATH = \"content/Building+Machine+Learning+Systems+with+a+Feature+Store.pdf\"\n",
37
+ "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
38
+ "EXPORT_TYPE = ExportType.DOC_CHUNKS"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 3,
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stdout",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "2025-12-02 19:43:33,611 INFO: detected formats: [<InputFormat.PDF: 'pdf'>]\n",
51
+ "2025-12-02 19:43:33,861 INFO: Going to convert document batch...\n",
52
+ "2025-12-02 19:43:33,863 INFO: Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7\n",
53
+ "2025-12-02 19:43:33,913 WARNING: The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.\n",
54
+ "2025-12-02 19:43:33,914 INFO: Loading plugin 'docling_defaults'\n",
55
+ "2025-12-02 19:43:33,926 INFO: Registered picture descriptions: ['vlm', 'api']\n",
56
+ "2025-12-02 19:43:33,981 WARNING: The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.\n",
57
+ "2025-12-02 19:43:33,982 INFO: Loading plugin 'docling_defaults'\n",
58
+ "2025-12-02 19:43:34,010 INFO: Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']\n",
59
+ "2025-12-02 19:43:42,281 INFO: Auto OCR model selected ocrmac.\n",
60
+ "2025-12-02 19:43:42,299 WARNING: The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.\n",
61
+ "2025-12-02 19:43:42,299 INFO: Loading plugin 'docling_defaults'\n",
62
+ "2025-12-02 19:43:42,323 INFO: Registered layout engines: ['docling_layout_default', 'docling_experimental_table_crops_layout']\n",
63
+ "2025-12-02 19:43:42,347 INFO: Accelerator device: 'mps'\n",
64
+ "2025-12-02 19:43:57,889 WARNING: The plugin langchain_docling will not be loaded because Docling is being executed with allow_external_plugins=false.\n",
65
+ "2025-12-02 19:43:57,907 INFO: Loading plugin 'docling_defaults'\n",
66
+ "2025-12-02 19:43:57,919 INFO: Registered table structure engines: ['docling_tableformer']\n",
67
+ "2025-12-02 19:44:40,325 INFO: Accelerator device: 'mps'\n",
68
+ "2025-12-02 19:44:41,261 INFO: Processing document Building+Machine+Learning+Systems+with+a+Feature+Store.pdf\n",
69
+ "2025-12-02 19:51:45,276 INFO: Finished converting document Building+Machine+Learning+Systems+with+a+Feature+Store.pdf in 491.52 sec.\n"
70
+ ]
71
+ },
72
+ {
73
+ "name": "stderr",
74
+ "output_type": "stream",
75
+ "text": [
76
+ "Token indices sequence length is longer than the specified maximum sequence length for this model (1143 > 512). Running this sequence through the model will result in indexing errors\n"
77
+ ]
78
+ },
79
+ {
80
+ "name": "stdout",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "Loaded 1333 document chunks\n"
84
+ ]
85
+ }
86
+ ],
87
+ "source": [
88
+ "loader = DoclingLoader(\n",
89
+ " file_path=PDF_PATH,\n",
90
+ " export_type=EXPORT_TYPE,\n",
91
+ " chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),\n",
92
+ ")\n",
93
+ "\n",
94
+ "docs = loader.load()\n",
95
+ "print(f\"Loaded {len(docs)} document chunks\")"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 11,
101
+ "metadata": {},
102
+ "outputs": [
103
+ {
104
+ "name": "stdout",
105
+ "output_type": "stream",
106
+ "text": [
107
+ "page_content='Praise for Building Machine Learning Systems with a Feature Store\n",
108
+ "It' s easy to be lost in quality metrics land and forget about the crucial systems aspect to ML. Jim does a great job explaining those aspects and gives a lot of practical tips on how to survive a long deployment.\n",
109
+ "-Hannes Mühleisen, cocreator of DuckDB\n",
110
+ "Building machine learning systems in production has historically involved a lot of black magic and undocumented learnings. Jim Dowling is doing a great service to ML practitioners by sharing the best practices and putting together clear step-by-step guide.' metadata={'source': 'content/Building+Machine+Learning+Systems+with+a+Feature+Store.pdf', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/7', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 97.75, 't': 162.01999999999998, 'r': 432.0, 'b': 126.02999999999997, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 213]}]}, {'self_ref': '#/texts/8', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 264.75, 't': 122.13, 'r': 432.0, 'b': 110.03200000000004, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 38]}]}, {'self_ref': '#/texts/9', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 2, 'bbox': {'l': 81.2, 't': 608.02, 'r': 432.0, 'b': 572.03, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 256]}]}], 'headings': ['Praise for Building Machine Learning Systems with a Feature Store'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 2591788756701469466, 'filename': 'Building+Machine+Learning+Systems+with+a+Feature+Store.pdf'}}}\n"
111
+ ]
112
+ }
113
+ ],
114
+ "source": [
115
+ "print(docs[1])"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 4,
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "name": "stdout",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "Created 1333 splits\n",
128
+ "Sample: Praise for Building Machine Learning Systems with a Feature Store\n",
129
+ "I witnessed the rise of feature st...\n"
130
+ ]
131
+ }
132
+ ],
133
+ "source": [
134
+ "if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
135
+ " splits = docs\n",
136
+ "else:\n",
137
+ " from langchain_text_splitters import MarkdownHeaderTextSplitter\n",
138
+ " splitter = MarkdownHeaderTextSplitter(\n",
139
+ " headers_to_split_on=[\n",
140
+ " (\"#\", \"Header_1\"),\n",
141
+ " (\"##\", \"Header_2\"),\n",
142
+ " (\"###\", \"Header_3\"),\n",
143
+ " ],\n",
144
+ " )\n",
145
+ " splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]\n",
146
+ "\n",
147
+ "print(f\"Created {len(splits)} splits\")\n",
148
+ "print(f\"Sample: {splits[0].page_content[:100]}...\")"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 5,
154
+ "metadata": {},
155
+ "outputs": [
156
+ {
157
+ "name": "stdout",
158
+ "output_type": "stream",
159
+ "text": [
160
+ "2025-12-02 19:52:07,229 INFO: Use pytorch device_name: mps\n",
161
+ "2025-12-02 19:52:07,232 INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2\n"
162
+ ]
163
+ }
164
+ ],
165
+ "source": [
166
+ "embeddings = SentenceTransformer(EMBED_MODEL_ID)"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 6,
172
+ "metadata": {},
173
+ "outputs": [
174
+ {
175
+ "name": "stderr",
176
+ "output_type": "stream",
177
+ "text": [
178
+ "Batches: 100%|██████████| 42/42 [00:18<00:00, 2.31it/s]\n"
179
+ ]
180
+ },
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "Created 1333 embeddings\n"
186
+ ]
187
+ }
188
+ ],
189
+ "source": [
190
+ "texts = [split.page_content for split in splits]\n",
191
+ "metadatas = [split.metadata for split in splits]\n",
192
+ "\n",
193
+ "vectors = embeddings.encode(texts, show_progress_bar=True, batch_size=32)\n",
194
+ "print(f\"Created {len(vectors)} embeddings\")"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 7,
200
+ "metadata": {},
201
+ "outputs": [
202
+ {
203
+ "name": "stdout",
204
+ "output_type": "stream",
205
+ "text": [
206
+ "2025-12-02 19:52:44,050 INFO: Initializing external client\n",
207
+ "2025-12-02 19:52:44,064 INFO: Base URL: https://c.app.hopsworks.ai:443\n"
208
+ ]
209
+ },
210
+ {
211
+ "name": "stderr",
212
+ "output_type": "stream",
213
+ "text": [
214
+ "\n",
215
+ "\n",
216
+ "UserWarning: The installed hopsworks client version 4.4.2 may not be compatible with the connected Hopsworks backend version 4.2.2. \n",
217
+ "To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'\n"
218
+ ]
219
+ },
220
+ {
221
+ "name": "stdout",
222
+ "output_type": "stream",
223
+ "text": [
224
+ "2025-12-02 19:52:47,302 INFO: Python Engine initialized.\n",
225
+ "\n",
226
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1271977\n"
227
+ ]
228
+ }
229
+ ],
230
+ "source": [
231
+ "project = hopsworks.login()\n",
232
+ "fs = project.get_feature_store()"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 8,
238
+ "metadata": {},
239
+ "outputs": [
240
+ {
241
+ "name": "stdout",
242
+ "output_type": "stream",
243
+ "text": [
244
+ "Created dataframe with 1333 rows\n"
245
+ ]
246
+ }
247
+ ],
248
+ "source": [
249
+ "data = []\n",
250
+ "for i, (text, vector, metadata) in enumerate(zip(texts, vectors, metadatas)):\n",
251
+ " data.append({\n",
252
+ " 'id': i,\n",
253
+ " 'text': text,\n",
254
+ " 'page': metadata.get('page', metadata.get('page_number', 0)),\n",
255
+ " 'embedding': vector\n",
256
+ " })\n",
257
+ "\n",
258
+ "df = pd.DataFrame(data)\n",
259
+ "print(f\"Created dataframe with {len(df)} rows\")"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 9,
265
+ "metadata": {},
266
+ "outputs": [
267
+ {
268
+ "name": "stdout",
269
+ "output_type": "stream",
270
+ "text": [
271
+ "Feature Group created successfully, explore it at \n",
272
+ "https://c.app.hopsworks.ai:443/p/1271977/fs/1258579/fg/1790385\n"
273
+ ]
274
+ },
275
+ {
276
+ "name": "stderr",
277
+ "output_type": "stream",
278
+ "text": [
279
+ "Uploading Dataframe: 100.00% |██████████| Rows 1333/1333 | Elapsed Time: 00:01 | Remaining Time: 00:00\n"
280
+ ]
281
+ },
282
+ {
283
+ "name": "stdout",
284
+ "output_type": "stream",
285
+ "text": [
286
+ "Launching job: book_embeddings_2_offline_fg_materialization\n",
287
+ "Job started successfully, you can follow the progress at \n",
288
+ "https://c.app.hopsworks.ai:443/p/1271977/jobs/named/book_embeddings_2_offline_fg_materialization/executions\n"
289
+ ]
290
+ },
291
+ {
292
+ "data": {
293
+ "text/plain": [
294
+ "(Job('book_embeddings_2_offline_fg_materialization', 'SPARK'), None)"
295
+ ]
296
+ },
297
+ "execution_count": 9,
298
+ "metadata": {},
299
+ "output_type": "execute_result"
300
+ }
301
+ ],
302
+ "source": [
303
+ "book_fg = fs.get_or_create_feature_group(\n",
304
+ " name=\"book_embeddings\",\n",
305
+ " version=2,\n",
306
+ " primary_key=[\"id\"],\n",
307
+ " description=\"Book text chunks with embeddings\"\n",
308
+ ")\n",
309
+ "\n",
310
+ "book_fg.insert(df)"
311
+ ]
312
+ }
313
+ ],
314
+ "metadata": {
315
+ "kernelspec": {
316
+ "display_name": "rag_llm",
317
+ "language": "python",
318
+ "name": "python3"
319
+ },
320
+ "language_info": {
321
+ "codemirror_mode": {
322
+ "name": "ipython",
323
+ "version": 3
324
+ },
325
+ "file_extension": ".py",
326
+ "mimetype": "text/x-python",
327
+ "name": "python",
328
+ "nbconvert_exporter": "python",
329
+ "pygments_lexer": "ipython3",
330
+ "version": "3.11.14"
331
+ }
332
+ },
333
+ "nbformat": 4,
334
+ "nbformat_minor": 2
335
+ }
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ langchain-community
4
+ langchain-docling
5
+ sentence-transformers
6
+ hopsworks[python] == 4.4.*
7
+ llama-cpp-python
8
+ python-dotenv
9
+ langchain-text-splitters
10
+ faiss-cpu
11
+ numpy
12
+ pandas
13
+
14
+