Kalpokoch commited on
Commit
fd37461
·
1 Parent(s): ade6daa

Implemented dynamic DB build and other app/docker changes

Browse files
Files changed (4) hide show
  1. .gitattributes +0 -35
  2. Dockerfile +7 -4
  3. app/app.py +11 -10
  4. app/policy_vector_db.py +47 -20
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -14,8 +14,11 @@ ENV TRANSFORMERS_CACHE=/app/.cache \
14
  HF_HOME=/app/.cache
15
  RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
16
 
17
- # Ensure ChromaDB can write its persistent database
18
- RUN mkdir -p /data/policy_vector_db && chmod -R 777 /data/policy_vector_db
 
 
 
19
 
20
  # Copy only the requirements file to leverage Docker cache
21
  COPY requirements.txt .
@@ -23,11 +26,11 @@ COPY requirements.txt .
23
  # Install Python dependencies
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
26
- # Copy the rest of your application code
27
  COPY . .
28
 
29
  # Expose the port the app runs on
30
  EXPOSE 7860
31
 
32
  # Command to run the FastAPI application
33
- CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]
 
14
  HF_HOME=/app/.cache
15
  RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
16
 
17
+ # --- NEW: Copy the pre-built vector database ---
18
+ # Create the directory for the DB inside the container
19
+ RUN mkdir -p /app/vector_database && chmod -R 777 /app/vector_database
20
+ # Copy the contents of your local 'vector_database' into the container
21
+ COPY vector_database/ /app/vector_database/
22
 
23
  # Copy only the requirements file to leverage Docker cache
24
  COPY requirements.txt .
 
26
  # Install Python dependencies
27
  RUN pip install --no-cache-dir -r requirements.txt
28
 
29
+ # Copy the rest of your application code (app/ processed_chunks.json, README.md etc.)
30
  COPY . .
31
 
32
  # Expose the port the app runs on
33
  EXPOSE 7860
34
 
35
  # Command to run the FastAPI application
36
+ CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]
app/app.py CHANGED
@@ -2,15 +2,17 @@ from fastapi import FastAPI, Request
2
  from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
4
  import torch
5
- from policy_vector_db import PolicyVectorDB # Make sure this is your local DB logic
6
- import chromadb
7
 
8
  # Create FastAPI app
9
  app = FastAPI()
10
 
11
- # Load the vector database from /tmp (safe for Hugging Face Spaces)
12
  print("Loading Vector Database...")
13
- db = PolicyVectorDB(persist_directory="/tmp/policy_vector_db")
 
 
14
  print("Vector Database loaded successfully!")
15
 
16
  # Load your quantized model from Hugging Face Hub
@@ -50,7 +52,8 @@ async def chat(query: Query):
50
 
51
  # Step 1: Vector DB search
52
  search_results = db.search(question)
53
- context = "\n".join([res["content"] for res in search_results])
 
54
 
55
  # Step 2: Build prompt
56
  prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
@@ -59,9 +62,7 @@ async def chat(query: Query):
59
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
60
  outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
61
 
62
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
63
 
64
- # Optionally strip out the prompt from the output
65
- final_answer = answer.split("Answer:")[-1].strip()
66
-
67
- return {"answer": final_answer}
 
2
  from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
4
  import torch
5
+ from app.policy_vector_db import PolicyVectorDB
6
+ import chromadb # Make sure chromadb is imported if you use it directly later, though PolicyVectorDB handles it.
7
 
8
  # Create FastAPI app
9
  app = FastAPI()
10
 
11
+ # --- REVISED: Load the vector database from the path inside the Docker container ---
12
  print("Loading Vector Database...")
13
+ # The path must match where you copied the DB in the Dockerfile
14
+ DB_PERSIST_DIRECTORY = "/app/vector_database"
15
+ db = PolicyVectorDB(persist_directory=DB_PERSIST_DIRECTORY)
16
  print("Vector Database loaded successfully!")
17
 
18
  # Load your quantized model from Hugging Face Hub
 
52
 
53
  # Step 1: Vector DB search
54
  search_results = db.search(question)
55
+ # --- FIX: Use 'text' key as per policy_vector_db.py's search return ---
56
+ context = "\n".join([res["text"] for res in search_results])
57
 
58
  # Step 2: Build prompt
59
  prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
 
62
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
63
  outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
64
 
65
+ # --- REVISED: Decode only the new tokens to avoid re-including prompt ---
66
+ answer = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
67
 
68
+ return {"answer": answer} # Return the directly decoded answer
 
 
 
app/policy_vector_db.py CHANGED
@@ -1,6 +1,6 @@
1
  import json
2
  import os
3
- import shutil
4
  from typing import List, Dict
5
 
6
  import chromadb
@@ -8,15 +8,34 @@ from sentence_transformers import SentenceTransformer
8
 
9
  class PolicyVectorDB:
10
  """Manages the creation and searching of a persistent vector database."""
11
- def __init__(self, persist_directory: str = "/tmp/policy_vector_db"):
12
  self.client = chromadb.PersistentClient(path=persist_directory)
13
  self.collection_name = "neepco_dop_policies"
14
- self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
15
- self.collection = self.client.get_or_create_collection(
16
- name=self.collection_name,
17
- metadata={"description": "NEEPCO Delegation of Powers Policy"}
18
- )
19
- print(f"Loaded/Created persistent collection '{self.collection_name}' at '{persist_directory}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def _flatten_metadata(self, metadata: Dict) -> Dict:
22
  """Ensures all metadata values are strings for ChromaDB compatibility."""
@@ -24,6 +43,8 @@ class PolicyVectorDB:
24
 
25
  def add_chunks(self, chunks: List[Dict]):
26
  """Encodes and adds a list of chunk dictionaries to the database."""
 
 
27
  if not chunks:
28
  print("No chunks provided to add.")
29
  return
@@ -40,7 +61,7 @@ class PolicyVectorDB:
40
 
41
  for i in range(0, len(new_chunks), batch_size):
42
  batch = new_chunks[i:i + batch_size]
43
- print(f" - Processing batch {i//batch_size + 1}/{ -(-len(new_chunks) // batch_size) }...")
44
 
45
  texts = [chunk['text'] for chunk in batch]
46
  ids = [chunk['id'] for chunk in batch]
@@ -73,38 +94,44 @@ class PolicyVectorDB:
73
  })
74
  return search_results
75
 
 
 
 
76
  def main():
77
- """Main function to build and verify the vector database."""
78
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
79
  INPUT_CHUNKS_PATH = os.path.join(BASE_DIR, "../processed_chunks.json")
80
  PERSIST_DIRECTORY = "/tmp/policy_vector_db"
81
 
82
  if not os.path.exists(INPUT_CHUNKS_PATH):
83
  print(f"FATAL ERROR: The input chunk file was not found at '{INPUT_CHUNKS_PATH}'")
84
- print("Please run 'create_chunks.py' first.")
85
  return
86
 
 
87
  if os.path.exists(PERSIST_DIRECTORY):
88
- print(f"Removing existing database at '{PERSIST_DIRECTORY}' to ensure a clean build.")
89
  shutil.rmtree(PERSIST_DIRECTORY)
90
 
91
  print(f"Creating database directory: '{PERSIST_DIRECTORY}'")
92
  os.makedirs(PERSIST_DIRECTORY, exist_ok=True)
93
- os.chmod(PERSIST_DIRECTORY, 0o777)
94
 
95
  print("\nStep 1: Loading processed chunks...")
96
  with open(INPUT_CHUNKS_PATH, 'r', encoding='utf-8') as f:
97
  chunks_to_add = json.load(f)
98
  print(f"Loaded {len(chunks_to_add)} chunks.")
99
 
100
- print("\nStep 2: Setting up persistent vector database...")
101
- db = PolicyVectorDB(persist_directory=PERSIST_DIRECTORY)
102
 
103
  print("\nStep 3: Adding chunks to the database...")
104
  db.add_chunks(chunks_to_add)
105
 
106
  print(f"\n✅ Vector database setup complete. Total chunks in DB: {db.collection.count()}")
107
  print(f"Database is saved in: {os.path.abspath(PERSIST_DIRECTORY)}")
 
 
108
 
109
  print("\n--- Running Verification Tests ---")
110
  test_questions = [
@@ -119,11 +146,11 @@ def main():
119
  search_results = db.search(question, top_k=2)
120
  if search_results:
121
  for j, result in enumerate(search_results, 1):
122
- print(f" Result {j} (Relevance: {result['relevance_score']:.4f}):")
123
- print(f" Text: {result['text'][:300]}...")
124
- print(f" Metadata: {result['metadata']}")
125
  else:
126
- print(" No results found.")
127
 
128
  if __name__ == "__main__":
129
- main()
 
1
  import json
2
  import os
3
+ import shutil # Keep for potential cleanup during local testing, but not for deployment init
4
  from typing import List, Dict
5
 
6
  import chromadb
 
8
 
9
  class PolicyVectorDB:
10
  """Manages the creation and searching of a persistent vector database."""
11
+ def __init__(self, persist_directory: str = "/app/policy_vector_db"):
12
  self.client = chromadb.PersistentClient(path=persist_directory)
13
  self.collection_name = "neepco_dop_policies"
14
+ # Using 'cuda' if available, otherwise 'cpu' for the embedding model
15
+ # You can keep 'cpu' if you are sure about resource allocation.
16
+ self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cuda' if torch.cuda.is_available() else 'cpu')
17
+
18
+ # When loading a pre-existing DB, use get_or_create_collection cautiously.
19
+ # If the collection doesn't exist at the path, it will create an empty one.
20
+ # If you are always pre-building, get_collection is safer as it will fail if not found.
21
+ # However, get_or_create_collection is more robust against initial empty state.
22
+ try:
23
+ self.collection = self.client.get_collection(name=self.collection_name)
24
+ print(f"Successfully loaded existing collection '{self.collection_name}' from '{persist_directory}'")
25
+ except Exception as e:
26
+ # If get_collection fails, it means the collection doesn't exist yet,
27
+ # which shouldn't happen if pre-built correctly.
28
+ # For robustness, you could add creation here if desired, but for pre-built,
29
+ # this indicates an issue with the pre-built DB or path.
30
+ print(f"Error loading collection '{self.collection_name}': {e}")
31
+ print("Attempting to create a new (likely empty) collection. Ensure your pre-built DB is copied correctly.")
32
+ self.collection = self.client.create_collection(
33
+ name=self.collection_name,
34
+ metadata={"description": "NEEPCO Delegation of Powers Policy"}
35
+ )
36
+
37
+ print(f"ChromaDB client initialized for collection '{self.collection_name}' at '{persist_directory}'")
38
+
39
 
40
  def _flatten_metadata(self, metadata: Dict) -> Dict:
41
  """Ensures all metadata values are strings for ChromaDB compatibility."""
 
43
 
44
  def add_chunks(self, chunks: List[Dict]):
45
  """Encodes and adds a list of chunk dictionaries to the database."""
46
+ # This method is primarily for initial DB building, less for runtime in a deployed RAG.
47
+ # However, keeping it makes the class reusable.
48
  if not chunks:
49
  print("No chunks provided to add.")
50
  return
 
61
 
62
  for i in range(0, len(new_chunks), batch_size):
63
  batch = new_chunks[i:i + batch_size]
64
+ print(f" - Processing batch {i//batch_size + 1}/{ -(-len(new_chunks) // batch_size) }...")
65
 
66
  texts = [chunk['text'] for chunk in batch]
67
  ids = [chunk['id'] for chunk in batch]
 
94
  })
95
  return search_results
96
 
97
+ # --- REVISED: Remove database building logic from main for deployment ---
98
+ # This main function is typically used for initial local building.
99
+ # For deployment, the DB is now pre-built and copied.
100
  def main():
101
+ """Main function to build and verify the vector database (for local pre-building)."""
102
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
103
  INPUT_CHUNKS_PATH = os.path.join(BASE_DIR, "../processed_chunks.json")
104
  PERSIST_DIRECTORY = "/tmp/policy_vector_db"
105
 
106
  if not os.path.exists(INPUT_CHUNKS_PATH):
107
  print(f"FATAL ERROR: The input chunk file was not found at '{INPUT_CHUNKS_PATH}'")
108
+ print("Please ensure 'processed_chunks.json' is in the root directory.")
109
  return
110
 
111
+ # Remove existing local build directory to ensure clean start
112
  if os.path.exists(PERSIST_DIRECTORY):
113
+ print(f"Removing existing local build database at '{PERSIST_DIRECTORY}' to ensure a clean build.")
114
  shutil.rmtree(PERSIST_DIRECTORY)
115
 
116
  print(f"Creating database directory: '{PERSIST_DIRECTORY}'")
117
  os.makedirs(PERSIST_DIRECTORY, exist_ok=True)
118
+ os.chmod(PERSIST_DIRECTORY, 0o777) # Ensure write permissions
119
 
120
  print("\nStep 1: Loading processed chunks...")
121
  with open(INPUT_CHUNKS_PATH, 'r', encoding='utf-8') as f:
122
  chunks_to_add = json.load(f)
123
  print(f"Loaded {len(chunks_to_add)} chunks.")
124
 
125
+ print("\nStep 2: Setting up persistent vector database (local build)...")
126
+ db = PolicyVectorDB(persist_directory=PERSIST_DIRECTORY) # Pass the local build path
127
 
128
  print("\nStep 3: Adding chunks to the database...")
129
  db.add_chunks(chunks_to_add)
130
 
131
  print(f"\n✅ Vector database setup complete. Total chunks in DB: {db.collection.count()}")
132
  print(f"Database is saved in: {os.path.abspath(PERSIST_DIRECTORY)}")
133
+ print("\n--- Important: Copy the contents of this directory (NOT the directory itself) to your 'vector_database' folder in the project root for deployment. ---")
134
+
135
 
136
  print("\n--- Running Verification Tests ---")
137
  test_questions = [
 
146
  search_results = db.search(question, top_k=2)
147
  if search_results:
148
  for j, result in enumerate(search_results, 1):
149
+ print(f" Result {j} (Relevance: {result['relevance_score']:.4f}):")
150
+ print(f" Text: {result['text'][:300]}...")
151
+ print(f" Metadata: {result['metadata']}")
152
  else:
153
+ print(" No results found.")
154
 
155
  if __name__ == "__main__":
156
+ main()