Spaces:

Kalpokoch
/

ChatbotDemo

Running

App Files Files

Kalpokoch commited on Aug 18

Commit

ce750f8

verified ·

1 Parent(s): 7e6e5a8

Update app/policy_vector_db.py

Browse files

Files changed (1) hide show

app/policy_vector_db.py +40 -263

app/policy_vector_db.py CHANGED Viewed

@@ -1,16 +1,11 @@
 import os
 import json
 import torch
-import re
-import hashlib
-from typing import List, Dict, Optional, Tuple
 from sentence_transformers import SentenceTransformer
 import chromadb
 from chromadb.config import Settings
 import logging
-import multiprocessing as mp
-from concurrent.futures import ThreadPoolExecutor
-import numpy as np
 # --- Basic Logging Setup ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -18,50 +13,28 @@ logger = logging.getLogger(__name__)
 class PolicyVectorDB:
     """
-    Enhanced vector database for policy documents with metadata-aware search capabilities.
-    Optimized for CPU utilization.
     """
     def __init__(self, persist_directory: str, top_k_default: int = 5, relevance_threshold: float = 0.5):
         self.persist_directory = persist_directory
         self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
         self.collection_name = "neepco_dop_policies"
-        # Optimize CPU usage
-        self.cpu_count = mp.cpu_count()
-        torch.set_num_threads(self.cpu_count)
-        logger.info(f"Detected {self.cpu_count} CPU cores, optimizing threading...")
         logger.info("Loading embedding model 'BAAI/bge-large-en-v1.5'. This may take a moment...")
-        # Optimize model loading for CPU
-        self.embedding_model = SentenceTransformer(
-            'BAAI/bge-large-en-v1.5',
-            device='cpu',
-            # Use all available CPU cores for inference
-            model_kwargs={'torch_dtype': torch.float32}
-        )
-        # Set model to use optimized CPU inference
-        self.embedding_model.max_seq_length = 512  # Reduce context length for speed
         logger.info("Embedding model loaded successfully.")
-        self.collection = None
         self.top_k_default = top_k_default
         self.relevance_threshold = relevance_threshold
-        # Thread pool for parallel processing
-        self.thread_pool = ThreadPoolExecutor(max_workers=self.cpu_count)
-        # Add monetary normalization for queries
-        self.money_patterns = {
-            r'(\d+(?:,\d+)*(?:\.\d+)?)\s*crore': lambda x: float(x.replace(',', '')) * 1e7,
-            r'(\d+(?:,\d+)*(?:\.\d+)?)\s*lakh': lambda x: float(x.replace(',', '')) * 1e5,
-            r'₹\s*(\d+(?:,\d+)*(?:\.\d+)?)': lambda x: float(x.replace(',', ''))
-        }
     def _get_collection(self):
-        """Retrieves or creates the ChromaDB collection. Implements lazy loading."""
         if self.collection is None:
             self.collection = self.client.get_or_create_collection(
                 name=self.collection_name,
@@ -70,90 +43,13 @@ class PolicyVectorDB:
         return self.collection
     def _flatten_metadata(self, metadata: Dict) -> Dict:
-        """Ensures all metadata values are strings, as required by ChromaDB."""
-        flattened = {}
-        for key, value in metadata.items():
-            if isinstance(value, (dict, list)):
-                # Convert complex structures to JSON strings
-                flattened[key] = json.dumps(value, ensure_ascii=False)
-            elif value is not None:
-                flattened[key] = str(value)
-        return flattened
-    def _extract_query_entities(self, query: str) -> Dict[str, any]:
-        """Extract structured entities from user queries for better filtering."""
-        entities = {
-            'monetary_values': [],
-            'roles': [],
-            'sections': [],
-            'keywords': []
-        }
-        # Extract monetary amounts
-        for pattern, converter in self.money_patterns.items():
-            matches = re.finditer(pattern, query, re.IGNORECASE)
-            for match in matches:
-                try:
-                    value = converter(match.group(1))
-                    entities['monetary_values'].append(value)
-                except:
-                    pass
-        # Extract common roles
-        role_patterns = [
-            r'\b(CMD|Chairman|Managing Director)\b',
-            r'\b(Director|D\([PT]\)|D\(P\)|D\(T\))\b',
-            r'\b(ED|Executive Director)\b',
-            r'\b(CGM|Chief General Manager)\b',
-            r'\b(GM|General Manager)\b',
-            r'\b(DGM|Deputy General Manager)\b',
-            r'\b(Sr\.?\s*M|Senior Manager)\b'
-        ]
-        for pattern in role_patterns:
-            matches = re.finditer(pattern, query, re.IGNORECASE)
-            entities['roles'].extend([match.group() for match in matches])
-        # Extract section references
-        section_matches = re.finditer(r'\b(Section|Annexure)\s*([IVX]+|[A-Z])\b', query, re.IGNORECASE)
-        entities['sections'].extend([match.group() for match in section_matches])
-        return entities
-    def _encode_batch_parallel(self, texts: List[str]) -> np.ndarray:
-        """Parallel encoding of text batches for better CPU utilization."""
-        # Split texts into smaller batches for parallel processing
-        batch_size = max(1, len(texts) // self.cpu_count)
-        if len(texts) <= batch_size:
-            return self.embedding_model.encode(
-                texts,
-                normalize_embeddings=True,
-                show_progress_bar=False,
-                batch_size=32,  # Optimize batch size for CPU
-                convert_to_numpy=True
-            )
-        # Process in parallel batches
-        batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
-        def encode_batch(batch):
-            return self.embedding_model.encode(
-                batch,
-                normalize_embeddings=True,
-                show_progress_bar=False,
-                batch_size=16,
-                convert_to_numpy=True
-            )
-        # Use thread pool for parallel encoding
-        futures = [self.thread_pool.submit(encode_batch, batch) for batch in batches]
-        results = [future.result() for future in futures]
-        # Concatenate results
-        return np.vstack(results) if results else np.array([])
     def add_chunks(self, chunks: List[Dict]):
-        """Enhanced chunk addition with better metadata handling and parallel processing."""
         collection = self._get_collection()
         if not chunks:
             logger.info("No chunks provided to add.")
@@ -174,9 +70,8 @@ class PolicyVectorDB:
         logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
-        # Optimized batch size for CPU processing
-        batch_size = min(64, max(16, len(new_chunks) // 4))
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
@@ -184,168 +79,56 @@ class PolicyVectorDB:
             texts = [chunk['text'] for chunk in batch]
             metadatas = [self._flatten_metadata(chunk.get('metadata', {})) for chunk in batch]
-            # Use parallel encoding
-            embeddings = self._encode_batch_parallel(texts).tolist()
             collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
             logger.info(f"Added batch {i//batch_size + 1}/{(len(new_chunks) + batch_size - 1) // batch_size}")
         logger.info(f"Finished adding {len(new_chunks)} chunks.")
-    def search(self, query_text: str, top_k: int = None, filters: Dict = None) -> List[Dict]:
         """
-        Enhanced search with metadata filtering and entity extraction.
-        Optimized for CPU performance.
         """
         collection = self._get_collection()
-        # Extract entities from query for potential filtering
-        entities = self._extract_query_entities(query_text)
-        # Build metadata filters
-        where_conditions = {}
-        if filters:
-            where_conditions.update(filters)
-        # Add entity-based filters
-        if entities['roles']:
-            # Filter by role if mentioned in query
-            where_conditions["role"] = {"$in": entities['roles']}
-        if entities['sections']:
-            # Filter by section if mentioned
-            where_conditions["section"] = {"$in": [s.split()[-1] for s in entities['sections']]}
         instructed_query = f"Represent this sentence for searching relevant passages: {query_text}"
-        # Optimized single query encoding
-        query_embedding = self.embedding_model.encode(
-            [instructed_query],
-            normalize_embeddings=True,
-            show_progress_bar=False,
-            batch_size=1,
-            convert_to_numpy=True
-        ).tolist()
         k = top_k if top_k is not None else self.top_k_default
-        # Perform search with metadata filtering
-        search_params = {
-            "query_embeddings": query_embedding,
-            "n_results": k * 3,  # Get more for filtering
-            "include": ["documents", "metadatas", "distances"]
-        }
-        if where_conditions:
-            search_params["where"] = where_conditions
-        results = collection.query(**search_params)
         search_results = []
         if results and results.get('documents') and results['documents'][0]:
             for i, doc in enumerate(results['documents'][0]):
                 relevance_score = 1 - results['distances'][0][i]
                 if relevance_score >= self.relevance_threshold:
-                    result = {
                         'text': doc,
                         'metadata': results['metadatas'][0][i],
                         'relevance_score': relevance_score
-                    }
-                    # Add monetary filtering if amounts mentioned in query
-                    if entities['monetary_values'] and 'limit_normalized' in results['metadatas'][0][i]:
-                        try:
-                            chunk_limit = float(results['metadatas'][0][i]['limit_normalized'])
-                            query_amount = max(entities['monetary_values'])
-                            # Boost relevance if the limit is appropriate for the query amount
-                            if chunk_limit >= query_amount:
-                                result['relevance_score'] += 0.1  # Small boost for relevant limits
-                        except:
-                            pass
-                    search_results.append(result)
-        return sorted(search_results, key=lambda x: x['relevance_score'], reverse=True)[:k]
-    def search_with_context(self, query_text: str, top_k: int = None, include_related: bool = True) -> List[Dict]:
-        """
-        Search with automatic inclusion of related/parent chunks for better context.
-        """
-        primary_results = self.search(query_text, top_k)
-        if not include_related or not primary_results:
-            return primary_results
-        # Find related chunks based on parent_id relationships
-        related_ids = set()
-        for result in primary_results:
-            metadata = result['metadata']
-            parent_id = metadata.get('parent_id')
-            if parent_id:
-                related_ids.add(parent_id)
-        if related_ids:
-            collection = self._get_collection()
-            try:
-                related_chunks = collection.get(
-                    ids=list(related_ids),
-                    include=["documents", "metadatas"]
-                )
-                for i, doc in enumerate(related_chunks['documents']):
-                    primary_results.append({
-                        'text': doc,
-                        'metadata': related_chunks['metadatas'][i],
-                        'relevance_score': 0.3,  # Lower score for context
-                        'is_context': True
                     })
-            except Exception as e:
-                logger.warning(f"Could not retrieve related chunks: {e}")
-        return sorted(primary_results, key=lambda x: x['relevance_score'], reverse=True)
-    def search_by_amount(self, amount: float, comparison: str = ">=", top_k: int = None) -> List[Dict]:
-        """Search for delegation limits based on monetary amount."""
-        collection = self._get_collection()
-        where_condition = {}
-        if comparison == ">=":
-            where_condition = {"limit_normalized": {"$gte": amount}}
-        elif comparison == "<=":
-            where_condition = {"limit_normalized": {"$lte": amount}}
-        elif comparison == "==":
-            where_condition = {"limit_normalized": {"$eq": amount}}
-        try:
-            results = collection.get(
-                where=where_condition,
-                include=["documents", "metadatas"]
-            )
-            search_results = []
-            if results and results.get('documents'):
-                for i, doc in enumerate(results['documents']):
-                    search_results.append({
-                        'text': doc,
-                        'metadata': results['metadatas'][i],
-                        'relevance_score': 1.0  # Perfect match for structured query
-                    })
-            k = top_k if top_k is not None else self.top_k_default
-            return search_results[:k]
-        except Exception as e:
-            logger.warning(f"Error in search_by_amount: {e}")
-            return []
-    def __del__(self):
-        """Cleanup thread pool on deletion."""
-        if hasattr(self, 'thread_pool'):
-            self.thread_pool.shutdown(wait=False)
 def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
-    """Checks if the DB is empty and populates it from a JSONL file if needed."""
     try:
         if db_instance._get_collection().count() > 0:
             logger.info("Vector database already contains data. Skipping population.")
@@ -368,15 +151,9 @@ def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> b
             logger.warning(f"Chunks file at '{chunks_file_path}' is empty or invalid. No data to add.")
             return False
-        # Process in batches to avoid memory issues
-        batch_size = 500
-        for i in range(0, len(chunks_to_add), batch_size):
-            batch = chunks_to_add[i:i + batch_size]
-            db_instance.add_chunks(batch)
-            logger.info(f"Processed batch {i//batch_size + 1}/{(len(chunks_to_add) + batch_size - 1) // batch_size}")
         logger.info("Vector database population attempt complete.")
         return True
     except Exception as e:
         logger.error(f"An error occurred during DB population check: {e}", exc_info=True)
-        return False

 import os
 import json
 import torch
+from typing import List, Dict
 from sentence_transformers import SentenceTransformer
 import chromadb
 from chromadb.config import Settings
 import logging
 # --- Basic Logging Setup ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 class PolicyVectorDB:
     """
+    Manages the connection, population, and querying of a ChromaDB vector database
+    for policy documents.
     """
     def __init__(self, persist_directory: str, top_k_default: int = 5, relevance_threshold: float = 0.5):
         self.persist_directory = persist_directory
         self.client = chromadb.PersistentClient(path=persist_directory, settings=Settings(allow_reset=True))
         self.collection_name = "neepco_dop_policies"
+        # Using a powerful open-source embedding model.
+        # Change 'cpu' to 'cuda' if a GPU is available for significantly faster embedding.
         logger.info("Loading embedding model 'BAAI/bge-large-en-v1.5'. This may take a moment...")
+        self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
         logger.info("Embedding model loaded successfully.")
+        self.collection = None # Initialize collection as None for lazy loading
         self.top_k_default = top_k_default
         self.relevance_threshold = relevance_threshold
     def _get_collection(self):
+        """
+        Retrieves or creates the ChromaDB collection. Implements lazy loading.
+        """
         if self.collection is None:
             self.collection = self.client.get_or_create_collection(
                 name=self.collection_name,
         return self.collection
     def _flatten_metadata(self, metadata: Dict) -> Dict:
+        """Ensures all metadata values are strings, as required by some ChromaDB versions."""
+        return {key: str(value) for key, value in metadata.items()}
     def add_chunks(self, chunks: List[Dict]):
+        """
+        Adds a list of chunks to the vector database, skipping any that already exist.
+        """
         collection = self._get_collection()
         if not chunks:
             logger.info("No chunks provided to add.")
         logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
+        # Process in batches for efficiency
+        batch_size = 32 # Reduced batch size for potentially large embeddings
         for i in range(0, len(new_chunks), batch_size):
             batch = new_chunks[i:i + batch_size]
             texts = [chunk['text'] for chunk in batch]
             metadatas = [self._flatten_metadata(chunk.get('metadata', {})) for chunk in batch]
+            # For BGE models, it's recommended not to add instructions to the document embeddings
+            embeddings = self.embedding_model.encode(texts, normalize_embeddings=True, show_progress_bar=False).tolist()
             collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
             logger.info(f"Added batch {i//batch_size + 1}/{(len(new_chunks) + batch_size - 1) // batch_size}")
         logger.info(f"Finished adding {len(new_chunks)} chunks.")
+    def search(self, query_text: str, top_k: int = None) -> List[Dict]:
         """
+        Searches the vector database for a given query text.
+        Returns a list of results filtered by a relevance threshold.
         """
         collection = self._get_collection()
+        # ✅ IMPROVEMENT: Add the recommended instruction prefix for BGE retrieval models.
         instructed_query = f"Represent this sentence for searching relevant passages: {query_text}"
+        # ✅ IMPROVEMENT: Normalize embeddings for more accurate similarity search.
+        query_embedding = self.embedding_model.encode([instructed_query], normalize_embeddings=True).tolist()
         k = top_k if top_k is not None else self.top_k_default
+        # Retrieve more results initially to allow for filtering
+        results = collection.query(
+            query_embeddings=query_embedding,
+            n_results=k * 2, # Retrieve more to filter by threshold
+            include=["documents", "metadatas", "distances"]
+        )
         search_results = []
         if results and results.get('documents') and results['documents'][0]:
             for i, doc in enumerate(results['documents'][0]):
+                # The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
                 relevance_score = 1 - results['distances'][0][i]
                 if relevance_score >= self.relevance_threshold:
+                    search_results.append({
                         'text': doc,
                         'metadata': results['metadatas'][0][i],
                         'relevance_score': relevance_score
                     })
+        # Sort by relevance score and return the top_k results
+        return sorted(search_results, key=lambda x: x['relevance_score'], reverse=True)[:k]
 def ensure_db_populated(db_instance: PolicyVectorDB, chunks_file_path: str) -> bool:
+    """
+    Checks if the DB is empty and populates it from a JSONL file if needed.
+    """
     try:
         if db_instance._get_collection().count() > 0:
             logger.info("Vector database already contains data. Skipping population.")
             logger.warning(f"Chunks file at '{chunks_file_path}' is empty or invalid. No data to add.")
             return False
+        db_instance.add_chunks(chunks_to_add)
         logger.info("Vector database population attempt complete.")
         return True
     except Exception as e:
         logger.error(f"An error occurred during DB population check: {e}", exc_info=True)
+        return False