Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 20

Commit

1e6e534

verified ·

1 Parent(s): c8871ab

Update create_granular_chunks.py

Browse files

Files changed (1) hide show

create_granular_chunks.py +91 -37

create_granular_chunks.py CHANGED Viewed

@@ -2,20 +2,27 @@ import os
 import json
 import re
 from typing import List, Dict, Any
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
-OUTPUT_FILE = "granular_chunks_final.jsonl" # Keeping the filename consistent
 # --- Global State ---
 chunk_counter = 0
 def get_unique_id() -> str:
     """Returns a unique, incrementing ID for each chunk."""
     global chunk_counter
     chunk_counter += 1
     return f"chunk-{chunk_counter}"
 def create_chunk(context: Dict, text: str) -> Dict:
     """Creates a standardized chunk dictionary with rich metadata."""
     metadata = {
@@ -24,29 +31,29 @@ def create_chunk(context: Dict, text: str) -> Dict:
         "title": context.get("title"),
         "source_description": context.get("description"),
     }
     for key, value in context.items():
         if key not in metadata and isinstance(value, (str, int, float, bool)):
             metadata[key] = value
     return {
         "id": get_unique_id(),
-        "text": text,
         "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
 def format_delegation_text(delegation: Any) -> str:
     """
     Formats a delegation dictionary or string into a readable string.
-    --- ACCURACY FIX ---
-    This function now explicitly includes "NIL" or "---" values instead of skipping them.
-    This is crucial for the model to correctly answer questions about roles with no power.
     """
     if not isinstance(delegation, dict):
         return str(delegation)
-    # Use "is NIL" for None or "---", otherwise use "is [limit]"
     parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
     return ", ".join(parts) if parts else "No specific delegation provided."
 def format_remarks(remarks: Any) -> str:
     """Safely formats the 'remarks' field, handling various data types."""
     if isinstance(remarks, list):
@@ -60,83 +67,124 @@ def format_remarks(remarks: Any) -> str:
         return " ".join(remark_parts)
     return str(remarks)
 def build_descriptive_text(context: Dict) -> str:
     """
-    Intelligently builds a single, descriptive, natural language sentence
-    by combining all relevant fields from the context.
     """
     text_parts = []
     if context.get("title"):
-        text_parts.append(f"Regarding the policy for '{context['title']}'")
     specific_desc = context.get('description') or context.get('method')
     if specific_desc and specific_desc != context.get('title'):
-         text_parts.append(f"specifically for '{specific_desc}'")
     if "delegation" in context:
         delegation_text = format_delegation_text(context["delegation"])
-        text_parts.append(f", the financial delegations are: {delegation_text}.")
     elif "composition" in context:
         composition_parts = []
         for item in context["composition"]:
             if isinstance(item, dict):
                 for role, members in item.items():
-                    member_text = f"the {role} is {members}" if isinstance(members, str) else f"the {role} are: {', '.join(members)}"
                     composition_parts.append(member_text)
         text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
     if "remarks" in context and context["remarks"]:
         remarks_text = format_remarks(context["remarks"])
         text_parts.append(f" Important remarks include: {remarks_text}")
-    return " ".join(text_parts)
 def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
     """
-    The definitive processing function. It traverses the JSON and uses a set of handlers
-    to create highly descriptive, self-contained chunks.
     """
     context = {**(parent_context or {}), **data}
     chunks = []
-    # --- Handler 1: Simple Item Lists (e.g., Annexure A, Financial Concurrence) ---
     list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
     if list_key:
         base_title = context.get('title', 'a policy')
         for item in data[list_key]:
             if isinstance(item, str):
-                chunks.append(create_chunk(context, f"A rule regarding '{base_title}' is: {item}."))
         return chunks
-    # --- Handler 2: Recursive Traversal ---
     has_recursed = False
     for key, value in data.items():
         if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
             for item in value:
                 chunks.extend(process_entry(item, context))
             has_recursed = True
-    # --- Handler 3: Leaf Node Creation ---
     if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
         text = build_descriptive_text(context)
-        chunks.append(create_chunk(context, text))
     return chunks
 def main():
-    """Main function to read, process, and write."""
-    print(f"Starting to process '{INPUT_FILE}' with the definitive chunking strategy...")
     all_chunks = []
     try:
         with open(INPUT_FILE, 'r', encoding='utf-8') as f:
             for i, line in enumerate(f):
                 try:
                     data = json.loads(line)
-                    processed_chunks = process_entry(data)
-                    if processed_chunks:
-                        all_chunks.extend(processed_chunks)
                 except json.JSONDecodeError:
                     print(f"Warning: Skipping malformed JSON on line {i+1}")
                     continue
@@ -144,17 +192,23 @@ def main():
         print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
-    print(f"Deconstructed into {len(all_chunks)} highly descriptive chunks.")
-    # Remove duplicates before writing
-    unique_chunks = {chunk['text']: chunk for chunk in all_chunks}.values()
-    print(f"Removed duplicates, writing {len(unique_chunks)} unique chunks.")
-    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
         for chunk in unique_chunks:
-            f.write(json.dumps(chunk) + '\n')
-    print(f"Successfully created improved granular chunks file: '{OUTPUT_FILE}'")
 if __name__ == "__main__":
     main()

 import json
 import re
 from typing import List, Dict, Any
+import nltk
+# Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
+nltk.download('punkt')
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
+OUTPUT_FILE = "granular_chunks_final.jsonl"  # Keep filename consistent
 # --- Global State ---
 chunk_counter = 0
 def get_unique_id() -> str:
     """Returns a unique, incrementing ID for each chunk."""
     global chunk_counter
     chunk_counter += 1
     return f"chunk-{chunk_counter}"
 def create_chunk(context: Dict, text: str) -> Dict:
     """Creates a standardized chunk dictionary with rich metadata."""
     metadata = {
         "title": context.get("title"),
         "source_description": context.get("description"),
     }
+    # Add other primitive metadata keys
     for key, value in context.items():
         if key not in metadata and isinstance(value, (str, int, float, bool)):
             metadata[key] = value
     return {
         "id": get_unique_id(),
+        "text": text.strip(),
         "metadata": {k: v for k, v in metadata.items() if v is not None}
     }
 def format_delegation_text(delegation: Any) -> str:
     """
     Formats a delegation dictionary or string into a readable string.
+    Explicitly includes "NIL" or "---" to capture no power cases.
     """
     if not isinstance(delegation, dict):
         return str(delegation)
     parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}" for auth, limit in delegation.items()]
     return ", ".join(parts) if parts else "No specific delegation provided."
 def format_remarks(remarks: Any) -> str:
     """Safely formats the 'remarks' field, handling various data types."""
     if isinstance(remarks, list):
         return " ".join(remark_parts)
     return str(remarks)
 def build_descriptive_text(context: Dict) -> str:
     """
+    Builds a clear, descriptive, natural language text by combining fields.
+    Focused for best relevance and contextual richness.
     """
     text_parts = []
     if context.get("title"):
+        text_parts.append(f"Regarding the policy '{context['title']}'")
     specific_desc = context.get('description') or context.get('method')
     if specific_desc and specific_desc != context.get('title'):
+        text_parts.append(f"specifically for '{specific_desc}'")
     if "delegation" in context:
         delegation_text = format_delegation_text(context["delegation"])
+        text_parts.append(f", financial delegations are: {delegation_text}.")
     elif "composition" in context:
         composition_parts = []
         for item in context["composition"]:
             if isinstance(item, dict):
                 for role, members in item.items():
+                    member_text = (f"the {role} is {members}" if isinstance(members, str)
+                                   else f"the {role} are: {', '.join(members)}")
                     composition_parts.append(member_text)
         text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
     if "remarks" in context and context["remarks"]:
         remarks_text = format_remarks(context["remarks"])
         text_parts.append(f" Important remarks include: {remarks_text}")
+    # Join all parts into a flowing sentence
+    return " ".join(text_parts).strip()
+def split_text_into_chunks(text: str, max_char_length: int = 1500, overlap: int = 200) -> List[str]:
+    """
+    Splits a long text into smaller chunks with controlled overlap.
+    Uses sentence tokenization for natural splits.
+    """
+    text = text.strip()
+    if len(text) <= max_char_length:
+        return [text]
+    sentences = nltk.sent_tokenize(text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        # +1 for space/newline likely added between sentences
+        if len(current_chunk) + len(sentence) + 1 <= max_char_length:
+            current_chunk += (" " + sentence) if current_chunk else sentence
+        else:
+            chunks.append(current_chunk.strip())
+            # Start next chunk with overlap from end of previous chunk (by characters)
+            if overlap < len(current_chunk):
+                current_chunk = current_chunk[-overlap:] + " " + sentence
+            else:
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
 def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
     """
+    Processes a JSON policy entry and returns granular, context-rich chunks.
+    Applies recursive traversal and implements chunk size limiting.
     """
     context = {**(parent_context or {}), **data}
     chunks = []
+    # Handler 1: Simple Item Lists (ex: rules, exclusions)
     list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
     if list_key:
         base_title = context.get('title', 'a policy')
         for item in data[list_key]:
             if isinstance(item, str):
+                # Build chunk text with clear descriptive prefix for relevance
+                text = f"A rule regarding '{base_title}' is: {item}."
+                # Split if too long
+                for sub_chunk in split_text_into_chunks(text):
+                    chunks.append(create_chunk(context, sub_chunk))
         return chunks
+    # Handler 2: Recursive traversal for nested dictionaries/lists
     has_recursed = False
     for key, value in data.items():
         if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
             for item in value:
                 chunks.extend(process_entry(item, context))
             has_recursed = True
+    # Handler 3: Leaf nodes with delegation, composition or description
     if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
         text = build_descriptive_text(context)
+        # Split long descriptive text intelligently
+        for chunk_text in split_text_into_chunks(text):
+            chunks.append(create_chunk(context, chunk_text))
     return chunks
 def main():
+    """Main orchestration to read input, process, and write chunks."""
+    print(f"Starting to process '{INPUT_FILE}' for improved granular chunking...")
     all_chunks = []
     try:
         with open(INPUT_FILE, 'r', encoding='utf-8') as f:
             for i, line in enumerate(f):
                 try:
                     data = json.loads(line)
+                    processed = process_entry(data)
+                    if processed:
+                        all_chunks.extend(processed)
                 except json.JSONDecodeError:
                     print(f"Warning: Skipping malformed JSON on line {i+1}")
                     continue
         print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
+    print(f"Generated {len(all_chunks)} chunks before deduplication.")
+    # Deduplicate by text content (retaining last occurrences)
+    unique_chunks_map = {}
+    for chunk in all_chunks:
+        unique_chunks_map[chunk['text']] = chunk
+    unique_chunks = list(unique_chunks_map.values())
+    print(f"{len(unique_chunks)} unique chunks after deduplication.")
+    # Write output in JSONL format for later vector DB ingestion
+    with open(OUTPUT_FILE, 'w', encoding='utf-8') as outf:
         for chunk in unique_chunks:
+            outf.write(json.dumps(chunk, ensure_ascii=False) + "\n")
+    print(f"Successfully wrote improved granular chunks to '{OUTPUT_FILE}'.")
 if __name__ == "__main__":
     main()