Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 21

Commit

1201e66

verified ·

1 Parent(s): acd510f

Update create_granular_chunks.py

Browse files

Files changed (1) hide show

create_granular_chunks.py +218 -208

create_granular_chunks.py CHANGED Viewed

@@ -1,231 +1,241 @@
-# create_granular_chunks.py
-import os
 import json
 import re
-from typing import List, Dict, Any
-import nltk
-# --- Tokenizer Import ---
-import tiktoken  # pip install tiktoken
-# Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
-nltk.download('punkt')
-# --- Configuration ---
-INPUT_FILE = "combined_context.jsonl"
-OUTPUT_FILE = "granular_chunks_final.jsonl"
-# Token-based chunking parameters (typical LLM embedding context ~512 tokens)
-MAX_TOKENS = 400
-OVERLAP_TOKENS = 50
-TOKENIZER_MODEL = "cl100k_base"  # use "cl100k_base" for OpenAI, adjust as needed
-# --- Keyword Enhancement ---
-FINANCIAL_KEYWORDS = [
-    "₹", "INR", "crore", "lakh", "limit", "delegation", "expenditure", "budget", "revenue", "capital",
-    "surplus", "investment", "write-off", "dividend", "pay", "salary", "contract value"
-]
-AUTHORITY_KEYWORDS = [
-    "CMD", "Chairman", "Board", "Director", "ED", "Executive Director", "CGM", "GM", "DGM", "Sr. M",
-    "Manager", "HOD", "Head of Finance", "Finance Head", "Project Head"
-]
-def get_encoding():
-    return tiktoken.get_encoding(TOKENIZER_MODEL)
-# --- Global State ---
-chunk_counter = 0
-def get_unique_id() -> str:
-    global chunk_counter
-    chunk_counter += 1
-    return f"chunk-{chunk_counter}"
-def enhance_chunk_with_keywords(text: str, metadata: dict) -> dict:
-    """Add keywords (financial and authority) to metadata if present in text."""
-    present_financial = [kw for kw in FINANCIAL_KEYWORDS if kw.lower() in text.lower()]
-    present_authority = [kw for kw in AUTHORITY_KEYWORDS if kw.lower() in text.lower()]
-    if present_financial:
-        metadata['financial_keywords'] = present_financial
-    if present_authority:
-        metadata['authority_keywords'] = present_authority
-    return metadata
-def create_chunk(context: Dict, text: str) -> Dict:
-    """Creates a standardized chunk dictionary with rich metadata."""
-    metadata = {
-        "section": context.get("section"),
-        "clause": context.get("clause") or context.get("Clause"),
-        "title": context.get("title"),
-        "source_description": context.get("description"),
-    }
-    for key, value in context.items():
-        if key not in metadata and isinstance(value, (str, int, float, bool)):
-            metadata[key] = value
-    # --- Keyword Enhancement ---
-    metadata = enhance_chunk_with_keywords(text, metadata)
-    return {
-        "id": get_unique_id(),
-        "text": text.strip(),
-        "metadata": {k: v for k, v in metadata.items() if v is not None}
-    }
-def format_delegation_text(delegation: Any) -> str:
-    if not isinstance(delegation, dict):
-        return str(delegation)
-    parts = [f"the limit for {auth} is {limit if limit and str(limit) != '---' else 'NIL'}"
-             for auth, limit in delegation.items()]
-    return ", ".join(parts) if parts else "No specific delegation provided."
-def format_remarks(remarks: Any) -> str:
-    if isinstance(remarks, list):
-        remark_parts = []
-        for item in remarks:
-            if isinstance(item, dict):
-                for key, value in item.items():
-                    remark_parts.append(f"{key}: {value}")
-            else:
-                remark_parts.append(str(item))
-        return " ".join(remark_parts)
-    return str(remarks)
-def build_descriptive_text(context: Dict) -> str:
-    text_parts = []
-    if context.get("title"):
-        text_parts.append(f"Regarding the policy '{context['title']}'")
-    specific_desc = context.get('description') or context.get('method')
-    if specific_desc and specific_desc != context.get('title'):
-        text_parts.append(f"specifically for '{specific_desc}'")
-    if "delegation" in context:
-        delegation_text = format_delegation_text(context["delegation"])
-        text_parts.append(f", financial delegations are: {delegation_text}.")
-    elif "composition" in context:
-        composition_parts = []
-        for item in context["composition"]:
-            if isinstance(item, dict):
-                for role, members in item.items():
-                    member_text = (f"the {role} is {members}" if isinstance(members, str)
-                                   else f"the {role} are: {', '.join(members)}")
-                    composition_parts.append(member_text)
-        text_parts.append(f", the composition is: {'; '.join(composition_parts)}.")
-    if "remarks" in context and context["remarks"]:
-        remarks_text = format_remarks(context["remarks"])
-        text_parts.append(f" Important remarks include: {remarks_text}")
-    return " ".join(text_parts).strip()
-def count_tokens(text: str) -> int:
-    encoding = get_encoding()
-    return len(encoding.encode(text))
-def get_token_overlap(text: str, overlap_tokens: int) -> str:
-    """Return the last `overlap_tokens` worth of text from the input string."""
-    encoding = get_encoding()
-    tokens = encoding.encode(text)
-    if len(tokens) <= overlap_tokens:
-        return text
-    # Decode only the last overlap_tokens tokens
-    overlapped = encoding.decode(tokens[-overlap_tokens:])
-    # Remove possible split word inconsistencies by finding last complete sentence
-    # This is optional: can simply return overlapped
-    last_period = overlapped.rfind('.')
-    if last_period != -1 and last_period < len(overlapped) - 2:
-        return overlapped[last_period+1:].strip()
-    return overlapped.strip()
-def split_text_by_tokens(text: str, max_tokens: int = MAX_TOKENS, overlap_tokens: int = OVERLAP_TOKENS) -> List[str]:
-    """Split text into chunks based on token count, with specified overlap."""
-    encoding = get_encoding()
-    sents = nltk.tokenize.sent_tokenize(text, language='english')
     chunks = []
     current_chunk = ""
     current_tokens = 0
-    for sentence in sents:
-        sentence_tokens = len(encoding.encode(sentence))
-        if current_tokens + sentence_tokens <= max_tokens:
-            current_chunk += (" " + sentence) if current_chunk else sentence
-            current_tokens += sentence_tokens
-        else:
             chunks.append(current_chunk.strip())
-            # Overlap logic
-            if overlap_tokens < current_tokens:
-                overlap_text = get_token_overlap(current_chunk, overlap_tokens)
                 current_chunk = overlap_text + " " + sentence
-                current_tokens = len(encoding.encode(current_chunk))
             else:
                 current_chunk = sentence
-                current_tokens = sentence_tokens
-    if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
-def process_entry(data: Dict, parent_context: Dict = None) -> List[Dict]:
-    context = {**(parent_context or {}), **data}
-    chunks = []
-    # Handler 1: Simple Item Lists
-    list_key = next((key for key in ["items", "exclusions"] if key in data and isinstance(data.get(key), list)), None)
-    if list_key:
-        base_title = context.get('title', 'a policy')
-        for item in data[list_key]:
-            if isinstance(item, str):
-                text = f"A rule regarding '{base_title}' is: {item}."
-                for sub_chunk in split_text_by_tokens(text):
-                    chunks.append(create_chunk(context, sub_chunk))
-        return chunks
-    # Handler 2: Recursive traversal for nested dicts/lists
-    has_recursed = False
-    for key, value in data.items():
-        if isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
-            for item in value:
-                chunks.extend(process_entry(item, context))
-            has_recursed = True
-    # Handler 3: Leaf nodes with delegation, composition or description
-    if not has_recursed and ("delegation" in data or "composition" in data or "description" in data):
-        text = build_descriptive_text(context)
-        for chunk_text in split_text_by_tokens(text):
-            chunks.append(create_chunk(context, chunk_text))
-    return chunks
-def main():
-    print(f"Starting to process '{INPUT_FILE}' with token-based chunking and keyword enhancement...")
     all_chunks = []
     try:
-        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
-            for i, line in enumerate(f):
                 try:
-                    data = json.loads(line)
-                    processed = process_entry(data)
-                    if processed:
-                        all_chunks.extend(processed)
-                except json.JSONDecodeError:
-                    print(f"Warning: Skipping malformed JSON on line {i+1}")
                     continue
     except FileNotFoundError:
-        print(f"Error: Input file '{INPUT_FILE}' not found.")
         return
     print(f"Generated {len(all_chunks)} chunks before deduplication.")
-    # Deduplicate by text content (retaining last occurrences)
-    unique_chunks_map = {}
-    for chunk in all_chunks:
-        unique_chunks_map[chunk['text']] = chunk
-    unique_chunks = list(unique_chunks_map.values())
-    print(f"{len(unique_chunks)} unique chunks after deduplication.")
-    # Write output in JSONL format for later vector DB ingestion
-    with open(OUTPUT_FILE, 'w', encoding='utf-8') as outf:
-        for chunk in unique_chunks:
-            outf.write(json.dumps(chunk, ensure_ascii=False) + "\\n")
-    print(f"Successfully wrote improved granular chunks to '{OUTPUT_FILE}'.")
 if __name__ == "__main__":
-    main()

+# create_granular_chunks.py (place this in root directory)
 import json
 import re
+import hashlib
+from typing import List, Dict, Any, Set
+import tiktoken
+def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
+    """Count tokens using tiktoken."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+        return len(encoding.encode(text))
+    except Exception:
+        # Fallback to simple word-based estimation
+        return len(text.split()) * 1.3
+def extract_financial_keywords(text: str) -> List[str]:
+    """Extract financial keywords from text."""
+    financial_patterns = [
+        r'₹[\d,]+(?:\.\d{1,2})?(?:\s*(?:crore|lakh|thousand))?',
+        r'\b(?:budget|cost|expenditure|estimate|payment|procurement)\b',
+        r'\b(?:tender|contract|purchase|award)\b',
+        r'\b(?:crore|lakh|thousand)\b'
+    ]
+    keywords = set()
+    for pattern in financial_patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        keywords.update(matches)
+    return list(keywords)[:10]  # Limit to 10 keywords
+def extract_authority_keywords(text: str) -> List[str]:
+    """Extract authority/designation keywords from text."""
+    authority_patterns = [
+        r'\b(?:D\([TPF]\)|ED|CGM|GM|DGM|Sr\.?\s*M(?:anager)?)\b',
+        r'\b(?:Director|Manager|Chief|Head)\b',
+        r'\b(?:CMD|BOD|HOP|HOD|HOF)\b',
+        r'\b(?:approval|sanction|delegation|authority|power)\b'
+    ]
+    keywords = set()
+    for pattern in authority_patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        keywords.update(matches)
+    return list(keywords)[:10]  # Limit to 10 keywords
+def create_chunk_text_from_item(item: Dict) -> str:
+    """Create comprehensive chunk text from a single item."""
+    parts = []
+    # Add section and title context
+    if item.get('section'):
+        parts.append(f"Regarding the policy '{item.get('title', 'Unknown')}' under section '{item['section']}':")
+    # Add main description
+    if item.get('description'):
+        parts.append(item['description'])
+    # Add items if present
+    if item.get('items'):
+        if len(item['items']) == 1:
+            parts.append(f"This covers: {item['items'][0]}")
+        else:
+            parts.append("This covers the following:")
+            for i, sub_item in enumerate(item['items'], 1):
+                parts.append(f"{i}. {sub_item}")
+    # Add delegation information
+    if item.get('delegation'):
+        parts.append("Authority delegation:")
+        for role, limit in item['delegation'].items():
+            if limit and limit != "NIL":
+                parts.append(f"- {role}: {limit}")
+    # Add subclauses
+    if item.get('subclauses'):
+        parts.append("This includes:")
+        for subclause in item['subclauses']:
+            if subclause.get('description'):
+                parts.append(f"• {subclause['description']}")
+            if subclause.get('delegation'):
+                for role, limit in subclause['delegation'].items():
+                    if limit and limit != "NIL":
+                        parts.append(f"  - {role}: {limit}")
+    # Add methods (for complex delegation structures)
+    if item.get('methods'):
+        for method in item['methods']:
+            if method.get('delegation'):
+                parts.append(f"For {method.get('method', 'this method')}:")
+                for role, limit in method['delegation'].items():
+                    if limit and limit != "NIL":
+                        parts.append(f"- {role}: {limit}")
+    # Add remarks
+    if item.get('remarks'):
+        parts.append("Important notes:")
+        if isinstance(item['remarks'], list):
+            for remark in item['remarks']:
+                if isinstance(remark, str):
+                    parts.append(f"• {remark}")
+        elif isinstance(item['remarks'], str):
+            parts.append(f"• {item['remarks']}")
+    return " ".join(parts)
+def split_into_token_chunks(text: str, max_tokens: int = 400, overlap_tokens: int = 50) -> List[str]:
+    """Split text into chunks based on token count."""
+    sentences = re.split(r'[.!?]\s+', text)
     chunks = []
     current_chunk = ""
     current_tokens = 0
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        sentence_tokens = count_tokens(sentence)
+        # If adding this sentence would exceed max_tokens, finalize current chunk
+        if current_tokens + sentence_tokens > max_tokens and current_chunk:
             chunks.append(current_chunk.strip())
+            # Start new chunk with overlap
+            if overlap_tokens > 0 and chunks:
+                overlap_text = current_chunk[-overlap_tokens*5:]  # Rough overlap estimation
                 current_chunk = overlap_text + " " + sentence
             else:
                 current_chunk = sentence
+            current_tokens = count_tokens(current_chunk)
+        else:
+            current_chunk += (" " if current_chunk else "") + sentence
+            current_tokens += sentence_tokens
+    # Add the last chunk if it has content
+    if current_chunk.strip():
         chunks.append(current_chunk.strip())
     return chunks
+def create_chunk_hash(text: str) -> str:
+    """Create a hash of the chunk text for deduplication."""
+    return hashlib.md5(text.encode('utf-8')).hexdigest()[:12]
+def process_jsonl_file(file_path: str, output_path: str):
+    """Process the JSONL file and create granular chunks."""
+    print(f"Starting to process '{file_path}' with token-based chunking and keyword enhancement...")
     all_chunks = []
+    chunk_hashes = set()  # For deduplication
+    chunk_id_counter = 1
     try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            for line_num, line in enumerate(file, 1):
                 try:
+                    item = json.loads(line.strip())
+                    # Create comprehensive text from the item
+                    chunk_text = create_chunk_text_from_item(item)
+                    if not chunk_text.strip():
+                        continue
+                    # Split into token-based chunks
+                    text_chunks = split_into_token_chunks(chunk_text)
+                    for i, chunk in enumerate(text_chunks):
+                        if not chunk.strip():
+                            continue
+                        # Check for duplicates
+                        chunk_hash = create_chunk_hash(chunk)
+                        if chunk_hash in chunk_hashes:
+                            continue
+                        chunk_hashes.add(chunk_hash)
+                        # Extract keywords
+                        financial_keywords = extract_financial_keywords(chunk)
+                        authority_keywords = extract_authority_keywords(chunk)
+                        # Create chunk object
+                        chunk_obj = {
+                            'id': f'chunk-{chunk_id_counter}',
+                            'text': chunk,
+                            'metadata': {
+                                'section': item.get('section', ''),
+                                'clause': item.get('clause', ''),
+                                'title': item.get('title', ''),
+                                'chunk_index': i,
+                                'source_line': line_num,
+                                'financial_keywords': financial_keywords,
+                                'authority_keywords': authority_keywords,
+                                'token_count': count_tokens(chunk)
+                            }
+                        }
+                        all_chunks.append(chunk_obj)
+                        chunk_id_counter += 1
+                except json.JSONDecodeError as e:
+                    print(f"Warning: Invalid JSON on line {line_num}: {e}")
                     continue
     except FileNotFoundError:
+        print(f"Error: File '{file_path}' not found.")
         return
+    except Exception as e:
+        print(f"Error reading file: {e}")
+        return
     print(f"Generated {len(all_chunks)} chunks before deduplication.")
+    print(f"{len(chunk_hashes)} unique chunks after deduplication.")
+    # Write chunks to output file
+    try:
+        with open(output_path, 'w', encoding='utf-8') as output_file:
+            for chunk in all_chunks:
+                json.dump(chunk, output_file, ensure_ascii=False)
+                output_file.write('\n')
+        print(f"Successfully wrote improved granular chunks to '{output_path}'.")
+        print(f"Sample chunk structure:")
+        if all_chunks:
+            sample = all_chunks[0]
+            print(f"  ID: {sample['id']}")
+            print(f"  Text length: {len(sample['text'])} chars")
+            print(f"  Section: {sample['metadata']['section']}")
+            print(f"  Financial keywords: {sample['metadata']['financial_keywords'][:3]}...")
+            print(f"  Token count: {sample['metadata']['token_count']}")
+    except Exception as e:
+        print(f"Error writing output file: {e}")
 if __name__ == "__main__":
+    input_file = "combined_context.jsonl"
+    output_file = "granular_chunks_final.jsonl"
+    process_jsonl_file(input_file, output_file)