Spaces:

Seth0330
/

AIEXTRACT1

Running

App Files Files Community

Seth0330 commited on 9 days ago

Commit

dd82407

verified ·

1 Parent(s): 6f6e8af

Update backend/app/openrouter_client.py

Browse files

Files changed (1) hide show

backend/app/openrouter_client.py +293 -8

backend/app/openrouter_client.py CHANGED Viewed

@@ -20,6 +20,14 @@ OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
 OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
 MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
 def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
     """
@@ -127,25 +135,302 @@ def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str
         }]
 async def extract_fields_from_document(
     file_bytes: bytes,
     content_type: str,
     filename: str,
 ) -> Dict[str, Any]:
     """
-    Call OpenRouter with Qwen3-VL and return parsed JSON with fields.
-    We instruct the model to return JSON only.
     """
-    if not OPENROUTER_API_KEY:
-        raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
     # Convert file to image blocks (handles PDF conversion)
-    image_blocks = _file_to_image_blocks(file_bytes, content_type)
-    if not image_blocks:
         raise ValueError("No images generated from file")
-    print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")
     system_prompt = (
         "You are a document extraction engine with vision capabilities. "

 OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
 MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
+# HuggingFace Inference API
+HF_TOKEN = os.environ.get("HF_TOKEN")
+HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
+HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen2-VL-7B-Instruct")  # Alternative HF model
+# Backend selection: "openrouter" or "huggingface"
+EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
 def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
     """
         }]
+async def _extract_single_page(image_bytes: bytes, page_num: int, total_pages: int, backend: str = None) -> Dict[str, Any]:
+    """
+    Extract text from a single page/image.
+    Processes one page at a time to avoid large payloads.
+    """
+    backend = backend or EXTRACTION_BACKEND
+    if backend == "huggingface":
+        return await _extract_with_hf(image_bytes, page_num, total_pages)
+    else:
+        return await _extract_with_openrouter_single(image_bytes, page_num, total_pages)
 async def extract_fields_from_document(
     file_bytes: bytes,
     content_type: str,
     filename: str,
 ) -> Dict[str, Any]:
     """
+    Extract fields from document. Processes pages separately for better reliability.
+    Supports both OpenRouter and HuggingFace Inference API.
     """
     # Convert file to image blocks (handles PDF conversion)
+    image_blocks_data = _file_to_image_blocks(file_bytes, content_type)
+    if not image_blocks_data:
         raise ValueError("No images generated from file")
+    # Get raw image bytes for processing
+    if content_type == "application/pdf" or content_type.endswith("/pdf"):
+        # For PDFs, we need to get the raw image bytes
+        pdf_images = _pdf_to_images(file_bytes)
+        image_bytes_list = pdf_images
+    else:
+        # For regular images, use the file bytes directly
+        image_bytes_list = [file_bytes]
+    total_pages = len(image_bytes_list)
+    print(f"[INFO] Processing {total_pages} page(s) separately for better reliability...")
+    # Process each page separately
+    page_results = []
+    for page_num, img_bytes in enumerate(image_bytes_list):
+        print(f"[INFO] Processing page {page_num + 1}/{total_pages}...")
+        try:
+            page_result = await _extract_single_page(img_bytes, page_num + 1, total_pages)
+            page_results.append({
+                "page_number": page_num + 1,
+                "text": page_result.get("full_text", ""),
+                "fields": page_result.get("fields", {}),
+                "confidence": page_result.get("confidence", 0),
+                "doc_type": page_result.get("doc_type", "other"),
+            })
+            print(f"[INFO] Page {page_num + 1} processed successfully")
+        except Exception as e:
+            print(f"[ERROR] Failed to process page {page_num + 1}: {e}")
+            page_results.append({
+                "page_number": page_num + 1,
+                "text": "",
+                "fields": {},
+                "confidence": 0,
+                "error": str(e)
+            })
+    # Combine results from all pages
+    combined_full_text = "\n\n".join([f"=== PAGE {p['page_number']} ===\n\n{p['text']}" for p in page_results if p.get("text")])
+    # Merge fields from all pages (prefer non-empty values)
+    combined_fields = {}
+    for page_result in page_results:
+        page_fields = page_result.get("fields", {})
+        for key, value in page_fields.items():
+            if value and (key not in combined_fields or not combined_fields[key]):
+                combined_fields[key] = value
+    # Calculate average confidence
+    confidences = [p.get("confidence", 0) for p in page_results if p.get("confidence", 0) > 0]
+    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
+    # Determine doc_type from first successful page
+    doc_type = "other"
+    for page_result in page_results:
+        if page_result.get("doc_type") and page_result["doc_type"] != "other":
+            doc_type = page_result["doc_type"]
+            break
+    return {
+        "doc_type": doc_type,
+        "confidence": avg_confidence,
+        "full_text": combined_full_text,
+        "fields": combined_fields,
+        "pages": page_results
+    }
+async def _extract_with_openrouter_single(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
+    """Extract from a single page using OpenRouter."""
+    if not OPENROUTER_API_KEY:
+        raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
+    # Create single image block
+    data_url = _image_bytes_to_base64(image_bytes)
+    image_block = {
+        "type": "image_url",
+        "image_url": {"url": data_url}
+    }
+    system_prompt = (
+        "You are a document extraction engine with vision capabilities. "
+        "You read and extract text from documents in any language, preserving structure, formatting, and all content. "
+        "You output structured JSON with both the full extracted text and key-value pairs."
+    )
+    user_prompt = (
+        f"Read this document page ({page_num} of {total_pages}) using your vision capability and extract ALL text content. "
+        "I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
+        "Extract every word, number, and piece of information, including any non-English text (Punjabi, Hindi, etc.).\n\n"
+        "Respond with JSON in this format:\n"
+        "{\n"
+        '  \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
+        '  \"confidence\": number between 0 and 100,\n'
+        '  \"full_text\": \"Complete extracted text from this page, preserving structure and formatting. Include all languages.\",\n'
+        '  \"fields\": {\n'
+        '    \"invoice_number\": \"...\",\n'
+        '    \"date\": \"...\",\n'
+        '    \"company_name\": \"...\",\n'
+        '    \"address\": \"...\",\n'
+        '    \"other_field\": \"...\"\n'
+        "  }\n"
+        "}\n\n"
+        "IMPORTANT:\n"
+        "- Extract ALL text from this page, including non-English languages\n"
+        "- Preserve structure, headings, and formatting\n"
+        "- Fill in fields with relevant extracted information\n"
+        "- If a field is not found, use empty string or omit it"
+    )
+    payload: Dict[str, Any] = {
+        "model": MODEL_NAME,
+        "messages": [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": system_prompt}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_prompt},
+                    image_block
+                ],
+            },
+        ],
+        "max_tokens": 4096,  # Smaller for single page
+    }
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": os.environ.get("APP_URL", "https://huggingface.co/spaces/your-space"),
+        "X-Title": "Document Capture Demo",
+    }
+    payload_size_mb = len(json.dumps(payload).encode('utf-8')) / 1024 / 1024
+    print(f"[INFO] OpenRouter: Processing page {page_num}, payload: {payload_size_mb:.2f} MB")
+    try:
+        timeout = httpx.Timeout(180.0, connect=30.0)  # 3 min per page
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
+            resp.raise_for_status()
+            data = resp.json()
+    except httpx.TimeoutException:
+        raise RuntimeError(f"OpenRouter API timed out for page {page_num}")
+    except Exception as e:
+        raise RuntimeError(f"OpenRouter API error for page {page_num}: {str(e)}")
+    if "choices" not in data or len(data["choices"]) == 0:
+        raise ValueError(f"No choices in OpenRouter response for page {page_num}")
+    content = data["choices"][0]["message"]["content"]
+    if isinstance(content, list):
+        text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
+    else:
+        text = content
+    # Parse JSON response
+    return _parse_model_response(text, page_num)
+async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
+    """Extract from a single page using HuggingFace Inference API."""
+    if not HF_TOKEN:
+        raise RuntimeError("HF_TOKEN environment variable is not set")
+    try:
+        from huggingface_hub import InferenceClient
+    except ImportError:
+        raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt")
+    client = InferenceClient(model=HF_MODEL_NAME, token=HF_TOKEN)
+    prompt = (
+        f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
+        "Extract every word, number, and piece of information, including any non-English text. "
+        "Return JSON with 'full_text', 'doc_type', 'confidence', and 'fields'."
+    )
+    print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
+    try:
+        # HF Inference API for vision models - use image-to-text or chat completion
+        # For vision models, we need to use the chat completion format
+        result = client.chat_completion(
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {"type": "image", "image": image_bytes}
+                    ]
+                }
+            ],
+            max_tokens=2048
+        )
+        # Extract response text
+        if isinstance(result, dict):
+            if "choices" in result and len(result["choices"]) > 0:
+                response_text = result["choices"][0].get("message", {}).get("content", "")
+            else:
+                response_text = result.get("generated_text", str(result))
+        elif isinstance(result, str):
+            response_text = result
+        else:
+            response_text = str(result)
+        if not response_text:
+            raise ValueError("Empty response from HuggingFace API")
+        return _parse_model_response(response_text, page_num)
+    except Exception as e:
+        print(f"[ERROR] HuggingFace API error details: {type(e).__name__}: {str(e)}")
+        raise RuntimeError(f"HuggingFace API error for page {page_num}: {str(e)}")
+def _parse_model_response(text: str, page_num: int = None) -> Dict[str, Any]:
+    """Parse JSON response from model, handling truncation and errors."""
+    if not text or not text.strip():
+        raise ValueError("Empty response from model")
+    # Try to parse JSON
+    try:
+        parsed = json.loads(text)
+        print(f"[DEBUG] Successfully parsed JSON for page {page_num or 'single'}")
+        return parsed
+    except json.JSONDecodeError as e:
+        print(f"[DEBUG] Direct JSON parse failed: {e}")
+        # Try to extract JSON from markdown code blocks
+        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
+        if json_match:
+            try:
+                return json.loads(json_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        # Try to find JSON object
+        json_match = re.search(r'\{.*\}', text, re.DOTALL)
+        if json_match:
+            try:
+                fixed_json = _fix_truncated_json(json_match.group(0))
+                return json.loads(fixed_json)
+            except Exception:
+                pass
+        # Extract full_text even from truncated JSON
+        full_text_match = re.search(r'"full_text"\s*:\s*"(.*?)(?:"\s*[,}]|$)', text, re.DOTALL)
+        if full_text_match:
+            full_text = (full_text_match.group(1)
+                        .replace('\\n', '\n')
+                        .replace('\\"', '"')
+                        .replace('\\\\', '\\'))
+            return {
+                "doc_type": "other",
+                "confidence": 90.0,
+                "full_text": full_text,
+                "fields": {"full_text": full_text}
+            }
+        # Last resort: return raw text
+        return {
+            "doc_type": "other",
+            "confidence": 50.0,
+            "full_text": text[:2000],
+            "fields": {"raw_text": text[:2000]}
+        }
     system_prompt = (
         "You are a document extraction engine with vision capabilities. "