Spaces:

Seth0330
/

AIEXTRACT1

Running

App Files Files Community

Seth0330 commited on 8 days ago

Commit

88c325b

verified ·

1 Parent(s): 4eec2ab

Update backend/app/openrouter_client.py

Browse files

Files changed (1) hide show

backend/app/openrouter_client.py +100 -2

backend/app/openrouter_client.py CHANGED Viewed

@@ -25,7 +25,12 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
 HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen3-VL-235B-A22B-Instruct")  # Default HF model
-# Backend selection: "openrouter" or "huggingface"
 EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
@@ -144,6 +149,8 @@ async def _extract_single_page(image_bytes: bytes, page_num: int, total_pages: i
     if backend == "huggingface":
         return await _extract_with_hf(image_bytes, page_num, total_pages)
     else:
         return await _extract_with_openrouter_single(image_bytes, page_num, total_pages)
@@ -155,7 +162,7 @@ async def extract_fields_from_document(
 ) -> Dict[str, Any]:
     """
     Extract fields from document. Processes pages separately for better reliability.
-    Supports both OpenRouter and HuggingFace Inference API.
     """
     # Convert file to image blocks (handles PDF conversion)
     image_blocks_data = _file_to_image_blocks(file_bytes, content_type)
@@ -324,6 +331,97 @@ async def _extract_with_openrouter_single(image_bytes: bytes, page_num: int, tot
     return _parse_model_response(text, page_num)
 async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
     """Extract from a single page using HuggingFace Inference API (router endpoint)."""
     if not HF_TOKEN:

 HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
 HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen3-VL-235B-A22B-Instruct")  # Default HF model
+# OpenAI API
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+OPENAI_BASE_URL = "https://api.openai.com/v1/chat/completions"
+OPENAI_MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME", "gpt-4o")  # Default OpenAI vision model
+# Backend selection: "openrouter", "huggingface", or "openai"
 EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
     if backend == "huggingface":
         return await _extract_with_hf(image_bytes, page_num, total_pages)
+    elif backend == "openai":
+        return await _extract_with_openai_single(image_bytes, page_num, total_pages)
     else:
         return await _extract_with_openrouter_single(image_bytes, page_num, total_pages)
 ) -> Dict[str, Any]:
     """
     Extract fields from document. Processes pages separately for better reliability.
+    Supports OpenRouter, HuggingFace Inference API, and OpenAI Vision API.
     """
     # Convert file to image blocks (handles PDF conversion)
     image_blocks_data = _file_to_image_blocks(file_bytes, content_type)
     return _parse_model_response(text, page_num)
+async def _extract_with_openai_single(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
+    """Extract from a single page using OpenAI GPT-4o Vision API."""
+    if not OPENAI_API_KEY:
+        raise RuntimeError("OPENAI_API_KEY environment variable is not set")
+    # Create single image block
+    data_url = _image_bytes_to_base64(image_bytes)
+    image_block = {
+        "type": "image_url",
+        "image_url": {"url": data_url}
+    }
+    system_prompt = (
+        "You are a document extraction engine with vision capabilities. "
+        "You read and extract text from documents in any language, preserving structure, formatting, and all content. "
+        "You output structured JSON with both the full extracted text and key-value pairs."
+    )
+    user_prompt = (
+        f"Read this document page ({page_num} of {total_pages}) using your vision capability and extract ALL text content. "
+        "I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
+        "Extract every word, number, and piece of information, including any non-English text (Punjabi, Hindi, etc.).\n\n"
+        "Respond with JSON in this format:\n"
+        "{\n"
+        '  "doc_type": "invoice | receipt | contract | report | notice | other",\n'
+        '  "confidence": number between 0 and 100,\n'
+        '  "full_text": "Complete extracted text from this page, preserving structure and formatting. Include all languages.",\n'
+        '  "fields": {\n'
+        '    "invoice_number": "...",\n'
+        '    "date": "...",\n'
+        '    "company_name": "...",\n'
+        '    "address": "...",\n'
+        '    "other_field": "..."\n'
+        "  }\n"
+        "}\n\n"
+        "IMPORTANT:\n"
+        "- Extract ALL text from this page, including non-English languages\n"
+        "- Preserve structure, headings, and formatting\n"
+        "- Fill in fields with relevant extracted information\n"
+        "- If a field is not found, use empty string or omit it"
+    )
+    payload: Dict[str, Any] = {
+        "model": OPENAI_MODEL_NAME,
+        "messages": [
+            {
+                "role": "system",
+                "content": system_prompt,
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_prompt},
+                    image_block
+                ],
+            },
+        ],
+        "max_tokens": 4096,  # Similar to OpenRouter
+        "temperature": 0.1,  # Lower temperature for more consistent extraction
+    }
+    headers = {
+        "Authorization": f"Bearer {OPENAI_API_KEY}",
+        "Content-Type": "application/json",
+    }
+    payload_size_mb = len(json.dumps(payload).encode('utf-8')) / 1024 / 1024
+    print(f"[INFO] OpenAI: Processing page {page_num} with model {OPENAI_MODEL_NAME}, payload: {payload_size_mb:.2f} MB")
+    try:
+        timeout = httpx.Timeout(180.0, connect=30.0)  # 3 min per page
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            resp = await client.post(OPENAI_BASE_URL, headers=headers, json=payload)
+            resp.raise_for_status()
+            data = resp.json()
+    except httpx.TimeoutException:
+        raise RuntimeError(f"OpenAI API timed out for page {page_num}")
+    except Exception as e:
+        error_msg = str(e)
+        print(f"[ERROR] OpenAI API error details: {type(e).__name__}: {error_msg}")
+        raise RuntimeError(f"OpenAI API error for page {page_num}: {error_msg}")
+    if "choices" not in data or len(data["choices"]) == 0:
+        raise ValueError(f"No choices in OpenAI response for page {page_num}")
+    response_text = data["choices"][0]["message"]["content"]
+    print(f"[DEBUG] OpenAI response preview: {response_text[:500]}")
+    return _parse_model_response(response_text, page_num)
 async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
     """Extract from a single page using HuggingFace Inference API (router endpoint)."""
     if not HF_TOKEN: