AIEXTRACT1 / backend /app /openrouter_client.py
Seth0330's picture
Update backend/app/openrouter_client.py
b5224a9 verified
raw
history blame
13.2 kB
import os
import base64
import json
import re
from io import BytesIO
from typing import Any, Dict, List
import httpx
try:
import fitz # PyMuPDF
from PIL import Image
PDF_SUPPORT = True
except ImportError as e:
PDF_SUPPORT = False
print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")
# Get your OpenRouter API key from env (you'll set this in Hugging Face later)
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
"""
Convert PDF pages to PNG images.
Returns a list of PNG image bytes, one per page.
"""
if not PDF_SUPPORT:
raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")
pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
print(f"[INFO] PDF has {len(pdf_doc)} page(s)")
for page_num in range(len(pdf_doc)):
page = pdf_doc[page_num]
# Render page to image (zoom factor 2 for better quality)
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image then to JPEG bytes (better compression, matches working code)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img_bytes = BytesIO()
img.save(img_bytes, format="JPEG", quality=95)
images.append(img_bytes.getvalue())
print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")
pdf_doc.close()
return images
def _image_bytes_to_base64(image_bytes: bytes) -> str:
"""Convert image bytes to base64 data URL (JPEG format)."""
b64 = base64.b64encode(image_bytes).decode("utf-8")
return f"data:image/jpeg;base64,{b64}"
def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
"""
Convert file to image blocks for the vision model.
- For images: Returns single image block
- For PDFs: Converts each page to an image and returns multiple blocks
"""
# Handle PDF files
if content_type == "application/pdf" or content_type.endswith("/pdf"):
if not PDF_SUPPORT:
raise RuntimeError("PDF support requires PyMuPDF. Please install it.")
print(f"[INFO] Converting PDF to images...")
pdf_images = _pdf_to_images(file_bytes)
# Create image blocks for each page
# OpenRouter format: {"type": "image_url", "image_url": {"url": "data:..."}}
image_blocks = []
for i, img_bytes in enumerate(pdf_images):
data_url = _image_bytes_to_base64(img_bytes)
image_blocks.append({
"type": "image_url",
"image_url": {"url": data_url}
})
print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")
return image_blocks
# Handle regular image files
else:
# Convert to JPEG for consistency (better compression)
try:
img = Image.open(BytesIO(file_bytes))
if img.mode != "RGB":
img = img.convert("RGB")
# Resize if too large (max 1920px on longest side) - matches your working code
max_size = 1920
w, h = img.size
if w > max_size or h > max_size:
if w > h:
new_w = max_size
new_h = int(h * (max_size / w))
else:
new_h = max_size
new_w = int(w * (max_size / h))
img = img.resize((new_w, new_h), Image.LANCZOS)
print(f"[INFO] Resized image from {w}x{h} to {new_w}x{new_h}")
# Convert to JPEG bytes
img_bytes = BytesIO()
img.save(img_bytes, format="JPEG", quality=95)
img_bytes = img_bytes.getvalue()
data_url = _image_bytes_to_base64(img_bytes)
except Exception as e:
# Fallback: use original file bytes
print(f"[WARNING] Could not process image with PIL: {e}. Using original bytes.")
b64 = base64.b64encode(file_bytes).decode("utf-8")
data_url = f"data:{content_type};base64,{b64}"
print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")
return [{
"type": "image_url",
"image_url": {"url": data_url}
}]
async def extract_fields_from_document(
file_bytes: bytes,
content_type: str,
filename: str,
) -> Dict[str, Any]:
"""
Call OpenRouter with Qwen3-VL and return parsed JSON with fields.
We instruct the model to return JSON only.
"""
if not OPENROUTER_API_KEY:
raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
# Convert file to image blocks (handles PDF conversion)
image_blocks = _file_to_image_blocks(file_bytes, content_type)
if not image_blocks:
raise ValueError("No images generated from file")
print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")
system_prompt = (
"You are a document extraction engine with vision capabilities. "
"You read and extract text from documents in any language, preserving structure, formatting, and all content. "
"You output structured JSON with both the full extracted text and key-value pairs."
)
# Update prompt for multi-page documents - ask for full text extraction first
if len(image_blocks) > 1:
user_prompt = (
f"Read this {len(image_blocks)}-page document using your vision capability and extract ALL text content. "
"I want the complete end-to-end text from all pages, preserving structure, headings, formatting, and content in all languages.\n\n"
"Analyze ALL pages thoroughly, including any non-English text (Punjabi, Hindi, or other languages). "
"Extract every word, number, and piece of information from every page.\n\n"
"Respond with JSON in this format:\n"
"{\n"
' \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
' \"confidence\": number between 0 and 100,\n'
' \"full_text\": \"Complete extracted text from all pages, preserving structure and formatting. Include all languages.\",\n'
' \"fields\": {\n'
' \"invoice_number\": \"...\",\n'
' \"date\": \"...\",\n'
' \"due_date\": \"...\",\n'
' \"total_amount\": \"...\",\n'
' \"currency\": \"...\",\n'
' \"vendor_name\": \"...\",\n'
' \"company_name\": \"...\",\n'
' \"address\": \"...\",\n'
' \"line_items\": [\n'
' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
' ],\n'
' \"other_field\": \"...\"\n'
" },\n"
' \"pages\": [\n'
' {\"page_number\": 1, \"text\": \"Full text from page 1\"},\n'
' {\"page_number\": 2, \"text\": \"Full text from page 2\"}\n'
' ]\n'
"}\n\n"
"IMPORTANT:\n"
"- Extract ALL text from ALL pages, including non-English languages\n"
"- Preserve structure, headings, and formatting in the full_text field\n"
"- Fill in fields with relevant extracted information\n"
"- If a field is not found, use empty string or omit it\n"
"- The full_text should contain everything readable from the document"
)
else:
user_prompt = (
"Read this document using your vision capability and extract ALL text content. "
"I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
"Extract every word, number, and piece of information, including any non-English text.\n\n"
"Respond with JSON in this format:\n"
"{\n"
' \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
' \"confidence\": number between 0 and 100,\n'
' \"full_text\": \"Complete extracted text, preserving structure and formatting. Include all languages.\",\n'
' \"fields\": {\n'
' \"invoice_number\": \"...\",\n'
' \"date\": \"...\",\n'
' \"due_date\": \"...\",\n'
' \"total_amount\": \"...\",\n'
' \"currency\": \"...\",\n'
' \"vendor_name\": \"...\",\n'
' \"company_name\": \"...\",\n'
' \"address\": \"...\",\n'
' \"line_items\": [\n'
' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
' ],\n'
' \"other_field\": \"...\"\n'
" }\n"
"}\n\n"
"IMPORTANT:\n"
"- Extract ALL text, including non-English languages\n"
"- Preserve structure, headings, and formatting in the full_text field\n"
"- Fill in fields with relevant extracted information\n"
"- If a field is not found, use empty string or omit it"
)
# Build content array with text prompt and all image blocks
user_content = [{"type": "text", "text": user_prompt}]
user_content.extend(image_blocks)
payload: Dict[str, Any] = {
"model": MODEL_NAME,
"messages": [
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}],
},
{
"role": "user",
"content": user_content,
},
],
"max_tokens": 8192, # Increased for full text extraction from multi-page documents
}
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
# Optional attribution headers
"HTTP-Referer": os.environ.get(
"APP_URL",
"https://huggingface.co/spaces/your-space",
),
"X-Title": "Document Capture Demo",
}
async with httpx.AsyncClient(timeout=120) as client:
resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
resp.raise_for_status()
data = resp.json()
# OpenRouter returns choices[0].message.content
if "choices" not in data or len(data["choices"]) == 0:
raise ValueError("No choices in OpenRouter response")
content = data["choices"][0]["message"]["content"]
# Log the raw response for debugging (first 500 chars)
print(f"[DEBUG] OpenRouter response preview: {str(content)[:500]}")
# content may be a string or a list of content blocks
if isinstance(content, list):
text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
else:
text = content
if not text or not text.strip():
raise ValueError("Empty response from OpenRouter API")
# Try to parse JSON from the model output
# The model might return JSON wrapped in markdown code blocks or with extra text
try:
# First, try direct JSON parsing
parsed = json.loads(text)
print(f"[DEBUG] Successfully parsed JSON directly")
return parsed
except json.JSONDecodeError as e:
print(f"[DEBUG] Direct JSON parse failed: {e}")
# Try to extract JSON from markdown code blocks
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if json_match:
try:
parsed = json.loads(json_match.group(1))
print(f"[DEBUG] Successfully parsed JSON from markdown code block")
return parsed
except json.JSONDecodeError as e2:
print(f"[DEBUG] Markdown code block parse failed: {e2}")
# Try to find JSON object in the text (look for {...})
json_match = re.search(r'\{.*\}', text, re.DOTALL)
if json_match:
try:
parsed = json.loads(json_match.group(0))
print(f"[DEBUG] Successfully parsed JSON from regex match")
return parsed
except json.JSONDecodeError as e3:
print(f"[DEBUG] Regex match parse failed: {e3}")
# If all parsing fails, return a default structure with the raw text
print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
return {
"doc_type": "other",
"confidence": 50.0,
"fields": {
"raw_response": text[:1000], # First 1000 chars for debugging
"error": "Could not parse JSON from model response",
"note": "Check server logs for full response"
}
}