Spaces:

Seth0330
/

AIEXTRACT1

Running

App Files Files Community

AIEXTRACT1 / backend /app /openrouter_client.py

Seth0330

Update backend/app/openrouter_client.py

b5224a9 verified 19 days ago

raw

history blame

13.2 kB

	import os
	import base64
	import json
	import re
	from io import BytesIO
	from typing import Any, Dict, List

	import httpx

	try:
	import fitz # PyMuPDF
	from PIL import Image
	PDF_SUPPORT = True
	except ImportError as e:
	PDF_SUPPORT = False
	print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")

	# Get your OpenRouter API key from env (you'll set this in Hugging Face later)
	OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
	OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
	MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"


	def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
	"""
	Convert PDF pages to PNG images.
	Returns a list of PNG image bytes, one per page.
	"""
	if not PDF_SUPPORT:
	raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")

	pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	images = []

	print(f"[INFO] PDF has {len(pdf_doc)} page(s)")

	for page_num in range(len(pdf_doc)):
	page = pdf_doc[page_num]
	# Render page to image (zoom factor 2 for better quality)
	mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
	pix = page.get_pixmap(matrix=mat)

	# Convert to PIL Image then to JPEG bytes (better compression, matches working code)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	img_bytes = BytesIO()
	img.save(img_bytes, format="JPEG", quality=95)
	images.append(img_bytes.getvalue())

	print(f"[INFO] Converted page {page_num + 1} to image ({pix.width}x{pix.height})")

	pdf_doc.close()
	return images


	def _image_bytes_to_base64(image_bytes: bytes) -> str:
	"""Convert image bytes to base64 data URL (JPEG format)."""
	b64 = base64.b64encode(image_bytes).decode("utf-8")
	return f"data:image/jpeg;base64,{b64}"


	def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str, Any]]:
	"""
	Convert file to image blocks for the vision model.
	- For images: Returns single image block
	- For PDFs: Converts each page to an image and returns multiple blocks
	"""
	# Handle PDF files
	if content_type == "application/pdf" or content_type.endswith("/pdf"):
	if not PDF_SUPPORT:
	raise RuntimeError("PDF support requires PyMuPDF. Please install it.")

	print(f"[INFO] Converting PDF to images...")
	pdf_images = _pdf_to_images(file_bytes)

	# Create image blocks for each page
	# OpenRouter format: {"type": "image_url", "image_url": {"url": "data:..."}}
	image_blocks = []
	for i, img_bytes in enumerate(pdf_images):
	data_url = _image_bytes_to_base64(img_bytes)
	image_blocks.append({
	"type": "image_url",
	"image_url": {"url": data_url}
	})
	print(f"[INFO] Created image block for page {i + 1} ({len(img_bytes)} bytes)")

	return image_blocks

	# Handle regular image files
	else:
	# Convert to JPEG for consistency (better compression)
	try:
	img = Image.open(BytesIO(file_bytes))
	if img.mode != "RGB":
	img = img.convert("RGB")

	# Resize if too large (max 1920px on longest side) - matches your working code
	max_size = 1920
	w, h = img.size
	if w > max_size or h > max_size:
	if w > h:
	new_w = max_size
	new_h = int(h * (max_size / w))
	else:
	new_h = max_size
	new_w = int(w * (max_size / h))
	img = img.resize((new_w, new_h), Image.LANCZOS)
	print(f"[INFO] Resized image from {w}x{h} to {new_w}x{new_h}")

	# Convert to JPEG bytes
	img_bytes = BytesIO()
	img.save(img_bytes, format="JPEG", quality=95)
	img_bytes = img_bytes.getvalue()
	data_url = _image_bytes_to_base64(img_bytes)
	except Exception as e:
	# Fallback: use original file bytes
	print(f"[WARNING] Could not process image with PIL: {e}. Using original bytes.")
	b64 = base64.b64encode(file_bytes).decode("utf-8")
	data_url = f"data:{content_type};base64,{b64}"

	print(f"[DEBUG] Encoding image file. Content type: {content_type}, Size: {len(file_bytes)} bytes")

	return [{
	"type": "image_url",
	"image_url": {"url": data_url}
	}]


	async def extract_fields_from_document(
	file_bytes: bytes,
	content_type: str,
	filename: str,
	) -> Dict[str, Any]:
	"""
	Call OpenRouter with Qwen3-VL and return parsed JSON with fields.
	We instruct the model to return JSON only.
	"""
	if not OPENROUTER_API_KEY:
	raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")

	# Convert file to image blocks (handles PDF conversion)
	image_blocks = _file_to_image_blocks(file_bytes, content_type)

	if not image_blocks:
	raise ValueError("No images generated from file")

	print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")

	system_prompt = (
	"You are a document extraction engine with vision capabilities. "
	"You read and extract text from documents in any language, preserving structure, formatting, and all content. "
	"You output structured JSON with both the full extracted text and key-value pairs."
	)

	# Update prompt for multi-page documents - ask for full text extraction first
	if len(image_blocks) > 1:
	user_prompt = (
	f"Read this {len(image_blocks)}-page document using your vision capability and extract ALL text content. "
	"I want the complete end-to-end text from all pages, preserving structure, headings, formatting, and content in all languages.\n\n"
	"Analyze ALL pages thoroughly, including any non-English text (Punjabi, Hindi, or other languages). "
	"Extract every word, number, and piece of information from every page.\n\n"
	"Respond with JSON in this format:\n"
	"{\n"
	' \"doc_type\": \"invoice \| receipt \| contract \| report \| notice \| other\",\n'
	' \"confidence\": number between 0 and 100,\n'
	' \"full_text\": \"Complete extracted text from all pages, preserving structure and formatting. Include all languages.\",\n'
	' \"fields\": {\n'
	' \"invoice_number\": \"...\",\n'
	' \"date\": \"...\",\n'
	' \"due_date\": \"...\",\n'
	' \"total_amount\": \"...\",\n'
	' \"currency\": \"...\",\n'
	' \"vendor_name\": \"...\",\n'
	' \"company_name\": \"...\",\n'
	' \"address\": \"...\",\n'
	' \"line_items\": [\n'
	' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
	' ],\n'
	' \"other_field\": \"...\"\n'
	" },\n"
	' \"pages\": [\n'
	' {\"page_number\": 1, \"text\": \"Full text from page 1\"},\n'
	' {\"page_number\": 2, \"text\": \"Full text from page 2\"}\n'
	' ]\n'
	"}\n\n"
	"IMPORTANT:\n"
	"- Extract ALL text from ALL pages, including non-English languages\n"
	"- Preserve structure, headings, and formatting in the full_text field\n"
	"- Fill in fields with relevant extracted information\n"
	"- If a field is not found, use empty string or omit it\n"
	"- The full_text should contain everything readable from the document"
	)
	else:
	user_prompt = (
	"Read this document using your vision capability and extract ALL text content. "
	"I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
	"Extract every word, number, and piece of information, including any non-English text.\n\n"
	"Respond with JSON in this format:\n"
	"{\n"
	' \"doc_type\": \"invoice \| receipt \| contract \| report \| notice \| other\",\n'
	' \"confidence\": number between 0 and 100,\n'
	' \"full_text\": \"Complete extracted text, preserving structure and formatting. Include all languages.\",\n'
	' \"fields\": {\n'
	' \"invoice_number\": \"...\",\n'
	' \"date\": \"...\",\n'
	' \"due_date\": \"...\",\n'
	' \"total_amount\": \"...\",\n'
	' \"currency\": \"...\",\n'
	' \"vendor_name\": \"...\",\n'
	' \"company_name\": \"...\",\n'
	' \"address\": \"...\",\n'
	' \"line_items\": [\n'
	' {\"description\": \"...\", \"quantity\": \"...\", \"unit_price\": \"...\", \"line_total\": \"...\"}\n'
	' ],\n'
	' \"other_field\": \"...\"\n'
	" }\n"
	"}\n\n"
	"IMPORTANT:\n"
	"- Extract ALL text, including non-English languages\n"
	"- Preserve structure, headings, and formatting in the full_text field\n"
	"- Fill in fields with relevant extracted information\n"
	"- If a field is not found, use empty string or omit it"
	)

	# Build content array with text prompt and all image blocks
	user_content = [{"type": "text", "text": user_prompt}]
	user_content.extend(image_blocks)

	payload: Dict[str, Any] = {
	"model": MODEL_NAME,
	"messages": [
	{
	"role": "system",
	"content": [{"type": "text", "text": system_prompt}],
	},
	{
	"role": "user",
	"content": user_content,
	},
	],
	"max_tokens": 8192, # Increased for full text extraction from multi-page documents
	}

	headers = {
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"Content-Type": "application/json",
	# Optional attribution headers
	"HTTP-Referer": os.environ.get(
	"APP_URL",
	"https://huggingface.co/spaces/your-space",
	),
	"X-Title": "Document Capture Demo",
	}

	async with httpx.AsyncClient(timeout=120) as client:
	resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
	resp.raise_for_status()
	data = resp.json()

	# OpenRouter returns choices[0].message.content
	if "choices" not in data or len(data["choices"]) == 0:
	raise ValueError("No choices in OpenRouter response")

	content = data["choices"][0]["message"]["content"]

	# Log the raw response for debugging (first 500 chars)
	print(f"[DEBUG] OpenRouter response preview: {str(content)[:500]}")

	# content may be a string or a list of content blocks
	if isinstance(content, list):
	text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
	else:
	text = content

	if not text or not text.strip():
	raise ValueError("Empty response from OpenRouter API")

	# Try to parse JSON from the model output
	# The model might return JSON wrapped in markdown code blocks or with extra text
	try:
	# First, try direct JSON parsing
	parsed = json.loads(text)
	print(f"[DEBUG] Successfully parsed JSON directly")
	return parsed
	except json.JSONDecodeError as e:
	print(f"[DEBUG] Direct JSON parse failed: {e}")
	# Try to extract JSON from markdown code blocks
	json_match = re.search(r'```(?:json)?\s(\{.?\})\s*```', text, re.DOTALL)
	if json_match:
	try:
	parsed = json.loads(json_match.group(1))
	print(f"[DEBUG] Successfully parsed JSON from markdown code block")
	return parsed
	except json.JSONDecodeError as e2:
	print(f"[DEBUG] Markdown code block parse failed: {e2}")

	# Try to find JSON object in the text (look for {...})
	json_match = re.search(r'\{.*\}', text, re.DOTALL)
	if json_match:
	try:
	parsed = json.loads(json_match.group(0))
	print(f"[DEBUG] Successfully parsed JSON from regex match")
	return parsed
	except json.JSONDecodeError as e3:
	print(f"[DEBUG] Regex match parse failed: {e3}")

	# If all parsing fails, return a default structure with the raw text
	print(f"[WARNING] All JSON parsing attempts failed. Returning fallback structure.")
	return {
	"doc_type": "other",
	"confidence": 50.0,
	"fields": {
	"raw_response": text[:1000], # First 1000 chars for debugging
	"error": "Could not parse JSON from model response",
	"note": "Check server logs for full response"
	}
	}