Seth0330 commited on
Commit
88c325b
·
verified ·
1 Parent(s): 4eec2ab

Update backend/app/openrouter_client.py

Browse files
Files changed (1) hide show
  1. backend/app/openrouter_client.py +100 -2
backend/app/openrouter_client.py CHANGED
@@ -25,7 +25,12 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
25
  HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
26
  HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen3-VL-235B-A22B-Instruct") # Default HF model
27
 
28
- # Backend selection: "openrouter" or "huggingface"
 
 
 
 
 
29
  EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
30
 
31
 
@@ -144,6 +149,8 @@ async def _extract_single_page(image_bytes: bytes, page_num: int, total_pages: i
144
 
145
  if backend == "huggingface":
146
  return await _extract_with_hf(image_bytes, page_num, total_pages)
 
 
147
  else:
148
  return await _extract_with_openrouter_single(image_bytes, page_num, total_pages)
149
 
@@ -155,7 +162,7 @@ async def extract_fields_from_document(
155
  ) -> Dict[str, Any]:
156
  """
157
  Extract fields from document. Processes pages separately for better reliability.
158
- Supports both OpenRouter and HuggingFace Inference API.
159
  """
160
  # Convert file to image blocks (handles PDF conversion)
161
  image_blocks_data = _file_to_image_blocks(file_bytes, content_type)
@@ -324,6 +331,97 @@ async def _extract_with_openrouter_single(image_bytes: bytes, page_num: int, tot
324
  return _parse_model_response(text, page_num)
325
 
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
328
  """Extract from a single page using HuggingFace Inference API (router endpoint)."""
329
  if not HF_TOKEN:
 
25
  HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
26
  HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen3-VL-235B-A22B-Instruct") # Default HF model
27
 
28
+ # OpenAI API
29
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
30
+ OPENAI_BASE_URL = "https://api.openai.com/v1/chat/completions"
31
+ OPENAI_MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME", "gpt-4o") # Default OpenAI vision model
32
+
33
+ # Backend selection: "openrouter", "huggingface", or "openai"
34
  EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
35
 
36
 
 
149
 
150
  if backend == "huggingface":
151
  return await _extract_with_hf(image_bytes, page_num, total_pages)
152
+ elif backend == "openai":
153
+ return await _extract_with_openai_single(image_bytes, page_num, total_pages)
154
  else:
155
  return await _extract_with_openrouter_single(image_bytes, page_num, total_pages)
156
 
 
162
  ) -> Dict[str, Any]:
163
  """
164
  Extract fields from document. Processes pages separately for better reliability.
165
+ Supports OpenRouter, HuggingFace Inference API, and OpenAI Vision API.
166
  """
167
  # Convert file to image blocks (handles PDF conversion)
168
  image_blocks_data = _file_to_image_blocks(file_bytes, content_type)
 
331
  return _parse_model_response(text, page_num)
332
 
333
 
334
+ async def _extract_with_openai_single(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
335
+ """Extract from a single page using OpenAI GPT-4o Vision API."""
336
+ if not OPENAI_API_KEY:
337
+ raise RuntimeError("OPENAI_API_KEY environment variable is not set")
338
+
339
+ # Create single image block
340
+ data_url = _image_bytes_to_base64(image_bytes)
341
+ image_block = {
342
+ "type": "image_url",
343
+ "image_url": {"url": data_url}
344
+ }
345
+
346
+ system_prompt = (
347
+ "You are a document extraction engine with vision capabilities. "
348
+ "You read and extract text from documents in any language, preserving structure, formatting, and all content. "
349
+ "You output structured JSON with both the full extracted text and key-value pairs."
350
+ )
351
+
352
+ user_prompt = (
353
+ f"Read this document page ({page_num} of {total_pages}) using your vision capability and extract ALL text content. "
354
+ "I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
355
+ "Extract every word, number, and piece of information, including any non-English text (Punjabi, Hindi, etc.).\n\n"
356
+ "Respond with JSON in this format:\n"
357
+ "{\n"
358
+ ' "doc_type": "invoice | receipt | contract | report | notice | other",\n'
359
+ ' "confidence": number between 0 and 100,\n'
360
+ ' "full_text": "Complete extracted text from this page, preserving structure and formatting. Include all languages.",\n'
361
+ ' "fields": {\n'
362
+ ' "invoice_number": "...",\n'
363
+ ' "date": "...",\n'
364
+ ' "company_name": "...",\n'
365
+ ' "address": "...",\n'
366
+ ' "other_field": "..."\n'
367
+ " }\n"
368
+ "}\n\n"
369
+ "IMPORTANT:\n"
370
+ "- Extract ALL text from this page, including non-English languages\n"
371
+ "- Preserve structure, headings, and formatting\n"
372
+ "- Fill in fields with relevant extracted information\n"
373
+ "- If a field is not found, use empty string or omit it"
374
+ )
375
+
376
+ payload: Dict[str, Any] = {
377
+ "model": OPENAI_MODEL_NAME,
378
+ "messages": [
379
+ {
380
+ "role": "system",
381
+ "content": system_prompt,
382
+ },
383
+ {
384
+ "role": "user",
385
+ "content": [
386
+ {"type": "text", "text": user_prompt},
387
+ image_block
388
+ ],
389
+ },
390
+ ],
391
+ "max_tokens": 4096, # Similar to OpenRouter
392
+ "temperature": 0.1, # Lower temperature for more consistent extraction
393
+ }
394
+
395
+ headers = {
396
+ "Authorization": f"Bearer {OPENAI_API_KEY}",
397
+ "Content-Type": "application/json",
398
+ }
399
+
400
+ payload_size_mb = len(json.dumps(payload).encode('utf-8')) / 1024 / 1024
401
+ print(f"[INFO] OpenAI: Processing page {page_num} with model {OPENAI_MODEL_NAME}, payload: {payload_size_mb:.2f} MB")
402
+
403
+ try:
404
+ timeout = httpx.Timeout(180.0, connect=30.0) # 3 min per page
405
+ async with httpx.AsyncClient(timeout=timeout) as client:
406
+ resp = await client.post(OPENAI_BASE_URL, headers=headers, json=payload)
407
+ resp.raise_for_status()
408
+ data = resp.json()
409
+ except httpx.TimeoutException:
410
+ raise RuntimeError(f"OpenAI API timed out for page {page_num}")
411
+ except Exception as e:
412
+ error_msg = str(e)
413
+ print(f"[ERROR] OpenAI API error details: {type(e).__name__}: {error_msg}")
414
+ raise RuntimeError(f"OpenAI API error for page {page_num}: {error_msg}")
415
+
416
+ if "choices" not in data or len(data["choices"]) == 0:
417
+ raise ValueError(f"No choices in OpenAI response for page {page_num}")
418
+
419
+ response_text = data["choices"][0]["message"]["content"]
420
+ print(f"[DEBUG] OpenAI response preview: {response_text[:500]}")
421
+
422
+ return _parse_model_response(response_text, page_num)
423
+
424
+
425
  async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
426
  """Extract from a single page using HuggingFace Inference API (router endpoint)."""
427
  if not HF_TOKEN: