Seth0330 commited on
Commit
dd82407
·
verified ·
1 Parent(s): 6f6e8af

Update backend/app/openrouter_client.py

Browse files
Files changed (1) hide show
  1. backend/app/openrouter_client.py +293 -8
backend/app/openrouter_client.py CHANGED
@@ -20,6 +20,14 @@ OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
20
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
21
  MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
22
 
 
 
 
 
 
 
 
 
23
 
24
  def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
25
  """
@@ -127,25 +135,302 @@ def _file_to_image_blocks(file_bytes: bytes, content_type: str) -> List[Dict[str
127
  }]
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  async def extract_fields_from_document(
131
  file_bytes: bytes,
132
  content_type: str,
133
  filename: str,
134
  ) -> Dict[str, Any]:
135
  """
136
- Call OpenRouter with Qwen3-VL and return parsed JSON with fields.
137
- We instruct the model to return JSON only.
138
  """
139
- if not OPENROUTER_API_KEY:
140
- raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
141
-
142
  # Convert file to image blocks (handles PDF conversion)
143
- image_blocks = _file_to_image_blocks(file_bytes, content_type)
144
 
145
- if not image_blocks:
146
  raise ValueError("No images generated from file")
147
 
148
- print(f"[INFO] Generated {len(image_blocks)} image block(s) for processing")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  system_prompt = (
151
  "You are a document extraction engine with vision capabilities. "
 
20
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/chat/completions"
21
  MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
22
 
23
+ # HuggingFace Inference API
24
+ HF_TOKEN = os.environ.get("HF_TOKEN")
25
+ HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
26
+ HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen2-VL-7B-Instruct") # Alternative HF model
27
+
28
+ # Backend selection: "openrouter" or "huggingface"
29
+ EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
30
+
31
 
32
  def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
33
  """
 
135
  }]
136
 
137
 
138
+ async def _extract_single_page(image_bytes: bytes, page_num: int, total_pages: int, backend: str = None) -> Dict[str, Any]:
139
+ """
140
+ Extract text from a single page/image.
141
+ Processes one page at a time to avoid large payloads.
142
+ """
143
+ backend = backend or EXTRACTION_BACKEND
144
+
145
+ if backend == "huggingface":
146
+ return await _extract_with_hf(image_bytes, page_num, total_pages)
147
+ else:
148
+ return await _extract_with_openrouter_single(image_bytes, page_num, total_pages)
149
+
150
+
151
  async def extract_fields_from_document(
152
  file_bytes: bytes,
153
  content_type: str,
154
  filename: str,
155
  ) -> Dict[str, Any]:
156
  """
157
+ Extract fields from document. Processes pages separately for better reliability.
158
+ Supports both OpenRouter and HuggingFace Inference API.
159
  """
 
 
 
160
  # Convert file to image blocks (handles PDF conversion)
161
+ image_blocks_data = _file_to_image_blocks(file_bytes, content_type)
162
 
163
+ if not image_blocks_data:
164
  raise ValueError("No images generated from file")
165
 
166
+ # Get raw image bytes for processing
167
+ if content_type == "application/pdf" or content_type.endswith("/pdf"):
168
+ # For PDFs, we need to get the raw image bytes
169
+ pdf_images = _pdf_to_images(file_bytes)
170
+ image_bytes_list = pdf_images
171
+ else:
172
+ # For regular images, use the file bytes directly
173
+ image_bytes_list = [file_bytes]
174
+
175
+ total_pages = len(image_bytes_list)
176
+ print(f"[INFO] Processing {total_pages} page(s) separately for better reliability...")
177
+
178
+ # Process each page separately
179
+ page_results = []
180
+ for page_num, img_bytes in enumerate(image_bytes_list):
181
+ print(f"[INFO] Processing page {page_num + 1}/{total_pages}...")
182
+ try:
183
+ page_result = await _extract_single_page(img_bytes, page_num + 1, total_pages)
184
+ page_results.append({
185
+ "page_number": page_num + 1,
186
+ "text": page_result.get("full_text", ""),
187
+ "fields": page_result.get("fields", {}),
188
+ "confidence": page_result.get("confidence", 0),
189
+ "doc_type": page_result.get("doc_type", "other"),
190
+ })
191
+ print(f"[INFO] Page {page_num + 1} processed successfully")
192
+ except Exception as e:
193
+ print(f"[ERROR] Failed to process page {page_num + 1}: {e}")
194
+ page_results.append({
195
+ "page_number": page_num + 1,
196
+ "text": "",
197
+ "fields": {},
198
+ "confidence": 0,
199
+ "error": str(e)
200
+ })
201
+
202
+ # Combine results from all pages
203
+ combined_full_text = "\n\n".join([f"=== PAGE {p['page_number']} ===\n\n{p['text']}" for p in page_results if p.get("text")])
204
+
205
+ # Merge fields from all pages (prefer non-empty values)
206
+ combined_fields = {}
207
+ for page_result in page_results:
208
+ page_fields = page_result.get("fields", {})
209
+ for key, value in page_fields.items():
210
+ if value and (key not in combined_fields or not combined_fields[key]):
211
+ combined_fields[key] = value
212
+
213
+ # Calculate average confidence
214
+ confidences = [p.get("confidence", 0) for p in page_results if p.get("confidence", 0) > 0]
215
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0
216
+
217
+ # Determine doc_type from first successful page
218
+ doc_type = "other"
219
+ for page_result in page_results:
220
+ if page_result.get("doc_type") and page_result["doc_type"] != "other":
221
+ doc_type = page_result["doc_type"]
222
+ break
223
+
224
+ return {
225
+ "doc_type": doc_type,
226
+ "confidence": avg_confidence,
227
+ "full_text": combined_full_text,
228
+ "fields": combined_fields,
229
+ "pages": page_results
230
+ }
231
+
232
+
233
+ async def _extract_with_openrouter_single(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
234
+ """Extract from a single page using OpenRouter."""
235
+ if not OPENROUTER_API_KEY:
236
+ raise RuntimeError("OPENROUTER_API_KEY environment variable is not set")
237
+
238
+ # Create single image block
239
+ data_url = _image_bytes_to_base64(image_bytes)
240
+ image_block = {
241
+ "type": "image_url",
242
+ "image_url": {"url": data_url}
243
+ }
244
+
245
+ system_prompt = (
246
+ "You are a document extraction engine with vision capabilities. "
247
+ "You read and extract text from documents in any language, preserving structure, formatting, and all content. "
248
+ "You output structured JSON with both the full extracted text and key-value pairs."
249
+ )
250
+
251
+ user_prompt = (
252
+ f"Read this document page ({page_num} of {total_pages}) using your vision capability and extract ALL text content. "
253
+ "I want the complete end-to-end text, preserving structure, headings, formatting, and content in all languages.\n\n"
254
+ "Extract every word, number, and piece of information, including any non-English text (Punjabi, Hindi, etc.).\n\n"
255
+ "Respond with JSON in this format:\n"
256
+ "{\n"
257
+ ' \"doc_type\": \"invoice | receipt | contract | report | notice | other\",\n'
258
+ ' \"confidence\": number between 0 and 100,\n'
259
+ ' \"full_text\": \"Complete extracted text from this page, preserving structure and formatting. Include all languages.\",\n'
260
+ ' \"fields\": {\n'
261
+ ' \"invoice_number\": \"...\",\n'
262
+ ' \"date\": \"...\",\n'
263
+ ' \"company_name\": \"...\",\n'
264
+ ' \"address\": \"...\",\n'
265
+ ' \"other_field\": \"...\"\n'
266
+ " }\n"
267
+ "}\n\n"
268
+ "IMPORTANT:\n"
269
+ "- Extract ALL text from this page, including non-English languages\n"
270
+ "- Preserve structure, headings, and formatting\n"
271
+ "- Fill in fields with relevant extracted information\n"
272
+ "- If a field is not found, use empty string or omit it"
273
+ )
274
+
275
+ payload: Dict[str, Any] = {
276
+ "model": MODEL_NAME,
277
+ "messages": [
278
+ {
279
+ "role": "system",
280
+ "content": [{"type": "text", "text": system_prompt}],
281
+ },
282
+ {
283
+ "role": "user",
284
+ "content": [
285
+ {"type": "text", "text": user_prompt},
286
+ image_block
287
+ ],
288
+ },
289
+ ],
290
+ "max_tokens": 4096, # Smaller for single page
291
+ }
292
+
293
+ headers = {
294
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
295
+ "Content-Type": "application/json",
296
+ "HTTP-Referer": os.environ.get("APP_URL", "https://huggingface.co/spaces/your-space"),
297
+ "X-Title": "Document Capture Demo",
298
+ }
299
+
300
+ payload_size_mb = len(json.dumps(payload).encode('utf-8')) / 1024 / 1024
301
+ print(f"[INFO] OpenRouter: Processing page {page_num}, payload: {payload_size_mb:.2f} MB")
302
+
303
+ try:
304
+ timeout = httpx.Timeout(180.0, connect=30.0) # 3 min per page
305
+ async with httpx.AsyncClient(timeout=timeout) as client:
306
+ resp = await client.post(OPENROUTER_BASE_URL, headers=headers, json=payload)
307
+ resp.raise_for_status()
308
+ data = resp.json()
309
+ except httpx.TimeoutException:
310
+ raise RuntimeError(f"OpenRouter API timed out for page {page_num}")
311
+ except Exception as e:
312
+ raise RuntimeError(f"OpenRouter API error for page {page_num}: {str(e)}")
313
+
314
+ if "choices" not in data or len(data["choices"]) == 0:
315
+ raise ValueError(f"No choices in OpenRouter response for page {page_num}")
316
+
317
+ content = data["choices"][0]["message"]["content"]
318
+ if isinstance(content, list):
319
+ text = "".join(part.get("text", "") for part in content if part.get("type") == "text")
320
+ else:
321
+ text = content
322
+
323
+ # Parse JSON response
324
+ return _parse_model_response(text, page_num)
325
+
326
+
327
+ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
328
+ """Extract from a single page using HuggingFace Inference API."""
329
+ if not HF_TOKEN:
330
+ raise RuntimeError("HF_TOKEN environment variable is not set")
331
+
332
+ try:
333
+ from huggingface_hub import InferenceClient
334
+ except ImportError:
335
+ raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt")
336
+
337
+ client = InferenceClient(model=HF_MODEL_NAME, token=HF_TOKEN)
338
+
339
+ prompt = (
340
+ f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
341
+ "Extract every word, number, and piece of information, including any non-English text. "
342
+ "Return JSON with 'full_text', 'doc_type', 'confidence', and 'fields'."
343
+ )
344
+
345
+ print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
346
+
347
+ try:
348
+ # HF Inference API for vision models - use image-to-text or chat completion
349
+ # For vision models, we need to use the chat completion format
350
+ result = client.chat_completion(
351
+ messages=[
352
+ {
353
+ "role": "user",
354
+ "content": [
355
+ {"type": "text", "text": prompt},
356
+ {"type": "image", "image": image_bytes}
357
+ ]
358
+ }
359
+ ],
360
+ max_tokens=2048
361
+ )
362
+
363
+ # Extract response text
364
+ if isinstance(result, dict):
365
+ if "choices" in result and len(result["choices"]) > 0:
366
+ response_text = result["choices"][0].get("message", {}).get("content", "")
367
+ else:
368
+ response_text = result.get("generated_text", str(result))
369
+ elif isinstance(result, str):
370
+ response_text = result
371
+ else:
372
+ response_text = str(result)
373
+
374
+ if not response_text:
375
+ raise ValueError("Empty response from HuggingFace API")
376
+
377
+ return _parse_model_response(response_text, page_num)
378
+ except Exception as e:
379
+ print(f"[ERROR] HuggingFace API error details: {type(e).__name__}: {str(e)}")
380
+ raise RuntimeError(f"HuggingFace API error for page {page_num}: {str(e)}")
381
+
382
+
383
+ def _parse_model_response(text: str, page_num: int = None) -> Dict[str, Any]:
384
+ """Parse JSON response from model, handling truncation and errors."""
385
+ if not text or not text.strip():
386
+ raise ValueError("Empty response from model")
387
+
388
+ # Try to parse JSON
389
+ try:
390
+ parsed = json.loads(text)
391
+ print(f"[DEBUG] Successfully parsed JSON for page {page_num or 'single'}")
392
+ return parsed
393
+ except json.JSONDecodeError as e:
394
+ print(f"[DEBUG] Direct JSON parse failed: {e}")
395
+
396
+ # Try to extract JSON from markdown code blocks
397
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
398
+ if json_match:
399
+ try:
400
+ return json.loads(json_match.group(1))
401
+ except json.JSONDecodeError:
402
+ pass
403
+
404
+ # Try to find JSON object
405
+ json_match = re.search(r'\{.*\}', text, re.DOTALL)
406
+ if json_match:
407
+ try:
408
+ fixed_json = _fix_truncated_json(json_match.group(0))
409
+ return json.loads(fixed_json)
410
+ except Exception:
411
+ pass
412
+
413
+ # Extract full_text even from truncated JSON
414
+ full_text_match = re.search(r'"full_text"\s*:\s*"(.*?)(?:"\s*[,}]|$)', text, re.DOTALL)
415
+ if full_text_match:
416
+ full_text = (full_text_match.group(1)
417
+ .replace('\\n', '\n')
418
+ .replace('\\"', '"')
419
+ .replace('\\\\', '\\'))
420
+ return {
421
+ "doc_type": "other",
422
+ "confidence": 90.0,
423
+ "full_text": full_text,
424
+ "fields": {"full_text": full_text}
425
+ }
426
+
427
+ # Last resort: return raw text
428
+ return {
429
+ "doc_type": "other",
430
+ "confidence": 50.0,
431
+ "full_text": text[:2000],
432
+ "fields": {"raw_text": text[:2000]}
433
+ }
434
 
435
  system_prompt = (
436
  "You are a document extraction engine with vision capabilities. "