Spaces:

Seth0330
/

AIEXTRACT1

Sleeping

App Files Files Community

AIEXTRACT1 / backend /app /main.py

Seth0330

Update backend/app/main.py

da57193 verified 15 days ago

raw

history blame

7.99 kB

	import os
	import time
	from typing import List, Dict

	from fastapi import FastAPI, UploadFile, File, Depends
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.staticfiles import StaticFiles
	from sqlalchemy.orm import Session

	from .db import Base, engine, SessionLocal
	from .models import ExtractionRecord
	from .schemas import ExtractionRecordBase, ExtractionStage
	from .openrouter_client import extract_fields_from_document

	# Ensure data dir exists for SQLite
	os.makedirs("data", exist_ok=True)

	# Create tables
	Base.metadata.create_all(bind=engine)

	app = FastAPI(title="Document Capture Demo – Backend")

	# CORS (for safety we allow all; you can tighten later)
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	def get_db():
	db = SessionLocal()
	try:
	yield db
	finally:
	db.close()


	@app.get("/ping")
	def ping():
	"""Healthcheck."""
	return {"status": "ok", "message": "backend alive"}


	def make_stages(total_ms: int, status: str) -> Dict[str, ExtractionStage]:
	"""
	Build synthetic stage timing data for the History UI.
	For now we just split total_ms into 4 stages.
	"""
	if total_ms <= 0:
	total_ms = 1000

	return {
	"uploading": ExtractionStage(
	time=int(total_ms * 0.15),
	status="completed",
	variation="normal",
	),
	"aiAnalysis": ExtractionStage(
	time=int(total_ms * 0.55),
	status="completed" if status == "completed" else "failed",
	variation="normal",
	),
	"dataExtraction": ExtractionStage(
	time=int(total_ms * 0.2),
	status="completed" if status == "completed" else "skipped",
	variation="fast",
	),
	"outputRendering": ExtractionStage(
	time=int(total_ms * 0.1),
	status="completed" if status == "completed" else "skipped",
	variation="normal",
	),
	}


	@app.post("/api/extract")
	async def extract_document(
	file: UploadFile = File(...),
	db: Session = Depends(get_db),
	):
	"""
	Main extraction endpoint used by the Dashboard.
	1) Read the uploaded file
	2) Call OpenRouter + Qwen3-VL
	3) Store a record in SQLite
	4) Return extraction result + metadata
	"""
	start = time.time()
	content = await file.read()
	content_type = file.content_type or "application/octet-stream"
	size_mb = len(content) / 1024 / 1024
	size_str = f"{size_mb:.2f} MB"

	try:
	print(f"[INFO] Starting extraction for file: {file.filename}, type: {content_type}, size: {size_str}")
	extracted = await extract_fields_from_document(content, content_type, file.filename)
	total_ms = int((time.time() - start) * 1000)

	print(f"[INFO] Extraction completed. Response keys: {list(extracted.keys())}")
	print(f"[INFO] Fields extracted: {extracted.get('fields', {})}")

	confidence = float(extracted.get("confidence", 90))
	fields = extracted.get("fields", {})

	# Include full_text in fields if present (for frontend display)
	full_text = extracted.get("full_text", "")
	if full_text:
	fields["full_text"] = full_text
	full_text_words = len(str(full_text).split())
	print(f"[INFO] Full text extracted: {full_text_words} words")

	# Also check for pages array
	pages_data = extracted.get("pages", [])
	if pages_data and isinstance(pages_data, list):
	print(f"[INFO] Extracted text from {len(pages_data)} page(s)")
	# Add pages to fields for frontend
	fields["pages"] = pages_data

	# Count fields, including full_text if present
	fields_extracted = len(fields) if isinstance(fields, dict) else 0

	print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}")

	status = "completed"
	error_message = None
	except Exception as e:
	import traceback
	total_ms = int((time.time() - start) * 1000)
	confidence = 0.0
	fields = {}
	fields_extracted = 0
	status = "failed"
	error_message = str(e)
	print(f"[ERROR] Extraction failed: {error_message}")
	print(f"[ERROR] Traceback: {traceback.format_exc()}")

	# Save record to DB
	rec = ExtractionRecord(
	file_name=file.filename,
	file_type=content_type,
	file_size=size_str,
	status=status,
	confidence=confidence,
	fields_extracted=fields_extracted,
	total_time_ms=total_ms,
	raw_output=str(fields),
	error_message=error_message,
	)
	db.add(rec)
	db.commit()
	db.refresh(rec)

	stages = make_stages(total_ms, status)

	# Response shape that frontend will consume
	return {
	"id": rec.id,
	"fileName": rec.file_name,
	"fileType": rec.file_type,
	"fileSize": rec.file_size,
	"status": status,
	"confidence": confidence,
	"fieldsExtracted": fields_extracted,
	"totalTime": total_ms,
	"fields": fields,
	"stages": {k: v.dict() for k, v in stages.items()},
	"errorMessage": error_message,
	}


	@app.get("/api/history", response_model=List[ExtractionRecordBase])
	def get_history(db: Session = Depends(get_db)):
	"""
	Used by the History page.
	Returns last 100 records, with synthetic stage data.
	"""
	recs = (
	db.query(ExtractionRecord)
	.order_by(ExtractionRecord.created_at.desc())
	.limit(100)
	.all()
	)

	output: List[ExtractionRecordBase] = []
	for r in recs:
	stages = make_stages(r.total_time_ms or 1000, r.status or "completed")
	output.append(
	ExtractionRecordBase(
	id=r.id,
	fileName=r.file_name,
	fileType=r.file_type or "",
	fileSize=r.file_size or "",
	extractedAt=r.created_at,
	status=r.status or "completed",
	confidence=r.confidence or 0.0,
	fieldsExtracted=r.fields_extracted or 0,
	totalTime=r.total_time_ms or 0,
	stages=stages,
	errorMessage=r.error_message,
	)
	)
	return output


	# Static frontend mounting (used after we build React)
	# Dockerfile copies the Vite build into backend/frontend_dist
	# IMPORTANT: API routes must be defined BEFORE this so they take precedence
	frontend_dir = os.path.join(
	os.path.dirname(os.path.dirname(__file__)), "frontend_dist"
	)

	if os.path.isdir(frontend_dir):
	# Serve static files (JS, CSS, images, etc.) from assets directory
	assets_dir = os.path.join(frontend_dir, "assets")
	if os.path.isdir(assets_dir):
	app.mount(
	"/assets",
	StaticFiles(directory=assets_dir),
	name="assets",
	)

	# Catch-all route to serve index.html for React Router
	# This must be last so API routes are matched first
	@app.get("/{full_path:path}")
	async def serve_frontend(full_path: str):
	"""
	Serve React app for all non-API routes.
	React Router will handle client-side routing.
	"""
	# Skip API routes, docs, and static assets
	if (full_path.startswith("api/") or
	full_path.startswith("docs") or
	full_path.startswith("openapi.json") or
	full_path.startswith("assets/")):
	from fastapi import HTTPException
	raise HTTPException(status_code=404)

	# Serve index.html for all other routes
	from fastapi.responses import FileResponse
	index_path = os.path.join(frontend_dir, "index.html")
	if os.path.exists(index_path):
	return FileResponse(index_path)
	from fastapi import HTTPException
	raise HTTPException(status_code=404)