""" BabelDOC with Agentic AI - Modal Deployment PDF translation API with layout preservation. 20-page limit during test phase. Setup: modal secret create babeldocs-secrets \ NEBIUS_API_KEY=your_key \ NEBIUS_API_BASE=https://api.tokenfactory.nebius.com/v1/ \ NEBIUS_TRANSLATION_MODEL=openai/gpt-oss-120b Deploy: modal deploy modal_deploy.py """ import modal import os from pathlib import Path THIS_DIR = Path(__file__).parent.resolve() BABELDOC_DIR = THIS_DIR.parent / "BabelDOC" # Max pages allowed (test phase limit) MAX_PAGES = 20 # Modal app - custom name for hackathon app = modal.App("mcp1stann-babeldocs") # Image with uv and BabelDOC installed babeldocs_image = ( modal.Image.debian_slim(python_version="3.11") .apt_install( "git", "libgl1-mesa-glx", "libglib2.0-0", "libsm6", "libxext6", "libxrender-dev", "libgomp1", "curl", "libspatialindex-dev", # For rtree "libharfbuzz-dev", # For uharfbuzz "libfreetype6-dev", # For freetype-py "libopencv-dev", # For opencv dependencies "libzstd-dev", # For pyzstd ) .pip_install("uv") .env({ "PYTHONIOENCODING": "utf-8", "PYTHONUNBUFFERED": "1", "UV_SYSTEM_PYTHON": "1", }) .pip_install("fastapi[standard]") .add_local_dir( str(BABELDOC_DIR), remote_path="/app/BabelDOC", copy=True, ) .run_commands( "cd /app/BabelDOC && uv pip install -e . --python python3.11", ) ) # Volume for caching models and fonts cache_volume = modal.Volume.from_name("babeldocs-cache", create_if_missing=True) CACHE_PATH = "/cache" @app.cls( image=babeldocs_image, timeout=900, # 15 minutes memory=8192, cpu=4, volumes={CACHE_PATH: cache_volume}, secrets=[modal.Secret.from_name("babeldocs-secrets")], scaledown_window=300, # Keep warm for 5 minutes ) class BabelDocsTranslator: """Class-based translator for BabelDOC (based on working SVG generator pattern).""" def _count_pdf_pages(self, pdf_bytes: bytes) -> int: """Count pages in PDF using PyMuPDF.""" try: import fitz # PyMuPDF doc = fitz.open(stream=pdf_bytes, filetype="pdf") count = len(doc) doc.close() return count except Exception: return -1 # Unknown def _translate_internal( self, pdf_base64: str, target_lang: str = "fr", pages: str = "", no_dual: bool = False, no_mono: bool = False, ) -> dict: """BabelDOC with Agentic AI - Internal translation.""" import base64 import subprocess import tempfile from pathlib import Path from datetime import datetime try: if not pdf_base64: return {"success": False, "message": "No PDF provided"} pdf_bytes = base64.b64decode(pdf_base64) # Check page limit (test phase) page_count = self._count_pdf_pages(pdf_bytes) if page_count > MAX_PAGES: return { "success": False, "message": f"PDF has {page_count} pages. Maximum allowed: {MAX_PAGES} pages (test phase limit)." } with tempfile.TemporaryDirectory() as tmpdir: input_path = Path(tmpdir) / "input.pdf" output_dir = Path(tmpdir) / "output" output_dir.mkdir() input_path.write_bytes(pdf_bytes) cmd = [ "babeldoc", "--files", str(input_path), "--output", str(output_dir), "--lang-out", target_lang, "--openai", "--openai-model", os.getenv("NEBIUS_TRANSLATION_MODEL", "openai/gpt-oss-120b"), "--openai-base-url", os.getenv("NEBIUS_API_BASE", "https://api.tokenfactory.nebius.com/v1/"), "--openai-api-key", os.getenv("NEBIUS_API_KEY", ""), "--no-watermark", "--translate-table-text", "--enhance-compatibility", # Enable image translation (orchestration PASS 2) with vision model "--vision-model", os.getenv("NEBIUS_VISION_MODEL", "Qwen/Qwen2.5-VL-72B-Instruct"), ] if pages: cmd.extend(["--pages", pages]) cmd.append("--only-include-translated-page") if no_dual: cmd.append("--no-dual") if no_mono: cmd.append("--no-mono") start_time = datetime.now() result = subprocess.run( cmd, capture_output=True, text=True, encoding="utf-8", errors="replace", cwd="/app/BabelDOC", env={ **os.environ, "HF_HOME": CACHE_PATH, }, ) duration = (datetime.now() - start_time).total_seconds() if result.returncode != 0: return { "success": False, "message": "Translation failed", "stderr": result.stderr[:1000] if result.stderr else "", "stdout": result.stdout[:500] if result.stdout else "", } # Find all 4 types of PDFs: # Format: name.no_watermark.{lang}.{mono|dual}.pdf # Format: name.no_watermark.{lang}.{mono|dual}.images_translated.pdf # Get all PDFs in output directory all_pdfs = list(output_dir.glob("*.pdf")) # Categorize by type mono_matches = [p for p in all_pdfs if f".{target_lang}.mono.pdf" in p.name and "images_translated" not in p.name] mono_img_matches = [p for p in all_pdfs if f".{target_lang}.mono.images_translated.pdf" in p.name] dual_matches = [p for p in all_pdfs if f".{target_lang}.dual.pdf" in p.name and "images_translated" not in p.name] dual_img_matches = [p for p in all_pdfs if f".{target_lang}.dual.images_translated.pdf" in p.name] mono_pdf = mono_matches[0] if mono_matches else None mono_img_pdf = mono_img_matches[0] if mono_img_matches else None dual_pdf = dual_matches[0] if dual_matches else None dual_img_pdf = dual_img_matches[0] if dual_img_matches else None if not any([mono_pdf, mono_img_pdf, dual_pdf, dual_img_pdf]): # Fallback to any PDF if not all_pdfs: return {"success": False, "message": "No output PDF generated"} mono_pdf = all_pdfs[0] result_data = { "success": True, "stats": { "duration_seconds": round(duration, 2), } } # Add mono PDF (without image translation) if mono_pdf and not no_mono: mono_bytes = mono_pdf.read_bytes() result_data["mono_pdf_base64"] = base64.b64encode(mono_bytes).decode("utf-8") result_data["mono_filename"] = mono_pdf.name result_data["stats"]["mono_size_bytes"] = len(mono_bytes) # Add mono PDF with image translation if mono_img_pdf and not no_mono: mono_img_bytes = mono_img_pdf.read_bytes() result_data["mono_img_pdf_base64"] = base64.b64encode(mono_img_bytes).decode("utf-8") result_data["mono_img_filename"] = mono_img_pdf.name result_data["stats"]["mono_img_size_bytes"] = len(mono_img_bytes) # Add dual PDF (without image translation) if dual_pdf and not no_dual: dual_bytes = dual_pdf.read_bytes() result_data["dual_pdf_base64"] = base64.b64encode(dual_bytes).decode("utf-8") result_data["dual_filename"] = dual_pdf.name result_data["stats"]["dual_size_bytes"] = len(dual_bytes) # Add dual PDF with image translation if dual_img_pdf and not no_dual: dual_img_bytes = dual_img_pdf.read_bytes() result_data["dual_img_pdf_base64"] = base64.b64encode(dual_img_bytes).decode("utf-8") result_data["dual_img_filename"] = dual_img_pdf.name result_data["stats"]["dual_img_size_bytes"] = len(dual_img_bytes) return result_data except Exception as e: return {"success": False, "message": f"Error: {str(e)}"} @modal.method() def translate( self, pdf_base64: str, target_lang: str = "fr", pages: str = "", no_dual: bool = False, no_mono: bool = False, ) -> dict: """Translate method (callable via Modal).""" return self._translate_internal(pdf_base64, target_lang, pages, no_dual, no_mono) @modal.fastapi_endpoint(method="POST") def api(self, request: dict) -> dict: """ FastAPI endpoint POST for PDF translation. Request body: { "pdf_base64": "base64_encoded_pdf", "target_lang": "fr", "pages": "1,2,3" (optional), "no_dual": false, "no_mono": false } """ pdf_base64 = request.get("pdf_base64", "") target_lang = request.get("target_lang", "fr") pages = request.get("pages", "") no_dual = request.get("no_dual", False) no_mono = request.get("no_mono", False) return self._translate_internal(pdf_base64, target_lang, pages, no_dual, no_mono) @modal.fastapi_endpoint(method="GET") def health(self) -> dict: """Health check endpoint.""" return { "status": "healthy", "service": "BabelDOC with Agentic AI", "version": "1.0.0", "max_pages": MAX_PAGES, } @modal.fastapi_endpoint(method="GET") def languages(self) -> dict: """Get supported languages.""" return { "languages": { "fr": "French", "en": "English", "es": "Spanish", "de": "German", "it": "Italian", "pt": "Portuguese", "zh": "Chinese", "ja": "Japanese", "ko": "Korean", "ru": "Russian", "ar": "Arabic", } } @app.local_entrypoint() def main(): """BabelDOC with Agentic AI - Local test.""" print("BabelDOC with Agentic AI - Modal Deployment") print("=" * 45) print(f"Max pages: {MAX_PAGES} (test phase)") print() print("Deploy: modal deploy modal_deploy.py") print("Test: modal serve modal_deploy.py")