Spaces:
Runtime error
Runtime error
Integrate Gemini API for enhanced image processing in MarkItDown
Browse files- Updated `app.py` to reflect the use of Gemini for image processing in MarkItDown.
- Modified `requirements.txt` to include `ffmpeg-python` for audio processing and removed OpenAI dependency.
- Enhanced `setup.sh` to install Gemini dependencies and updated installation instructions.
- Introduced `gemini_client_wrapper.py` to create a wrapper for Gemini API, mimicking OpenAI's interface for compatibility with MarkItDown.
- Added tests in `test_gemini_wrapper.py` to validate Gemini integration and MarkItDown functionality.
- Refactored `markitdown_parser.py` to utilize Gemini for image files while maintaining standard processing for other formats.
- Updated parser names and descriptions for clarity across various parsers.
- app.py +1 -1
- requirements.txt +2 -1
- setup.sh +2 -2
- src/core/gemini_client_wrapper.py +198 -0
- src/parsers/docling_parser.py +1 -1
- src/parsers/got_ocr_parser.py +1 -1
- src/parsers/markitdown_parser.py +95 -24
- src/parsers/mistral_ocr_parser.py +1 -1
- src/ui/components/document_converter.py +1 -1
- test_gemini_wrapper.py +94 -0
app.py
CHANGED
|
@@ -35,7 +35,7 @@ except ImportError as e:
|
|
| 35 |
|
| 36 |
try:
|
| 37 |
from markitdown import MarkItDown
|
| 38 |
-
print("MarkItDown is available")
|
| 39 |
except ImportError:
|
| 40 |
print("Installing MarkItDown...")
|
| 41 |
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "markitdown[all]"], check=False)
|
|
|
|
| 35 |
|
| 36 |
try:
|
| 37 |
from markitdown import MarkItDown
|
| 38 |
+
print("MarkItDown is available (using Gemini for image processing)")
|
| 39 |
except ImportError:
|
| 40 |
print("Installing MarkItDown...")
|
| 41 |
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "markitdown[all]"], check=False)
|
requirements.txt
CHANGED
|
@@ -30,7 +30,8 @@ huggingface_hub[cli]>=0.19.0
|
|
| 30 |
|
| 31 |
# MarkItDown and its dependencies
|
| 32 |
markitdown[all]
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
# Docling dependencies
|
| 36 |
docling
|
|
|
|
| 30 |
|
| 31 |
# MarkItDown and its dependencies
|
| 32 |
markitdown[all]
|
| 33 |
+
ffmpeg-python # For audio processing in MarkItDown
|
| 34 |
+
# Note: Using Gemini Flash 2.5 for LLM image descriptions instead of OpenAI
|
| 35 |
|
| 36 |
# Docling dependencies
|
| 37 |
docling
|
setup.sh
CHANGED
|
@@ -30,8 +30,7 @@ echo "NumPy installed successfully"
|
|
| 30 |
echo "Installing Python dependencies..."
|
| 31 |
pip install -q -U pillow opencv-python
|
| 32 |
pip install -q -U google-genai
|
| 33 |
-
|
| 34 |
-
# pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
|
| 35 |
echo "Python dependencies installed successfully"
|
| 36 |
|
| 37 |
# Install GOT-OCR transformers dependencies
|
|
@@ -50,6 +49,7 @@ echo "Spaces module installed successfully"
|
|
| 50 |
# Install markitdown with all optional dependencies
|
| 51 |
echo "Installing MarkItDown with all dependencies..."
|
| 52 |
pip install -q -U 'markitdown[all]'
|
|
|
|
| 53 |
echo "MarkItDown installed successfully"
|
| 54 |
|
| 55 |
# Install Docling for advanced PDF understanding
|
|
|
|
| 30 |
echo "Installing Python dependencies..."
|
| 31 |
pip install -q -U pillow opencv-python
|
| 32 |
pip install -q -U google-genai
|
| 33 |
+
# Note: Using Gemini Flash 2.5 for LLM image descriptions in MarkItDown instead of OpenAI
|
|
|
|
| 34 |
echo "Python dependencies installed successfully"
|
| 35 |
|
| 36 |
# Install GOT-OCR transformers dependencies
|
|
|
|
| 49 |
# Install markitdown with all optional dependencies
|
| 50 |
echo "Installing MarkItDown with all dependencies..."
|
| 51 |
pip install -q -U 'markitdown[all]'
|
| 52 |
+
pip install -q -U ffmpeg-python # For audio processing
|
| 53 |
echo "MarkItDown installed successfully"
|
| 54 |
|
| 55 |
# Install Docling for advanced PDF understanding
|
src/core/gemini_client_wrapper.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Gemini client wrapper that mimics OpenAI client interface for MarkItDown compatibility.
|
| 3 |
+
This allows us to use Gemini Flash 2.5 for image processing in MarkItDown.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import base64
|
| 8 |
+
from typing import List, Dict, Any, Optional
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from google import genai
|
| 13 |
+
HAS_GEMINI = True
|
| 14 |
+
except ImportError:
|
| 15 |
+
HAS_GEMINI = False
|
| 16 |
+
|
| 17 |
+
from src.core.config import config
|
| 18 |
+
from src.core.logging_config import get_logger
|
| 19 |
+
|
| 20 |
+
logger = get_logger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class GeminiChatCompletions:
|
| 24 |
+
"""Chat completions interface that mimics OpenAI's chat.completions API."""
|
| 25 |
+
|
| 26 |
+
def __init__(self, client):
|
| 27 |
+
self.client = client
|
| 28 |
+
|
| 29 |
+
def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> 'GeminiResponse':
|
| 30 |
+
"""Create a chat completion that mimics OpenAI's API."""
|
| 31 |
+
if not messages:
|
| 32 |
+
raise ValueError("Messages cannot be empty")
|
| 33 |
+
|
| 34 |
+
# Extract the user message (MarkItDown sends a single user message with text + image)
|
| 35 |
+
user_message = None
|
| 36 |
+
for msg in messages:
|
| 37 |
+
if msg.get("role") == "user":
|
| 38 |
+
user_message = msg
|
| 39 |
+
break
|
| 40 |
+
|
| 41 |
+
if not user_message:
|
| 42 |
+
raise ValueError("No user message found")
|
| 43 |
+
|
| 44 |
+
content = user_message.get("content", [])
|
| 45 |
+
if not isinstance(content, list):
|
| 46 |
+
content = [{"type": "text", "text": str(content)}]
|
| 47 |
+
|
| 48 |
+
# Extract text prompt and image
|
| 49 |
+
text_prompt = ""
|
| 50 |
+
image_data = None
|
| 51 |
+
|
| 52 |
+
for item in content:
|
| 53 |
+
if item.get("type") == "text":
|
| 54 |
+
text_prompt = item.get("text", "")
|
| 55 |
+
elif item.get("type") == "image_url":
|
| 56 |
+
image_url = item.get("image_url", {}).get("url", "")
|
| 57 |
+
if image_url.startswith("data:image/"):
|
| 58 |
+
# Extract base64 data from data URI
|
| 59 |
+
try:
|
| 60 |
+
header, data = image_url.split(",", 1)
|
| 61 |
+
image_data = base64.b64decode(data)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logger.error(f"Failed to decode image data: {e}")
|
| 64 |
+
raise ValueError("Invalid image data URI")
|
| 65 |
+
|
| 66 |
+
if not text_prompt:
|
| 67 |
+
text_prompt = "Describe this image in detail."
|
| 68 |
+
|
| 69 |
+
if not image_data:
|
| 70 |
+
raise ValueError("No image data found in request")
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
# Use Gemini to process the image
|
| 74 |
+
response = self.client.models.generate_content(
|
| 75 |
+
model=config.model.gemini_model,
|
| 76 |
+
contents=[
|
| 77 |
+
{
|
| 78 |
+
"parts": [
|
| 79 |
+
{"text": text_prompt},
|
| 80 |
+
{
|
| 81 |
+
"inline_data": {
|
| 82 |
+
"mime_type": "image/jpeg", # Assume JPEG for now
|
| 83 |
+
"data": base64.b64encode(image_data).decode()
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
]
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
config={
|
| 90 |
+
"temperature": config.model.temperature,
|
| 91 |
+
"max_output_tokens": 1024, # Reasonable limit for image descriptions
|
| 92 |
+
}
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Extract text from Gemini response
|
| 96 |
+
response_text = ""
|
| 97 |
+
if hasattr(response, "text") and response.text:
|
| 98 |
+
response_text = response.text
|
| 99 |
+
elif hasattr(response, "candidates") and response.candidates:
|
| 100 |
+
candidate = response.candidates[0]
|
| 101 |
+
if hasattr(candidate, "content") and candidate.content:
|
| 102 |
+
if hasattr(candidate.content, "parts") and candidate.content.parts:
|
| 103 |
+
response_text = candidate.content.parts[0].text
|
| 104 |
+
|
| 105 |
+
if not response_text:
|
| 106 |
+
logger.warning("Empty response from Gemini, using fallback")
|
| 107 |
+
response_text = "Image processing completed but no description generated."
|
| 108 |
+
|
| 109 |
+
return GeminiResponse(response_text)
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logger.error(f"Gemini API error: {str(e)}")
|
| 113 |
+
# Return a fallback response to avoid breaking MarkItDown
|
| 114 |
+
return GeminiResponse(f"Image description unavailable due to processing error: {str(e)}")
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class GeminiChoice:
|
| 118 |
+
"""Mimics OpenAI's Choice object."""
|
| 119 |
+
|
| 120 |
+
def __init__(self, content: str):
|
| 121 |
+
self.message = GeminiMessage(content)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class GeminiMessage:
|
| 125 |
+
"""Mimics OpenAI's Message object."""
|
| 126 |
+
|
| 127 |
+
def __init__(self, content: str):
|
| 128 |
+
self.content = content
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
class GeminiResponse:
|
| 132 |
+
"""Mimics OpenAI's ChatCompletion response."""
|
| 133 |
+
|
| 134 |
+
def __init__(self, content: str):
|
| 135 |
+
self.choices = [GeminiChoice(content)]
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class GeminiClientWrapper:
|
| 139 |
+
"""
|
| 140 |
+
Gemini client wrapper that mimics OpenAI client interface for MarkItDown.
|
| 141 |
+
|
| 142 |
+
This allows MarkItDown to use Gemini for image processing while thinking
|
| 143 |
+
it's using an OpenAI client.
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 147 |
+
if not HAS_GEMINI:
|
| 148 |
+
raise ImportError("google-genai package is required for Gemini support")
|
| 149 |
+
|
| 150 |
+
api_key = api_key or config.api.google_api_key
|
| 151 |
+
if not api_key:
|
| 152 |
+
raise ValueError("Google API key is required for Gemini client")
|
| 153 |
+
|
| 154 |
+
self.client = genai.Client(api_key=api_key)
|
| 155 |
+
self.chat = GeminiChatCompletions(self.client)
|
| 156 |
+
|
| 157 |
+
logger.info("Gemini client wrapper initialized for MarkItDown compatibility")
|
| 158 |
+
|
| 159 |
+
@property
|
| 160 |
+
def completions(self):
|
| 161 |
+
"""Alias for chat to match some OpenAI client patterns."""
|
| 162 |
+
return self.chat
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def create_gemini_client_for_markitdown() -> Optional[GeminiClientWrapper]:
|
| 166 |
+
"""
|
| 167 |
+
Create a Gemini client wrapper for use with MarkItDown.
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
GeminiClientWrapper if Gemini is available and configured, None otherwise.
|
| 171 |
+
"""
|
| 172 |
+
if not HAS_GEMINI:
|
| 173 |
+
logger.warning("Gemini not available for MarkItDown image processing")
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
if not config.api.google_api_key:
|
| 177 |
+
logger.warning("No Google API key found for MarkItDown image processing")
|
| 178 |
+
return None
|
| 179 |
+
|
| 180 |
+
try:
|
| 181 |
+
return GeminiClientWrapper()
|
| 182 |
+
except Exception as e:
|
| 183 |
+
logger.error(f"Failed to create Gemini client for MarkItDown: {e}")
|
| 184 |
+
return None
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# For testing purposes
|
| 188 |
+
if __name__ == "__main__":
|
| 189 |
+
# Test the wrapper
|
| 190 |
+
try:
|
| 191 |
+
client = create_gemini_client_for_markitdown()
|
| 192 |
+
if client:
|
| 193 |
+
print("β
Gemini client wrapper created successfully")
|
| 194 |
+
print("β
Ready for MarkItDown integration")
|
| 195 |
+
else:
|
| 196 |
+
print("β Failed to create Gemini client wrapper")
|
| 197 |
+
except Exception as e:
|
| 198 |
+
print(f"β Error: {e}")
|
src/parsers/docling_parser.py
CHANGED
|
@@ -132,7 +132,7 @@ class DoclingParser(DocumentParser):
|
|
| 132 |
|
| 133 |
@classmethod
|
| 134 |
def get_name(cls) -> str:
|
| 135 |
-
return "Docling
|
| 136 |
|
| 137 |
@classmethod
|
| 138 |
def get_supported_file_types(cls) -> Set[str]:
|
|
|
|
| 132 |
|
| 133 |
@classmethod
|
| 134 |
def get_name(cls) -> str:
|
| 135 |
+
return "Docling"
|
| 136 |
|
| 137 |
@classmethod
|
| 138 |
def get_supported_file_types(cls) -> Set[str]:
|
src/parsers/got_ocr_parser.py
CHANGED
|
@@ -41,7 +41,7 @@ class GotOcrParser(DocumentParser):
|
|
| 41 |
|
| 42 |
@classmethod
|
| 43 |
def get_name(cls) -> str:
|
| 44 |
-
return "GOT-OCR
|
| 45 |
|
| 46 |
@classmethod
|
| 47 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
|
|
|
| 41 |
|
| 42 |
@classmethod
|
| 43 |
def get_name(cls) -> str:
|
| 44 |
+
return "GOT-OCR"
|
| 45 |
|
| 46 |
@classmethod
|
| 47 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
src/parsers/markitdown_parser.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import logging
|
| 2 |
import os
|
|
|
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Dict, List, Optional, Any, Union, Set
|
| 5 |
import io
|
|
@@ -12,12 +14,18 @@ from src.core.exceptions import DocumentProcessingError, ParserError
|
|
| 12 |
# Check for MarkItDown availability
|
| 13 |
try:
|
| 14 |
from markitdown import MarkItDown
|
| 15 |
-
from openai import OpenAI
|
| 16 |
HAS_MARKITDOWN = True
|
| 17 |
except ImportError:
|
| 18 |
HAS_MARKITDOWN = False
|
| 19 |
logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# Configure logging
|
| 22 |
logger = logging.getLogger(__name__)
|
| 23 |
logger.setLevel(logging.DEBUG)
|
|
@@ -33,19 +41,10 @@ class MarkItDownParser(DocumentParser):
|
|
| 33 |
# Initialize MarkItDown instance
|
| 34 |
if HAS_MARKITDOWN:
|
| 35 |
try:
|
| 36 |
-
#
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
self.markdown_instance = MarkItDown(
|
| 41 |
-
enable_plugins=False,
|
| 42 |
-
llm_client=client,
|
| 43 |
-
llm_model="gpt-4o"
|
| 44 |
-
)
|
| 45 |
-
logger.info("MarkItDown initialized with OpenAI support for image descriptions")
|
| 46 |
-
else:
|
| 47 |
-
self.markdown_instance = MarkItDown(enable_plugins=False)
|
| 48 |
-
logger.info("MarkItDown initialized without OpenAI support")
|
| 49 |
except Exception as e:
|
| 50 |
logger.error(f"Error initializing MarkItDown: {str(e)}")
|
| 51 |
self.markdown_instance = None
|
|
@@ -72,23 +71,95 @@ class MarkItDownParser(DocumentParser):
|
|
| 72 |
# Check for cancellation before starting
|
| 73 |
if self._check_cancellation():
|
| 74 |
raise DocumentProcessingError("Conversion cancelled")
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
| 76 |
try:
|
| 77 |
-
#
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
except Exception as e:
|
| 86 |
logger.error(f"Error converting file with MarkItDown: {str(e)}")
|
| 87 |
raise DocumentProcessingError(f"MarkItDown conversion failed: {str(e)}")
|
| 88 |
|
| 89 |
@classmethod
|
| 90 |
def get_name(cls) -> str:
|
| 91 |
-
return "MarkItDown
|
| 92 |
|
| 93 |
@classmethod
|
| 94 |
def get_supported_file_types(cls) -> Set[str]:
|
|
@@ -112,7 +183,7 @@ class MarkItDownParser(DocumentParser):
|
|
| 112 |
|
| 113 |
@classmethod
|
| 114 |
def get_description(cls) -> str:
|
| 115 |
-
return "MarkItDown parser for converting various file formats to Markdown"
|
| 116 |
|
| 117 |
|
| 118 |
# Register the parser with the registry if available
|
|
|
|
| 1 |
import logging
|
| 2 |
import os
|
| 3 |
+
import threading
|
| 4 |
+
import time
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Dict, List, Optional, Any, Union, Set
|
| 7 |
import io
|
|
|
|
| 14 |
# Check for MarkItDown availability
|
| 15 |
try:
|
| 16 |
from markitdown import MarkItDown
|
|
|
|
| 17 |
HAS_MARKITDOWN = True
|
| 18 |
except ImportError:
|
| 19 |
HAS_MARKITDOWN = False
|
| 20 |
logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
|
| 21 |
|
| 22 |
+
# Import our Gemini wrapper for LLM support
|
| 23 |
+
try:
|
| 24 |
+
from src.core.gemini_client_wrapper import create_gemini_client_for_markitdown
|
| 25 |
+
HAS_GEMINI_WRAPPER = True
|
| 26 |
+
except ImportError:
|
| 27 |
+
HAS_GEMINI_WRAPPER = False
|
| 28 |
+
|
| 29 |
# Configure logging
|
| 30 |
logger = logging.getLogger(__name__)
|
| 31 |
logger.setLevel(logging.DEBUG)
|
|
|
|
| 41 |
# Initialize MarkItDown instance
|
| 42 |
if HAS_MARKITDOWN:
|
| 43 |
try:
|
| 44 |
+
# Initialize MarkItDown without LLM client for better performance
|
| 45 |
+
# LLM client will only be used for image files when needed
|
| 46 |
+
self.markdown_instance = MarkItDown()
|
| 47 |
+
logger.info("MarkItDown initialized successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
except Exception as e:
|
| 49 |
logger.error(f"Error initializing MarkItDown: {str(e)}")
|
| 50 |
self.markdown_instance = None
|
|
|
|
| 71 |
# Check for cancellation before starting
|
| 72 |
if self._check_cancellation():
|
| 73 |
raise DocumentProcessingError("Conversion cancelled")
|
| 74 |
+
|
| 75 |
+
file_path_str = str(file_path)
|
| 76 |
+
file_ext = Path(file_path).suffix.lower()
|
| 77 |
+
|
| 78 |
try:
|
| 79 |
+
# Run conversion in a separate thread to support cancellation
|
| 80 |
+
result_container = {"result": None, "error": None, "completed": False}
|
| 81 |
+
|
| 82 |
+
def conversion_worker():
|
| 83 |
+
try:
|
| 84 |
+
# For image files, potentially use LLM if available
|
| 85 |
+
if file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
|
| 86 |
+
if HAS_GEMINI_WRAPPER:
|
| 87 |
+
try:
|
| 88 |
+
# Create Gemini-enabled instance for image processing
|
| 89 |
+
gemini_client = create_gemini_client_for_markitdown()
|
| 90 |
+
if gemini_client:
|
| 91 |
+
llm_instance = MarkItDown(llm_client=gemini_client, llm_model="gemini-2.5-flash")
|
| 92 |
+
result = llm_instance.convert(file_path_str)
|
| 93 |
+
else:
|
| 94 |
+
# No Gemini client available, use standard conversion
|
| 95 |
+
logger.info("Gemini client not available, using standard conversion for image")
|
| 96 |
+
result = self.markdown_instance.convert(file_path_str)
|
| 97 |
+
except Exception as llm_error:
|
| 98 |
+
logger.warning(f"Gemini image processing failed, falling back to basic conversion: {llm_error}")
|
| 99 |
+
result = self.markdown_instance.convert(file_path_str)
|
| 100 |
+
else:
|
| 101 |
+
# No Gemini wrapper available, use standard conversion
|
| 102 |
+
logger.info("Gemini wrapper not available, using standard conversion for image")
|
| 103 |
+
result = self.markdown_instance.convert(file_path_str)
|
| 104 |
+
else:
|
| 105 |
+
# For non-image files, use standard conversion
|
| 106 |
+
result = self.markdown_instance.convert(file_path_str)
|
| 107 |
+
|
| 108 |
+
result_container["result"] = result
|
| 109 |
+
result_container["completed"] = True
|
| 110 |
+
except Exception as e:
|
| 111 |
+
result_container["error"] = e
|
| 112 |
+
result_container["completed"] = True
|
| 113 |
+
|
| 114 |
+
# Start conversion in background thread
|
| 115 |
+
conversion_thread = threading.Thread(target=conversion_worker, daemon=True)
|
| 116 |
+
conversion_thread.start()
|
| 117 |
+
|
| 118 |
+
# Wait for completion or cancellation
|
| 119 |
+
while conversion_thread.is_alive():
|
| 120 |
+
if self._check_cancellation():
|
| 121 |
+
logger.info("MarkItDown conversion cancelled by user")
|
| 122 |
+
# Give thread a moment to finish cleanly
|
| 123 |
+
conversion_thread.join(timeout=0.1)
|
| 124 |
+
raise DocumentProcessingError("Conversion cancelled")
|
| 125 |
+
time.sleep(0.1) # Check every 100ms
|
| 126 |
+
|
| 127 |
+
# Ensure thread has completed
|
| 128 |
+
conversion_thread.join()
|
| 129 |
+
|
| 130 |
+
# Check for errors
|
| 131 |
+
if result_container["error"]:
|
| 132 |
+
raise result_container["error"]
|
| 133 |
+
|
| 134 |
+
result = result_container["result"]
|
| 135 |
+
if result is None:
|
| 136 |
+
raise DocumentProcessingError("MarkItDown conversion returned no result")
|
| 137 |
+
|
| 138 |
+
# Use the correct attribute - MarkItDown returns .text_content
|
| 139 |
+
if hasattr(result, 'text_content') and result.text_content:
|
| 140 |
+
return result.text_content
|
| 141 |
+
elif hasattr(result, 'markdown') and result.markdown:
|
| 142 |
+
return result.markdown
|
| 143 |
+
elif hasattr(result, 'content') and result.content:
|
| 144 |
+
return result.content
|
| 145 |
+
else:
|
| 146 |
+
# Fallback - convert result to string
|
| 147 |
+
content = str(result)
|
| 148 |
+
if content and content.strip():
|
| 149 |
+
return content
|
| 150 |
+
else:
|
| 151 |
+
raise DocumentProcessingError("MarkItDown conversion returned empty content")
|
| 152 |
|
| 153 |
+
except DocumentProcessingError:
|
| 154 |
+
# Re-raise cancellation errors
|
| 155 |
+
raise
|
| 156 |
except Exception as e:
|
| 157 |
logger.error(f"Error converting file with MarkItDown: {str(e)}")
|
| 158 |
raise DocumentProcessingError(f"MarkItDown conversion failed: {str(e)}")
|
| 159 |
|
| 160 |
@classmethod
|
| 161 |
def get_name(cls) -> str:
|
| 162 |
+
return "MarkItDown"
|
| 163 |
|
| 164 |
@classmethod
|
| 165 |
def get_supported_file_types(cls) -> Set[str]:
|
|
|
|
| 183 |
|
| 184 |
@classmethod
|
| 185 |
def get_description(cls) -> str:
|
| 186 |
+
return "MarkItDown parser for converting various file formats to Markdown. Uses Gemini Flash 2.5 for advanced image analysis."
|
| 187 |
|
| 188 |
|
| 189 |
# Register the parser with the registry if available
|
src/parsers/mistral_ocr_parser.py
CHANGED
|
@@ -32,7 +32,7 @@ class MistralOcrParser(DocumentParser):
|
|
| 32 |
|
| 33 |
@classmethod
|
| 34 |
def get_name(cls) -> str:
|
| 35 |
-
return "Mistral OCR
|
| 36 |
|
| 37 |
@classmethod
|
| 38 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
|
|
|
| 32 |
|
| 33 |
@classmethod
|
| 34 |
def get_name(cls) -> str:
|
| 35 |
+
return "Mistral OCR"
|
| 36 |
|
| 37 |
@classmethod
|
| 38 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
src/ui/components/document_converter.py
CHANGED
|
@@ -220,7 +220,7 @@ def create_document_converter_tab():
|
|
| 220 |
files_input = gr.Files(
|
| 221 |
label="Upload Document(s) - Single file or up to 5 files (20MB max combined)",
|
| 222 |
file_count="multiple",
|
| 223 |
-
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".txt", ".md", ".html", ".htm"]
|
| 224 |
)
|
| 225 |
|
| 226 |
# Processing type selector (visible only for multiple files)
|
|
|
|
| 220 |
files_input = gr.Files(
|
| 221 |
label="Upload Document(s) - Single file or up to 5 files (20MB max combined)",
|
| 222 |
file_count="multiple",
|
| 223 |
+
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".txt", ".md", ".html", ".htm", ".csv"]
|
| 224 |
)
|
| 225 |
|
| 226 |
# Processing type selector (visible only for multiple files)
|
test_gemini_wrapper.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Simple test script for Gemini wrapper functionality
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# Add project root to path
|
| 9 |
+
sys.path.append(str(Path(__file__).parent))
|
| 10 |
+
|
| 11 |
+
def test_gemini_wrapper():
|
| 12 |
+
"""Test Gemini wrapper without API key"""
|
| 13 |
+
print("Testing Gemini wrapper structure...")
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from src.core.gemini_client_wrapper import (
|
| 17 |
+
GeminiClientWrapper,
|
| 18 |
+
GeminiChatCompletions,
|
| 19 |
+
GeminiResponse,
|
| 20 |
+
HAS_GEMINI,
|
| 21 |
+
create_gemini_client_for_markitdown
|
| 22 |
+
)
|
| 23 |
+
print("β
All classes imported successfully")
|
| 24 |
+
print(f"β
HAS_GEMINI: {HAS_GEMINI}")
|
| 25 |
+
|
| 26 |
+
# Test response structure
|
| 27 |
+
test_response = GeminiResponse("Test image description")
|
| 28 |
+
print(f"β
Response choices: {len(test_response.choices)}")
|
| 29 |
+
print(f"β
Message content: {test_response.choices[0].message.content}")
|
| 30 |
+
|
| 31 |
+
# Test client creation (should fail gracefully without API key)
|
| 32 |
+
client = create_gemini_client_for_markitdown()
|
| 33 |
+
print(f"β
Client creation (no API key): {client is None}")
|
| 34 |
+
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"β Error: {e}")
|
| 37 |
+
import traceback
|
| 38 |
+
traceback.print_exc()
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
return True
|
| 42 |
+
|
| 43 |
+
def test_markitdown_availability():
|
| 44 |
+
"""Test MarkItDown availability"""
|
| 45 |
+
print("\nTesting MarkItDown availability...")
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
from markitdown import MarkItDown
|
| 49 |
+
print("β
MarkItDown imported successfully")
|
| 50 |
+
|
| 51 |
+
# Test basic initialization
|
| 52 |
+
md = MarkItDown()
|
| 53 |
+
print("β
MarkItDown initialized without LLM client")
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"β MarkItDown error: {e}")
|
| 57 |
+
return False
|
| 58 |
+
|
| 59 |
+
return True
|
| 60 |
+
|
| 61 |
+
def test_integration_structure():
|
| 62 |
+
"""Test the overall integration structure"""
|
| 63 |
+
print("\nTesting integration structure...")
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Test that our wrapper can theoretically work with MarkItDown
|
| 67 |
+
from src.core.gemini_client_wrapper import GeminiClientWrapper, HAS_GEMINI
|
| 68 |
+
from markitdown import MarkItDown
|
| 69 |
+
|
| 70 |
+
print("β
Both components available for integration")
|
| 71 |
+
|
| 72 |
+
# Test interface compatibility (structure only)
|
| 73 |
+
if HAS_GEMINI:
|
| 74 |
+
print("β
Gemini dependency available")
|
| 75 |
+
else:
|
| 76 |
+
print("β οΈ Gemini dependency not available")
|
| 77 |
+
|
| 78 |
+
print("β
Integration structure test passed")
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"β Integration error: {e}")
|
| 82 |
+
return False
|
| 83 |
+
|
| 84 |
+
return True
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
print("=== Testing Gemini-MarkItDown Integration ===\n")
|
| 88 |
+
|
| 89 |
+
success = True
|
| 90 |
+
success &= test_gemini_wrapper()
|
| 91 |
+
success &= test_markitdown_availability()
|
| 92 |
+
success &= test_integration_structure()
|
| 93 |
+
|
| 94 |
+
print(f"\n=== Overall Result: {'β
PASS' if success else 'β FAIL'} ===")
|