Spaces:
Runtime error
Runtime error
Harsh Upadhyay
commited on
Commit
·
fca1742
1
Parent(s):
aac0325
removing local file support and adding SQLAlchemy support for all database operations.
Browse files- backend/.gitignore +1 -0
- backend/app/app.py +4 -0
- backend/app/database.py +29 -13
- backend/app/routes/routes.py +119 -56
- backend/app/utils/extract_text.py +20 -4
backend/.gitignore
CHANGED
|
@@ -13,6 +13,7 @@ env/
|
|
| 13 |
venv/
|
| 14 |
instance/
|
| 15 |
*.db
|
|
|
|
| 16 |
|
| 17 |
# OS/Editor
|
| 18 |
.DS_Store
|
|
|
|
| 13 |
venv/
|
| 14 |
instance/
|
| 15 |
*.db
|
| 16 |
+
*.env
|
| 17 |
|
| 18 |
# OS/Editor
|
| 19 |
.DS_Store
|
backend/app/app.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
|
| 4 |
+
# ... existing code ...
|
backend/app/database.py
CHANGED
|
@@ -1,22 +1,23 @@
|
|
| 1 |
# All sqlite3 and local DB logic will be removed and replaced with SQLAlchemy/Postgres in the next step.
|
| 2 |
# This file will be refactored to use SQLAlchemy models and sessions.
|
| 3 |
|
| 4 |
-
from sqlalchemy import create_engine, Column, Integer, String, Text, Float, ForeignKey, DateTime
|
| 5 |
from sqlalchemy.orm import declarative_base, sessionmaker, relationship
|
| 6 |
from sqlalchemy.sql import func
|
| 7 |
import os
|
| 8 |
from sqlalchemy.exc import IntegrityError
|
| 9 |
from werkzeug.security import check_password_hash, generate_password_hash
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
| 12 |
-
DATABASE_URL
|
| 13 |
-
"sqlite:///" + os.path.join(os.path.dirname(os.path.abspath(__file__)), 'legal_docs.db')
|
| 14 |
-
)
|
| 15 |
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
if not DATABASE_URL or DATABASE_URL.strip() == "":
|
| 19 |
-
raise ValueError("DATABASE_URL is not set or is empty. Please set it as an environment variable or
|
| 20 |
|
| 21 |
engine = create_engine(DATABASE_URL)
|
| 22 |
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
@@ -45,7 +46,8 @@ class Document(Base):
|
|
| 45 |
clauses = Column(Text)
|
| 46 |
features = Column(Text)
|
| 47 |
context_analysis = Column(Text)
|
| 48 |
-
|
|
|
|
| 49 |
upload_time = Column(DateTime(timezone=True), server_default=func.now())
|
| 50 |
user_id = Column(Integer, ForeignKey('users.id'))
|
| 51 |
user = relationship('User', back_populates='documents')
|
|
@@ -71,7 +73,7 @@ def get_db_session():
|
|
| 71 |
return SessionLocal()
|
| 72 |
|
| 73 |
# --- Document CRUD ---
|
| 74 |
-
def save_document(title, full_text, summary, clauses, features, context_analysis,
|
| 75 |
session = get_db_session()
|
| 76 |
try:
|
| 77 |
doc = Document(
|
|
@@ -81,7 +83,8 @@ def save_document(title, full_text, summary, clauses, features, context_analysis
|
|
| 81 |
clauses=str(clauses),
|
| 82 |
features=str(features),
|
| 83 |
context_analysis=str(context_analysis),
|
| 84 |
-
|
|
|
|
| 85 |
user_id=user_id
|
| 86 |
)
|
| 87 |
session.add(doc)
|
|
@@ -104,6 +107,9 @@ def get_all_documents(user_id=None):
|
|
| 104 |
for doc in documents:
|
| 105 |
d = doc.__dict__.copy()
|
| 106 |
d.pop('_sa_instance_state', None)
|
|
|
|
|
|
|
|
|
|
| 107 |
result.append(d)
|
| 108 |
return result
|
| 109 |
finally:
|
|
@@ -119,6 +125,8 @@ def get_document_by_id(doc_id, user_id=None):
|
|
| 119 |
if doc:
|
| 120 |
d = doc.__dict__.copy()
|
| 121 |
d.pop('_sa_instance_state', None)
|
|
|
|
|
|
|
| 122 |
return d
|
| 123 |
return None
|
| 124 |
finally:
|
|
@@ -128,11 +136,10 @@ def delete_document(doc_id):
|
|
| 128 |
session = get_db_session()
|
| 129 |
try:
|
| 130 |
doc = session.query(Document).filter(Document.id == doc_id).first()
|
| 131 |
-
file_path = doc.file_path if doc else None
|
| 132 |
if doc:
|
| 133 |
session.delete(doc)
|
| 134 |
session.commit()
|
| 135 |
-
return
|
| 136 |
finally:
|
| 137 |
session.close()
|
| 138 |
|
|
@@ -172,13 +179,22 @@ def search_questions_answers(query, user_id=None):
|
|
| 172 |
'document_id': row.document_id,
|
| 173 |
'question': row.question,
|
| 174 |
'answer': row.answer,
|
| 175 |
-
'created_at': row.created_at
|
| 176 |
})
|
| 177 |
return results
|
| 178 |
finally:
|
| 179 |
session.close()
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
def save_question_answer(document_id, user_id, question, answer, score):
|
|
|
|
|
|
|
| 182 |
session = get_db_session()
|
| 183 |
try:
|
| 184 |
qa = QuestionAnswer(
|
|
|
|
| 1 |
# All sqlite3 and local DB logic will be removed and replaced with SQLAlchemy/Postgres in the next step.
|
| 2 |
# This file will be refactored to use SQLAlchemy models and sessions.
|
| 3 |
|
| 4 |
+
from sqlalchemy import create_engine, Column, Integer, String, Text, Float, ForeignKey, DateTime, LargeBinary
|
| 5 |
from sqlalchemy.orm import declarative_base, sessionmaker, relationship
|
| 6 |
from sqlalchemy.sql import func
|
| 7 |
import os
|
| 8 |
from sqlalchemy.exc import IntegrityError
|
| 9 |
from werkzeug.security import check_password_hash, generate_password_hash
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
import re
|
| 12 |
|
| 13 |
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
|
| 14 |
+
print("DEBUG: DATABASE_URL from os.environ:", os.environ.get('DATABASE_URL'))
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
# SQLAlchemy setup
|
| 17 |
+
DATABASE_URL = os.environ.get('DATABASE_URL')
|
| 18 |
|
| 19 |
if not DATABASE_URL or DATABASE_URL.strip() == "":
|
| 20 |
+
raise ValueError("DATABASE_URL is not set or is empty. Please set it as an environment variable or in your .env file for NeonDB.")
|
| 21 |
|
| 22 |
engine = create_engine(DATABASE_URL)
|
| 23 |
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
|
|
| 46 |
clauses = Column(Text)
|
| 47 |
features = Column(Text)
|
| 48 |
context_analysis = Column(Text)
|
| 49 |
+
file_data = Column(LargeBinary) # Store file content in DB
|
| 50 |
+
file_size = Column(Integer) # Add this
|
| 51 |
upload_time = Column(DateTime(timezone=True), server_default=func.now())
|
| 52 |
user_id = Column(Integer, ForeignKey('users.id'))
|
| 53 |
user = relationship('User', back_populates='documents')
|
|
|
|
| 73 |
return SessionLocal()
|
| 74 |
|
| 75 |
# --- Document CRUD ---
|
| 76 |
+
def save_document(title, full_text, summary, clauses, features, context_analysis, file_data, user_id):
|
| 77 |
session = get_db_session()
|
| 78 |
try:
|
| 79 |
doc = Document(
|
|
|
|
| 83 |
clauses=str(clauses),
|
| 84 |
features=str(features),
|
| 85 |
context_analysis=str(context_analysis),
|
| 86 |
+
file_data=file_data,
|
| 87 |
+
file_size=len(file_data) if file_data else 0, # Store file size
|
| 88 |
user_id=user_id
|
| 89 |
)
|
| 90 |
session.add(doc)
|
|
|
|
| 107 |
for doc in documents:
|
| 108 |
d = doc.__dict__.copy()
|
| 109 |
d.pop('_sa_instance_state', None)
|
| 110 |
+
d.pop('file_data', None) # Don't return file data in list
|
| 111 |
+
# Do NOT pop 'summary'; keep it in the result
|
| 112 |
+
# file_size is included
|
| 113 |
result.append(d)
|
| 114 |
return result
|
| 115 |
finally:
|
|
|
|
| 125 |
if doc:
|
| 126 |
d = doc.__dict__.copy()
|
| 127 |
d.pop('_sa_instance_state', None)
|
| 128 |
+
# Don't return file_data by default
|
| 129 |
+
d.pop('file_data', None)
|
| 130 |
return d
|
| 131 |
return None
|
| 132 |
finally:
|
|
|
|
| 136 |
session = get_db_session()
|
| 137 |
try:
|
| 138 |
doc = session.query(Document).filter(Document.id == doc_id).first()
|
|
|
|
| 139 |
if doc:
|
| 140 |
session.delete(doc)
|
| 141 |
session.commit()
|
| 142 |
+
return True
|
| 143 |
finally:
|
| 144 |
session.close()
|
| 145 |
|
|
|
|
| 179 |
'document_id': row.document_id,
|
| 180 |
'question': row.question,
|
| 181 |
'answer': row.answer,
|
| 182 |
+
'created_at': row.created_at.isoformat() if row.created_at else None,
|
| 183 |
})
|
| 184 |
return results
|
| 185 |
finally:
|
| 186 |
session.close()
|
| 187 |
|
| 188 |
+
def clean_answer(answer):
|
| 189 |
+
# Remove patterns like (3), extra spaces, and leading/trailing punctuation
|
| 190 |
+
answer = re.sub(r'\(\d+\)', '', answer)
|
| 191 |
+
answer = re.sub(r'\s+', ' ', answer)
|
| 192 |
+
answer = answer.strip(' ,.;:')
|
| 193 |
+
return answer
|
| 194 |
+
|
| 195 |
def save_question_answer(document_id, user_id, question, answer, score):
|
| 196 |
+
score = float(score) # Convert np.float64 to Python float
|
| 197 |
+
answer = clean_answer(answer) # Clean up answer format
|
| 198 |
session = get_db_session()
|
| 199 |
try:
|
| 200 |
qa = QuestionAnswer(
|
backend/app/routes/routes.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
-
from flask import Blueprint, request, jsonify,
|
| 3 |
from werkzeug.utils import secure_filename
|
| 4 |
from app.utils.extract_text import extract_text_from_pdf
|
| 5 |
from app.utils.summarizer import generate_summary
|
| 6 |
from app.utils.clause_detector import detect_clauses
|
| 7 |
-
from app.database import save_document, delete_document
|
| 8 |
from app.database import get_all_documents, get_document_by_id
|
| 9 |
from app.database import search_documents, save_question_answer, search_questions_answers
|
| 10 |
from app.nlp.qa import answer_question
|
|
@@ -20,7 +20,14 @@ import textract
|
|
| 20 |
from app.database import get_user_profile, update_user_profile, change_user_password
|
| 21 |
from app.database import SessionLocal, User
|
| 22 |
from sqlalchemy.exc import IntegrityError
|
| 23 |
-
from sqlalchemy import or_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
main = Blueprint("main", __name__)
|
| 26 |
|
|
@@ -29,12 +36,7 @@ enhanced_legal_processor = EnhancedLegalProcessor()
|
|
| 29 |
legal_domain_processor = LegalDomainFeatures()
|
| 30 |
context_processor = ContextUnderstanding()
|
| 31 |
|
| 32 |
-
|
| 33 |
-
UPLOAD_FOLDER = os.path.join(BASE_DIR, 'uploads')
|
| 34 |
-
|
| 35 |
-
# Ensure the upload folder exists
|
| 36 |
-
if not os.path.exists(UPLOAD_FOLDER):
|
| 37 |
-
os.makedirs(UPLOAD_FOLDER)
|
| 38 |
|
| 39 |
ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
|
| 40 |
|
|
@@ -74,8 +76,7 @@ def upload_file():
|
|
| 74 |
if not (file.filename.lower().endswith('.pdf')):
|
| 75 |
return jsonify({'error': 'File type not allowed. Only PDF files are supported.'}), 400
|
| 76 |
filename = secure_filename(file.filename)
|
| 77 |
-
|
| 78 |
-
file.save(file_path)
|
| 79 |
identity = get_jwt_identity()
|
| 80 |
user_id = get_user_id_by_username(identity)
|
| 81 |
if not user_id:
|
|
@@ -87,7 +88,7 @@ def upload_file():
|
|
| 87 |
clauses="[]",
|
| 88 |
features="{}",
|
| 89 |
context_analysis="{}",
|
| 90 |
-
|
| 91 |
user_id=user_id
|
| 92 |
)
|
| 93 |
return jsonify({
|
|
@@ -103,9 +104,27 @@ def upload_file():
|
|
| 103 |
@main.route('/documents', methods=['GET'])
|
| 104 |
@jwt_required()
|
| 105 |
def list_documents():
|
|
|
|
|
|
|
|
|
|
| 106 |
try:
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
except Exception as e:
|
| 110 |
logging.error(f"Error listing documents: {str(e)}", exc_info=True)
|
| 111 |
return jsonify({"error": str(e)}), 500
|
|
@@ -123,31 +142,49 @@ def get_document(doc_id):
|
|
| 123 |
logging.error(f"Error getting document {doc_id}: {str(e)}", exc_info=True)
|
| 124 |
return jsonify({"error": str(e)}), 500
|
| 125 |
|
| 126 |
-
@main.route('/documents/download/<
|
| 127 |
@jwt_required()
|
| 128 |
-
def download_document(
|
| 129 |
try:
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
except Exception as e:
|
| 132 |
-
logging.error(f"Error downloading file
|
| 133 |
return jsonify({"error": f"Error downloading file: {str(e)}"}), 500
|
| 134 |
|
| 135 |
-
@main.route('/documents/view/<
|
| 136 |
@jwt_required()
|
| 137 |
-
def view_document(
|
| 138 |
try:
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
except Exception as e:
|
| 141 |
-
logging.error(f"Error viewing file
|
| 142 |
return jsonify({"error": f"Error viewing file: {str(e)}"}), 500
|
| 143 |
|
| 144 |
@main.route('/documents/<int:doc_id>', methods=['DELETE'])
|
| 145 |
@jwt_required()
|
| 146 |
def delete_document_route(doc_id):
|
| 147 |
try:
|
| 148 |
-
|
| 149 |
-
if file_path_to_delete and os.path.exists(file_path_to_delete):
|
| 150 |
-
os.remove(file_path_to_delete)
|
| 151 |
return jsonify({"success": True, "message": "Document deleted successfully"}), 200
|
| 152 |
except Exception as e:
|
| 153 |
logging.error(f"Error deleting document {doc_id}: {str(e)}", exc_info=True)
|
|
@@ -207,30 +244,31 @@ def login():
|
|
| 207 |
@jwt_required()
|
| 208 |
def process_document(doc_id):
|
| 209 |
try:
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
| 212 |
return jsonify({'error': 'Document not found'}), 404
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
| 215 |
if not text:
|
|
|
|
| 216 |
return jsonify({'error': 'Could not extract text from file'}), 400
|
| 217 |
summary = generate_summary(text)
|
| 218 |
clauses = detect_clauses(text)
|
| 219 |
features = legal_domain_processor.process_legal_document(text)
|
| 220 |
context_analysis = context_processor.analyze_context(text)
|
| 221 |
# Update the document with processed content
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
doc.features = str(features)
|
| 230 |
-
doc.context_analysis = str(context_analysis)
|
| 231 |
-
session.commit()
|
| 232 |
-
finally:
|
| 233 |
-
session.close()
|
| 234 |
return jsonify({
|
| 235 |
'message': 'Document processed successfully',
|
| 236 |
'document_id': doc_id,
|
|
@@ -244,30 +282,41 @@ def process_document(doc_id):
|
|
| 244 |
@jwt_required()
|
| 245 |
def generate_document_summary(doc_id):
|
| 246 |
try:
|
| 247 |
-
|
|
|
|
| 248 |
if not doc:
|
|
|
|
| 249 |
return jsonify({"error": "Document not found"}), 404
|
| 250 |
-
summary = doc.
|
| 251 |
if summary and summary.strip() and summary != 'Processing...':
|
|
|
|
| 252 |
return jsonify({"summary": summary}), 200
|
| 253 |
-
|
| 254 |
-
|
| 255 |
return jsonify({"error": "File not found for this document"}), 404
|
| 256 |
-
text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
if not text.strip():
|
|
|
|
| 258 |
return jsonify({"error": "No text available for summarization"}), 400
|
| 259 |
-
summary = generate_summary(text)
|
| 260 |
-
# Save the summary to the database
|
| 261 |
-
session = SessionLocal()
|
| 262 |
try:
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
document.summary = summary
|
| 266 |
-
session.commit()
|
| 267 |
-
finally:
|
| 268 |
session.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
return jsonify({"summary": summary}), 200
|
| 270 |
except Exception as e:
|
|
|
|
| 271 |
return jsonify({"error": f"Error generating summary: {str(e)}"}), 500
|
| 272 |
|
| 273 |
@main.route('/ask-question', methods=['POST', 'OPTIONS'])
|
|
@@ -390,10 +439,24 @@ def dashboard_stats():
|
|
| 390 |
processed_documents = sum(1 for doc in documents if doc.get('summary') and doc.get('summary') != 'Processing...')
|
| 391 |
pending_analysis = total_documents - processed_documents
|
| 392 |
qa_results = search_questions_answers('', user_id=user_id)
|
| 393 |
-
from datetime import datetime, timedelta
|
| 394 |
now = datetime.utcnow()
|
| 395 |
last_30_days = now - timedelta(days=30)
|
| 396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
return jsonify({
|
| 398 |
'total_documents': total_documents,
|
| 399 |
'processed_documents': processed_documents,
|
|
|
|
| 1 |
import os
|
| 2 |
+
from flask import Blueprint, request, jsonify, send_file
|
| 3 |
from werkzeug.utils import secure_filename
|
| 4 |
from app.utils.extract_text import extract_text_from_pdf
|
| 5 |
from app.utils.summarizer import generate_summary
|
| 6 |
from app.utils.clause_detector import detect_clauses
|
| 7 |
+
from app.database import save_document, delete_document, Document
|
| 8 |
from app.database import get_all_documents, get_document_by_id
|
| 9 |
from app.database import search_documents, save_question_answer, search_questions_answers
|
| 10 |
from app.nlp.qa import answer_question
|
|
|
|
| 20 |
from app.database import get_user_profile, update_user_profile, change_user_password
|
| 21 |
from app.database import SessionLocal, User
|
| 22 |
from sqlalchemy.exc import IntegrityError
|
| 23 |
+
from sqlalchemy import or_, Index
|
| 24 |
+
import io
|
| 25 |
+
from datetime import datetime, timedelta, timezone
|
| 26 |
+
from sqlalchemy import Column, Integer, String, Text, DateTime, LargeBinary, func
|
| 27 |
+
from sqlalchemy.orm import relationship
|
| 28 |
+
from sqlalchemy.ext.declarative import declarative_base
|
| 29 |
+
from sqlalchemy import create_engine
|
| 30 |
+
from sqlalchemy.pool import NullPool
|
| 31 |
|
| 32 |
main = Blueprint("main", __name__)
|
| 33 |
|
|
|
|
| 36 |
legal_domain_processor = LegalDomainFeatures()
|
| 37 |
context_processor = ContextUnderstanding()
|
| 38 |
|
| 39 |
+
# Remove UPLOAD_FOLDER, file_path, and local file logic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
|
| 42 |
|
|
|
|
| 76 |
if not (file.filename.lower().endswith('.pdf')):
|
| 77 |
return jsonify({'error': 'File type not allowed. Only PDF files are supported.'}), 400
|
| 78 |
filename = secure_filename(file.filename)
|
| 79 |
+
file_content = file.read() # Read file content as bytes
|
|
|
|
| 80 |
identity = get_jwt_identity()
|
| 81 |
user_id = get_user_id_by_username(identity)
|
| 82 |
if not user_id:
|
|
|
|
| 88 |
clauses="[]",
|
| 89 |
features="{}",
|
| 90 |
context_analysis="{}",
|
| 91 |
+
file_data=file_content, # Store file in DB
|
| 92 |
user_id=user_id
|
| 93 |
)
|
| 94 |
return jsonify({
|
|
|
|
| 104 |
@main.route('/documents', methods=['GET'])
|
| 105 |
@jwt_required()
|
| 106 |
def list_documents():
|
| 107 |
+
page = int(request.args.get('page', 1))
|
| 108 |
+
limit = int(request.args.get('limit', 20))
|
| 109 |
+
offset = (page - 1) * limit
|
| 110 |
try:
|
| 111 |
+
identity = get_jwt_identity()
|
| 112 |
+
user_id = get_user_id_by_username(identity)
|
| 113 |
+
session = SessionLocal()
|
| 114 |
+
query = session.query(Document).filter(Document.user_id == user_id).order_by(Document.upload_time.desc())
|
| 115 |
+
documents = query.offset(offset).limit(limit).all()
|
| 116 |
+
result = []
|
| 117 |
+
for doc in documents:
|
| 118 |
+
result.append({
|
| 119 |
+
'id': doc.id,
|
| 120 |
+
'title': doc.title,
|
| 121 |
+
'summary': doc.summary,
|
| 122 |
+
'file_size': doc.file_size,
|
| 123 |
+
'upload_time': doc.upload_time.isoformat() if doc.upload_time else None,
|
| 124 |
+
'type': doc.title.split('.')[-1].upper() if '.' in doc.title else 'UNKNOWN',
|
| 125 |
+
})
|
| 126 |
+
session.close()
|
| 127 |
+
return jsonify(result), 200
|
| 128 |
except Exception as e:
|
| 129 |
logging.error(f"Error listing documents: {str(e)}", exc_info=True)
|
| 130 |
return jsonify({"error": str(e)}), 500
|
|
|
|
| 142 |
logging.error(f"Error getting document {doc_id}: {str(e)}", exc_info=True)
|
| 143 |
return jsonify({"error": str(e)}), 500
|
| 144 |
|
| 145 |
+
@main.route('/documents/download/<int:doc_id>', methods=['GET'])
|
| 146 |
@jwt_required()
|
| 147 |
+
def download_document(doc_id):
|
| 148 |
try:
|
| 149 |
+
session = SessionLocal()
|
| 150 |
+
doc = session.query(Document).filter(Document.id == doc_id).first()
|
| 151 |
+
session.close()
|
| 152 |
+
if not doc or not doc.file_data:
|
| 153 |
+
return jsonify({"error": "File not found"}), 404
|
| 154 |
+
return send_file(
|
| 155 |
+
io.BytesIO(doc.file_data),
|
| 156 |
+
as_attachment=True,
|
| 157 |
+
download_name=doc.title,
|
| 158 |
+
mimetype='application/pdf'
|
| 159 |
+
)
|
| 160 |
except Exception as e:
|
| 161 |
+
logging.error(f"Error downloading file: {str(e)}", exc_info=True)
|
| 162 |
return jsonify({"error": f"Error downloading file: {str(e)}"}), 500
|
| 163 |
|
| 164 |
+
@main.route('/documents/view/<int:doc_id>', methods=['GET'])
|
| 165 |
@jwt_required()
|
| 166 |
+
def view_document(doc_id):
|
| 167 |
try:
|
| 168 |
+
session = SessionLocal()
|
| 169 |
+
doc = session.query(Document).filter(Document.id == doc_id).first()
|
| 170 |
+
session.close()
|
| 171 |
+
if not doc or not doc.file_data:
|
| 172 |
+
return jsonify({"error": "File not found"}), 404
|
| 173 |
+
return send_file(
|
| 174 |
+
io.BytesIO(doc.file_data),
|
| 175 |
+
as_attachment=False,
|
| 176 |
+
download_name=doc.title,
|
| 177 |
+
mimetype='application/pdf'
|
| 178 |
+
)
|
| 179 |
except Exception as e:
|
| 180 |
+
logging.error(f"Error viewing file: {str(e)}", exc_info=True)
|
| 181 |
return jsonify({"error": f"Error viewing file: {str(e)}"}), 500
|
| 182 |
|
| 183 |
@main.route('/documents/<int:doc_id>', methods=['DELETE'])
|
| 184 |
@jwt_required()
|
| 185 |
def delete_document_route(doc_id):
|
| 186 |
try:
|
| 187 |
+
delete_document(doc_id)
|
|
|
|
|
|
|
| 188 |
return jsonify({"success": True, "message": "Document deleted successfully"}), 200
|
| 189 |
except Exception as e:
|
| 190 |
logging.error(f"Error deleting document {doc_id}: {str(e)}", exc_info=True)
|
|
|
|
| 244 |
@jwt_required()
|
| 245 |
def process_document(doc_id):
|
| 246 |
try:
|
| 247 |
+
session = SessionLocal()
|
| 248 |
+
doc = session.query(Document).filter(Document.id == doc_id).first()
|
| 249 |
+
if not doc:
|
| 250 |
+
session.close()
|
| 251 |
return jsonify({'error': 'Document not found'}), 404
|
| 252 |
+
if not doc.file_data:
|
| 253 |
+
session.close()
|
| 254 |
+
return jsonify({'error': 'File not found for this document'}), 404
|
| 255 |
+
# Extract text from file_data
|
| 256 |
+
text = extract_text_from_pdf(io.BytesIO(doc.file_data))
|
| 257 |
if not text:
|
| 258 |
+
session.close()
|
| 259 |
return jsonify({'error': 'Could not extract text from file'}), 400
|
| 260 |
summary = generate_summary(text)
|
| 261 |
clauses = detect_clauses(text)
|
| 262 |
features = legal_domain_processor.process_legal_document(text)
|
| 263 |
context_analysis = context_processor.analyze_context(text)
|
| 264 |
# Update the document with processed content
|
| 265 |
+
doc.full_text = text
|
| 266 |
+
doc.summary = summary
|
| 267 |
+
doc.clauses = str(clauses)
|
| 268 |
+
doc.features = str(features)
|
| 269 |
+
doc.context_analysis = str(context_analysis)
|
| 270 |
+
session.commit()
|
| 271 |
+
session.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
return jsonify({
|
| 273 |
'message': 'Document processed successfully',
|
| 274 |
'document_id': doc_id,
|
|
|
|
| 282 |
@jwt_required()
|
| 283 |
def generate_document_summary(doc_id):
|
| 284 |
try:
|
| 285 |
+
session = SessionLocal()
|
| 286 |
+
doc = session.query(Document).filter(Document.id == doc_id).first()
|
| 287 |
if not doc:
|
| 288 |
+
session.close()
|
| 289 |
return jsonify({"error": "Document not found"}), 404
|
| 290 |
+
summary = doc.summary
|
| 291 |
if summary and summary.strip() and summary != 'Processing...':
|
| 292 |
+
session.close()
|
| 293 |
return jsonify({"summary": summary}), 200
|
| 294 |
+
if not doc.file_data:
|
| 295 |
+
session.close()
|
| 296 |
return jsonify({"error": "File not found for this document"}), 404
|
| 297 |
+
# Extract text from file_data
|
| 298 |
+
try:
|
| 299 |
+
text = extract_text_from_pdf(io.BytesIO(doc.file_data))
|
| 300 |
+
except Exception as e:
|
| 301 |
+
session.close()
|
| 302 |
+
logging.error(f"Error extracting text from PDF: {e}")
|
| 303 |
+
return jsonify({"error": f"Error extracting text from PDF: {e}"}), 500
|
| 304 |
if not text.strip():
|
| 305 |
+
session.close()
|
| 306 |
return jsonify({"error": "No text available for summarization"}), 400
|
|
|
|
|
|
|
|
|
|
| 307 |
try:
|
| 308 |
+
summary = generate_summary(text)
|
| 309 |
+
except Exception as e:
|
|
|
|
|
|
|
|
|
|
| 310 |
session.close()
|
| 311 |
+
logging.error(f"Error generating summary: {e}")
|
| 312 |
+
return jsonify({"error": f"Error generating summary: {e}"}), 500
|
| 313 |
+
# Save the summary to the database
|
| 314 |
+
doc.summary = summary
|
| 315 |
+
session.commit()
|
| 316 |
+
session.close()
|
| 317 |
return jsonify({"summary": summary}), 200
|
| 318 |
except Exception as e:
|
| 319 |
+
logging.error(f"Error in generate_document_summary: {e}", exc_info=True)
|
| 320 |
return jsonify({"error": f"Error generating summary: {str(e)}"}), 500
|
| 321 |
|
| 322 |
@main.route('/ask-question', methods=['POST', 'OPTIONS'])
|
|
|
|
| 439 |
processed_documents = sum(1 for doc in documents if doc.get('summary') and doc.get('summary') != 'Processing...')
|
| 440 |
pending_analysis = total_documents - processed_documents
|
| 441 |
qa_results = search_questions_answers('', user_id=user_id)
|
|
|
|
| 442 |
now = datetime.utcnow()
|
| 443 |
last_30_days = now - timedelta(days=30)
|
| 444 |
+
def parse_dt(val):
|
| 445 |
+
if isinstance(val, datetime):
|
| 446 |
+
# Convert to naive UTC
|
| 447 |
+
if val.tzinfo is not None:
|
| 448 |
+
return val.astimezone(timezone.utc).replace(tzinfo=None)
|
| 449 |
+
return val
|
| 450 |
+
if isinstance(val, str):
|
| 451 |
+
try:
|
| 452 |
+
dt = datetime.fromisoformat(val)
|
| 453 |
+
if dt.tzinfo is not None:
|
| 454 |
+
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
| 455 |
+
return dt
|
| 456 |
+
except Exception:
|
| 457 |
+
return None
|
| 458 |
+
return None
|
| 459 |
+
recent_questions = sum(1 for q in qa_results if q['created_at'] and parse_dt(q['created_at']) and parse_dt(q['created_at']) >= last_30_days)
|
| 460 |
return jsonify({
|
| 461 |
'total_documents': total_documents,
|
| 462 |
'processed_documents': processed_documents,
|
backend/app/utils/extract_text.py
CHANGED
|
@@ -1,8 +1,24 @@
|
|
| 1 |
import tempfile
|
| 2 |
from pdfminer.high_level import extract_text
|
| 3 |
import os
|
|
|
|
| 4 |
|
| 5 |
-
def extract_text_from_pdf(
|
| 6 |
-
|
| 7 |
-
text
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import tempfile
|
| 2 |
from pdfminer.high_level import extract_text
|
| 3 |
import os
|
| 4 |
+
from PyPDF2 import PdfReader
|
| 5 |
|
| 6 |
+
def extract_text_from_pdf(file_or_path):
|
| 7 |
+
"""
|
| 8 |
+
Extract text from a PDF file. Accepts either a file path (str) or a file-like object (e.g., BytesIO).
|
| 9 |
+
"""
|
| 10 |
+
if isinstance(file_or_path, (str, bytes)):
|
| 11 |
+
# Assume it's a file path
|
| 12 |
+
with open(file_or_path, 'rb') as f:
|
| 13 |
+
reader = PdfReader(f)
|
| 14 |
+
text = ""
|
| 15 |
+
for page in reader.pages:
|
| 16 |
+
text += page.extract_text() or ""
|
| 17 |
+
return text
|
| 18 |
+
else:
|
| 19 |
+
# Assume it's a file-like object
|
| 20 |
+
reader = PdfReader(file_or_path)
|
| 21 |
+
text = ""
|
| 22 |
+
for page in reader.pages:
|
| 23 |
+
text += page.extract_text() or ""
|
| 24 |
+
return text
|