Spaces:

umarigan
/

SemanticSearch

Sleeping

App Files Files Community

SemanticSearch / app.py

umarigan

Update app.py

0cfdb4e verified over 1 year ago

raw

history blame

5.14 kB

	import gradio as gr
	import fitz # PyMuPDF for reading PDFs
	import numpy as np
	from bokeh.plotting import figure, output_file, save
	from bokeh.models import HoverTool, ColumnDataSource
	import umap
	import pandas as pd
	from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
	from sentence_transformers import SentenceTransformer
	import tempfile
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# Initialize the model globally
	model = SentenceTransformer('all-MiniLM-L6-v2')
	logging.info("Model loaded successfully.")

	def process_pdf(pdf_path):
	logging.info(f"Processing PDF: {pdf_path}")
	# Open the PDF
	doc = fitz.open(pdf_path)
	texts = [page.get_text() for page in doc]
	print("PDF processed successfully.")
	return " ".join(texts)

	def create_embeddings(text):
	print("Creating embeddings.")
	sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
	embeddings = model.encode(sentences)
	print("Embeddings created successfully.")
	return embeddings, sentences

	def generate_plot_bokeh(query, pdf_file):
	logging.info("Generating plot.")
	# Generate embeddings for the query
	query_embedding = model.encode([query])[0]

	# Process the PDF and create embeddings
	text = process_pdf(pdf_file.name)
	embeddings, sentences = create_embeddings(text)

	logging.info("Data prepared for UMAP.")
	# Prepare the data for UMAP and visualization
	all_embeddings = np.vstack([embeddings, query_embedding])
	all_sentences = sentences + [query]

	# UMAP transformation
	umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
	umap_embeddings = umap_transform.fit_transform(all_embeddings)

	logging.info("UMAP transformation completed.")
	# Find the closest sentences to the query
	distances = cosine_similarity([query_embedding], embeddings)[0]
	closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed

	# Prepare data for plotting
	data = {
	'x': umap_embeddings[:-1, 0], # Exclude the query point itself
	'y': umap_embeddings[:-1, 1], # Exclude the query point itself
	'content': all_sentences[:-1], # Exclude the query sentence itself
	'color': ['red' if i in closest_indices else 'blue' for i in range(len(sentences))],
	}
	source = ColumnDataSource(data)

	# Create the Bokeh plot
	p = figure(title="UMAP Projection of Sentences", width=700, height=700)
	p.scatter('x', 'y', color='color', source=source)

	hover = HoverTool(tooltips=[("Content", "@content")])
	p.add_tools(hover)

	logging.info("Plot created successfully.")
	# Save the plot to an HTML file
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
	logging.info(f"temp file is {temp_file.name}")
	output_file(temp_file.name)
	save(p)
	logging.info("Plot saved to file.")
	return temp_file.name
	import plotly.express as px
	import plotly.graph_objects as go

	def generate_plotly_figure(query, pdf_file):
	logging.info("Generating plot with Plotly.")
	# Generate embeddings for the query
	query_embedding = model.encode([query])[0]

	# Process the PDF and create embeddings
	text = process_pdf(pdf_file.name)
	embeddings, sentences = create_embeddings(text)

	logging.info("Data prepared for UMAP.")
	# Prepare the data for UMAP and visualization
	all_embeddings = np.vstack([embeddings, query_embedding])
	all_sentences = sentences + [query]

	# UMAP transformation
	umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
	umap_embeddings = umap_transform.fit_transform(all_embeddings)

	logging.info("UMAP transformation completed.")
	# Find the closest sentences to the query
	distances = cosine_similarity([query_embedding], embeddings)[0]
	closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed

	# Prepare data for plotting
	colors = ['red' if i in closest_indices else 'blue' for i in range(len(sentences))]
	fig = go.Figure()
	fig.add_trace(go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
	marker=dict(color=colors), text=all_sentences[:-1]))

	fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")

	logging.info("Plotly figure created successfully.")
	return fig

	def gradio_interface(pdf_file, query):
	logging.info("Gradio interface called.")
	fig = generate_plotly_figure(query, pdf_file)
	logging.info("Returning Plotly figure.")
	return fig
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")],
	outputs=gr.Plot(), # Updated to use gr.Plot() for Plotly figures
	title="PDF Content Visualizer",
	description="Upload a PDF and enter a query to visualize the content."
	)

	if __name__ == "__main__":
	iface.launch()