Spaces:

bookbot
/

Wikipedia-Scraper

Build error

App Files Files Community

Wikipedia-Scraper / app.py

w11wo

modified output filename

566a5ab over 3 years ago

raw

history blame

2.23 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	from urllib.parse import urlparse
	import gradio as gr
	import json


	def extract_wikipedia_text(raw_text, language):
	contents = []
	paragraph = ""

	for element in raw_text:
	# detected next headline
	if element.name == "span":
	if paragraph == "":
	continue
	contents.append({f"text-{language}": paragraph})
	paragraph = ""
	else:
	clean_text = preprocessing(element.text)
	if clean_text == "":
	continue
	if paragraph != "":
	clean_text = " " + clean_text
	paragraph += clean_text
	return contents


	def preprocessing(text):
	# remove square brackets a.k.a citations
	clean_text = re.sub("\[.*?]", "", text).strip()
	# remove \n
	clean_text = clean_text.replace("\n", "")
	return clean_text


	def scrape(url):
	language = urlparse(url).netloc.split(".")[0]
	try:
	page = requests.get(url, headers={"user-agent": "Mozilla/5.0"})
	soup = BeautifulSoup(page.content, "html.parser")
	except:
	print("error")
	title = soup.find("h1", {"id": "firstHeading"}).get_text().strip()
	raw_text = soup.select(
	"h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p"
	)
	contents = extract_wikipedia_text(raw_text, language)
	json_output = {"source": url, f"title-{language}": title, "pages": contents}
	filename = f"{url.split('/')[-1]}.json"
	with open(filename, "w") as f:
	json.dump(json_output, f)
	return json_output, filename


	style_sheet = "#json-output { max-height: 400px; overflow-y: auto; }"
	with gr.Blocks(css=style_sheet) as demo:
	gr.Markdown(
	f"""
	<center>
	<h1>Wikipedia Scraper 📜</h1>
	</center>
	"""
	)
	with gr.Row():
	inp = gr.Textbox(placeholder="Wikipedia URL")
	with gr.Column():
	out = gr.JSON(elem_id="json-output")
	out_download = gr.File()
	btn = gr.Button("Scrape")
	btn.click(fn=scrape, inputs=inp, outputs=[out, out_download])

	demo.launch(debug=True)