Spaces:
Build error
Build error
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| from urllib.parse import urlparse | |
| import gradio as gr | |
| import json | |
| def extract_wikipedia_text(raw_text, language): | |
| contents = [] | |
| paragraph = "" | |
| for element in raw_text: | |
| # detected next headline | |
| if element.name == "span": | |
| if paragraph == "": | |
| continue | |
| contents.append({f"text-{language}": paragraph}) | |
| paragraph = "" | |
| else: | |
| clean_text = preprocessing(element.text) | |
| if clean_text == "": | |
| continue | |
| if paragraph != "": | |
| clean_text = " " + clean_text | |
| paragraph += clean_text | |
| return contents | |
| def preprocessing(text): | |
| # remove square brackets a.k.a citations | |
| clean_text = re.sub("\[.*?]", "", text).strip() | |
| # remove \n | |
| clean_text = clean_text.replace("\n", "") | |
| return clean_text | |
| def scrape(url): | |
| language = urlparse(url).netloc.split(".")[0] | |
| try: | |
| page = requests.get(url, headers={"user-agent": "Mozilla/5.0"}) | |
| soup = BeautifulSoup(page.content, "html.parser") | |
| except: | |
| print("error") | |
| title = soup.find("h1", {"id": "firstHeading"}).get_text().strip() | |
| raw_text = soup.select( | |
| "h2 span.mw-headline, h3 span.mw-headline, h4 span.mw-headline, p" | |
| ) | |
| contents = extract_wikipedia_text(raw_text, language) | |
| json_output = {"source": url, f"title-{language}": title, "pages": contents} | |
| filename = f"{url.split('/')[-1]}.json" | |
| with open(filename, "w") as f: | |
| json.dump(json_output, f) | |
| return json_output, filename | |
| style_sheet = "#json-output { max-height: 400px; overflow-y: auto; }" | |
| with gr.Blocks(css=style_sheet) as demo: | |
| gr.Markdown( | |
| f""" | |
| <center> | |
| <h1>Wikipedia Scraper π</h1> | |
| </center> | |
| """ | |
| ) | |
| with gr.Row(): | |
| inp = gr.Textbox(placeholder="Wikipedia URL") | |
| with gr.Column(): | |
| out = gr.JSON(elem_id="json-output") | |
| out_download = gr.File() | |
| btn = gr.Button("Scrape") | |
| btn.click(fn=scrape, inputs=inp, outputs=[out, out_download]) | |
| demo.launch(debug=True) | |