raannakasturi's picture
Update app.py
6e331a2 verified
import json
import os
import pdf2doi
import gradio as gr
import requests
import html
def download_pdf(url):
file_path = f"{url.split('/')[-1]}.pdf"
response = requests.get(url)
with open(file_path, 'wb') as file:
file.write(response.content)
return file_path
import json
def get_doi(pdf_path):
pdf2doi.config.set('verbose', False)
results = pdf2doi.pdf2doi(pdf_path)
validation_info = results.get('validation_info', {})
if isinstance(validation_info, str):
try:
validation_info = json.loads(validation_info) # Convert JSON string to dic
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
return None, None, None
if not isinstance(validation_info, dict):
print("Validation info is not a dictionary")
return None, None, None
doi = validation_info.get('DOI', None)
title = validation_info.get('title', None)
url = validation_info.get('URL', None)
return doi, title, url
def get_paper_data(doi):
api_url = f"https://api.citeas.org/product/{doi}"
response = requests.get(api_url)
return response.json()
def main(pdf_url):
pdf_path = download_pdf(pdf_url)
doi, title, url = get_doi(pdf_path)
if doi is None:
return json.dumps({"error": "DOI not found"}, indent=4)
paper_data = get_paper_data(doi)
if not paper_data:
return json.dumps({"error": "Paper data not found"}, indent=4)
citation_text = None
for citation in paper_data.get('citations', []):
if citation.get('style_shortname') == 'apa':
citation_text = citation.get('citation')
break
title = title or paper_data.get('name')
url = url or f"https://doi.org/{doi}"
if citation_text:
citation_text = citation_text.encode('utf-8').decode('utf-8')
citation_text = html.unescape(
citation_text.replace("<i>", "").replace("</i>", "").replace("\u2026", "...").replace("\n", " ")
)
else:
citation_text = "Citation not found"
data = {
"doi": doi,
"title": title if title else "Title not found",
"citation_text": citation_text,
"url": url
}
os.remove(pdf_path)
return json.dumps(data, ensure_ascii=False, indent=4)
theme = gr.themes.Soft(
primary_hue="purple",
secondary_hue="cyan",
neutral_hue="slate",
font=[
gr.themes.GoogleFont("Syne"),
gr.themes.GoogleFont("Poppins"),
gr.themes.GoogleFont("Poppins"),
gr.themes.GoogleFont("Poppins")
],
)
with gr.Blocks(theme=theme) as app:
with gr.Row():
pdf_path = gr.Textbox(lines=1, label="PDF URL", placeholder="Enter the URL of the PDF")
doi_data = gr.Textbox(lines=7, label="DOI Data", placeholder="DOI data will be displayed here", show_copy_button=True)
get_data = gr.Button(value="Get DOI Data", variant='primary')
get_data.click(main, inputs=[pdf_path], outputs=[doi_data], api_name="getDOIData")
app.queue(default_concurrency_limit=250).launch()