Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import pdf2doi | |
| import gradio as gr | |
| import requests | |
| import html | |
| def download_pdf(url): | |
| file_path = f"{url.split('/')[-1]}.pdf" | |
| response = requests.get(url) | |
| with open(file_path, 'wb') as file: | |
| file.write(response.content) | |
| return file_path | |
| import json | |
| def get_doi(pdf_path): | |
| pdf2doi.config.set('verbose', False) | |
| results = pdf2doi.pdf2doi(pdf_path) | |
| validation_info = results.get('validation_info', {}) | |
| if isinstance(validation_info, str): | |
| try: | |
| validation_info = json.loads(validation_info) # Convert JSON string to dic | |
| except json.JSONDecodeError as e: | |
| print(f"Error decoding JSON: {e}") | |
| return None, None, None | |
| if not isinstance(validation_info, dict): | |
| print("Validation info is not a dictionary") | |
| return None, None, None | |
| doi = validation_info.get('DOI', None) | |
| title = validation_info.get('title', None) | |
| url = validation_info.get('URL', None) | |
| return doi, title, url | |
| def get_paper_data(doi): | |
| api_url = f"https://api.citeas.org/product/{doi}" | |
| response = requests.get(api_url) | |
| return response.json() | |
| def main(pdf_url): | |
| pdf_path = download_pdf(pdf_url) | |
| doi, title, url = get_doi(pdf_path) | |
| if doi is None: | |
| return json.dumps({"error": "DOI not found"}, indent=4) | |
| paper_data = get_paper_data(doi) | |
| if not paper_data: | |
| return json.dumps({"error": "Paper data not found"}, indent=4) | |
| citation_text = None | |
| for citation in paper_data.get('citations', []): | |
| if citation.get('style_shortname') == 'apa': | |
| citation_text = citation.get('citation') | |
| break | |
| title = title or paper_data.get('name') | |
| url = url or f"https://doi.org/{doi}" | |
| if citation_text: | |
| citation_text = citation_text.encode('utf-8').decode('utf-8') | |
| citation_text = html.unescape( | |
| citation_text.replace("<i>", "").replace("</i>", "").replace("\u2026", "...").replace("\n", " ") | |
| ) | |
| else: | |
| citation_text = "Citation not found" | |
| data = { | |
| "doi": doi, | |
| "title": title if title else "Title not found", | |
| "citation_text": citation_text, | |
| "url": url | |
| } | |
| os.remove(pdf_path) | |
| return json.dumps(data, ensure_ascii=False, indent=4) | |
| theme = gr.themes.Soft( | |
| primary_hue="purple", | |
| secondary_hue="cyan", | |
| neutral_hue="slate", | |
| font=[ | |
| gr.themes.GoogleFont("Syne"), | |
| gr.themes.GoogleFont("Poppins"), | |
| gr.themes.GoogleFont("Poppins"), | |
| gr.themes.GoogleFont("Poppins") | |
| ], | |
| ) | |
| with gr.Blocks(theme=theme) as app: | |
| with gr.Row(): | |
| pdf_path = gr.Textbox(lines=1, label="PDF URL", placeholder="Enter the URL of the PDF") | |
| doi_data = gr.Textbox(lines=7, label="DOI Data", placeholder="DOI data will be displayed here", show_copy_button=True) | |
| get_data = gr.Button(value="Get DOI Data", variant='primary') | |
| get_data.click(main, inputs=[pdf_path], outputs=[doi_data], api_name="getDOIData") | |
| app.queue(default_concurrency_limit=250).launch() |