from flask import Flask, request, jsonify, render_template, send_file
from flask_cors import CORS
import requests
import json
import time
import pandas as pd
from typing import Dict, List, Optional
import pickle
import os
import sys
import threading
import tempfile
import shutil
from datetime import datetime
import timeit
import json
import requests
import os
import time
from tqdm import tqdm
# Define 'toc' function once
def toc(start_time):
elapsed = timeit.default_timer() - start_time
print(elapsed)
# Record start time
start_time = timeit.default_timer()
# Helper function to get all pages
def get_all_pages(url, headers, upper_limit=None):
all_results = []
unique_ids = set() # Track unique paper IDs
page = 1
processing_times = [] # Track time taken per paper
# Get first page to get total count
first_response = requests.get(f"{url}&page={page}", headers=headers)
if first_response.status_code != 200:
return []
data = first_response.json()
total_count = data.get('meta', {}).get('count', 0)
start_time = time.time()
# Add only unique papers from first page
for result in data.get('results', []):
if result.get('id') not in unique_ids:
unique_ids.add(result.get('id'))
all_results.append(result)
if upper_limit and len(all_results) >= upper_limit:
return all_results
papers_processed = len(all_results)
time_taken = time.time() - start_time
if papers_processed > 0:
processing_times.append(time_taken / papers_processed)
# Continue getting remaining pages until we have all papers
target_count = min(total_count, upper_limit) if upper_limit else total_count
pbar = tqdm(total=target_count, desc="Retrieving papers",
initial=len(all_results), unit="papers")
while len(all_results) < total_count:
page += 1
page_start_time = time.time()
paged_url = f"{url}&page={page}"
response = requests.get(paged_url, headers=headers)
if response.status_code != 200:
print(f"Error retrieving page {page}: {response.status_code}")
break
data = response.json()
results = data.get('results', [])
if not results:
break
# Add only unique papers from this page
new_papers = 0
for result in results:
if result.get('id') not in unique_ids:
unique_ids.add(result.get('id'))
all_results.append(result)
new_papers += 1
if upper_limit and len(all_results) >= upper_limit:
pbar.update(new_papers)
pbar.close()
return all_results
# Update processing times and estimated time remaining
if new_papers > 0:
time_taken = time.time() - page_start_time
processing_times.append(time_taken / new_papers)
avg_time_per_paper = sum(processing_times) / len(processing_times)
papers_remaining = target_count - len(all_results)
est_time_remaining = papers_remaining * avg_time_per_paper
pbar.set_postfix({'Est. Time Remaining': f'{est_time_remaining:.1f}s'})
pbar.update(new_papers)
# Add a small delay to respect rate limits
time.sleep(1)
pbar.close()
return all_results
def get_related_papers(work_id, upper_limit=None, progress_callback=None):
# Define base URL for OpenAlex API
base_url = "https://api.openalex.org/works"
work_query = f"/{work_id}" # OpenAlex work IDs can be used directly in path
work_url = base_url + work_query
# Add email to be a polite API user
headers = {'User-Agent': 'LowAI (chowdhary@iiasa.ac.at)'}
response = requests.get(work_url, headers=headers)
print(response)
if response.status_code == 200:
paper = response.json() # For direct work queries, the response is the paper object
paper_id = paper['id']
# Use referenced_works field on the seed work directly for cited papers
referenced_ids = paper.get('referenced_works', []) or []
print("\nTotal counts:")
print(f"Cited (referenced_works) count: {len(referenced_ids)}")
def fetch_works_by_ids(ids, chunk_size=50):
results = []
seen = set()
total_chunks = (len(ids) + chunk_size - 1) // chunk_size
for i in range(0, len(ids), chunk_size):
chunk = ids[i:i+chunk_size]
# Build ids filter: ids.openalex:ID1|ID2|ID3
ids_filter = '|'.join(chunk)
url = f"{base_url}?filter=ids.openalex:{ids_filter}&per-page=200"
resp = requests.get(url, headers=headers)
if resp.status_code != 200:
print(f"Error fetching IDs chunk {i//chunk_size+1}: {resp.status_code}")
continue
data = resp.json()
for r in data.get('results', []):
rid = r.get('id')
if rid and rid not in seen:
seen.add(rid)
results.append(r)
# Update progress for cited papers (0-30%)
if progress_callback:
progress = int(30 * (i // chunk_size + 1) / total_chunks)
progress_callback(progress, f"Fetching cited papers... {len(results)} found")
time.sleep(1) # be polite to API
if upper_limit and len(results) >= upper_limit:
return results[:upper_limit]
return results
print("\nRetrieving cited papers via referenced_works IDs...")
cited_papers = fetch_works_by_ids(referenced_ids)
print(f"Found {len(cited_papers)} unique cited papers")
# Count citing papers (works that cite the seed), then paginate to collect all
citing_count_url = f"{base_url}?filter=cites:{work_id}&per-page=1"
citing_count = requests.get(citing_count_url, headers=headers).json().get('meta', {}).get('count', 0)
print(f"Citing papers: {citing_count}")
# Get all citing papers with pagination
print("\nRetrieving citing papers (paginated)...")
page = 1
citing_papers = []
unique_ids = set()
target = citing_count if not upper_limit else min(upper_limit, citing_count)
from tqdm import tqdm
pbar = tqdm(total=target, desc="Retrieving citing papers", unit="papers")
while len(citing_papers) < target:
paged_url = f"{base_url}?filter=cites:{work_id}&per-page=200&sort=publication_date:desc&page={page}"
resp = requests.get(paged_url, headers=headers)
if resp.status_code != 200:
print(f"Error retrieving citing page {page}: {resp.status_code}")
break
data = resp.json()
results = data.get('results', [])
if not results:
break
new = 0
for r in results:
rid = r.get('id')
if rid and rid not in unique_ids:
unique_ids.add(rid)
citing_papers.append(r)
new += 1
if len(citing_papers) >= target:
break
# Update progress for citing papers (30-70%)
if progress_callback:
progress = 30 + int(40 * len(citing_papers) / target)
progress_callback(progress, f"Fetching citing papers... {len(citing_papers)} found")
pbar.update(new)
page += 1
time.sleep(1)
pbar.close()
print(f"Found {len(citing_papers)} unique citing papers")
# Get all related papers
print("\nRetrieving related papers...")
related_url = f"{base_url}?filter=related_to:{work_id}&per-page=200&sort=publication_date:desc"
related_papers = get_all_pages(related_url, headers, upper_limit)
print(f"Found {len(related_papers)} unique related papers")
# Update progress for related papers (70-90%)
if progress_callback:
progress_callback(70, f"Fetching related papers... {len(related_papers)} found")
# Create sets of IDs for quick lookup
cited_ids = {paper['id'] for paper in cited_papers}
citing_ids = {paper['id'] for paper in citing_papers}
# Print some debug information
print(f"\nDebug Information:")
print(f"Seed paper ID: {paper_id}")
print(f"Number of unique cited papers: {len(cited_ids)}")
print(f"Number of unique citing papers: {len(citing_ids)}")
print(f"Number of papers in both sets: {len(cited_ids.intersection(citing_ids))}")
# Update progress for processing (90-95%)
if progress_callback:
progress_callback(90, "Processing and deduplicating papers...")
# Combine all papers and remove duplicates while tracking relationship
all_papers = cited_papers + citing_papers + related_papers
seen_titles = set()
unique_papers = []
for paper in all_papers:
title = paper.get('title', '')
if title not in seen_titles:
seen_titles.add(title)
# Add relationship type
if paper['id'] in cited_ids:
paper['relationship'] = 'cited'
elif paper['id'] in citing_ids:
paper['relationship'] = 'citing'
else:
paper['relationship'] = 'related'
unique_papers.append(paper)
# Final progress update
if progress_callback:
progress_callback(100, f"Collection completed! Found {len(unique_papers)} unique papers")
return unique_papers
else:
print(f"Error retrieving seed paper: {response.status_code}")
return []
import requests
import json
from typing import Dict, List, Optional
from openai import OpenAI
import concurrent.futures
import threading
import time
def analyze_paper_relevance(content: Dict[str, str], research_question: str, api_key: str) -> Optional[Dict]:
"""Analyze if a paper is relevant to the research question using GPT-5 mini."""
client = OpenAI(api_key=api_key)
title = content.get('title', '')
abstract = content.get('abstract', '')
has_abstract = bool(abstract and abstract.strip())
if has_abstract:
prompt = f"""
Research Question: {research_question}
Paper Title: {title}
Paper Abstract: {abstract}
Analyze this paper and determine:
1. Is this paper highly relevant to answering the research question?
2. What are the main aims/objectives of this paper?
3. What are the key takeaways or findings?
Return ONLY a valid JSON object in this exact format:
{{
"relevant": true/false,
"relevance_reason": "brief explanation of why it is/isn't relevant",
"aims_of_paper": "main objectives of the paper",
"key_takeaways": "key findings or takeaways"
}}
"""
else:
prompt = f"""
Research Question: {research_question}
Paper Title: {title}
Note: No abstract is available for this paper.
Analyze this paper based on the title only and determine:
1. Is this paper likely to be relevant to answering the research question based on the title?
Return ONLY a valid JSON object in this exact format:
{{
"relevant": true/false,
"relevance_reason": "brief explanation of why it is/isn't relevant based on title"
}}
"""
try:
# Try GPT-5 mini first, fallback to gpt-4o-mini if it fails
try:
response = client.responses.create(
model="gpt-5-mini",
input=prompt,
reasoning={"effort": "minimal"},
text={"verbosity": "low"}
)
except Exception as e:
print(f"GPT-5 mini failed, trying gpt-4o-mini: {e}")
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": prompt
}],
max_completion_tokens=1000
)
# Handle different response formats
if hasattr(response, 'choices') and response.choices:
# Old format (chat completions)
result = response.choices[0].message.content
elif hasattr(response, 'output'):
# New format (responses) - extract text from output
result = ""
for item in response.output:
if hasattr(item, "content") and item.content:
for content in item.content:
if hasattr(content, "text") and content.text:
result += content.text
else:
print("Unexpected response format")
return None
if not result:
print("Empty response from GPT")
return None
# Clean and parse the JSON response
result = result.strip()
if result.startswith("```json"):
result = result[7:]
if result.endswith("```"):
result = result[:-3]
# Try to parse JSON
try:
return json.loads(result.strip())
except json.JSONDecodeError as e:
print(f"Failed to parse JSON response: {e}")
print(f"Raw response: {result[:200]}...")
return None
except Exception as e:
print(f"Error in GPT analysis: {str(e)}")
return None
def extract_abstract_from_inverted_index(inverted_index: Dict) -> str:
"""Extract abstract text from inverted index format."""
if not inverted_index:
return ""
words = []
for word, positions in inverted_index.items():
for pos in positions:
while len(words) <= pos:
words.append('')
words[pos] = word
return ' '.join(words).strip()
def analyze_single_paper(paper: Dict, research_question: str, api_key: str) -> Optional[Dict]:
"""Analyze a single paper with its own client."""
try:
client = OpenAI(api_key=api_key)
# Extract title and abstract
title = paper.get('title', '')
abstract = extract_abstract_from_inverted_index(paper.get('abstract_inverted_index', {}))
if not title and not abstract:
return None
# Create content for analysis
content = {
'title': title,
'abstract': abstract
}
# Analyze with GPT
analysis = analyze_paper_relevance_with_client(content, research_question, client)
if analysis:
paper['gpt_analysis'] = analysis
paper['relevance_reason'] = analysis.get('relevance_reason', 'Analysis completed')
paper['relevance_score'] = analysis.get('relevant', False)
return paper
return None
except Exception as e:
print(f"Error analyzing paper: {e}")
return None
def analyze_paper_batch(papers_batch: List[Dict], research_question: str, api_key: str, batch_id: int) -> List[Dict]:
"""Analyze a batch of papers in parallel using ThreadPoolExecutor."""
results = []
# Use ThreadPoolExecutor to process papers in parallel within the batch
with concurrent.futures.ThreadPoolExecutor(max_workers=len(papers_batch)) as executor:
# Submit all papers for parallel processing
future_to_paper = {
executor.submit(analyze_single_paper, paper, research_question, api_key): paper
for paper in papers_batch
}
# Collect results as they complete
for future in concurrent.futures.as_completed(future_to_paper):
try:
result = future.result()
if result:
results.append(result)
except Exception as e:
print(f"Error in parallel analysis: {e}")
continue
return results
def analyze_paper_relevance_with_client(content: Dict[str, str], research_question: str, client: OpenAI) -> Optional[Dict]:
"""Analyze if a paper is relevant to the research question using provided client."""
title = content.get('title', '')
abstract = content.get('abstract', '')
prompt = f"""
Research Question: {research_question}
Paper Title: {title}
Paper Abstract: {abstract or 'No abstract available'}
Analyze this paper and determine:
1. Is this paper highly relevant to answering the research question?
2. What are the main aims/objectives of this paper?
3. What are the key takeaways or findings?
Return ONLY a valid JSON object in this exact format:
{{
"relevant": true/false,
"relevance_reason": "brief explanation of why it is/isn't relevant",
"aims_of_paper": "main objectives of the paper",
"key_takeaways": "key findings or takeaways"
}}
"""
try:
# Try GPT-5 nano first, fallback to gpt-4o-mini if it fails
try:
response = client.responses.create(
model="gpt-5-nano",
input=prompt,
reasoning={"effort": "minimal"},
text={"verbosity": "low"}
)
except Exception as e:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": prompt
}],
max_completion_tokens=1000
)
# Handle different response formats
if hasattr(response, 'choices') and response.choices:
# Old format (chat completions)
result = response.choices[0].message.content
elif hasattr(response, 'output'):
# New format (responses) - extract text from output
result = ""
for item in response.output:
if hasattr(item, "content") and item.content:
for content in item.content:
if hasattr(content, "text") and content.text:
result += content.text
else:
return None
if not result:
return None
# Clean and parse the JSON response
result = result.strip()
if result.startswith("```json"):
result = result[7:]
if result.endswith("```"):
result = result[:-3]
# Try to parse JSON
try:
return json.loads(result.strip())
except json.JSONDecodeError:
return None
except Exception as e:
return None
def filter_papers_for_research_question(papers: List[Dict], research_question: str, api_key: str, limit: int = 10) -> List[Dict]:
"""Analyze exactly 'limit' number of papers for relevance using parallel processing."""
if not papers or not research_question:
return []
# Sort papers by publication date (most recent first)
sorted_papers = sorted(papers, key=lambda x: x.get('publication_date', ''), reverse=True)
# Take only the first 'limit' papers for analysis
papers_to_analyze = sorted_papers[:limit]
print(f"Analyzing {len(papers_to_analyze)} papers for relevance to: {research_question}")
# Process all papers in parallel (no batching needed for small numbers)
all_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=min(limit, 20)) as executor:
# Submit all papers for parallel processing
future_to_paper = {
executor.submit(analyze_single_paper, paper, research_question, api_key): paper
for paper in papers_to_analyze
}
# Collect results as they complete
completed = 0
for future in concurrent.futures.as_completed(future_to_paper):
try:
result = future.result()
completed += 1
if result:
all_results.append(result)
print(f"Completed {completed}/{len(papers_to_analyze)} papers")
except Exception as e:
print(f"Error in parallel analysis: {e}")
completed += 1
# Sort by publication date again (most recent first)
all_results.sort(key=lambda x: x.get('publication_date', ''), reverse=True)
print(f"Analysis complete. Processed {len(all_results)} papers.")
return all_results
import requests
import re
import html
# Try to import BeautifulSoup, fallback to simple parsing if not available
try:
from bs4 import BeautifulSoup
HAS_BS4 = True
except ImportError:
HAS_BS4 = False
print("BeautifulSoup not available, using simple HTML parsing")
app = Flask(__name__)
CORS(app)
# Configuration: read from environment (set in HF Space Secrets)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
if not OPENAI_API_KEY:
print("[WARN] OPENAI_API_KEY is not set. Set it in Space Settings → Secrets.")
# Global progress tracking
progress_data = {}
# Determine script directory and robust project root
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.dirname(SCRIPT_DIR) if os.path.basename(SCRIPT_DIR) == "code" else SCRIPT_DIR
# Ensure we can import helper modules (prefer repo root; fallback to ./code)
CODE_DIR_CANDIDATE = os.path.join(ROOT_DIR, "code")
CODE_DIR = CODE_DIR_CANDIDATE if os.path.isdir(CODE_DIR_CANDIDATE) else ROOT_DIR
if CODE_DIR not in sys.path:
sys.path.insert(0, CODE_DIR)
# Database directories: prefer repo-root `database/` when present; fallback to CODE_DIR/database
DATABASE_DIR_ROOT = os.path.join(ROOT_DIR, "database")
DATABASE_DIR = DATABASE_DIR_ROOT if os.path.isdir(DATABASE_DIR_ROOT) else os.path.join(CODE_DIR, "database")
COLLECTION_DB_DIR = os.path.join(DATABASE_DIR, "collections")
FILTER_DB_DIR = os.path.join(DATABASE_DIR, "filters")
# Ensure database directories exist
os.makedirs(COLLECTION_DB_DIR, exist_ok=True)
os.makedirs(FILTER_DB_DIR, exist_ok=True)
def ensure_db_dirs() -> None:
"""Ensure database directories exist (safe to call anytime)."""
try:
os.makedirs(COLLECTION_DB_DIR, exist_ok=True)
os.makedirs(FILTER_DB_DIR, exist_ok=True)
except Exception:
pass
# Robust HTTP headers for publisher sites
DEFAULT_HTTP_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
}
def _http_get(url: str, timeout: int = 15) -> Optional[requests.Response]:
try:
resp = requests.get(url, headers=DEFAULT_HTTP_HEADERS, timeout=timeout, allow_redirects=True)
return resp
except Exception as e:
print(f"HTTP GET failed for {url}: {e}")
return None
def fetch_abstract_from_doi(doi: str) -> Optional[str]:
"""Fetch abstract/highlights from a DOI URL with a robust, layered strategy."""
if not doi:
return None
# Normalize DOI
doi_clean = doi.replace('https://doi.org/', '').strip()
# 1) Crossref (fast, sometimes JATS)
try:
text = fetch_from_crossref(doi_clean)
if text and len(text) > 50:
return text
except Exception as e:
print(f"Crossref fetch failed: {e}")
# 2) Fetch target HTML via doi.org redirect
try:
start_url = f"https://doi.org/{doi_clean}"
resp = _http_get(start_url, timeout=15)
if not resp or resp.status_code >= 400:
return None
html_text = resp.text or ''
final_url = getattr(resp, 'url', start_url)
print(f"Resolved DOI to: {final_url}")
# Parse with robust pipeline
parsed = robust_extract_abstract(html_text)
if parsed and len(parsed) > 50:
return parsed
except Exception as e:
print(f"DOI HTML fetch failed: {e}")
# 3) PubMed placeholder (extendable)
try:
text = fetch_from_pubmed(doi_clean)
if text and len(text) > 50:
return text
except Exception:
pass
return None
def fetch_from_crossref(doi: str) -> Optional[str]:
"""Fetch abstract from Crossref API."""
try:
url = f"https://api.crossref.org/works/{doi}"
response = _http_get(url, timeout=12)
if response.status_code == 200:
data = response.json()
if 'message' in data:
message = data['message']
# Check for abstract or highlights (case insensitive)
for key in message:
if key.lower() in ['abstract', 'highlights'] and message[key]:
raw = str(message[key])
# Crossref sometimes returns JATS/XML; strip tags and unescape entities
text = re.sub(r'<[^>]+>', ' ', raw)
text = html.unescape(re.sub(r'\s+', ' ', text)).strip()
return text
except Exception:
pass
return None
def fetch_from_doi_org(doi: str) -> Optional[str]:
"""Legacy wrapper kept for API compatibility; now uses robust pipeline."""
try:
url = f"https://doi.org/{doi}"
resp = _http_get(url, timeout=15)
if not resp or resp.status_code >= 400:
return None
return robust_extract_abstract(resp.text or '')
except Exception:
return None
def extract_from_preloaded_state_bruteforce(content: str) -> Optional[str]:
"""Extract abstract from window.__PRELOADED_STATE__ using brace matching and fallbacks."""
try:
start_idx = content.find('window.__PRELOADED_STATE__')
if start_idx == -1:
return None
# Find the first '{' after the equals sign
eq_idx = content.find('=', start_idx)
if eq_idx == -1:
return None
brace_idx = content.find('{', eq_idx)
if brace_idx == -1:
return None
# Brace matching to find the matching closing '}'
depth = 0
end_idx = -1
for i in range(brace_idx, min(len(content), brace_idx + 5_000_000)):
ch = content[i]
if ch == '{': depth += 1
elif ch == '}':
depth -= 1
if depth == 0:
end_idx = i
break
if end_idx == -1:
return None
json_str = content[brace_idx:end_idx+1]
try:
data = json.loads(json_str)
except Exception as e:
# Try to relax by removing trailing commas and control chars
cleaned = re.sub(r',\s*([}\]])', r'\1', json_str)
cleaned = re.sub(r'\u0000', '', cleaned)
try:
data = json.loads(cleaned)
except Exception as e2:
print(f"Failed to parse preloaded JSON: {e2}")
return None
# Same traversal as before
if isinstance(data, dict) and 'abstracts' in data and isinstance(data['abstracts'], dict) and 'content' in data['abstracts']:
abstracts = data['abstracts']['content']
if isinstance(abstracts, list):
for abstract_item in abstracts:
if isinstance(abstract_item, dict) and '$$' in abstract_item and abstract_item.get('#name') == 'abstract':
class_name = abstract_item.get('$', {}).get('class', '')
for section in abstract_item.get('$$', []):
if isinstance(section, dict) and section.get('#name') == 'abstract-sec':
section_text = extract_text_from_abstract_section(section)
section_highlights = extract_highlights_from_section(section)
if section_text and len(section_text.strip()) > 50:
return clean_text(section_text)
if section_highlights and len(section_highlights.strip()) > 50:
return clean_text(section_highlights)
if 'highlight' in class_name.lower():
highlights_text = extract_highlights_from_abstract_item(abstract_item)
if highlights_text and len(highlights_text.strip()) > 50:
return clean_text(highlights_text)
return None
except Exception as e:
print(f"Error extracting from preloaded state (bruteforce): {e}")
return None
def extract_from_json_ld(content: str) -> Optional[str]:
"""Parse JSON-LD script tags and extract abstract/description if present."""
if not HAS_BS4:
return None
try:
soup = BeautifulSoup(content, 'html.parser')
for script in soup.find_all('script', type='application/ld+json'):
try:
data = json.loads(script.string or '{}')
except Exception:
continue
candidates = []
if isinstance(data, dict):
candidates.append(data)
elif isinstance(data, list):
candidates.extend([d for d in data if isinstance(d, dict)])
for obj in candidates:
for key in ['abstract', 'description']:
if key in obj and obj[key]:
text = clean_text(str(obj[key]))
if len(text) > 50:
return text
return None
except Exception as e:
print(f"Error extracting from JSON-LD: {e}")
return None
def clean_text(s: str) -> str:
s = html.unescape(s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
def extract_from_meta_tags(soup) -> Optional[str]:
try:
# Common meta carriers of abstract-like summaries
candidates = []
# OpenGraph description
og = soup.find('meta', attrs={'property': 'og:description'})
if og and og.get('content'):
candidates.append(og['content'])
# Twitter description
tw = soup.find('meta', attrs={'name': 'twitter:description'})
if tw and tw.get('content'):
candidates.append(tw['content'])
# Dublin Core description
dc = soup.find('meta', attrs={'name': 'dc.description'})
if dc and dc.get('content'):
candidates.append(dc['content'])
# citation_abstract
cit_abs = soup.find('meta', attrs={'name': 'citation_abstract'})
if cit_abs and cit_abs.get('content'):
candidates.append(cit_abs['content'])
# Fallback: any meta description
desc = soup.find('meta', attrs={'name': 'description'})
if desc and desc.get('content'):
candidates.append(desc['content'])
# Clean and return the longest meaningful candidate
candidates = [clean_text(c) for c in candidates if isinstance(c, str)]
candidates.sort(key=lambda x: len(x), reverse=True)
for text in candidates:
if len(text) > 50:
return text
return None
except Exception:
return None
def robust_extract_abstract(html_text: str) -> Optional[str]:
"""Layered extraction over raw HTML: preloaded-state, JSON-LD, meta tags, DOM, regex."""
if not html_text:
return None
# 1) ScienceDirect/Elsevier preloaded state (brace-matched)
try:
txt = extract_from_preloaded_state_bruteforce(html_text)
if txt and len(txt) > 50:
return clean_text(txt)
except Exception:
pass
# 2) JSON-LD
try:
txt = extract_from_json_ld(html_text)
if txt and len(txt) > 50:
return clean_text(txt)
except Exception:
pass
# 3) BeautifulSoup-based DOM extraction (meta + selectors + heading-sibling)
if HAS_BS4:
try:
soup = BeautifulSoup(html_text, 'html.parser')
# meta first
meta_txt = extract_from_meta_tags(soup)
if meta_txt and len(meta_txt) > 50:
return clean_text(meta_txt)
# selector scan
selectors = [
'div.abstract', 'div.Abstract', 'div.ABSTRACT',
'div[class*="abstract" i]', 'div[class*="Abstract" i]',
'section.abstract', 'section.Abstract', 'section.ABSTRACT',
'div[data-testid="abstract" i]', 'div[data-testid="Abstract" i]',
'div.article-abstract', 'div.article-Abstract',
'div.abstract-content', 'div.Abstract-content',
'div.highlights', 'div.Highlights', 'div.HIGHLIGHTS',
'div[class*="highlights" i]', 'div[class*="Highlights" i]',
'section.highlights', 'section.Highlights', 'section.HIGHLIGHTS',
'div[data-testid="highlights" i]', 'div[data-testid="Highlights" i]'
]
for css in selectors:
node = soup.select_one(css)
if node:
t = clean_text(node.get_text(' ', strip=True))
if len(t) > 50:
return t
# headings near Abstract/Highlights
for tag in soup.find_all(['h1','h2','h3','h4','h5','h6','strong','b']):
try:
title = (tag.get_text() or '').strip().lower()
if 'abstract' in title or 'highlights' in title:
blocks = []
sib = tag
steps = 0
while sib and steps < 20:
sib = sib.find_next_sibling()
steps += 1
if not sib: break
if sib.name in ['p','div','section','article','ul','ol']:
blocks.append(sib.get_text(' ', strip=True))
joined = clean_text(' '.join(blocks))
if len(joined) > 50:
return joined
except Exception:
continue
except Exception:
pass
# 4) Regex fallback
try:
patterns = [
r'
]*class="[^\"]*(?:abstract|Abstract|ABSTRACT|highlights|Highlights|HIGHLIGHTS)[^\"]*"[^>]*>(.*?)
',
r']*class="[^\"]*(?:abstract|Abstract|ABSTRACT|highlights|Highlights|HIGHLIGHTS)[^\"]*"[^>]*>(.*?)',
r']*data-testid="(?:abstract|Abstract|highlights|Highlights)"[^>]*>(.*?)
'
]
for pat in patterns:
for m in re.findall(pat, html_text, re.DOTALL | re.IGNORECASE):
t = clean_text(re.sub(r'<[^>]+>', ' ', m))
if len(t) > 50:
return t
except Exception:
pass
return None
def extract_text_from_abstract_section(section: dict) -> str:
"""Extract text content from abstract section structure."""
try:
text_parts = []
if '$$' in section:
for item in section['$$']:
if isinstance(item, dict):
# Direct text content from simple-para
if item.get('#name') == 'simple-para' and '_' in item:
text_parts.append(item['_'])
# Also check for para elements
elif item.get('#name') == 'para' and '_' in item:
text_parts.append(item['_'])
# Recursively extract from nested structure
elif '$$' in item:
nested_text = extract_text_from_abstract_section(item)
if nested_text:
text_parts.append(nested_text)
return ' '.join(text_parts)
except Exception as e:
print(f"Error extracting text from abstract section: {e}")
return ""
def extract_highlights_from_section(section: dict) -> str:
"""Extract highlights content from section structure."""
try:
text_parts = []
if '$$' in section:
for item in section['$$']:
if isinstance(item, dict):
# Look for section-title with "Highlights"
if (item.get('#name') == 'section-title' and
item.get('_') and 'highlight' in item['_'].lower()):
# Found highlights section, extract list items
highlights_text = extract_highlights_list(item, section)
if highlights_text:
text_parts.append(highlights_text)
# Also look for direct list structures
elif item.get('#name') == 'list':
# Found list, extract list items directly
highlights_text = extract_highlights_list(item, section)
if highlights_text:
text_parts.append(highlights_text)
elif '$$' in item:
# Recursively search for highlights
nested_text = extract_highlights_from_section(item)
if nested_text:
text_parts.append(nested_text)
return ' '.join(text_parts)
except Exception as e:
print(f"Error extracting highlights from section: {e}")
return ""
def extract_highlights_list(title_item: dict, parent_section: dict) -> str:
"""Extract highlights list items from the section structure."""
try:
highlights = []
# Look for the list structure after the highlights title
if '$$' in parent_section:
for item in parent_section['$$']:
if isinstance(item, dict) and item.get('#name') == 'list':
# Found list, extract list items
if '$$' in item:
for list_item in item['$$']:
if isinstance(list_item, dict) and list_item.get('#name') == 'list-item':
# Extract text from list item
item_text = extract_text_from_abstract_section(list_item)
if item_text:
highlights.append(f"• {item_text}")
# Also check if the title_item itself contains a list (for direct list structures)
if '$$' in title_item:
for item in title_item['$$']:
if isinstance(item, dict) and item.get('#name') == 'list':
if '$$' in item:
for list_item in item['$$']:
if isinstance(list_item, dict) and list_item.get('#name') == 'list-item':
item_text = extract_text_from_abstract_section(list_item)
if item_text:
highlights.append(f"• {item_text}")
return ' '.join(highlights)
except Exception as e:
print(f"Error extracting highlights list: {e}")
return ""
def extract_highlights_from_abstract_item(abstract_item: dict) -> str:
"""Extract highlights from an abstract item that contains highlights."""
try:
highlights = []
if '$$' in abstract_item:
for section in abstract_item['$$']:
if isinstance(section, dict) and section.get('#name') == 'abstract-sec':
# Look for highlights within this section
highlights_text = extract_highlights_from_section(section)
if highlights_text:
highlights.append(highlights_text)
return ' '.join(highlights)
except Exception as e:
print(f"Error extracting highlights from abstract item: {e}")
return ""
def fetch_from_pubmed(doi: str) -> Optional[str]:
"""Fetch abstract from PubMed if available."""
try:
# This is a simplified approach - in practice, you'd need to use PubMed API
# For now, we'll skip this method but could be extended to check for:
# - abstract field
# - highlights field
# - other summary fields
pass
except Exception:
pass
return None
def convert_abstract_to_inverted_index(abstract: str) -> Dict:
"""Convert abstract text to inverted index format."""
if not abstract:
return {}
# Simple word tokenization and position mapping
words = re.findall(r'\b\w+\b', abstract.lower())
inverted_index = {}
for i, word in enumerate(words):
if word not in inverted_index:
inverted_index[word] = []
inverted_index[word].append(i)
return inverted_index
def extract_work_id_from_url(url: str) -> Optional[str]:
"""Extract OpenAlex work ID from various URL formats."""
if not url:
return None
# Handle different URL formats
if 'openalex.org' in url:
if '/works/' in url:
# Extract ID from URL like https://openalex.org/W2741809807
work_id = url.split('/works/')[-1]
return work_id
elif 'api.openalex.org/works/' in url:
# Extract ID from API URL
work_id = url.split('/works/')[-1]
return work_id
# If it's already just an ID
if url.startswith('W') and len(url) > 5:
return url
return None
def save_to_database(session_id: str, data_type: str, data: Dict) -> str:
"""Legacy-compatible save helper that routes to the new split DB layout."""
if data_type == 'collection':
work_id = data.get('work_id', '')
title = data.get('title', '')
return save_collection_to_database(work_id, title, data)
if data_type == 'filter':
source_collection = data.get('source_collection', '')
research_question = data.get('research_question', '')
return save_filter_to_database(source_collection, research_question, data)
# Fallback legacy path (single folder)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{session_id}_{data_type}_{timestamp}.pkl"
filepath = os.path.join(DATABASE_DIR, filename)
with open(filepath, 'wb') as f: pickle.dump(data, f)
return filename
def _clean_work_id(work_id_or_url: str) -> str:
clean = extract_work_id_from_url(work_id_or_url) or work_id_or_url
clean = clean.replace('https://api.openalex.org/works/', '').replace('https://openalex.org/', '')
return clean
def save_collection_to_database(work_id_or_url: str, title: str, data: Dict) -> str:
"""Save a collection once per work. Filename is the clean work id only (dedup)."""
ensure_db_dirs()
clean_id = _clean_work_id(work_id_or_url)
filename = f"{clean_id}.pkl"
filepath = os.path.join(COLLECTION_DB_DIR, filename)
# Deduplicate: if exists, do NOT overwrite
if os.path.exists(filepath):
return filename
# Ensure helpful metadata for frontend display
data = dict(data)
data['work_id'] = work_id_or_url
data['title'] = title
data['work_identifier'] = clean_id
data['created'] = datetime.now().isoformat()
with open(filepath, 'wb') as f: pickle.dump(data, f)
return filename
def save_filter_to_database(source_collection_clean_id: str, research_question: str, data: Dict) -> str:
"""Save a filter result linked to a source collection. Multiple filters allowed."""
ensure_db_dirs()
# Slug for RQ to keep filenames short
rq_slug = ''.join(c for c in research_question[:40] if c.isalnum() or c in (' ', '-', '_')).strip().replace(' ', '_') or 'rq'
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"{source_collection_clean_id}__filter__{rq_slug}__{timestamp}.pkl"
filepath = os.path.join(FILTER_DB_DIR, filename)
data = dict(data)
data['filter_identifier'] = filename.replace('.pkl','')
data['source_collection'] = source_collection_clean_id
data['research_question'] = research_question
data['created'] = datetime.now().isoformat()
with open(filepath, 'wb') as f: pickle.dump(data, f)
return filename
def get_collection_files() -> List[Dict]:
files: List[Dict] = []
if not os.path.exists(COLLECTION_DB_DIR): return files
for filename in os.listdir(COLLECTION_DB_DIR):
if not filename.endswith('.pkl'): continue
filepath = os.path.join(COLLECTION_DB_DIR, filename)
try:
stat = os.stat(filepath)
with open(filepath, 'rb') as f: data = pickle.load(f)
files.append({
'filename': filename,
'type': 'collection',
'work_identifier': data.get('work_identifier') or filename.replace('.pkl',''),
'title': data.get('title',''),
'work_id': data.get('work_id',''),
'total_papers': data.get('total_papers',0),
'created': data.get('created', datetime.fromtimestamp(stat.st_ctime).isoformat()),
'size': stat.st_size
})
except Exception:
continue
files.sort(key=lambda x: x['created'], reverse=True)
return files
def get_filter_files() -> List[Dict]:
files: List[Dict] = []
if not os.path.exists(FILTER_DB_DIR): return files
for filename in os.listdir(FILTER_DB_DIR):
if not filename.endswith('.pkl'): continue
filepath = os.path.join(FILTER_DB_DIR, filename)
try:
stat = os.stat(filepath)
with open(filepath, 'rb') as f: data = pickle.load(f)
files.append({
'filename': filename,
'type': 'filter',
'filter_identifier': data.get('filter_identifier') or filename.replace('.pkl',''),
'source_collection': data.get('source_collection',''),
'research_question': data.get('research_question',''),
'relevant_papers': data.get('relevant_papers',0),
'total_papers': data.get('total_papers',0),
'tested_papers': data.get('tested_papers',0),
'created': data.get('created', datetime.fromtimestamp(stat.st_ctime).isoformat()),
'size': stat.st_size
})
except Exception:
continue
files.sort(key=lambda x: x['created'], reverse=True)
return files
def get_database_files() -> List[Dict]:
"""Combined listing for frontend history panel."""
return get_collection_files() + get_filter_files()
def find_existing_collection(work_id_or_url: str) -> Optional[str]:
"""Return existing collection filename for a work id if present (dedup)."""
clean_id = _clean_work_id(work_id_or_url)
filename = f"{clean_id}.pkl"
filepath = os.path.join(COLLECTION_DB_DIR, filename)
return filename if os.path.exists(filepath) else None
def filter_papers_for_rq(papers: List[Dict], research_question: str) -> List[Dict]:
"""Filter papers based on research question using GPT-5 mini."""
if not papers or not research_question:
return []
relevant_papers = []
for i, paper in enumerate(papers):
print(f"Analyzing paper {i+1}/{len(papers)}: {paper.get('title', 'No title')[:50]}...")
# Extract title and abstract
title = paper.get('title', '')
abstract = ''
# Try to get abstract from inverted index
inverted_abstract = paper.get('abstract_inverted_index')
if inverted_abstract:
words = []
for word, positions in inverted_abstract.items():
for pos in positions:
while len(words) <= pos:
words.append('')
words[pos] = word
abstract = ' '.join(words).strip()
if not title and not abstract:
continue
# Create content for GPT analysis
content = {
'title': title,
'abstract': abstract
}
# Analyze with GPT-5 mini
try:
analysis = analyze_with_gpt4(content, OPENAI_API_KEY)
if analysis and analysis.get('aims_of_paper'):
# Check if paper is relevant to research question
relevance_prompt = f"""
Research Question: {research_question}
Paper Title: {title}
Paper Abstract: {abstract or 'No abstract available'}
Is this paper highly relevant to answering the research question?
Consider the paper's aims, methods, and findings.
Return ONLY a JSON object: {{"relevant": true/false, "reason": "brief explanation"}}
"""
relevance_response = analyze_with_gpt4({
'title': 'Relevance Check',
'abstract': relevance_prompt
}, OPENAI_API_KEY)
if relevance_response and relevance_response.get('aims_of_paper'):
# Parse the relevance response
try:
relevance_data = json.loads(relevance_response['aims_of_paper'])
if relevance_data.get('relevant', False):
paper['relevance_reason'] = relevance_data.get('reason', 'Relevant to research question')
paper['gpt_analysis'] = analysis
relevant_papers.append(paper)
except:
# If parsing fails, include paper anyway if it has analysis
paper['gpt_analysis'] = analysis
relevant_papers.append(paper)
except Exception as e:
print(f"Error analyzing paper {i+1}: {e}")
continue
return relevant_papers
@app.route('/')
def index():
"""Serve the main HTML page."""
return render_template('index.html')
@app.route('/health')
def health():
return jsonify({'status': 'ok', 'app': 'paper_analysis_backend', 'port': 5000})
@app.route('/api/progress/')
def get_progress(task_id):
"""Get progress for a specific task."""
return jsonify(progress_data.get(task_id, {'status': 'not_found', 'progress': 0, 'message': 'Task not found'}))
def collect_papers_async(work_id, limit, task_id):
"""Async function to collect papers with progress tracking."""
try:
def progress_callback(progress, message):
progress_data[task_id] = {
'status': 'running',
'progress': progress,
'message': message
}
progress_data[task_id] = {'status': 'running', 'progress': 0, 'message': 'Starting paper collection...'}
# Get related papers with detailed counts and progress callback
papers = get_related_papers(work_id, upper_limit=limit, progress_callback=progress_callback)
if not papers:
progress_data[task_id] = {'status': 'error', 'progress': 0, 'message': 'No related papers found'}
return
# Count papers by relationship type
cited_count = sum(1 for p in papers if p.get('relationship') == 'cited')
citing_count = sum(1 for p in papers if p.get('relationship') == 'citing')
related_count = sum(1 for p in papers if p.get('relationship') == 'related')
# Save papers to temporary file
with open('temp_papers.pkl', 'wb') as f:
pickle.dump(papers, f)
# Fetch seed title for identifier; tolerate failures
title = ''
try:
seed_resp = requests.get(f'https://api.openalex.org/works/{_clean_work_id(work_id)}', timeout=10)
if seed_resp.ok:
title = (seed_resp.json() or {}).get('title','')
except Exception:
title = ''
# Save to collection database (dedup by work id)
collection_data = {
'work_id': work_id,
'total_papers': len(papers),
'cited_papers': cited_count,
'citing_papers': citing_count,
'related_papers': related_count,
'limit': limit,
'papers': papers,
}
db_filename = save_collection_to_database(work_id, title, collection_data)
progress_data[task_id] = {
'status': 'completed',
'progress': 100,
'message': 'Collection completed',
'result': {
'work_id': work_id,
'total_papers': len(papers),
'cited_papers': cited_count,
'citing_papers': citing_count,
'related_papers': related_count,
'limit': limit,
'papers': papers[:10], # Return first 10 for preview
'db_filename': db_filename
}
}
except Exception as e:
print(f"Error collecting papers: {e}")
progress_data[task_id] = {'status': 'error', 'progress': 0, 'message': str(e)}
def search_papers_by_title(title: str) -> List[Dict]:
"""Search OpenAlex for papers by title and return ranked matches."""
try:
# Clean and prepare the title for search
clean_title = title.strip()
if not clean_title:
return []
# Search OpenAlex API
import urllib.parse
params = {
'search': clean_title,
'per_page': 10, # Get top 10 results
'sort': 'relevance_score:desc' # Sort by relevance
}
# Build URL with query parameters
query_string = urllib.parse.urlencode(params)
search_url = f"https://api.openalex.org/works?{query_string}"
print(f"EXACT URL BEING SEARCHED: {search_url}")
response = _http_get(search_url, timeout=10)
if not response or response.status_code != 200:
print(f"OpenAlex search failed: {response.status_code if response else 'No response'}")
return []
data = response.json()
results = data.get('results', [])
if not results:
print(f"No results found for title: {clean_title}")
return []
# Return top results (OpenAlex already ranks by relevance)
scored_results = []
for work in results[:5]: # Take top 5 from OpenAlex
work_title = work.get('title', '')
if not work_title:
continue
work_id = work.get('id', '').replace('https://openalex.org/', '')
scored_results.append({
'work_id': work_id,
'title': work_title,
'authors': ', '.join([author.get('author', {}).get('display_name', '') for author in work.get('authorships', [])[:3]]),
'year': work.get('publication_date', '')[:4] if work.get('publication_date') else 'Unknown',
'venue': work.get('primary_location', {}).get('source', {}).get('display_name', 'Unknown'),
'relevance_score': work.get('relevance_score', 0)
})
return scored_results
except Exception as e:
print(f"Error searching for papers by title: {e}")
return []
@app.route('/api/search-papers', methods=['POST'])
def search_papers():
"""Search for papers by title and return matches for user selection."""
try:
data = request.get_json()
paper_title = data.get('paper_title', '').strip()
if not paper_title:
return jsonify({'error': 'Paper title is required'}), 400
matches = search_papers_by_title(paper_title)
if not matches:
return jsonify({'error': f'No papers found matching title: {paper_title}'}), 404
return jsonify({
'success': True,
'matches': matches,
'query': paper_title
})
except Exception as e:
print(f"Error searching papers: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/collect-papers', methods=['POST'])
def collect_papers():
"""Collect related papers from a seed paper URL or title search."""
try:
data = request.get_json()
seed_url = data.get('seed_url', '').strip()
paper_title = data.get('paper_title', '').strip()
method = data.get('method', 'url')
user_api_key = data.get('user_api_key') # User's own API key for large collections
if method == 'title' and not paper_title:
return jsonify({'error': 'Paper title is required for title search'}), 400
elif method == 'url' and not seed_url:
return jsonify({'error': 'Seed URL is required for URL method'}), 400
# Handle title search or URL method
if method == 'title':
# For title search, work_id should be provided (selected by user)
work_id = data.get('selected_work_id', '').strip()
if not work_id:
return jsonify({'error': 'Selected work ID is required for title search'}), 400
else:
# Extract work ID from URL
work_id = extract_work_id_from_url(seed_url)
if not work_id:
return jsonify({'error': 'Invalid OpenAlex URL format'}), 400
print(f"Collecting papers for work ID: {work_id}")
# Check if collection already exists (dedup)
existing_file = find_existing_collection(work_id)
if existing_file:
print(f"Using existing collection: {existing_file}")
# Load existing collection data
filepath = os.path.join(COLLECTION_DB_DIR, existing_file)
with open(filepath, 'rb') as f:
existing_data = pickle.load(f)
# Generate task ID for consistency
task_id = f"collect_{int(time.time())}"
# Set progress to completed immediately
progress_data[task_id] = {
'status': 'completed',
'progress': 100,
'message': f'Using existing collection from {existing_data.get("created", "unknown time")}',
'result': {
'papers': existing_data.get('papers', []),
'total_papers': existing_data.get('total_papers', 0),
'cited_papers': existing_data.get('cited_papers', 0),
'citing_papers': existing_data.get('citing_papers', 0),
'related_papers': existing_data.get('related_papers', 0),
'db_filename': existing_file
}
}
return jsonify({'success': True, 'task_id': task_id, 'used_existing': True, 'message': 'Using existing collection'})
# Optional limit from request (None means collect all)
limit = data.get('limit')
try:
limit = int(limit) if limit is not None else None
except Exception:
limit = None
# Generate task ID
task_id = f"collect_{int(time.time())}"
# Start async collection
thread = threading.Thread(target=collect_papers_async, args=(work_id, limit, task_id))
thread.daemon = True
thread.start()
return jsonify({
'success': True,
'task_id': task_id,
'message': 'Paper collection started'
})
except Exception as e:
print(f"Error collecting papers: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/filter-papers', methods=['POST'])
def filter_papers():
"""Filter papers based on research question."""
try:
data = request.get_json()
research_question = data.get('research_question', '').strip()
limit = data.get('limit', 10) # Default to 10 most recent relevant papers
provided_source_collection = (data.get('source_collection') or '').strip()
papers_data = data.get('papers') # Papers passed directly from frontend
user_api_key = data.get('user_api_key') # User's own API key for large analyses
if not research_question:
return jsonify({'error': 'Research question is required'}), 400
# Load papers from either passed data or temporary file
papers = []
if papers_data:
papers = papers_data
elif os.path.exists('temp_papers.pkl'):
with open('temp_papers.pkl', 'rb') as f:
papers = pickle.load(f)
else:
return jsonify({'error': 'No papers found. Please collect papers first.'}), 400
print(f"Filtering {len(papers)} papers for research question: {research_question}")
# Use user's API key if provided, otherwise use default
api_key_to_use = user_api_key if user_api_key else OPENAI_API_KEY
# Filter papers using custom analyzer (returns top N most recent relevant papers)
relevant_papers = filter_papers_for_research_question(papers, research_question, api_key_to_use, limit)
# Determine source collection id for linkage
source_collection_id = None
if provided_source_collection:
source_collection_id = provided_source_collection
else:
try:
collections = get_collection_files()
if collections:
source_collection_id = collections[0].get('work_identifier')
except Exception:
source_collection_id = None
# Count actual relevant papers from analysis results
actual_relevant = 0
for paper in relevant_papers:
if paper.get('relevance_score') == True or paper.get('relevance_score') == 'true':
actual_relevant += 1
# Calculate open access statistics
total_oa = 0
for paper in papers:
oa_info = paper.get('open_access') or {}
if oa_info.get('is_oa', False):
total_oa += 1
oa_percentage = round((total_oa / len(papers)) * 100) if papers else 0
# Calculate abstract statistics
total_with_abstract = 0
for paper in papers:
if paper.get('abstract_inverted_index') and len(paper.get('abstract_inverted_index', {})) > 0:
total_with_abstract += 1
abstract_percentage = round((total_with_abstract / len(papers)) * 100) if papers else 0
# Save filtered results to filter database (linked to collection)
tested_papers = int(limit) if isinstance(limit, int) else 0
filter_data = {
'research_question': research_question,
'total_papers': len(papers), # Total papers in collection
'tested_papers': tested_papers, # Number of papers tested for relevance
'relevant_papers': actual_relevant, # Actual count of YES responses
'oa_percentage': oa_percentage, # Open access percentage
'abstract_percentage': abstract_percentage, # Percentage with abstracts
'limit': limit,
'papers': relevant_papers,
'source_collection': source_collection_id
}
if source_collection_id:
db_filename = save_filter_to_database(source_collection_id, research_question, filter_data)
else:
# Fallback
db_filename = save_to_database(f"filter_{int(time.time())}", 'filter', filter_data)
return jsonify({
'success': True,
'research_question': research_question,
'total_papers': len(papers), # Total papers in collection
'tested_papers': tested_papers, # Number of papers tested for relevance
'relevant_papers': actual_relevant, # Actual count of YES responses
'oa_percentage': oa_percentage, # Open access percentage
'abstract_percentage': abstract_percentage, # Percentage with abstracts
'limit': limit,
'papers': relevant_papers,
'db_filename': db_filename
})
except Exception as e:
print(f"Error filtering papers: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/database-files')
def get_database_files_endpoint():
"""Get list of all database files (collections + filters)."""
try:
files = get_database_files()
return jsonify({'success': True, 'files': files})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/api/load-database-file/')
def load_database_file(filename):
"""Load a specific database file."""
try:
# Try collections then filters then legacy
filepath = os.path.join(COLLECTION_DB_DIR, filename)
if not os.path.exists(filepath):
filepath = os.path.join(FILTER_DB_DIR, filename)
if not os.path.exists(filepath):
filepath = os.path.join(DATABASE_DIR, filename)
if not os.path.exists(filepath):
return jsonify({'error': 'File not found'}), 404
with open(filepath, 'rb') as f:
data = pickle.load(f)
return jsonify({'success': True, 'data': data})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/api/delete-database-file/', methods=['DELETE'])
def delete_database_file(filename):
"""Delete a specific database file."""
try:
# Try collections then filters then legacy
filepath = os.path.join(COLLECTION_DB_DIR, filename)
if not os.path.exists(filepath):
filepath = os.path.join(FILTER_DB_DIR, filename)
if not os.path.exists(filepath):
filepath = os.path.join(DATABASE_DIR, filename)
if not os.path.exists(filepath):
return jsonify({'error': 'File not found'}), 404
# Delete the file
os.remove(filepath)
return jsonify({'success': True, 'message': f'File {filename} deleted successfully'})
except Exception as e:
return jsonify({'error': str(e)}), 500
def generate_bibtex_entry(paper):
"""Generate a BibTeX entry for a single paper."""
try:
# Handle None or invalid paper objects
if not paper or not isinstance(paper, dict):
print(f"Invalid paper object: {paper}")
return f"@article{{error_{hash(str(paper)) % 10000},\n title={{Invalid paper data}},\n author={{Unknown}},\n year={{Unknown}}\n}}"
# Extract basic info with safe defaults
title = paper.get('title', 'Unknown Title')
year = paper.get('publication_year', 'Unknown Year')
doi = paper.get('doi', '')
# Generate a unique key (using OpenAlex ID or DOI)
work_id = paper.get('id', '')
if work_id and isinstance(work_id, str):
work_id = work_id.replace('https://openalex.org/', '')
if not work_id and doi:
work_id = doi.replace('https://doi.org/', '').replace('/', '_')
if not work_id:
work_id = f"paper_{hash(title) % 10000}"
# Extract authors safely
authorships = paper.get('authorships', [])
author_list = []
if isinstance(authorships, list):
for authorship in authorships:
if isinstance(authorship, dict):
author = authorship.get('author', {})
if isinstance(author, dict):
display_name = author.get('display_name', '')
if display_name:
# Split name and format as "Last, First"
name_parts = display_name.split()
if len(name_parts) >= 2:
last_name = name_parts[-1]
first_name = ' '.join(name_parts[:-1])
author_list.append(f"{last_name}, {first_name}")
else:
author_list.append(display_name)
authors = " and ".join(author_list) if author_list else "Unknown Author"
# Extract journal info safely
primary_location = paper.get('primary_location', {})
journal = 'Unknown Journal'
if isinstance(primary_location, dict):
source = primary_location.get('source', {})
if isinstance(source, dict):
journal = source.get('display_name', 'Unknown Journal')
# Extract volume, issue, pages safely
biblio = paper.get('biblio', {})
volume = ''
issue = ''
first_page = ''
last_page = ''
if isinstance(biblio, dict):
volume = biblio.get('volume', '')
issue = biblio.get('issue', '')
first_page = biblio.get('first_page', '')
last_page = biblio.get('last_page', '')
# Format pages
if first_page and last_page and first_page != last_page:
pages = f"{first_page}--{last_page}"
elif first_page:
pages = first_page
else:
pages = ""
# Format volume and issue
volume_info = ""
if volume:
volume_info = f"volume={{{volume}}}"
if issue:
volume_info += f", number={{{issue}}}"
elif issue:
volume_info = f"number={{{issue}}}"
# Get URL (prefer DOI, fallback to landing page)
url = doi if doi else ''
if isinstance(primary_location, dict):
landing_url = primary_location.get('landing_page_url', '')
if landing_url and not url:
url = landing_url
# Build BibTeX entry
bibtex_entry = f"""@article{{{work_id},
title={{{title}}},
author={{{authors}}},
journal={{{journal}}},
year={{{year}}}"""
if volume_info:
bibtex_entry += f",\n {volume_info}"
if pages:
bibtex_entry += f",\n pages={{{pages}}}"
if doi:
bibtex_entry += f",\n doi={{{doi.replace('https://doi.org/', '')}}}"
if url:
bibtex_entry += f",\n url={{{url}}}"
bibtex_entry += "\n}"
return bibtex_entry
except Exception as e:
print(f"Error generating BibTeX for paper: {e}")
print(f"Paper data: {paper}")
return f"@article{{error_{hash(str(paper)) % 10000},\n title={{Error generating entry}},\n author={{Unknown}},\n year={{Unknown}}\n}}"
@app.route('/api/generate-bibtex/', methods=['POST'])
def generate_bibtex(filename):
"""Generate BibTeX file for a collection."""
try:
# Load the collection
collection_path = os.path.join(COLLECTION_DB_DIR, filename)
if not os.path.exists(collection_path):
return jsonify({'success': False, 'message': 'Collection not found'}), 404
with open(collection_path, 'rb') as f:
collection_data = pickle.load(f)
papers = collection_data.get('papers', [])
if not papers:
return jsonify({'success': False, 'message': 'No papers in collection'}), 400
print(f"Found {len(papers)} papers in collection")
print(f"First paper structure: {type(papers[0]) if papers else 'No papers'}")
if papers:
print(f"First paper keys: {list(papers[0].keys()) if isinstance(papers[0], dict) else 'Not a dict'}")
# Generate BibTeX entries
bibtex_entries = []
for i, paper in enumerate(papers):
print(f"Processing paper {i+1}/{len(papers)}: {type(paper)}")
entry = generate_bibtex_entry(paper)
bibtex_entries.append(entry)
# Combine all entries
bibtex_content = "\n\n".join(bibtex_entries)
# Save BibTeX file
bibtex_filename = filename.replace('.pkl', '.bib')
bibtex_path = os.path.join(COLLECTION_DB_DIR, bibtex_filename)
with open(bibtex_path, 'w', encoding='utf-8') as f:
f.write(bibtex_content)
print(f"BibTeX file saved to: {bibtex_path}")
print(f"File exists: {os.path.exists(bibtex_path)}")
print(f"File size: {os.path.getsize(bibtex_path) if os.path.exists(bibtex_path) else 'N/A'}")
return jsonify({
'success': True,
'message': f'BibTeX file generated with {len(papers)} entries',
'filename': bibtex_filename,
'entries_count': len(papers)
})
except Exception as e:
return jsonify({'success': False, 'message': f'Error generating BibTeX: {str(e)}'}), 500
@app.route('/api/download-database-file/')
def download_database_file(filename):
"""Download a database file (collection, filter, or BibTeX)."""
try:
print(f"Attempting to download file: {filename}")
# Try collections first, then filters, then legacy
filepath = os.path.join(COLLECTION_DB_DIR, filename)
print(f"Checking collections path: {filepath}")
if not os.path.exists(filepath):
filepath = os.path.join(FILTER_DB_DIR, filename)
print(f"Checking filters path: {filepath}")
if not os.path.exists(filepath):
filepath = os.path.join(DATABASE_DIR, filename)
print(f"Checking legacy path: {filepath}")
if not os.path.exists(filepath):
print(f"File not found in any directory: {filename}")
return jsonify({'error': 'File not found'}), 404
print(f"Found file at: {filepath}")
print(f"File size: {os.path.getsize(filepath)}")
return send_file(filepath, as_attachment=True, download_name=filename)
except Exception as e:
print(f"Error in download_database_file: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/merge-collections', methods=['POST'])
def merge_collections():
"""Merge multiple collections into a new collection with overlap analysis."""
try:
data = request.get_json()
collection_filenames = data.get('collections', [])
if len(collection_filenames) < 2:
return jsonify({'success': False, 'message': 'At least 2 collections required for merging'}), 400
# Load all collections and track their work IDs
collections_data = []
all_work_ids = set()
collection_work_ids = [] # List of sets, one per collection
for filename in collection_filenames:
collection_path = os.path.join(COLLECTION_DB_DIR, filename)
if not os.path.exists(collection_path):
return jsonify({'success': False, 'message': f'Collection {filename} not found'}), 404
with open(collection_path, 'rb') as f:
collection_data = pickle.load(f)
papers = collection_data.get('papers', [])
collection_work_ids_set = set()
# Extract work IDs for this collection
for paper in papers:
if isinstance(paper, dict):
work_id = paper.get('id', '')
if work_id:
collection_work_ids_set.add(work_id)
all_work_ids.add(work_id)
collections_data.append({
'filename': filename,
'title': collection_data.get('title', filename.replace('.pkl', '')),
'papers': papers,
'work_ids': collection_work_ids_set,
'total_papers': len(papers)
})
collection_work_ids.append(collection_work_ids_set)
# Calculate overlap statistics
overlap_stats = []
total_unique_papers = len(all_work_ids)
for i, collection in enumerate(collections_data):
collection_work_ids_i = collection_work_ids[i]
overlaps = []
# Calculate overlap with each other collection
for j, other_collection in enumerate(collections_data):
if i != j:
other_work_ids = collection_work_ids[j]
intersection = collection_work_ids_i.intersection(other_work_ids)
overlap_count = len(intersection)
overlap_percentage = (overlap_count / len(collection_work_ids_i)) * 100 if collection_work_ids_i else 0
overlaps.append({
'collection': other_collection['title'],
'overlap_count': overlap_count,
'overlap_percentage': round(overlap_percentage, 1)
})
overlap_stats.append({
'collection': collection['title'],
'total_papers': collection['total_papers'],
'overlaps': overlaps
})
# Create merged collection with unique papers only
merged_papers = []
merged_work_ids = set()
for collection in collections_data:
for paper in collection['papers']:
if isinstance(paper, dict):
work_id = paper.get('id', '')
if work_id and work_id not in merged_work_ids:
merged_papers.append(paper)
merged_work_ids.add(work_id)
if not merged_papers:
return jsonify({'success': False, 'message': 'No papers found in collections to merge'}), 400
# Calculate total papers across all collections (before deduplication)
total_papers_before_merge = sum(collection['total_papers'] for collection in collections_data)
duplicates_removed = total_papers_before_merge - len(merged_papers)
deduplication_percentage = (duplicates_removed / total_papers_before_merge) * 100 if total_papers_before_merge > 0 else 0
# Create merged collection data
collection_titles = [collection['title'] for collection in collections_data]
merged_title = f"MERGED: {' + '.join(collection_titles[:3])}"
if len(collection_titles) > 3:
merged_title += f" + {len(collection_titles) - 3} more"
merged_data = {
'work_identifier': f"merged_{int(time.time())}",
'title': merged_title,
'work_id': '',
'papers': merged_papers,
'total_papers': len(merged_papers),
'created': datetime.now().isoformat(),
'source_collections': collection_filenames,
'merge_stats': {
'total_papers_before_merge': total_papers_before_merge,
'duplicates_removed': duplicates_removed,
'deduplication_percentage': round(deduplication_percentage, 1),
'overlap_analysis': overlap_stats
}
}
# Save merged collection
merged_filename = f"merged_{int(time.time())}.pkl"
merged_path = os.path.join(COLLECTION_DB_DIR, merged_filename)
with open(merged_path, 'wb') as f:
pickle.dump(merged_data, f)
return jsonify({
'success': True,
'message': f'Merged collection created with {len(merged_papers)} unique papers (removed {duplicates_removed} duplicates)',
'filename': merged_filename,
'total_papers': len(merged_papers),
'merge_stats': {
'total_papers_before_merge': total_papers_before_merge,
'duplicates_removed': duplicates_removed,
'deduplication_percentage': round(deduplication_percentage, 1),
'overlap_analysis': overlap_stats
}
})
except Exception as e:
return jsonify({'success': False, 'message': f'Error merging collections: {str(e)}'}), 500
@app.route('/api/fetch-abstracts', methods=['POST'])
def fetch_abstracts():
"""Fetch missing abstracts for papers using their DOI URLs."""
try:
data = request.get_json()
papers = data.get('papers', [])
if not papers:
return jsonify({'error': 'No papers provided'}), 400
updated_papers = []
fetched_count = 0
total_processed = 0
for paper in papers:
total_processed += 1
updated_paper = paper.copy()
# Check if paper already has abstract (check both abstract_inverted_index and abstract fields)
has_abstract = (
(paper.get('abstract_inverted_index') and
len(paper.get('abstract_inverted_index', {})) > 0) or
(paper.get('abstract') and
len(str(paper.get('abstract', '')).strip()) > 50)
)
if not has_abstract and paper.get('doi'):
print(f"Fetching abstract for DOI: {paper.get('doi')}")
abstract = fetch_abstract_from_doi(paper.get('doi'))
if abstract:
# Convert to inverted index format
inverted_index = convert_abstract_to_inverted_index(abstract)
updated_paper['abstract_inverted_index'] = inverted_index
fetched_count += 1
print(f"Successfully fetched abstract for: {paper.get('title', 'Unknown')[:50]}...")
else:
print(f"Could not fetch abstract for: {paper.get('title', 'Unknown')[:50]}...")
updated_papers.append(updated_paper)
return jsonify({
'success': True,
'fetched_count': fetched_count,
'total_processed': total_processed,
'updated_papers': updated_papers
})
except Exception as e:
print(f"Error fetching abstracts: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/export-excel/')
def export_excel_from_file(filename):
"""Export Excel from a specific database file."""
try:
# Try collections then filters then legacy
filepath = os.path.join(COLLECTION_DB_DIR, filename)
if not os.path.exists(filepath):
filepath = os.path.join(FILTER_DB_DIR, filename)
if not os.path.exists(filepath):
filepath = os.path.join(DATABASE_DIR, filename)
if not os.path.exists(filepath):
return jsonify({'error': 'File not found'}), 404
with open(filepath, 'rb') as f:
data = pickle.load(f)
papers = data.get('papers', [])
if not papers:
return jsonify({'error': 'No papers found in file'}), 400
# Prepare data for Excel export
excel_data = []
for paper in papers:
# Extract abstract from inverted index
abstract = ""
if paper.get('abstract_inverted_index'):
words = []
for word, positions in paper['abstract_inverted_index'].items():
for pos in positions:
while len(words) <= pos:
words.append('')
words[pos] = word
abstract = ' '.join(words).strip()
# Extract open access info with null checks
oa_info = paper.get('open_access') or {}
is_oa = oa_info.get('is_oa', False) if oa_info else False
oa_status = oa_info.get('oa_status', '') if oa_info else ''
# Extract DOI with null check
doi = ""
if paper.get('doi'):
doi = paper['doi'].replace('https://doi.org/', '')
# Extract authors with null checks
authors = paper.get('authorships') or []
author_names = []
for author in authors[:5]: # Limit to first 5 authors
if author and isinstance(author, dict):
author_obj = author.get('author') or {}
if author_obj and isinstance(author_obj, dict):
author_names.append(author_obj.get('display_name', ''))
# Extract journal with null checks
journal = ""
primary_location = paper.get('primary_location')
if primary_location and isinstance(primary_location, dict):
source = primary_location.get('source')
if source and isinstance(source, dict):
journal = source.get('display_name', '')
# Extract GPT analysis with null checks
gpt_analysis = paper.get('gpt_analysis') or {}
gpt_aims = gpt_analysis.get('aims_of_paper', '') if gpt_analysis else ''
gpt_takeaways = gpt_analysis.get('key_takeaways', '') if gpt_analysis else ''
excel_data.append({
'Title': paper.get('title', ''),
'Publication Date': paper.get('publication_date', ''),
'DOI': doi,
'Is Open Access': is_oa,
'OA Status': oa_status,
'Abstract': abstract,
'Relationship': paper.get('relationship', ''),
'Authors': ', '.join(author_names),
'Journal': journal,
'OpenAlex ID': paper.get('id', ''),
'Relevance Reason': paper.get('relevance_reason', ''),
'GPT Aims': gpt_aims,
'GPT Takeaways': gpt_takeaways
})
# Create DataFrame and export to Excel
df = pd.DataFrame(excel_data)
excel_filename = f'{filename.replace(".pkl", "")}_{int(time.time())}.xlsx'
# Create Excel file in a temporary location
temp_dir = tempfile.gettempdir()
excel_path = os.path.join(temp_dir, excel_filename)
try:
df.to_excel(excel_path, index=False)
return send_file(excel_path, as_attachment=True, download_name=excel_filename)
except Exception as e:
print(f"Error creating Excel file: {e}")
# Fallback: try current directory
try:
df.to_excel(excel_filename, index=False)
return send_file(excel_filename, as_attachment=True, download_name=excel_filename)
except Exception as e2:
print(f"Error creating Excel file in current directory: {e2}")
return jsonify({'error': f'Failed to create Excel file: {str(e2)}'}), 500
except Exception as e:
print(f"Error exporting Excel: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/export-excel')
def export_excel():
"""Export collected papers to Excel format."""
try:
# Load papers from temporary file
if not os.path.exists('temp_papers.pkl'):
return jsonify({'error': 'No papers found. Please collect papers first.'}), 400
with open('temp_papers.pkl', 'rb') as f:
papers = pickle.load(f)
# Prepare data for Excel export
excel_data = []
for paper in papers:
# Extract abstract from inverted index
abstract = ""
if paper.get('abstract_inverted_index'):
words = []
for word, positions in paper['abstract_inverted_index'].items():
for pos in positions:
while len(words) <= pos:
words.append('')
words[pos] = word
abstract = ' '.join(words).strip()
# Extract open access info with null checks
oa_info = paper.get('open_access') or {}
is_oa = oa_info.get('is_oa', False) if oa_info else False
oa_status = oa_info.get('oa_status', '') if oa_info else ''
# Extract DOI with null check
doi = ""
if paper.get('doi'):
doi = paper['doi'].replace('https://doi.org/', '')
# Extract authors with null checks
authors = paper.get('authorships') or []
author_names = []
for author in authors[:5]: # Limit to first 5 authors
if author and isinstance(author, dict):
author_obj = author.get('author') or {}
if author_obj and isinstance(author_obj, dict):
author_names.append(author_obj.get('display_name', ''))
# Extract journal with null checks
journal = ""
primary_location = paper.get('primary_location')
if primary_location and isinstance(primary_location, dict):
source = primary_location.get('source')
if source and isinstance(source, dict):
journal = source.get('display_name', '')
# Extract GPT analysis with null checks
gpt_analysis = paper.get('gpt_analysis') or {}
gpt_aims = gpt_analysis.get('aims_of_paper', '') if gpt_analysis else ''
gpt_takeaways = gpt_analysis.get('key_takeaways', '') if gpt_analysis else ''
excel_data.append({
'Title': paper.get('title', ''),
'Publication Date': paper.get('publication_date', ''),
'DOI': doi,
'Is Open Access': is_oa,
'OA Status': oa_status,
'Abstract': abstract,
'Relationship': paper.get('relationship', ''),
'Authors': ', '.join(author_names),
'Journal': journal,
'OpenAlex ID': paper.get('id', ''),
'Relevance Reason': paper.get('relevance_reason', ''),
'GPT Aims': gpt_aims,
'GPT Takeaways': gpt_takeaways
})
# Create DataFrame and export to Excel
df = pd.DataFrame(excel_data)
excel_filename = f'research_papers_{int(time.time())}.xlsx'
# Create Excel file in a temporary location
temp_dir = tempfile.gettempdir()
excel_path = os.path.join(temp_dir, excel_filename)
try:
df.to_excel(excel_path, index=False)
return send_file(excel_path, as_attachment=True, download_name=excel_filename)
except Exception as e:
print(f"Error creating Excel file: {e}")
# Fallback: try current directory
try:
df.to_excel(excel_filename, index=False)
return send_file(excel_filename, as_attachment=True, download_name=excel_filename)
except Exception as e2:
print(f"Error creating Excel file in current directory: {e2}")
return jsonify({'error': f'Failed to create Excel file: {str(e2)}'}), 500
except Exception as e:
print(f"Error exporting Excel: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/paper-details/')
def paper_details(work_id):
"""Get detailed analysis for a specific paper."""
try:
# Load papers from temporary file
if not os.path.exists('temp_papers.pkl'):
return jsonify({'error': 'No papers found'}), 400
with open('temp_papers.pkl', 'rb') as f:
papers = pickle.load(f)
# Find the specific paper
paper = next((p for p in papers if p.get('id') == work_id), None)
if not paper:
return jsonify({'error': 'Paper not found'}), 404
return jsonify({
'success': True,
'paper': paper
})
except Exception as e:
print(f"Error getting paper details: {e}")
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
# Create templates directory if it doesn't exist
os.makedirs('templates', exist_ok=True)
port = int(os.getenv('PORT', '5000'))
app.run(debug=False, host='0.0.0.0', port=port)