Spaces:

echoboi
/

ai_systematic_lit_review

Sleeping

App Files Files Community

ai_systematic_lit_review / paper_analysis_backend.py

echoboi

Upload folder using huggingface_hub

979408e verified 5 months ago

raw

history blame contribute delete

93.9 kB

	from flask import Flask, request, jsonify, render_template, send_file
	from flask_cors import CORS
	import requests
	import json
	import time
	import pandas as pd
	from typing import Dict, List, Optional
	import pickle
	import os
	import sys
	import threading
	import tempfile
	import shutil
	from datetime import datetime
	import timeit
	import json
	import requests
	import os
	import time
	from tqdm import tqdm

	# Define 'toc' function once
	def toc(start_time):
	elapsed = timeit.default_timer() - start_time
	print(elapsed)

	# Record start time
	start_time = timeit.default_timer()

	# Helper function to get all pages
	def get_all_pages(url, headers, upper_limit=None):
	all_results = []
	unique_ids = set() # Track unique paper IDs
	page = 1
	processing_times = [] # Track time taken per paper

	# Get first page to get total count
	first_response = requests.get(f"{url}&page={page}", headers=headers)
	if first_response.status_code != 200:
	return []

	data = first_response.json()
	total_count = data.get('meta', {}).get('count', 0)
	start_time = time.time()

	# Add only unique papers from first page
	for result in data.get('results', []):
	if result.get('id') not in unique_ids:
	unique_ids.add(result.get('id'))
	all_results.append(result)
	if upper_limit and len(all_results) >= upper_limit:
	return all_results

	papers_processed = len(all_results)
	time_taken = time.time() - start_time
	if papers_processed > 0:
	processing_times.append(time_taken / papers_processed)

	# Continue getting remaining pages until we have all papers
	target_count = min(total_count, upper_limit) if upper_limit else total_count
	pbar = tqdm(total=target_count, desc="Retrieving papers",
	initial=len(all_results), unit="papers")

	while len(all_results) < total_count:
	page += 1
	page_start_time = time.time()
	paged_url = f"{url}&page={page}"
	response = requests.get(paged_url, headers=headers)
	if response.status_code != 200:
	print(f"Error retrieving page {page}: {response.status_code}")
	break

	data = response.json()
	results = data.get('results', [])
	if not results:
	break

	# Add only unique papers from this page
	new_papers = 0
	for result in results:
	if result.get('id') not in unique_ids:
	unique_ids.add(result.get('id'))
	all_results.append(result)
	new_papers += 1
	if upper_limit and len(all_results) >= upper_limit:
	pbar.update(new_papers)
	pbar.close()
	return all_results

	# Update processing times and estimated time remaining
	if new_papers > 0:
	time_taken = time.time() - page_start_time
	processing_times.append(time_taken / new_papers)
	avg_time_per_paper = sum(processing_times) / len(processing_times)
	papers_remaining = target_count - len(all_results)
	est_time_remaining = papers_remaining * avg_time_per_paper
	pbar.set_postfix({'Est. Time Remaining': f'{est_time_remaining:.1f}s'})

	pbar.update(new_papers)
	# Add a small delay to respect rate limits
	time.sleep(1)

	pbar.close()
	return all_results


	def get_related_papers(work_id, upper_limit=None, progress_callback=None):
	# Define base URL for OpenAlex API
	base_url = "https://api.openalex.org/works"

	work_query = f"/{work_id}" # OpenAlex work IDs can be used directly in path
	work_url = base_url + work_query

	# Add email to be a polite API user
	headers = {'User-Agent': 'LowAI ([email protected])'}
	response = requests.get(work_url, headers=headers)
	print(response)
	if response.status_code == 200:
	paper = response.json() # For direct work queries, the response is the paper object
	paper_id = paper['id']

	# Use referenced_works field on the seed work directly for cited papers
	referenced_ids = paper.get('referenced_works', []) or []
	print("\nTotal counts:")
	print(f"Cited (referenced_works) count: {len(referenced_ids)}")

	def fetch_works_by_ids(ids, chunk_size=50):
	results = []
	seen = set()
	total_chunks = (len(ids) + chunk_size - 1) // chunk_size

	for i in range(0, len(ids), chunk_size):
	chunk = ids[i:i+chunk_size]
	# Build ids filter: ids.openalex:ID1\|ID2\|ID3
	ids_filter = '\|'.join(chunk)
	url = f"{base_url}?filter=ids.openalex:{ids_filter}&per-page=200"
	resp = requests.get(url, headers=headers)
	if resp.status_code != 200:
	print(f"Error fetching IDs chunk {i//chunk_size+1}: {resp.status_code}")
	continue
	data = resp.json()
	for r in data.get('results', []):
	rid = r.get('id')
	if rid and rid not in seen:
	seen.add(rid)
	results.append(r)

	# Update progress for cited papers (0-30%)
	if progress_callback:
	progress = int(30 * (i // chunk_size + 1) / total_chunks)
	progress_callback(progress, f"Fetching cited papers... {len(results)} found")

	time.sleep(1) # be polite to API
	if upper_limit and len(results) >= upper_limit:
	return results[:upper_limit]
	return results

	print("\nRetrieving cited papers via referenced_works IDs...")
	cited_papers = fetch_works_by_ids(referenced_ids)
	print(f"Found {len(cited_papers)} unique cited papers")

	# Count citing papers (works that cite the seed), then paginate to collect all
	citing_count_url = f"{base_url}?filter=cites:{work_id}&per-page=1"
	citing_count = requests.get(citing_count_url, headers=headers).json().get('meta', {}).get('count', 0)
	print(f"Citing papers: {citing_count}")

	# Get all citing papers with pagination
	print("\nRetrieving citing papers (paginated)...")
	page = 1
	citing_papers = []
	unique_ids = set()
	target = citing_count if not upper_limit else min(upper_limit, citing_count)
	from tqdm import tqdm
	pbar = tqdm(total=target, desc="Retrieving citing papers", unit="papers")
	while len(citing_papers) < target:
	paged_url = f"{base_url}?filter=cites:{work_id}&per-page=200&sort=publication_date:desc&page={page}"
	resp = requests.get(paged_url, headers=headers)
	if resp.status_code != 200:
	print(f"Error retrieving citing page {page}: {resp.status_code}")
	break
	data = resp.json()
	results = data.get('results', [])
	if not results:
	break
	new = 0
	for r in results:
	rid = r.get('id')
	if rid and rid not in unique_ids:
	unique_ids.add(rid)
	citing_papers.append(r)
	new += 1
	if len(citing_papers) >= target:
	break

	# Update progress for citing papers (30-70%)
	if progress_callback:
	progress = 30 + int(40 * len(citing_papers) / target)
	progress_callback(progress, f"Fetching citing papers... {len(citing_papers)} found")

	pbar.update(new)
	page += 1
	time.sleep(1)
	pbar.close()
	print(f"Found {len(citing_papers)} unique citing papers")

	# Get all related papers
	print("\nRetrieving related papers...")
	related_url = f"{base_url}?filter=related_to:{work_id}&per-page=200&sort=publication_date:desc"
	related_papers = get_all_pages(related_url, headers, upper_limit)
	print(f"Found {len(related_papers)} unique related papers")

	# Update progress for related papers (70-90%)
	if progress_callback:
	progress_callback(70, f"Fetching related papers... {len(related_papers)} found")

	# Create sets of IDs for quick lookup
	cited_ids = {paper['id'] for paper in cited_papers}
	citing_ids = {paper['id'] for paper in citing_papers}

	# Print some debug information
	print(f"\nDebug Information:")
	print(f"Seed paper ID: {paper_id}")
	print(f"Number of unique cited papers: {len(cited_ids)}")
	print(f"Number of unique citing papers: {len(citing_ids)}")
	print(f"Number of papers in both sets: {len(cited_ids.intersection(citing_ids))}")

	# Update progress for processing (90-95%)
	if progress_callback:
	progress_callback(90, "Processing and deduplicating papers...")

	# Combine all papers and remove duplicates while tracking relationship
	all_papers = cited_papers + citing_papers + related_papers
	seen_titles = set()
	unique_papers = []
	for paper in all_papers:
	title = paper.get('title', '')
	if title not in seen_titles:
	seen_titles.add(title)
	# Add relationship type
	if paper['id'] in cited_ids:
	paper['relationship'] = 'cited'
	elif paper['id'] in citing_ids:
	paper['relationship'] = 'citing'
	else:
	paper['relationship'] = 'related'
	unique_papers.append(paper)

	# Final progress update
	if progress_callback:
	progress_callback(100, f"Collection completed! Found {len(unique_papers)} unique papers")

	return unique_papers
	else:
	print(f"Error retrieving seed paper: {response.status_code}")
	return []
	import requests
	import json
	from typing import Dict, List, Optional
	from openai import OpenAI
	import concurrent.futures
	import threading
	import time

	def analyze_paper_relevance(content: Dict[str, str], research_question: str, api_key: str) -> Optional[Dict]:
	"""Analyze if a paper is relevant to the research question using GPT-5 mini."""
	client = OpenAI(api_key=api_key)

	title = content.get('title', '')
	abstract = content.get('abstract', '')
	has_abstract = bool(abstract and abstract.strip())

	if has_abstract:
	prompt = f"""
	Research Question: {research_question}

	Paper Title: {title}
	Paper Abstract: {abstract}

	Analyze this paper and determine:
	1. Is this paper highly relevant to answering the research question?
	2. What are the main aims/objectives of this paper?
	3. What are the key takeaways or findings?

	Return ONLY a valid JSON object in this exact format:
	{{
	"relevant": true/false,
	"relevance_reason": "brief explanation of why it is/isn't relevant",
	"aims_of_paper": "main objectives of the paper",
	"key_takeaways": "key findings or takeaways"
	}}
	"""
	else:
	prompt = f"""
	Research Question: {research_question}

	Paper Title: {title}
	Note: No abstract is available for this paper.

	Analyze this paper based on the title only and determine:
	1. Is this paper likely to be relevant to answering the research question based on the title?

	Return ONLY a valid JSON object in this exact format:
	{{
	"relevant": true/false,
	"relevance_reason": "brief explanation of why it is/isn't relevant based on title"
	}}
	"""

	try:
	# Try GPT-5 mini first, fallback to gpt-4o-mini if it fails
	try:
	response = client.responses.create(
	model="gpt-5-mini",
	input=prompt,
	reasoning={"effort": "minimal"},
	text={"verbosity": "low"}
	)
	except Exception as e:
	print(f"GPT-5 mini failed, trying gpt-4o-mini: {e}")
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[{
	"role": "user",
	"content": prompt
	}],
	max_completion_tokens=1000
	)

	# Handle different response formats
	if hasattr(response, 'choices') and response.choices:
	# Old format (chat completions)
	result = response.choices[0].message.content
	elif hasattr(response, 'output'):
	# New format (responses) - extract text from output
	result = ""
	for item in response.output:
	if hasattr(item, "content") and item.content:
	for content in item.content:
	if hasattr(content, "text") and content.text:
	result += content.text
	else:
	print("Unexpected response format")
	return None

	if not result:
	print("Empty response from GPT")
	return None

	# Clean and parse the JSON response
	result = result.strip()
	if result.startswith("```json"):
	result = result[7:]
	if result.endswith("```"):
	result = result[:-3]

	# Try to parse JSON
	try:
	return json.loads(result.strip())
	except json.JSONDecodeError as e:
	print(f"Failed to parse JSON response: {e}")
	print(f"Raw response: {result[:200]}...")
	return None

	except Exception as e:
	print(f"Error in GPT analysis: {str(e)}")
	return None

	def extract_abstract_from_inverted_index(inverted_index: Dict) -> str:
	"""Extract abstract text from inverted index format."""
	if not inverted_index:
	return ""

	words = []
	for word, positions in inverted_index.items():
	for pos in positions:
	while len(words) <= pos:
	words.append('')
	words[pos] = word
	return ' '.join(words).strip()

	def analyze_single_paper(paper: Dict, research_question: str, api_key: str) -> Optional[Dict]:
	"""Analyze a single paper with its own client."""
	try:
	client = OpenAI(api_key=api_key)

	# Extract title and abstract
	title = paper.get('title', '')
	abstract = extract_abstract_from_inverted_index(paper.get('abstract_inverted_index', {}))

	if not title and not abstract:
	return None

	# Create content for analysis
	content = {
	'title': title,
	'abstract': abstract
	}

	# Analyze with GPT
	analysis = analyze_paper_relevance_with_client(content, research_question, client)
	if analysis:
	paper['gpt_analysis'] = analysis
	paper['relevance_reason'] = analysis.get('relevance_reason', 'Analysis completed')
	paper['relevance_score'] = analysis.get('relevant', False)
	return paper

	return None

	except Exception as e:
	print(f"Error analyzing paper: {e}")
	return None

	def analyze_paper_batch(papers_batch: List[Dict], research_question: str, api_key: str, batch_id: int) -> List[Dict]:
	"""Analyze a batch of papers in parallel using ThreadPoolExecutor."""
	results = []

	# Use ThreadPoolExecutor to process papers in parallel within the batch
	with concurrent.futures.ThreadPoolExecutor(max_workers=len(papers_batch)) as executor:
	# Submit all papers for parallel processing
	future_to_paper = {
	executor.submit(analyze_single_paper, paper, research_question, api_key): paper
	for paper in papers_batch
	}

	# Collect results as they complete
	for future in concurrent.futures.as_completed(future_to_paper):
	try:
	result = future.result()
	if result:
	results.append(result)
	except Exception as e:
	print(f"Error in parallel analysis: {e}")
	continue

	return results

	def analyze_paper_relevance_with_client(content: Dict[str, str], research_question: str, client: OpenAI) -> Optional[Dict]:
	"""Analyze if a paper is relevant to the research question using provided client."""
	title = content.get('title', '')
	abstract = content.get('abstract', '')

	prompt = f"""
	Research Question: {research_question}

	Paper Title: {title}
	Paper Abstract: {abstract or 'No abstract available'}

	Analyze this paper and determine:
	1. Is this paper highly relevant to answering the research question?
	2. What are the main aims/objectives of this paper?
	3. What are the key takeaways or findings?

	Return ONLY a valid JSON object in this exact format:
	{{
	"relevant": true/false,
	"relevance_reason": "brief explanation of why it is/isn't relevant",
	"aims_of_paper": "main objectives of the paper",
	"key_takeaways": "key findings or takeaways"
	}}
	"""

	try:
	# Try GPT-5 nano first, fallback to gpt-4o-mini if it fails
	try:
	response = client.responses.create(
	model="gpt-5-nano",
	input=prompt,
	reasoning={"effort": "minimal"},
	text={"verbosity": "low"}
	)
	except Exception as e:
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[{
	"role": "user",
	"content": prompt
	}],
	max_completion_tokens=1000
	)

	# Handle different response formats
	if hasattr(response, 'choices') and response.choices:
	# Old format (chat completions)
	result = response.choices[0].message.content
	elif hasattr(response, 'output'):
	# New format (responses) - extract text from output
	result = ""
	for item in response.output:
	if hasattr(item, "content") and item.content:
	for content in item.content:
	if hasattr(content, "text") and content.text:
	result += content.text
	else:
	return None

	if not result:
	return None

	# Clean and parse the JSON response
	result = result.strip()
	if result.startswith("```json"):
	result = result[7:]
	if result.endswith("```"):
	result = result[:-3]

	# Try to parse JSON
	try:
	return json.loads(result.strip())
	except json.JSONDecodeError:
	return None

	except Exception as e:
	return None

	def filter_papers_for_research_question(papers: List[Dict], research_question: str, api_key: str, limit: int = 10) -> List[Dict]:
	"""Analyze exactly 'limit' number of papers for relevance using parallel processing."""
	if not papers or not research_question:
	return []

	# Sort papers by publication date (most recent first)
	sorted_papers = sorted(papers, key=lambda x: x.get('publication_date', ''), reverse=True)

	# Take only the first 'limit' papers for analysis
	papers_to_analyze = sorted_papers[:limit]

	print(f"Analyzing {len(papers_to_analyze)} papers for relevance to: {research_question}")

	# Process all papers in parallel (no batching needed for small numbers)
	all_results = []

	with concurrent.futures.ThreadPoolExecutor(max_workers=min(limit, 20)) as executor:
	# Submit all papers for parallel processing
	future_to_paper = {
	executor.submit(analyze_single_paper, paper, research_question, api_key): paper
	for paper in papers_to_analyze
	}

	# Collect results as they complete
	completed = 0
	for future in concurrent.futures.as_completed(future_to_paper):
	try:
	result = future.result()
	completed += 1
	if result:
	all_results.append(result)
	print(f"Completed {completed}/{len(papers_to_analyze)} papers")
	except Exception as e:
	print(f"Error in parallel analysis: {e}")
	completed += 1

	# Sort by publication date again (most recent first)
	all_results.sort(key=lambda x: x.get('publication_date', ''), reverse=True)

	print(f"Analysis complete. Processed {len(all_results)} papers.")
	return all_results
	import requests
	import re
	import html

	# Try to import BeautifulSoup, fallback to simple parsing if not available
	try:
	from bs4 import BeautifulSoup
	HAS_BS4 = True
	except ImportError:
	HAS_BS4 = False
	print("BeautifulSoup not available, using simple HTML parsing")

	app = Flask(__name__)
	CORS(app)

	# Configuration: read from environment (set in HF Space Secrets)
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
	if not OPENAI_API_KEY:
	print("[WARN] OPENAI_API_KEY is not set. Set it in Space Settings → Secrets.")

	# Global progress tracking
	progress_data = {}
	# Determine script directory and robust project root
	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	ROOT_DIR = os.path.dirname(SCRIPT_DIR) if os.path.basename(SCRIPT_DIR) == "code" else SCRIPT_DIR

	# Ensure we can import helper modules (prefer repo root; fallback to ./code)
	CODE_DIR_CANDIDATE = os.path.join(ROOT_DIR, "code")
	CODE_DIR = CODE_DIR_CANDIDATE if os.path.isdir(CODE_DIR_CANDIDATE) else ROOT_DIR
	if CODE_DIR not in sys.path:
	sys.path.insert(0, CODE_DIR)

	# Database directories: prefer repo-root `database/` when present; fallback to CODE_DIR/database
	DATABASE_DIR_ROOT = os.path.join(ROOT_DIR, "database")
	DATABASE_DIR = DATABASE_DIR_ROOT if os.path.isdir(DATABASE_DIR_ROOT) else os.path.join(CODE_DIR, "database")
	COLLECTION_DB_DIR = os.path.join(DATABASE_DIR, "collections")
	FILTER_DB_DIR = os.path.join(DATABASE_DIR, "filters")

	# Ensure database directories exist
	os.makedirs(COLLECTION_DB_DIR, exist_ok=True)
	os.makedirs(FILTER_DB_DIR, exist_ok=True)

	def ensure_db_dirs() -> None:
	"""Ensure database directories exist (safe to call anytime)."""
	try:
	os.makedirs(COLLECTION_DB_DIR, exist_ok=True)
	os.makedirs(FILTER_DB_DIR, exist_ok=True)
	except Exception:
	pass

	# Robust HTTP headers for publisher sites
	DEFAULT_HTTP_HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	'Cache-Control': 'no-cache',
	}

	def _http_get(url: str, timeout: int = 15) -> Optional[requests.Response]:
	try:
	resp = requests.get(url, headers=DEFAULT_HTTP_HEADERS, timeout=timeout, allow_redirects=True)
	return resp
	except Exception as e:
	print(f"HTTP GET failed for {url}: {e}")
	return None

	def fetch_abstract_from_doi(doi: str) -> Optional[str]:
	"""Fetch abstract/highlights from a DOI URL with a robust, layered strategy."""
	if not doi:
	return None
	# Normalize DOI
	doi_clean = doi.replace('https://doi.org/', '').strip()

	# 1) Crossref (fast, sometimes JATS)
	try:
	text = fetch_from_crossref(doi_clean)
	if text and len(text) > 50:
	return text
	except Exception as e:
	print(f"Crossref fetch failed: {e}")

	# 2) Fetch target HTML via doi.org redirect
	try:
	start_url = f"https://doi.org/{doi_clean}"
	resp = _http_get(start_url, timeout=15)
	if not resp or resp.status_code >= 400:
	return None
	html_text = resp.text or ''
	final_url = getattr(resp, 'url', start_url)
	print(f"Resolved DOI to: {final_url}")

	# Parse with robust pipeline
	parsed = robust_extract_abstract(html_text)
	if parsed and len(parsed) > 50:
	return parsed
	except Exception as e:
	print(f"DOI HTML fetch failed: {e}")

	# 3) PubMed placeholder (extendable)
	try:
	text = fetch_from_pubmed(doi_clean)
	if text and len(text) > 50:
	return text
	except Exception:
	pass

	return None

	def fetch_from_crossref(doi: str) -> Optional[str]:
	"""Fetch abstract from Crossref API."""
	try:
	url = f"https://api.crossref.org/works/{doi}"
	response = _http_get(url, timeout=12)
	if response.status_code == 200:
	data = response.json()
	if 'message' in data:
	message = data['message']
	# Check for abstract or highlights (case insensitive)
	for key in message:
	if key.lower() in ['abstract', 'highlights'] and message[key]:
	raw = str(message[key])
	# Crossref sometimes returns JATS/XML; strip tags and unescape entities
	text = re.sub(r'<[^>]+>', ' ', raw)
	text = html.unescape(re.sub(r'\s+', ' ', text)).strip()
	return text
	except Exception:
	pass
	return None

	def fetch_from_doi_org(doi: str) -> Optional[str]:
	"""Legacy wrapper kept for API compatibility; now uses robust pipeline."""
	try:
	url = f"https://doi.org/{doi}"
	resp = _http_get(url, timeout=15)
	if not resp or resp.status_code >= 400:
	return None
	return robust_extract_abstract(resp.text or '')
	except Exception:
	return None

	def extract_from_preloaded_state_bruteforce(content: str) -> Optional[str]:
	"""Extract abstract from window.__PRELOADED_STATE__ using brace matching and fallbacks."""
	try:
	start_idx = content.find('window.__PRELOADED_STATE__')
	if start_idx == -1:
	return None
	# Find the first '{' after the equals sign
	eq_idx = content.find('=', start_idx)
	if eq_idx == -1:
	return None
	brace_idx = content.find('{', eq_idx)
	if brace_idx == -1:
	return None
	# Brace matching to find the matching closing '}'
	depth = 0
	end_idx = -1
	for i in range(brace_idx, min(len(content), brace_idx + 5_000_000)):
	ch = content[i]
	if ch == '{': depth += 1
	elif ch == '}':
	depth -= 1
	if depth == 0:
	end_idx = i
	break
	if end_idx == -1:
	return None
	json_str = content[brace_idx:end_idx+1]
	try:
	data = json.loads(json_str)
	except Exception as e:
	# Try to relax by removing trailing commas and control chars
	cleaned = re.sub(r',\s*([}\]])', r'\1', json_str)
	cleaned = re.sub(r'\u0000', '', cleaned)
	try:
	data = json.loads(cleaned)
	except Exception as e2:
	print(f"Failed to parse preloaded JSON: {e2}")
	return None

	# Same traversal as before
	if isinstance(data, dict) and 'abstracts' in data and isinstance(data['abstracts'], dict) and 'content' in data['abstracts']:
	abstracts = data['abstracts']['content']
	if isinstance(abstracts, list):
	for abstract_item in abstracts:
	if isinstance(abstract_item, dict) and '$$' in abstract_item and abstract_item.get('#name') == 'abstract':
	class_name = abstract_item.get('$', {}).get('class', '')
	for section in abstract_item.get('$$', []):
	if isinstance(section, dict) and section.get('#name') == 'abstract-sec':
	section_text = extract_text_from_abstract_section(section)
	section_highlights = extract_highlights_from_section(section)
	if section_text and len(section_text.strip()) > 50:
	return clean_text(section_text)
	if section_highlights and len(section_highlights.strip()) > 50:
	return clean_text(section_highlights)
	if 'highlight' in class_name.lower():
	highlights_text = extract_highlights_from_abstract_item(abstract_item)
	if highlights_text and len(highlights_text.strip()) > 50:
	return clean_text(highlights_text)
	return None
	except Exception as e:
	print(f"Error extracting from preloaded state (bruteforce): {e}")
	return None

	def extract_from_json_ld(content: str) -> Optional[str]:
	"""Parse JSON-LD script tags and extract abstract/description if present."""
	if not HAS_BS4:
	return None
	try:
	soup = BeautifulSoup(content, 'html.parser')
	for script in soup.find_all('script', type='application/ld+json'):
	try:
	data = json.loads(script.string or '{}')
	except Exception:
	continue
	candidates = []
	if isinstance(data, dict):
	candidates.append(data)
	elif isinstance(data, list):
	candidates.extend([d for d in data if isinstance(d, dict)])
	for obj in candidates:
	for key in ['abstract', 'description']:
	if key in obj and obj[key]:
	text = clean_text(str(obj[key]))
	if len(text) > 50:
	return text
	return None
	except Exception as e:
	print(f"Error extracting from JSON-LD: {e}")
	return None

	def clean_text(s: str) -> str:
	s = html.unescape(s)
	s = re.sub(r'\s+', ' ', s)
	return s.strip()

	def extract_from_meta_tags(soup) -> Optional[str]:
	try:
	# Common meta carriers of abstract-like summaries
	candidates = []
	# OpenGraph description
	og = soup.find('meta', attrs={'property': 'og:description'})
	if og and og.get('content'):
	candidates.append(og['content'])
	# Twitter description
	tw = soup.find('meta', attrs={'name': 'twitter:description'})
	if tw and tw.get('content'):
	candidates.append(tw['content'])
	# Dublin Core description
	dc = soup.find('meta', attrs={'name': 'dc.description'})
	if dc and dc.get('content'):
	candidates.append(dc['content'])
	# citation_abstract
	cit_abs = soup.find('meta', attrs={'name': 'citation_abstract'})
	if cit_abs and cit_abs.get('content'):
	candidates.append(cit_abs['content'])
	# Fallback: any meta description
	desc = soup.find('meta', attrs={'name': 'description'})
	if desc and desc.get('content'):
	candidates.append(desc['content'])

	# Clean and return the longest meaningful candidate
	candidates = [clean_text(c) for c in candidates if isinstance(c, str)]
	candidates.sort(key=lambda x: len(x), reverse=True)
	for text in candidates:
	if len(text) > 50:
	return text
	return None
	except Exception:
	return None

	def robust_extract_abstract(html_text: str) -> Optional[str]:
	"""Layered extraction over raw HTML: preloaded-state, JSON-LD, meta tags, DOM, regex."""
	if not html_text:
	return None

	# 1) ScienceDirect/Elsevier preloaded state (brace-matched)
	try:
	txt = extract_from_preloaded_state_bruteforce(html_text)
	if txt and len(txt) > 50:
	return clean_text(txt)
	except Exception:
	pass

	# 2) JSON-LD
	try:
	txt = extract_from_json_ld(html_text)
	if txt and len(txt) > 50:
	return clean_text(txt)
	except Exception:
	pass

	# 3) BeautifulSoup-based DOM extraction (meta + selectors + heading-sibling)
	if HAS_BS4:
	try:
	soup = BeautifulSoup(html_text, 'html.parser')
	# meta first
	meta_txt = extract_from_meta_tags(soup)
	if meta_txt and len(meta_txt) > 50:
	return clean_text(meta_txt)

	# selector scan
	selectors = [
	'div.abstract', 'div.Abstract', 'div.ABSTRACT',
	'div[class="abstract" i]', 'div[class="Abstract" i]',
	'section.abstract', 'section.Abstract', 'section.ABSTRACT',
	'div[data-testid="abstract" i]', 'div[data-testid="Abstract" i]',
	'div.article-abstract', 'div.article-Abstract',
	'div.abstract-content', 'div.Abstract-content',
	'div.highlights', 'div.Highlights', 'div.HIGHLIGHTS',
	'div[class="highlights" i]', 'div[class="Highlights" i]',
	'section.highlights', 'section.Highlights', 'section.HIGHLIGHTS',
	'div[data-testid="highlights" i]', 'div[data-testid="Highlights" i]'
	]
	for css in selectors:
	node = soup.select_one(css)
	if node:
	t = clean_text(node.get_text(' ', strip=True))
	if len(t) > 50:
	return t

	# headings near Abstract/Highlights
	for tag in soup.find_all(['h1','h2','h3','h4','h5','h6','strong','b']):
	try:
	title = (tag.get_text() or '').strip().lower()
	if 'abstract' in title or 'highlights' in title:
	blocks = []
	sib = tag
	steps = 0
	while sib and steps < 20:
	sib = sib.find_next_sibling()
	steps += 1
	if not sib: break
	if sib.name in ['p','div','section','article','ul','ol']:
	blocks.append(sib.get_text(' ', strip=True))
	joined = clean_text(' '.join(blocks))
	if len(joined) > 50:
	return joined
	except Exception:
	continue
	except Exception:
	pass

	# 4) Regex fallback
	try:
	patterns = [
	r'<div[^>]class="[^\"](?:abstract\|Abstract\|ABSTRACT\|highlights\|Highlights\|HIGHLIGHTS)[^\"]"[^>]>(.*?)</div>',
	r'<section[^>]class="[^\"](?:abstract\|Abstract\|ABSTRACT\|highlights\|Highlights\|HIGHLIGHTS)[^\"]"[^>]>(.*?)</section>',
	r'<div[^>]data-testid="(?:abstract\|Abstract\|highlights\|Highlights)"[^>]>(.*?)</div>'
	]
	for pat in patterns:
	for m in re.findall(pat, html_text, re.DOTALL \| re.IGNORECASE):
	t = clean_text(re.sub(r'<[^>]+>', ' ', m))
	if len(t) > 50:
	return t
	except Exception:
	pass

	return None

	def extract_text_from_abstract_section(section: dict) -> str:
	"""Extract text content from abstract section structure."""
	try:
	text_parts = []

	if '$$' in section:
	for item in section['$$']:
	if isinstance(item, dict):
	# Direct text content from simple-para
	if item.get('#name') == 'simple-para' and '_' in item:
	text_parts.append(item['_'])
	# Also check for para elements
	elif item.get('#name') == 'para' and '_' in item:
	text_parts.append(item['_'])
	# Recursively extract from nested structure
	elif '$$' in item:
	nested_text = extract_text_from_abstract_section(item)
	if nested_text:
	text_parts.append(nested_text)

	return ' '.join(text_parts)

	except Exception as e:
	print(f"Error extracting text from abstract section: {e}")
	return ""

	def extract_highlights_from_section(section: dict) -> str:
	"""Extract highlights content from section structure."""
	try:
	text_parts = []

	if '$$' in section:
	for item in section['$$']:
	if isinstance(item, dict):
	# Look for section-title with "Highlights"
	if (item.get('#name') == 'section-title' and
	item.get('_') and 'highlight' in item['_'].lower()):
	# Found highlights section, extract list items
	highlights_text = extract_highlights_list(item, section)
	if highlights_text:
	text_parts.append(highlights_text)
	# Also look for direct list structures
	elif item.get('#name') == 'list':
	# Found list, extract list items directly
	highlights_text = extract_highlights_list(item, section)
	if highlights_text:
	text_parts.append(highlights_text)
	elif '$$' in item:
	# Recursively search for highlights
	nested_text = extract_highlights_from_section(item)
	if nested_text:
	text_parts.append(nested_text)

	return ' '.join(text_parts)

	except Exception as e:
	print(f"Error extracting highlights from section: {e}")
	return ""

	def extract_highlights_list(title_item: dict, parent_section: dict) -> str:
	"""Extract highlights list items from the section structure."""
	try:
	highlights = []

	# Look for the list structure after the highlights title
	if '$$' in parent_section:
	for item in parent_section['$$']:
	if isinstance(item, dict) and item.get('#name') == 'list':
	# Found list, extract list items
	if '$$' in item:
	for list_item in item['$$']:
	if isinstance(list_item, dict) and list_item.get('#name') == 'list-item':
	# Extract text from list item
	item_text = extract_text_from_abstract_section(list_item)
	if item_text:
	highlights.append(f"• {item_text}")

	# Also check if the title_item itself contains a list (for direct list structures)
	if '$$' in title_item:
	for item in title_item['$$']:
	if isinstance(item, dict) and item.get('#name') == 'list':
	if '$$' in item:
	for list_item in item['$$']:
	if isinstance(list_item, dict) and list_item.get('#name') == 'list-item':
	item_text = extract_text_from_abstract_section(list_item)
	if item_text:
	highlights.append(f"• {item_text}")

	return ' '.join(highlights)

	except Exception as e:
	print(f"Error extracting highlights list: {e}")
	return ""

	def extract_highlights_from_abstract_item(abstract_item: dict) -> str:
	"""Extract highlights from an abstract item that contains highlights."""
	try:
	highlights = []

	if '$$' in abstract_item:
	for section in abstract_item['$$']:
	if isinstance(section, dict) and section.get('#name') == 'abstract-sec':
	# Look for highlights within this section
	highlights_text = extract_highlights_from_section(section)
	if highlights_text:
	highlights.append(highlights_text)

	return ' '.join(highlights)

	except Exception as e:
	print(f"Error extracting highlights from abstract item: {e}")
	return ""

	def fetch_from_pubmed(doi: str) -> Optional[str]:
	"""Fetch abstract from PubMed if available."""
	try:
	# This is a simplified approach - in practice, you'd need to use PubMed API
	# For now, we'll skip this method but could be extended to check for:
	# - abstract field
	# - highlights field
	# - other summary fields
	pass
	except Exception:
	pass
	return None

	def convert_abstract_to_inverted_index(abstract: str) -> Dict:
	"""Convert abstract text to inverted index format."""
	if not abstract:
	return {}

	# Simple word tokenization and position mapping
	words = re.findall(r'\b\w+\b', abstract.lower())
	inverted_index = {}

	for i, word in enumerate(words):
	if word not in inverted_index:
	inverted_index[word] = []
	inverted_index[word].append(i)

	return inverted_index

	def extract_work_id_from_url(url: str) -> Optional[str]:
	"""Extract OpenAlex work ID from various URL formats."""
	if not url:
	return None

	# Handle different URL formats
	if 'openalex.org' in url:
	if '/works/' in url:
	# Extract ID from URL like https://openalex.org/W2741809807
	work_id = url.split('/works/')[-1]
	return work_id
	elif 'api.openalex.org/works/' in url:
	# Extract ID from API URL
	work_id = url.split('/works/')[-1]
	return work_id

	# If it's already just an ID
	if url.startswith('W') and len(url) > 5:
	return url

	return None

	def save_to_database(session_id: str, data_type: str, data: Dict) -> str:
	"""Legacy-compatible save helper that routes to the new split DB layout."""
	if data_type == 'collection':
	work_id = data.get('work_id', '')
	title = data.get('title', '')
	return save_collection_to_database(work_id, title, data)
	if data_type == 'filter':
	source_collection = data.get('source_collection', '')
	research_question = data.get('research_question', '')
	return save_filter_to_database(source_collection, research_question, data)

	# Fallback legacy path (single folder)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"{session_id}_{data_type}_{timestamp}.pkl"
	filepath = os.path.join(DATABASE_DIR, filename)
	with open(filepath, 'wb') as f: pickle.dump(data, f)
	return filename

	def _clean_work_id(work_id_or_url: str) -> str:
	clean = extract_work_id_from_url(work_id_or_url) or work_id_or_url
	clean = clean.replace('https://api.openalex.org/works/', '').replace('https://openalex.org/', '')
	return clean

	def save_collection_to_database(work_id_or_url: str, title: str, data: Dict) -> str:
	"""Save a collection once per work. Filename is the clean work id only (dedup)."""
	ensure_db_dirs()
	clean_id = _clean_work_id(work_id_or_url)
	filename = f"{clean_id}.pkl"
	filepath = os.path.join(COLLECTION_DB_DIR, filename)

	# Deduplicate: if exists, do NOT overwrite
	if os.path.exists(filepath):
	return filename

	# Ensure helpful metadata for frontend display
	data = dict(data)
	data['work_id'] = work_id_or_url
	data['title'] = title
	data['work_identifier'] = clean_id
	data['created'] = datetime.now().isoformat()

	with open(filepath, 'wb') as f: pickle.dump(data, f)
	return filename

	def save_filter_to_database(source_collection_clean_id: str, research_question: str, data: Dict) -> str:
	"""Save a filter result linked to a source collection. Multiple filters allowed."""
	ensure_db_dirs()
	# Slug for RQ to keep filenames short
	rq_slug = ''.join(c for c in research_question[:40] if c.isalnum() or c in (' ', '-', '_')).strip().replace(' ', '_') or 'rq'
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	filename = f"{source_collection_clean_id}__filter__{rq_slug}__{timestamp}.pkl"
	filepath = os.path.join(FILTER_DB_DIR, filename)

	data = dict(data)
	data['filter_identifier'] = filename.replace('.pkl','')
	data['source_collection'] = source_collection_clean_id
	data['research_question'] = research_question
	data['created'] = datetime.now().isoformat()

	with open(filepath, 'wb') as f: pickle.dump(data, f)
	return filename

	def get_collection_files() -> List[Dict]:
	files: List[Dict] = []
	if not os.path.exists(COLLECTION_DB_DIR): return files
	for filename in os.listdir(COLLECTION_DB_DIR):
	if not filename.endswith('.pkl'): continue
	filepath = os.path.join(COLLECTION_DB_DIR, filename)
	try:
	stat = os.stat(filepath)
	with open(filepath, 'rb') as f: data = pickle.load(f)
	files.append({
	'filename': filename,
	'type': 'collection',
	'work_identifier': data.get('work_identifier') or filename.replace('.pkl',''),
	'title': data.get('title',''),
	'work_id': data.get('work_id',''),
	'total_papers': data.get('total_papers',0),
	'created': data.get('created', datetime.fromtimestamp(stat.st_ctime).isoformat()),
	'size': stat.st_size
	})
	except Exception:
	continue
	files.sort(key=lambda x: x['created'], reverse=True)
	return files

	def get_filter_files() -> List[Dict]:
	files: List[Dict] = []
	if not os.path.exists(FILTER_DB_DIR): return files
	for filename in os.listdir(FILTER_DB_DIR):
	if not filename.endswith('.pkl'): continue
	filepath = os.path.join(FILTER_DB_DIR, filename)
	try:
	stat = os.stat(filepath)
	with open(filepath, 'rb') as f: data = pickle.load(f)
	files.append({
	'filename': filename,
	'type': 'filter',
	'filter_identifier': data.get('filter_identifier') or filename.replace('.pkl',''),
	'source_collection': data.get('source_collection',''),
	'research_question': data.get('research_question',''),
	'relevant_papers': data.get('relevant_papers',0),
	'total_papers': data.get('total_papers',0),
	'tested_papers': data.get('tested_papers',0),
	'created': data.get('created', datetime.fromtimestamp(stat.st_ctime).isoformat()),
	'size': stat.st_size
	})
	except Exception:
	continue
	files.sort(key=lambda x: x['created'], reverse=True)
	return files

	def get_database_files() -> List[Dict]:
	"""Combined listing for frontend history panel."""
	return get_collection_files() + get_filter_files()

	def find_existing_collection(work_id_or_url: str) -> Optional[str]:
	"""Return existing collection filename for a work id if present (dedup)."""
	clean_id = _clean_work_id(work_id_or_url)
	filename = f"{clean_id}.pkl"
	filepath = os.path.join(COLLECTION_DB_DIR, filename)
	return filename if os.path.exists(filepath) else None

	def filter_papers_for_rq(papers: List[Dict], research_question: str) -> List[Dict]:
	"""Filter papers based on research question using GPT-5 mini."""
	if not papers or not research_question:
	return []

	relevant_papers = []

	for i, paper in enumerate(papers):
	print(f"Analyzing paper {i+1}/{len(papers)}: {paper.get('title', 'No title')[:50]}...")

	# Extract title and abstract
	title = paper.get('title', '')
	abstract = ''

	# Try to get abstract from inverted index
	inverted_abstract = paper.get('abstract_inverted_index')
	if inverted_abstract:
	words = []
	for word, positions in inverted_abstract.items():
	for pos in positions:
	while len(words) <= pos:
	words.append('')
	words[pos] = word
	abstract = ' '.join(words).strip()

	if not title and not abstract:
	continue

	# Create content for GPT analysis
	content = {
	'title': title,
	'abstract': abstract
	}

	# Analyze with GPT-5 mini
	try:
	analysis = analyze_with_gpt4(content, OPENAI_API_KEY)
	if analysis and analysis.get('aims_of_paper'):
	# Check if paper is relevant to research question
	relevance_prompt = f"""
	Research Question: {research_question}

	Paper Title: {title}
	Paper Abstract: {abstract or 'No abstract available'}

	Is this paper highly relevant to answering the research question?
	Consider the paper's aims, methods, and findings.

	Return ONLY a JSON object: {{"relevant": true/false, "reason": "brief explanation"}}
	"""

	relevance_response = analyze_with_gpt4({
	'title': 'Relevance Check',
	'abstract': relevance_prompt
	}, OPENAI_API_KEY)

	if relevance_response and relevance_response.get('aims_of_paper'):
	# Parse the relevance response
	try:
	relevance_data = json.loads(relevance_response['aims_of_paper'])
	if relevance_data.get('relevant', False):
	paper['relevance_reason'] = relevance_data.get('reason', 'Relevant to research question')
	paper['gpt_analysis'] = analysis
	relevant_papers.append(paper)
	except:
	# If parsing fails, include paper anyway if it has analysis
	paper['gpt_analysis'] = analysis
	relevant_papers.append(paper)

	except Exception as e:
	print(f"Error analyzing paper {i+1}: {e}")
	continue

	return relevant_papers

	@app.route('/')
	def index():
	"""Serve the main HTML page."""
	return render_template('index.html')

	@app.route('/health')
	def health():
	return jsonify({'status': 'ok', 'app': 'paper_analysis_backend', 'port': 5000})

	@app.route('/api/progress/<task_id>')
	def get_progress(task_id):
	"""Get progress for a specific task."""
	return jsonify(progress_data.get(task_id, {'status': 'not_found', 'progress': 0, 'message': 'Task not found'}))

	def collect_papers_async(work_id, limit, task_id):
	"""Async function to collect papers with progress tracking."""
	try:
	def progress_callback(progress, message):
	progress_data[task_id] = {
	'status': 'running',
	'progress': progress,
	'message': message
	}

	progress_data[task_id] = {'status': 'running', 'progress': 0, 'message': 'Starting paper collection...'}

	# Get related papers with detailed counts and progress callback
	papers = get_related_papers(work_id, upper_limit=limit, progress_callback=progress_callback)

	if not papers:
	progress_data[task_id] = {'status': 'error', 'progress': 0, 'message': 'No related papers found'}
	return

	# Count papers by relationship type
	cited_count = sum(1 for p in papers if p.get('relationship') == 'cited')
	citing_count = sum(1 for p in papers if p.get('relationship') == 'citing')
	related_count = sum(1 for p in papers if p.get('relationship') == 'related')

	# Save papers to temporary file
	with open('temp_papers.pkl', 'wb') as f:
	pickle.dump(papers, f)

	# Fetch seed title for identifier; tolerate failures
	title = ''
	try:
	seed_resp = requests.get(f'https://api.openalex.org/works/{_clean_work_id(work_id)}', timeout=10)
	if seed_resp.ok:
	title = (seed_resp.json() or {}).get('title','')
	except Exception:
	title = ''

	# Save to collection database (dedup by work id)
	collection_data = {
	'work_id': work_id,
	'total_papers': len(papers),
	'cited_papers': cited_count,
	'citing_papers': citing_count,
	'related_papers': related_count,
	'limit': limit,
	'papers': papers,
	}
	db_filename = save_collection_to_database(work_id, title, collection_data)

	progress_data[task_id] = {
	'status': 'completed',
	'progress': 100,
	'message': 'Collection completed',
	'result': {
	'work_id': work_id,
	'total_papers': len(papers),
	'cited_papers': cited_count,
	'citing_papers': citing_count,
	'related_papers': related_count,
	'limit': limit,
	'papers': papers[:10], # Return first 10 for preview
	'db_filename': db_filename
	}
	}

	except Exception as e:
	print(f"Error collecting papers: {e}")
	progress_data[task_id] = {'status': 'error', 'progress': 0, 'message': str(e)}

	def search_papers_by_title(title: str) -> List[Dict]:
	"""Search OpenAlex for papers by title and return ranked matches."""
	try:
	# Clean and prepare the title for search
	clean_title = title.strip()
	if not clean_title:
	return []

	# Search OpenAlex API
	import urllib.parse
	params = {
	'search': clean_title,
	'per_page': 10, # Get top 10 results
	'sort': 'relevance_score:desc' # Sort by relevance
	}

	# Build URL with query parameters
	query_string = urllib.parse.urlencode(params)
	search_url = f"https://api.openalex.org/works?{query_string}"

	print(f"EXACT URL BEING SEARCHED: {search_url}")

	response = _http_get(search_url, timeout=10)
	if not response or response.status_code != 200:
	print(f"OpenAlex search failed: {response.status_code if response else 'No response'}")
	return []

	data = response.json()
	results = data.get('results', [])

	if not results:
	print(f"No results found for title: {clean_title}")
	return []

	# Return top results (OpenAlex already ranks by relevance)
	scored_results = []
	for work in results[:5]: # Take top 5 from OpenAlex
	work_title = work.get('title', '')
	if not work_title:
	continue

	work_id = work.get('id', '').replace('https://openalex.org/', '')
	scored_results.append({
	'work_id': work_id,
	'title': work_title,
	'authors': ', '.join([author.get('author', {}).get('display_name', '') for author in work.get('authorships', [])[:3]]),
	'year': work.get('publication_date', '')[:4] if work.get('publication_date') else 'Unknown',
	'venue': work.get('primary_location', {}).get('source', {}).get('display_name', 'Unknown'),
	'relevance_score': work.get('relevance_score', 0)
	})

	return scored_results

	except Exception as e:
	print(f"Error searching for papers by title: {e}")
	return []

	@app.route('/api/search-papers', methods=['POST'])
	def search_papers():
	"""Search for papers by title and return matches for user selection."""
	try:
	data = request.get_json()
	paper_title = data.get('paper_title', '').strip()

	if not paper_title:
	return jsonify({'error': 'Paper title is required'}), 400

	matches = search_papers_by_title(paper_title)

	if not matches:
	return jsonify({'error': f'No papers found matching title: {paper_title}'}), 404

	return jsonify({
	'success': True,
	'matches': matches,
	'query': paper_title
	})

	except Exception as e:
	print(f"Error searching papers: {e}")
	return jsonify({'error': str(e)}), 500

	@app.route('/api/collect-papers', methods=['POST'])
	def collect_papers():
	"""Collect related papers from a seed paper URL or title search."""
	try:
	data = request.get_json()
	seed_url = data.get('seed_url', '').strip()
	paper_title = data.get('paper_title', '').strip()
	method = data.get('method', 'url')
	user_api_key = data.get('user_api_key') # User's own API key for large collections

	if method == 'title' and not paper_title:
	return jsonify({'error': 'Paper title is required for title search'}), 400
	elif method == 'url' and not seed_url:
	return jsonify({'error': 'Seed URL is required for URL method'}), 400

	# Handle title search or URL method
	if method == 'title':
	# For title search, work_id should be provided (selected by user)
	work_id = data.get('selected_work_id', '').strip()
	if not work_id:
	return jsonify({'error': 'Selected work ID is required for title search'}), 400
	else:
	# Extract work ID from URL
	work_id = extract_work_id_from_url(seed_url)
	if not work_id:
	return jsonify({'error': 'Invalid OpenAlex URL format'}), 400

	print(f"Collecting papers for work ID: {work_id}")

	# Check if collection already exists (dedup)
	existing_file = find_existing_collection(work_id)
	if existing_file:
	print(f"Using existing collection: {existing_file}")
	# Load existing collection data
	filepath = os.path.join(COLLECTION_DB_DIR, existing_file)
	with open(filepath, 'rb') as f:
	existing_data = pickle.load(f)

	# Generate task ID for consistency
	task_id = f"collect_{int(time.time())}"

	# Set progress to completed immediately
	progress_data[task_id] = {
	'status': 'completed',
	'progress': 100,
	'message': f'Using existing collection from {existing_data.get("created", "unknown time")}',
	'result': {
	'papers': existing_data.get('papers', []),
	'total_papers': existing_data.get('total_papers', 0),
	'cited_papers': existing_data.get('cited_papers', 0),
	'citing_papers': existing_data.get('citing_papers', 0),
	'related_papers': existing_data.get('related_papers', 0),
	'db_filename': existing_file
	}
	}

	return jsonify({'success': True, 'task_id': task_id, 'used_existing': True, 'message': 'Using existing collection'})

	# Optional limit from request (None means collect all)
	limit = data.get('limit')
	try:
	limit = int(limit) if limit is not None else None
	except Exception:
	limit = None

	# Generate task ID
	task_id = f"collect_{int(time.time())}"

	# Start async collection
	thread = threading.Thread(target=collect_papers_async, args=(work_id, limit, task_id))
	thread.daemon = True
	thread.start()

	return jsonify({
	'success': True,
	'task_id': task_id,
	'message': 'Paper collection started'
	})

	except Exception as e:
	print(f"Error collecting papers: {e}")
	return jsonify({'error': str(e)}), 500

	@app.route('/api/filter-papers', methods=['POST'])
	def filter_papers():
	"""Filter papers based on research question."""
	try:
	data = request.get_json()
	research_question = data.get('research_question', '').strip()
	limit = data.get('limit', 10) # Default to 10 most recent relevant papers
	provided_source_collection = (data.get('source_collection') or '').strip()
	papers_data = data.get('papers') # Papers passed directly from frontend
	user_api_key = data.get('user_api_key') # User's own API key for large analyses

	if not research_question:
	return jsonify({'error': 'Research question is required'}), 400

	# Load papers from either passed data or temporary file
	papers = []
	if papers_data:
	papers = papers_data
	elif os.path.exists('temp_papers.pkl'):
	with open('temp_papers.pkl', 'rb') as f:
	papers = pickle.load(f)
	else:
	return jsonify({'error': 'No papers found. Please collect papers first.'}), 400

	print(f"Filtering {len(papers)} papers for research question: {research_question}")

	# Use user's API key if provided, otherwise use default
	api_key_to_use = user_api_key if user_api_key else OPENAI_API_KEY

	# Filter papers using custom analyzer (returns top N most recent relevant papers)
	relevant_papers = filter_papers_for_research_question(papers, research_question, api_key_to_use, limit)

	# Determine source collection id for linkage
	source_collection_id = None
	if provided_source_collection:
	source_collection_id = provided_source_collection
	else:
	try:
	collections = get_collection_files()
	if collections:
	source_collection_id = collections[0].get('work_identifier')
	except Exception:
	source_collection_id = None

	# Count actual relevant papers from analysis results
	actual_relevant = 0
	for paper in relevant_papers:
	if paper.get('relevance_score') == True or paper.get('relevance_score') == 'true':
	actual_relevant += 1

	# Calculate open access statistics
	total_oa = 0
	for paper in papers:
	oa_info = paper.get('open_access') or {}
	if oa_info.get('is_oa', False):
	total_oa += 1
	oa_percentage = round((total_oa / len(papers)) * 100) if papers else 0

	# Calculate abstract statistics
	total_with_abstract = 0
	for paper in papers:
	if paper.get('abstract_inverted_index') and len(paper.get('abstract_inverted_index', {})) > 0:
	total_with_abstract += 1
	abstract_percentage = round((total_with_abstract / len(papers)) * 100) if papers else 0

	# Save filtered results to filter database (linked to collection)
	tested_papers = int(limit) if isinstance(limit, int) else 0
	filter_data = {
	'research_question': research_question,
	'total_papers': len(papers), # Total papers in collection
	'tested_papers': tested_papers, # Number of papers tested for relevance
	'relevant_papers': actual_relevant, # Actual count of YES responses
	'oa_percentage': oa_percentage, # Open access percentage
	'abstract_percentage': abstract_percentage, # Percentage with abstracts
	'limit': limit,
	'papers': relevant_papers,
	'source_collection': source_collection_id
	}
	if source_collection_id:
	db_filename = save_filter_to_database(source_collection_id, research_question, filter_data)
	else:
	# Fallback
	db_filename = save_to_database(f"filter_{int(time.time())}", 'filter', filter_data)

	return jsonify({
	'success': True,
	'research_question': research_question,
	'total_papers': len(papers), # Total papers in collection
	'tested_papers': tested_papers, # Number of papers tested for relevance
	'relevant_papers': actual_relevant, # Actual count of YES responses
	'oa_percentage': oa_percentage, # Open access percentage
	'abstract_percentage': abstract_percentage, # Percentage with abstracts
	'limit': limit,
	'papers': relevant_papers,
	'db_filename': db_filename
	})

	except Exception as e:
	print(f"Error filtering papers: {e}")
	return jsonify({'error': str(e)}), 500

	@app.route('/api/database-files')
	def get_database_files_endpoint():
	"""Get list of all database files (collections + filters)."""
	try:
	files = get_database_files()
	return jsonify({'success': True, 'files': files})
	except Exception as e:
	return jsonify({'error': str(e)}), 500

	@app.route('/api/load-database-file/<filename>')
	def load_database_file(filename):
	"""Load a specific database file."""
	try:
	# Try collections then filters then legacy
	filepath = os.path.join(COLLECTION_DB_DIR, filename)
	if not os.path.exists(filepath):
	filepath = os.path.join(FILTER_DB_DIR, filename)
	if not os.path.exists(filepath):
	filepath = os.path.join(DATABASE_DIR, filename)
	if not os.path.exists(filepath):
	return jsonify({'error': 'File not found'}), 404

	with open(filepath, 'rb') as f:
	data = pickle.load(f)

	return jsonify({'success': True, 'data': data})
	except Exception as e:
	return jsonify({'error': str(e)}), 500

	@app.route('/api/delete-database-file/<filename>', methods=['DELETE'])
	def delete_database_file(filename):
	"""Delete a specific database file."""
	try:
	# Try collections then filters then legacy
	filepath = os.path.join(COLLECTION_DB_DIR, filename)
	if not os.path.exists(filepath):
	filepath = os.path.join(FILTER_DB_DIR, filename)
	if not os.path.exists(filepath):
	filepath = os.path.join(DATABASE_DIR, filename)
	if not os.path.exists(filepath):
	return jsonify({'error': 'File not found'}), 404

	# Delete the file
	os.remove(filepath)
	return jsonify({'success': True, 'message': f'File {filename} deleted successfully'})
	except Exception as e:
	return jsonify({'error': str(e)}), 500

	def generate_bibtex_entry(paper):
	"""Generate a BibTeX entry for a single paper."""
	try:
	# Handle None or invalid paper objects
	if not paper or not isinstance(paper, dict):
	print(f"Invalid paper object: {paper}")
	return f"@article{{error_{hash(str(paper)) % 10000},\n title={{Invalid paper data}},\n author={{Unknown}},\n year={{Unknown}}\n}}"

	# Extract basic info with safe defaults
	title = paper.get('title', 'Unknown Title')
	year = paper.get('publication_year', 'Unknown Year')
	doi = paper.get('doi', '')

	# Generate a unique key (using OpenAlex ID or DOI)
	work_id = paper.get('id', '')
	if work_id and isinstance(work_id, str):
	work_id = work_id.replace('https://openalex.org/', '')
	if not work_id and doi:
	work_id = doi.replace('https://doi.org/', '').replace('/', '_')
	if not work_id:
	work_id = f"paper_{hash(title) % 10000}"

	# Extract authors safely
	authorships = paper.get('authorships', [])
	author_list = []
	if isinstance(authorships, list):
	for authorship in authorships:
	if isinstance(authorship, dict):
	author = authorship.get('author', {})
	if isinstance(author, dict):
	display_name = author.get('display_name', '')
	if display_name:
	# Split name and format as "Last, First"
	name_parts = display_name.split()
	if len(name_parts) >= 2:
	last_name = name_parts[-1]
	first_name = ' '.join(name_parts[:-1])
	author_list.append(f"{last_name}, {first_name}")
	else:
	author_list.append(display_name)

	authors = " and ".join(author_list) if author_list else "Unknown Author"

	# Extract journal info safely
	primary_location = paper.get('primary_location', {})
	journal = 'Unknown Journal'
	if isinstance(primary_location, dict):
	source = primary_location.get('source', {})
	if isinstance(source, dict):
	journal = source.get('display_name', 'Unknown Journal')

	# Extract volume, issue, pages safely
	biblio = paper.get('biblio', {})
	volume = ''
	issue = ''
	first_page = ''
	last_page = ''
	if isinstance(biblio, dict):
	volume = biblio.get('volume', '')
	issue = biblio.get('issue', '')
	first_page = biblio.get('first_page', '')
	last_page = biblio.get('last_page', '')

	# Format pages
	if first_page and last_page and first_page != last_page:
	pages = f"{first_page}--{last_page}"
	elif first_page:
	pages = first_page
	else:
	pages = ""

	# Format volume and issue
	volume_info = ""
	if volume:
	volume_info = f"volume={{{volume}}}"
	if issue:
	volume_info += f", number={{{issue}}}"
	elif issue:
	volume_info = f"number={{{issue}}}"

	# Get URL (prefer DOI, fallback to landing page)
	url = doi if doi else ''
	if isinstance(primary_location, dict):
	landing_url = primary_location.get('landing_page_url', '')
	if landing_url and not url:
	url = landing_url

	# Build BibTeX entry
	bibtex_entry = f"""@article{{{work_id},
	title={{{title}}},
	author={{{authors}}},
	journal={{{journal}}},
	year={{{year}}}"""

	if volume_info:
	bibtex_entry += f",\n {volume_info}"

	if pages:
	bibtex_entry += f",\n pages={{{pages}}}"

	if doi:
	bibtex_entry += f",\n doi={{{doi.replace('https://doi.org/', '')}}}"

	if url:
	bibtex_entry += f",\n url={{{url}}}"

	bibtex_entry += "\n}"

	return bibtex_entry

	except Exception as e:
	print(f"Error generating BibTeX for paper: {e}")
	print(f"Paper data: {paper}")
	return f"@article{{error_{hash(str(paper)) % 10000},\n title={{Error generating entry}},\n author={{Unknown}},\n year={{Unknown}}\n}}"

	@app.route('/api/generate-bibtex/<filename>', methods=['POST'])
	def generate_bibtex(filename):
	"""Generate BibTeX file for a collection."""
	try:
	# Load the collection
	collection_path = os.path.join(COLLECTION_DB_DIR, filename)
	if not os.path.exists(collection_path):
	return jsonify({'success': False, 'message': 'Collection not found'}), 404

	with open(collection_path, 'rb') as f:
	collection_data = pickle.load(f)

	papers = collection_data.get('papers', [])
	if not papers:
	return jsonify({'success': False, 'message': 'No papers in collection'}), 400

	print(f"Found {len(papers)} papers in collection")
	print(f"First paper structure: {type(papers[0]) if papers else 'No papers'}")
	if papers:
	print(f"First paper keys: {list(papers[0].keys()) if isinstance(papers[0], dict) else 'Not a dict'}")

	# Generate BibTeX entries
	bibtex_entries = []
	for i, paper in enumerate(papers):
	print(f"Processing paper {i+1}/{len(papers)}: {type(paper)}")
	entry = generate_bibtex_entry(paper)
	bibtex_entries.append(entry)

	# Combine all entries
	bibtex_content = "\n\n".join(bibtex_entries)

	# Save BibTeX file
	bibtex_filename = filename.replace('.pkl', '.bib')
	bibtex_path = os.path.join(COLLECTION_DB_DIR, bibtex_filename)

	with open(bibtex_path, 'w', encoding='utf-8') as f:
	f.write(bibtex_content)

	print(f"BibTeX file saved to: {bibtex_path}")
	print(f"File exists: {os.path.exists(bibtex_path)}")
	print(f"File size: {os.path.getsize(bibtex_path) if os.path.exists(bibtex_path) else 'N/A'}")

	return jsonify({
	'success': True,
	'message': f'BibTeX file generated with {len(papers)} entries',
	'filename': bibtex_filename,
	'entries_count': len(papers)
	})

	except Exception as e:
	return jsonify({'success': False, 'message': f'Error generating BibTeX: {str(e)}'}), 500

	@app.route('/api/download-database-file/<filename>')
	def download_database_file(filename):
	"""Download a database file (collection, filter, or BibTeX)."""
	try:
	print(f"Attempting to download file: {filename}")

	# Try collections first, then filters, then legacy
	filepath = os.path.join(COLLECTION_DB_DIR, filename)
	print(f"Checking collections path: {filepath}")
	if not os.path.exists(filepath):
	filepath = os.path.join(FILTER_DB_DIR, filename)
	print(f"Checking filters path: {filepath}")
	if not os.path.exists(filepath):
	filepath = os.path.join(DATABASE_DIR, filename)
	print(f"Checking legacy path: {filepath}")
	if not os.path.exists(filepath):
	print(f"File not found in any directory: {filename}")
	return jsonify({'error': 'File not found'}), 404

	print(f"Found file at: {filepath}")
	print(f"File size: {os.path.getsize(filepath)}")

	return send_file(filepath, as_attachment=True, download_name=filename)
	except Exception as e:
	print(f"Error in download_database_file: {e}")
	return jsonify({'error': str(e)}), 500

	@app.route('/api/merge-collections', methods=['POST'])
	def merge_collections():
	"""Merge multiple collections into a new collection with overlap analysis."""
	try:
	data = request.get_json()
	collection_filenames = data.get('collections', [])

	if len(collection_filenames) < 2:
	return jsonify({'success': False, 'message': 'At least 2 collections required for merging'}), 400

	# Load all collections and track their work IDs
	collections_data = []
	all_work_ids = set()
	collection_work_ids = [] # List of sets, one per collection

	for filename in collection_filenames:
	collection_path = os.path.join(COLLECTION_DB_DIR, filename)
	if not os.path.exists(collection_path):
	return jsonify({'success': False, 'message': f'Collection {filename} not found'}), 404

	with open(collection_path, 'rb') as f:
	collection_data = pickle.load(f)

	papers = collection_data.get('papers', [])
	collection_work_ids_set = set()

	# Extract work IDs for this collection
	for paper in papers:
	if isinstance(paper, dict):
	work_id = paper.get('id', '')
	if work_id:
	collection_work_ids_set.add(work_id)
	all_work_ids.add(work_id)

	collections_data.append({
	'filename': filename,
	'title': collection_data.get('title', filename.replace('.pkl', '')),
	'papers': papers,
	'work_ids': collection_work_ids_set,
	'total_papers': len(papers)
	})
	collection_work_ids.append(collection_work_ids_set)

	# Calculate overlap statistics
	overlap_stats = []
	total_unique_papers = len(all_work_ids)

	for i, collection in enumerate(collections_data):
	collection_work_ids_i = collection_work_ids[i]
	overlaps = []

	# Calculate overlap with each other collection
	for j, other_collection in enumerate(collections_data):
	if i != j:
	other_work_ids = collection_work_ids[j]
	intersection = collection_work_ids_i.intersection(other_work_ids)
	overlap_count = len(intersection)
	overlap_percentage = (overlap_count / len(collection_work_ids_i)) * 100 if collection_work_ids_i else 0

	overlaps.append({
	'collection': other_collection['title'],
	'overlap_count': overlap_count,
	'overlap_percentage': round(overlap_percentage, 1)
	})

	overlap_stats.append({
	'collection': collection['title'],
	'total_papers': collection['total_papers'],
	'overlaps': overlaps
	})

	# Create merged collection with unique papers only
	merged_papers = []
	merged_work_ids = set()

	for collection in collections_data:
	for paper in collection['papers']:
	if isinstance(paper, dict):
	work_id = paper.get('id', '')
	if work_id and work_id not in merged_work_ids:
	merged_papers.append(paper)
	merged_work_ids.add(work_id)

	if not merged_papers:
	return jsonify({'success': False, 'message': 'No papers found in collections to merge'}), 400

	# Calculate total papers across all collections (before deduplication)
	total_papers_before_merge = sum(collection['total_papers'] for collection in collections_data)
	duplicates_removed = total_papers_before_merge - len(merged_papers)
	deduplication_percentage = (duplicates_removed / total_papers_before_merge) * 100 if total_papers_before_merge > 0 else 0

	# Create merged collection data
	collection_titles = [collection['title'] for collection in collections_data]
	merged_title = f"MERGED: {' + '.join(collection_titles[:3])}"
	if len(collection_titles) > 3:
	merged_title += f" + {len(collection_titles) - 3} more"

	merged_data = {
	'work_identifier': f"merged_{int(time.time())}",
	'title': merged_title,
	'work_id': '',
	'papers': merged_papers,
	'total_papers': len(merged_papers),
	'created': datetime.now().isoformat(),
	'source_collections': collection_filenames,
	'merge_stats': {
	'total_papers_before_merge': total_papers_before_merge,
	'duplicates_removed': duplicates_removed,
	'deduplication_percentage': round(deduplication_percentage, 1),
	'overlap_analysis': overlap_stats
	}
	}

	# Save merged collection
	merged_filename = f"merged_{int(time.time())}.pkl"
	merged_path = os.path.join(COLLECTION_DB_DIR, merged_filename)

	with open(merged_path, 'wb') as f:
	pickle.dump(merged_data, f)

	return jsonify({
	'success': True,
	'message': f'Merged collection created with {len(merged_papers)} unique papers (removed {duplicates_removed} duplicates)',
	'filename': merged_filename,
	'total_papers': len(merged_papers),
	'merge_stats': {
	'total_papers_before_merge': total_papers_before_merge,
	'duplicates_removed': duplicates_removed,
	'deduplication_percentage': round(deduplication_percentage, 1),
	'overlap_analysis': overlap_stats
	}
	})

	except Exception as e:
	return jsonify({'success': False, 'message': f'Error merging collections: {str(e)}'}), 500

	@app.route('/api/fetch-abstracts', methods=['POST'])
	def fetch_abstracts():
	"""Fetch missing abstracts for papers using their DOI URLs."""
	try:
	data = request.get_json()
	papers = data.get('papers', [])

	if not papers:
	return jsonify({'error': 'No papers provided'}), 400

	updated_papers = []
	fetched_count = 0
	total_processed = 0

	for paper in papers:
	total_processed += 1
	updated_paper = paper.copy()

	# Check if paper already has abstract (check both abstract_inverted_index and abstract fields)
	has_abstract = (
	(paper.get('abstract_inverted_index') and
	len(paper.get('abstract_inverted_index', {})) > 0) or
	(paper.get('abstract') and
	len(str(paper.get('abstract', '')).strip()) > 50)
	)

	if not has_abstract and paper.get('doi'):
	print(f"Fetching abstract for DOI: {paper.get('doi')}")
	abstract = fetch_abstract_from_doi(paper.get('doi'))

	if abstract:
	# Convert to inverted index format
	inverted_index = convert_abstract_to_inverted_index(abstract)
	updated_paper['abstract_inverted_index'] = inverted_index
	fetched_count += 1
	print(f"Successfully fetched abstract for: {paper.get('title', 'Unknown')[:50]}...")
	else:
	print(f"Could not fetch abstract for: {paper.get('title', 'Unknown')[:50]}...")

	updated_papers.append(updated_paper)

	return jsonify({
	'success': True,
	'fetched_count': fetched_count,
	'total_processed': total_processed,
	'updated_papers': updated_papers
	})

	except Exception as e:
	print(f"Error fetching abstracts: {e}")
	return jsonify({'error': str(e)}), 500

	@app.route('/api/export-excel/<filename>')
	def export_excel_from_file(filename):
	"""Export Excel from a specific database file."""
	try:
	# Try collections then filters then legacy
	filepath = os.path.join(COLLECTION_DB_DIR, filename)
	if not os.path.exists(filepath):
	filepath = os.path.join(FILTER_DB_DIR, filename)
	if not os.path.exists(filepath):
	filepath = os.path.join(DATABASE_DIR, filename)
	if not os.path.exists(filepath):
	return jsonify({'error': 'File not found'}), 404

	with open(filepath, 'rb') as f:
	data = pickle.load(f)

	papers = data.get('papers', [])
	if not papers:
	return jsonify({'error': 'No papers found in file'}), 400

	# Prepare data for Excel export
	excel_data = []
	for paper in papers:
	# Extract abstract from inverted index
	abstract = ""
	if paper.get('abstract_inverted_index'):
	words = []
	for word, positions in paper['abstract_inverted_index'].items():
	for pos in positions:
	while len(words) <= pos:
	words.append('')
	words[pos] = word
	abstract = ' '.join(words).strip()

	# Extract open access info with null checks
	oa_info = paper.get('open_access') or {}
	is_oa = oa_info.get('is_oa', False) if oa_info else False
	oa_status = oa_info.get('oa_status', '') if oa_info else ''

	# Extract DOI with null check
	doi = ""
	if paper.get('doi'):
	doi = paper['doi'].replace('https://doi.org/', '')

	# Extract authors with null checks
	authors = paper.get('authorships') or []
	author_names = []
	for author in authors[:5]: # Limit to first 5 authors
	if author and isinstance(author, dict):
	author_obj = author.get('author') or {}
	if author_obj and isinstance(author_obj, dict):
	author_names.append(author_obj.get('display_name', ''))

	# Extract journal with null checks
	journal = ""
	primary_location = paper.get('primary_location')
	if primary_location and isinstance(primary_location, dict):
	source = primary_location.get('source')
	if source and isinstance(source, dict):
	journal = source.get('display_name', '')

	# Extract GPT analysis with null checks
	gpt_analysis = paper.get('gpt_analysis') or {}
	gpt_aims = gpt_analysis.get('aims_of_paper', '') if gpt_analysis else ''
	gpt_takeaways = gpt_analysis.get('key_takeaways', '') if gpt_analysis else ''

	excel_data.append({
	'Title': paper.get('title', ''),
	'Publication Date': paper.get('publication_date', ''),
	'DOI': doi,
	'Is Open Access': is_oa,
	'OA Status': oa_status,
	'Abstract': abstract,
	'Relationship': paper.get('relationship', ''),
	'Authors': ', '.join(author_names),
	'Journal': journal,
	'OpenAlex ID': paper.get('id', ''),
	'Relevance Reason': paper.get('relevance_reason', ''),
	'GPT Aims': gpt_aims,
	'GPT Takeaways': gpt_takeaways
	})

	# Create DataFrame and export to Excel
	df = pd.DataFrame(excel_data)
	excel_filename = f'{filename.replace(".pkl", "")}_{int(time.time())}.xlsx'

	# Create Excel file in a temporary location
	temp_dir = tempfile.gettempdir()
	excel_path = os.path.join(temp_dir, excel_filename)

	try:
	df.to_excel(excel_path, index=False)
	return send_file(excel_path, as_attachment=True, download_name=excel_filename)
	except Exception as e:
	print(f"Error creating Excel file: {e}")
	# Fallback: try current directory
	try:
	df.to_excel(excel_filename, index=False)
	return send_file(excel_filename, as_attachment=True, download_name=excel_filename)
	except Exception as e2:
	print(f"Error creating Excel file in current directory: {e2}")
	return jsonify({'error': f'Failed to create Excel file: {str(e2)}'}), 500

	except Exception as e:
	print(f"Error exporting Excel: {e}")
	return jsonify({'error': str(e)}), 500

	@app.route('/api/export-excel')
	def export_excel():
	"""Export collected papers to Excel format."""
	try:
	# Load papers from temporary file
	if not os.path.exists('temp_papers.pkl'):
	return jsonify({'error': 'No papers found. Please collect papers first.'}), 400

	with open('temp_papers.pkl', 'rb') as f:
	papers = pickle.load(f)

	# Prepare data for Excel export
	excel_data = []
	for paper in papers:
	# Extract abstract from inverted index
	abstract = ""
	if paper.get('abstract_inverted_index'):
	words = []
	for word, positions in paper['abstract_inverted_index'].items():
	for pos in positions:
	while len(words) <= pos:
	words.append('')
	words[pos] = word
	abstract = ' '.join(words).strip()

	# Extract open access info with null checks
	oa_info = paper.get('open_access') or {}
	is_oa = oa_info.get('is_oa', False) if oa_info else False
	oa_status = oa_info.get('oa_status', '') if oa_info else ''

	# Extract DOI with null check
	doi = ""
	if paper.get('doi'):
	doi = paper['doi'].replace('https://doi.org/', '')

	# Extract authors with null checks
	authors = paper.get('authorships') or []
	author_names = []
	for author in authors[:5]: # Limit to first 5 authors
	if author and isinstance(author, dict):
	author_obj = author.get('author') or {}
	if author_obj and isinstance(author_obj, dict):
	author_names.append(author_obj.get('display_name', ''))

	# Extract journal with null checks
	journal = ""
	primary_location = paper.get('primary_location')
	if primary_location and isinstance(primary_location, dict):
	source = primary_location.get('source')
	if source and isinstance(source, dict):
	journal = source.get('display_name', '')

	# Extract GPT analysis with null checks
	gpt_analysis = paper.get('gpt_analysis') or {}
	gpt_aims = gpt_analysis.get('aims_of_paper', '') if gpt_analysis else ''
	gpt_takeaways = gpt_analysis.get('key_takeaways', '') if gpt_analysis else ''

	excel_data.append({
	'Title': paper.get('title', ''),
	'Publication Date': paper.get('publication_date', ''),
	'DOI': doi,
	'Is Open Access': is_oa,
	'OA Status': oa_status,
	'Abstract': abstract,
	'Relationship': paper.get('relationship', ''),
	'Authors': ', '.join(author_names),
	'Journal': journal,
	'OpenAlex ID': paper.get('id', ''),
	'Relevance Reason': paper.get('relevance_reason', ''),
	'GPT Aims': gpt_aims,
	'GPT Takeaways': gpt_takeaways
	})

	# Create DataFrame and export to Excel
	df = pd.DataFrame(excel_data)
	excel_filename = f'research_papers_{int(time.time())}.xlsx'

	# Create Excel file in a temporary location
	temp_dir = tempfile.gettempdir()
	excel_path = os.path.join(temp_dir, excel_filename)

	try:
	df.to_excel(excel_path, index=False)
	return send_file(excel_path, as_attachment=True, download_name=excel_filename)
	except Exception as e:
	print(f"Error creating Excel file: {e}")
	# Fallback: try current directory
	try:
	df.to_excel(excel_filename, index=False)
	return send_file(excel_filename, as_attachment=True, download_name=excel_filename)
	except Exception as e2:
	print(f"Error creating Excel file in current directory: {e2}")
	return jsonify({'error': f'Failed to create Excel file: {str(e2)}'}), 500

	except Exception as e:
	print(f"Error exporting Excel: {e}")
	return jsonify({'error': str(e)}), 500

	@app.route('/api/paper-details/<work_id>')
	def paper_details(work_id):
	"""Get detailed analysis for a specific paper."""
	try:
	# Load papers from temporary file
	if not os.path.exists('temp_papers.pkl'):
	return jsonify({'error': 'No papers found'}), 400

	with open('temp_papers.pkl', 'rb') as f:
	papers = pickle.load(f)

	# Find the specific paper
	paper = next((p for p in papers if p.get('id') == work_id), None)
	if not paper:
	return jsonify({'error': 'Paper not found'}), 404

	return jsonify({
	'success': True,
	'paper': paper
	})

	except Exception as e:
	print(f"Error getting paper details: {e}")
	return jsonify({'error': str(e)}), 500

	if __name__ == '__main__':
	# Create templates directory if it doesn't exist
	os.makedirs('templates', exist_ok=True)
	port = int(os.getenv('PORT', '5000'))
	app.run(debug=False, host='0.0.0.0', port=port)