Spaces:
No application file
No application file
Add Firecrawl integration for superior web scraping
Browse files- config.py +1 -0
- micro/scrape.py +149 -2
- requirements.txt +1 -0
config.py
CHANGED
|
@@ -9,6 +9,7 @@ load_dotenv()
|
|
| 9 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your_openai_key_here")
|
| 10 |
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "your_anthropic_key_here")
|
| 11 |
SERPAPI_KEY = os.getenv("SERPAPI_KEY", "your_serpapi_key_here")
|
|
|
|
| 12 |
|
| 13 |
# LLM Configuration
|
| 14 |
LLM_CONFIG: Dict[str, Any] = {
|
|
|
|
| 9 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your_openai_key_here")
|
| 10 |
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "your_anthropic_key_here")
|
| 11 |
SERPAPI_KEY = os.getenv("SERPAPI_KEY", "your_serpapi_key_here")
|
| 12 |
+
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "fc-08e46542bfcc4ca7a953fac4dea4237e")
|
| 13 |
|
| 14 |
# LLM Configuration
|
| 15 |
LLM_CONFIG: Dict[str, Any] = {
|
micro/scrape.py
CHANGED
|
@@ -4,6 +4,15 @@ import re
|
|
| 4 |
from typing import Dict, Tuple, Optional
|
| 5 |
from datetime import datetime
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# Try to import Selenium, but handle gracefully if not available
|
| 8 |
try:
|
| 9 |
from selenium import webdriver
|
|
@@ -149,6 +158,17 @@ class ScrapeMicroFunction:
|
|
| 149 |
"""Micro-function for web scraping with enhanced preview extraction"""
|
| 150 |
|
| 151 |
def __init__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
if SELENIUM_AVAILABLE:
|
| 153 |
self.chrome_options = Options()
|
| 154 |
self.chrome_options.add_argument('--headless')
|
|
@@ -212,8 +232,11 @@ class ScrapeMicroFunction:
|
|
| 212 |
def _scrape_url(self, url: str) -> dict:
|
| 213 |
"""Scrape URL and extract both preview and full content"""
|
| 214 |
try:
|
| 215 |
-
# Try
|
| 216 |
-
if
|
|
|
|
|
|
|
|
|
|
| 217 |
return self._scrape_linkedin(url)
|
| 218 |
else:
|
| 219 |
return self._scrape_generic(url)
|
|
@@ -301,6 +324,130 @@ class ScrapeMicroFunction:
|
|
| 301 |
# Fallback to Selenium
|
| 302 |
return self._scrape_with_selenium(url)
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
def _scrape_with_selenium(self, url: str) -> dict:
|
| 305 |
"""Selenium fallback for sites that block requests"""
|
| 306 |
if not SELENIUM_AVAILABLE:
|
|
|
|
| 4 |
from typing import Dict, Tuple, Optional
|
| 5 |
from datetime import datetime
|
| 6 |
|
| 7 |
+
# Try to import Firecrawl
|
| 8 |
+
try:
|
| 9 |
+
from firecrawl import FirecrawlApp
|
| 10 |
+
from config import FIRECRAWL_API_KEY
|
| 11 |
+
FIRECRAWL_AVAILABLE = True and FIRECRAWL_API_KEY != "your_firecrawl_key_here"
|
| 12 |
+
except ImportError:
|
| 13 |
+
FIRECRAWL_AVAILABLE = False
|
| 14 |
+
print("Warning: Firecrawl not available. Web scraping will use fallback methods.")
|
| 15 |
+
|
| 16 |
# Try to import Selenium, but handle gracefully if not available
|
| 17 |
try:
|
| 18 |
from selenium import webdriver
|
|
|
|
| 158 |
"""Micro-function for web scraping with enhanced preview extraction"""
|
| 159 |
|
| 160 |
def __init__(self):
|
| 161 |
+
# Initialize Firecrawl client if available
|
| 162 |
+
if FIRECRAWL_AVAILABLE:
|
| 163 |
+
try:
|
| 164 |
+
self.firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
|
| 165 |
+
print("✅ Firecrawl client initialized successfully")
|
| 166 |
+
except Exception as e:
|
| 167 |
+
print(f"⚠️ Firecrawl initialization failed: {e}")
|
| 168 |
+
self.firecrawl_app = None
|
| 169 |
+
else:
|
| 170 |
+
self.firecrawl_app = None
|
| 171 |
+
|
| 172 |
if SELENIUM_AVAILABLE:
|
| 173 |
self.chrome_options = Options()
|
| 174 |
self.chrome_options.add_argument('--headless')
|
|
|
|
| 232 |
def _scrape_url(self, url: str) -> dict:
|
| 233 |
"""Scrape URL and extract both preview and full content"""
|
| 234 |
try:
|
| 235 |
+
# Try Firecrawl first if available (works for all sites including LinkedIn)
|
| 236 |
+
if self.firecrawl_app:
|
| 237 |
+
return self._scrape_with_firecrawl(url)
|
| 238 |
+
# Fallback to site-specific methods
|
| 239 |
+
elif 'linkedin.com' in url:
|
| 240 |
return self._scrape_linkedin(url)
|
| 241 |
else:
|
| 242 |
return self._scrape_generic(url)
|
|
|
|
| 324 |
# Fallback to Selenium
|
| 325 |
return self._scrape_with_selenium(url)
|
| 326 |
|
| 327 |
+
def _scrape_with_firecrawl(self, url: str) -> dict:
|
| 328 |
+
"""Firecrawl scraping - works for all sites including LinkedIn"""
|
| 329 |
+
if not self.firecrawl_app:
|
| 330 |
+
# Fallback to other methods if Firecrawl not available
|
| 331 |
+
if 'linkedin.com' in url:
|
| 332 |
+
return self._scrape_linkedin(url)
|
| 333 |
+
else:
|
| 334 |
+
return self._scrape_generic(url)
|
| 335 |
+
|
| 336 |
+
try:
|
| 337 |
+
print(f"🔥 Using Firecrawl to scrape: {url}")
|
| 338 |
+
|
| 339 |
+
# Use Firecrawl to scrape the URL and get LLM-ready markdown
|
| 340 |
+
scrape_result = self.firecrawl_app.scrape_url(
|
| 341 |
+
url,
|
| 342 |
+
formats=['markdown', 'html'],
|
| 343 |
+
only_main_content=True, # Focus on main content
|
| 344 |
+
timeout=30000
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
if scrape_result and hasattr(scrape_result, 'data'):
|
| 348 |
+
# Handle Firecrawl response object structure
|
| 349 |
+
data = scrape_result.data
|
| 350 |
+
markdown_content = getattr(data, 'markdown', '') or ''
|
| 351 |
+
html_content = getattr(data, 'html', '') or ''
|
| 352 |
+
metadata = getattr(data, 'metadata', {}) or {}
|
| 353 |
+
|
| 354 |
+
# Create preview from metadata and content
|
| 355 |
+
title = metadata.get('title', 'Not specified') if isinstance(metadata, dict) else 'Not specified'
|
| 356 |
+
preview = {
|
| 357 |
+
'company': 'Not specified',
|
| 358 |
+
'role': title,
|
| 359 |
+
'location': 'Not specified',
|
| 360 |
+
'posted_days': 'Recently'
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
# Try to extract better preview info from markdown content
|
| 364 |
+
enhanced_preview = self._extract_preview_from_markdown(markdown_content, url)
|
| 365 |
+
preview.update({k: v for k, v in enhanced_preview.items() if v != 'Not specified'})
|
| 366 |
+
|
| 367 |
+
return {
|
| 368 |
+
'success': True,
|
| 369 |
+
'content': markdown_content or html_content,
|
| 370 |
+
'html_content': html_content,
|
| 371 |
+
'markdown_content': markdown_content,
|
| 372 |
+
'metadata': metadata,
|
| 373 |
+
'preview': preview,
|
| 374 |
+
'url': url,
|
| 375 |
+
'scraping_method': 'firecrawl'
|
| 376 |
+
}
|
| 377 |
+
else:
|
| 378 |
+
error_msg = getattr(scrape_result, 'error', 'Unknown Firecrawl error') if scrape_result else 'No response from Firecrawl'
|
| 379 |
+
return {
|
| 380 |
+
'success': False,
|
| 381 |
+
'error': f"Firecrawl failed: {error_msg}",
|
| 382 |
+
'preview': {'company': 'Error', 'role': 'Firecrawl failed', 'location': '', 'posted_days': ''},
|
| 383 |
+
'content': ''
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
except Exception as e:
|
| 387 |
+
print(f"❌ Firecrawl error: {str(e)}")
|
| 388 |
+
# Fallback to other methods
|
| 389 |
+
if 'linkedin.com' in url:
|
| 390 |
+
return self._scrape_linkedin(url)
|
| 391 |
+
else:
|
| 392 |
+
return self._scrape_generic(url)
|
| 393 |
+
|
| 394 |
+
def _extract_preview_from_markdown(self, markdown: str, url: str) -> dict:
|
| 395 |
+
"""Extract preview info from Firecrawl markdown content"""
|
| 396 |
+
preview = {
|
| 397 |
+
'company': 'Not specified',
|
| 398 |
+
'role': 'Not specified',
|
| 399 |
+
'location': 'Not specified',
|
| 400 |
+
'posted_days': 'Recently'
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
if not markdown:
|
| 404 |
+
return preview
|
| 405 |
+
|
| 406 |
+
lines = markdown.split('\n')
|
| 407 |
+
|
| 408 |
+
# Enhanced extraction for different job sites
|
| 409 |
+
for i, line in enumerate(lines[:15]): # Check first 15 lines
|
| 410 |
+
line = line.strip()
|
| 411 |
+
if len(line) < 3:
|
| 412 |
+
continue
|
| 413 |
+
|
| 414 |
+
# Extract from headers (usually job titles)
|
| 415 |
+
if line.startswith('#') and i < 5:
|
| 416 |
+
clean_title = line.lstrip('#').strip()
|
| 417 |
+
if not any(word in clean_title.lower() for word in ['about', 'company', 'description', 'overview']):
|
| 418 |
+
preview['role'] = clean_title
|
| 419 |
+
|
| 420 |
+
# LinkedIn specific patterns
|
| 421 |
+
if 'linkedin.com' in url:
|
| 422 |
+
# Company name after role
|
| 423 |
+
if ' at ' in line and preview['role'] != 'Not specified':
|
| 424 |
+
parts = line.split(' at ')
|
| 425 |
+
if len(parts) == 2:
|
| 426 |
+
preview['company'] = parts[1].strip()
|
| 427 |
+
|
| 428 |
+
# Location patterns
|
| 429 |
+
location_match = re.search(r'([^,]+,\s*[A-Z]{2}(?:\s*\d{5})?)', line)
|
| 430 |
+
if location_match:
|
| 431 |
+
preview['location'] = location_match.group(1).strip()
|
| 432 |
+
|
| 433 |
+
# General patterns for other sites
|
| 434 |
+
elif any(site in url for site in ['microsoft.com', 'google.com', 'amazon', 'meta.com']):
|
| 435 |
+
# Extract company from URL
|
| 436 |
+
if 'microsoft.com' in url:
|
| 437 |
+
preview['company'] = 'Microsoft'
|
| 438 |
+
elif 'google.com' in url:
|
| 439 |
+
preview['company'] = 'Google'
|
| 440 |
+
elif 'amazon' in url:
|
| 441 |
+
preview['company'] = 'Amazon'
|
| 442 |
+
elif 'meta.com' in url:
|
| 443 |
+
preview['company'] = 'Meta'
|
| 444 |
+
|
| 445 |
+
# Look for location in structured content
|
| 446 |
+
if re.search(r'\b(Remote|Hybrid|On-site)\b', line, re.IGNORECASE):
|
| 447 |
+
preview['location'] = line.strip()
|
| 448 |
+
|
| 449 |
+
return preview
|
| 450 |
+
|
| 451 |
def _scrape_with_selenium(self, url: str) -> dict:
|
| 452 |
"""Selenium fallback for sites that block requests"""
|
| 453 |
if not SELENIUM_AVAILABLE:
|
requirements.txt
CHANGED
|
@@ -22,3 +22,4 @@ google-auth-httplib2>=0.1.0
|
|
| 22 |
authlib>=1.2.0
|
| 23 |
selenium>=4.0.0
|
| 24 |
webdriver-manager>=3.8.0
|
|
|
|
|
|
| 22 |
authlib>=1.2.0
|
| 23 |
selenium>=4.0.0
|
| 24 |
webdriver-manager>=3.8.0
|
| 25 |
+
firecrawl-py>=0.0.20
|