AvikalpK commited on
Commit
3be0d8a
·
1 Parent(s): d9dd652

Add Firecrawl integration for superior web scraping

Browse files
Files changed (3) hide show
  1. config.py +1 -0
  2. micro/scrape.py +149 -2
  3. requirements.txt +1 -0
config.py CHANGED
@@ -9,6 +9,7 @@ load_dotenv()
9
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your_openai_key_here")
10
  ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "your_anthropic_key_here")
11
  SERPAPI_KEY = os.getenv("SERPAPI_KEY", "your_serpapi_key_here")
 
12
 
13
  # LLM Configuration
14
  LLM_CONFIG: Dict[str, Any] = {
 
9
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your_openai_key_here")
10
  ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "your_anthropic_key_here")
11
  SERPAPI_KEY = os.getenv("SERPAPI_KEY", "your_serpapi_key_here")
12
+ FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", "fc-08e46542bfcc4ca7a953fac4dea4237e")
13
 
14
  # LLM Configuration
15
  LLM_CONFIG: Dict[str, Any] = {
micro/scrape.py CHANGED
@@ -4,6 +4,15 @@ import re
4
  from typing import Dict, Tuple, Optional
5
  from datetime import datetime
6
 
 
 
 
 
 
 
 
 
 
7
  # Try to import Selenium, but handle gracefully if not available
8
  try:
9
  from selenium import webdriver
@@ -149,6 +158,17 @@ class ScrapeMicroFunction:
149
  """Micro-function for web scraping with enhanced preview extraction"""
150
 
151
  def __init__(self):
 
 
 
 
 
 
 
 
 
 
 
152
  if SELENIUM_AVAILABLE:
153
  self.chrome_options = Options()
154
  self.chrome_options.add_argument('--headless')
@@ -212,8 +232,11 @@ class ScrapeMicroFunction:
212
  def _scrape_url(self, url: str) -> dict:
213
  """Scrape URL and extract both preview and full content"""
214
  try:
215
- # Try LinkedIn-specific scraping first
216
- if 'linkedin.com' in url:
 
 
 
217
  return self._scrape_linkedin(url)
218
  else:
219
  return self._scrape_generic(url)
@@ -301,6 +324,130 @@ class ScrapeMicroFunction:
301
  # Fallback to Selenium
302
  return self._scrape_with_selenium(url)
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  def _scrape_with_selenium(self, url: str) -> dict:
305
  """Selenium fallback for sites that block requests"""
306
  if not SELENIUM_AVAILABLE:
 
4
  from typing import Dict, Tuple, Optional
5
  from datetime import datetime
6
 
7
+ # Try to import Firecrawl
8
+ try:
9
+ from firecrawl import FirecrawlApp
10
+ from config import FIRECRAWL_API_KEY
11
+ FIRECRAWL_AVAILABLE = True and FIRECRAWL_API_KEY != "your_firecrawl_key_here"
12
+ except ImportError:
13
+ FIRECRAWL_AVAILABLE = False
14
+ print("Warning: Firecrawl not available. Web scraping will use fallback methods.")
15
+
16
  # Try to import Selenium, but handle gracefully if not available
17
  try:
18
  from selenium import webdriver
 
158
  """Micro-function for web scraping with enhanced preview extraction"""
159
 
160
  def __init__(self):
161
+ # Initialize Firecrawl client if available
162
+ if FIRECRAWL_AVAILABLE:
163
+ try:
164
+ self.firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
165
+ print("✅ Firecrawl client initialized successfully")
166
+ except Exception as e:
167
+ print(f"⚠️ Firecrawl initialization failed: {e}")
168
+ self.firecrawl_app = None
169
+ else:
170
+ self.firecrawl_app = None
171
+
172
  if SELENIUM_AVAILABLE:
173
  self.chrome_options = Options()
174
  self.chrome_options.add_argument('--headless')
 
232
  def _scrape_url(self, url: str) -> dict:
233
  """Scrape URL and extract both preview and full content"""
234
  try:
235
+ # Try Firecrawl first if available (works for all sites including LinkedIn)
236
+ if self.firecrawl_app:
237
+ return self._scrape_with_firecrawl(url)
238
+ # Fallback to site-specific methods
239
+ elif 'linkedin.com' in url:
240
  return self._scrape_linkedin(url)
241
  else:
242
  return self._scrape_generic(url)
 
324
  # Fallback to Selenium
325
  return self._scrape_with_selenium(url)
326
 
327
+ def _scrape_with_firecrawl(self, url: str) -> dict:
328
+ """Firecrawl scraping - works for all sites including LinkedIn"""
329
+ if not self.firecrawl_app:
330
+ # Fallback to other methods if Firecrawl not available
331
+ if 'linkedin.com' in url:
332
+ return self._scrape_linkedin(url)
333
+ else:
334
+ return self._scrape_generic(url)
335
+
336
+ try:
337
+ print(f"🔥 Using Firecrawl to scrape: {url}")
338
+
339
+ # Use Firecrawl to scrape the URL and get LLM-ready markdown
340
+ scrape_result = self.firecrawl_app.scrape_url(
341
+ url,
342
+ formats=['markdown', 'html'],
343
+ only_main_content=True, # Focus on main content
344
+ timeout=30000
345
+ )
346
+
347
+ if scrape_result and hasattr(scrape_result, 'data'):
348
+ # Handle Firecrawl response object structure
349
+ data = scrape_result.data
350
+ markdown_content = getattr(data, 'markdown', '') or ''
351
+ html_content = getattr(data, 'html', '') or ''
352
+ metadata = getattr(data, 'metadata', {}) or {}
353
+
354
+ # Create preview from metadata and content
355
+ title = metadata.get('title', 'Not specified') if isinstance(metadata, dict) else 'Not specified'
356
+ preview = {
357
+ 'company': 'Not specified',
358
+ 'role': title,
359
+ 'location': 'Not specified',
360
+ 'posted_days': 'Recently'
361
+ }
362
+
363
+ # Try to extract better preview info from markdown content
364
+ enhanced_preview = self._extract_preview_from_markdown(markdown_content, url)
365
+ preview.update({k: v for k, v in enhanced_preview.items() if v != 'Not specified'})
366
+
367
+ return {
368
+ 'success': True,
369
+ 'content': markdown_content or html_content,
370
+ 'html_content': html_content,
371
+ 'markdown_content': markdown_content,
372
+ 'metadata': metadata,
373
+ 'preview': preview,
374
+ 'url': url,
375
+ 'scraping_method': 'firecrawl'
376
+ }
377
+ else:
378
+ error_msg = getattr(scrape_result, 'error', 'Unknown Firecrawl error') if scrape_result else 'No response from Firecrawl'
379
+ return {
380
+ 'success': False,
381
+ 'error': f"Firecrawl failed: {error_msg}",
382
+ 'preview': {'company': 'Error', 'role': 'Firecrawl failed', 'location': '', 'posted_days': ''},
383
+ 'content': ''
384
+ }
385
+
386
+ except Exception as e:
387
+ print(f"❌ Firecrawl error: {str(e)}")
388
+ # Fallback to other methods
389
+ if 'linkedin.com' in url:
390
+ return self._scrape_linkedin(url)
391
+ else:
392
+ return self._scrape_generic(url)
393
+
394
+ def _extract_preview_from_markdown(self, markdown: str, url: str) -> dict:
395
+ """Extract preview info from Firecrawl markdown content"""
396
+ preview = {
397
+ 'company': 'Not specified',
398
+ 'role': 'Not specified',
399
+ 'location': 'Not specified',
400
+ 'posted_days': 'Recently'
401
+ }
402
+
403
+ if not markdown:
404
+ return preview
405
+
406
+ lines = markdown.split('\n')
407
+
408
+ # Enhanced extraction for different job sites
409
+ for i, line in enumerate(lines[:15]): # Check first 15 lines
410
+ line = line.strip()
411
+ if len(line) < 3:
412
+ continue
413
+
414
+ # Extract from headers (usually job titles)
415
+ if line.startswith('#') and i < 5:
416
+ clean_title = line.lstrip('#').strip()
417
+ if not any(word in clean_title.lower() for word in ['about', 'company', 'description', 'overview']):
418
+ preview['role'] = clean_title
419
+
420
+ # LinkedIn specific patterns
421
+ if 'linkedin.com' in url:
422
+ # Company name after role
423
+ if ' at ' in line and preview['role'] != 'Not specified':
424
+ parts = line.split(' at ')
425
+ if len(parts) == 2:
426
+ preview['company'] = parts[1].strip()
427
+
428
+ # Location patterns
429
+ location_match = re.search(r'([^,]+,\s*[A-Z]{2}(?:\s*\d{5})?)', line)
430
+ if location_match:
431
+ preview['location'] = location_match.group(1).strip()
432
+
433
+ # General patterns for other sites
434
+ elif any(site in url for site in ['microsoft.com', 'google.com', 'amazon', 'meta.com']):
435
+ # Extract company from URL
436
+ if 'microsoft.com' in url:
437
+ preview['company'] = 'Microsoft'
438
+ elif 'google.com' in url:
439
+ preview['company'] = 'Google'
440
+ elif 'amazon' in url:
441
+ preview['company'] = 'Amazon'
442
+ elif 'meta.com' in url:
443
+ preview['company'] = 'Meta'
444
+
445
+ # Look for location in structured content
446
+ if re.search(r'\b(Remote|Hybrid|On-site)\b', line, re.IGNORECASE):
447
+ preview['location'] = line.strip()
448
+
449
+ return preview
450
+
451
  def _scrape_with_selenium(self, url: str) -> dict:
452
  """Selenium fallback for sites that block requests"""
453
  if not SELENIUM_AVAILABLE:
requirements.txt CHANGED
@@ -22,3 +22,4 @@ google-auth-httplib2>=0.1.0
22
  authlib>=1.2.0
23
  selenium>=4.0.0
24
  webdriver-manager>=3.8.0
 
 
22
  authlib>=1.2.0
23
  selenium>=4.0.0
24
  webdriver-manager>=3.8.0
25
+ firecrawl-py>=0.0.20