Spaces:
Running
Running
Commit
·
2ac1fd8
1
Parent(s):
16bd62f
ok
Browse files- modules/keywords.py +113 -10
- simple_pdf_generator.py +17 -25
modules/keywords.py
CHANGED
|
@@ -32,10 +32,12 @@ class KeywordsModule:
|
|
| 32 |
|
| 33 |
# RapidAPI endpoints
|
| 34 |
self.enrichment_api_host = "google-keyword-insight1.p.rapidapi.com"
|
|
|
|
| 35 |
|
| 36 |
# API priority order (tries in this order)
|
| 37 |
self.api_sources = [
|
| 38 |
-
{'name': '
|
|
|
|
| 39 |
]
|
| 40 |
|
| 41 |
# Performance Configuration
|
|
@@ -80,8 +82,11 @@ class KeywordsModule:
|
|
| 80 |
# Try multiple API sources in order of preference
|
| 81 |
main_domain_data = self._fetch_domain_keywords_multi_api(domain, quick_scan)
|
| 82 |
if not main_domain_data['success']:
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
# Fetch competitor data
|
| 87 |
competitor_data = {}
|
|
@@ -125,14 +130,16 @@ class KeywordsModule:
|
|
| 125 |
available_apis = [api for api in self.api_sources if api['available']]
|
| 126 |
|
| 127 |
if not available_apis:
|
| 128 |
-
print("No keyword APIs configured
|
| 129 |
-
return {'success':
|
| 130 |
|
| 131 |
for api_source in available_apis:
|
| 132 |
try:
|
| 133 |
print(f"Trying {api_source['name']} for keyword data...")
|
| 134 |
|
| 135 |
-
if api_source['name'] == '
|
|
|
|
|
|
|
| 136 |
result = self._fetch_keywords_enrichment_only(domain, quick_scan)
|
| 137 |
else:
|
| 138 |
continue
|
|
@@ -147,8 +154,8 @@ class KeywordsModule:
|
|
| 147 |
print(f"{api_source['name']} failed: {str(e)}")
|
| 148 |
continue
|
| 149 |
|
| 150 |
-
print("All APIs failed
|
| 151 |
-
return {'success':
|
| 152 |
|
| 153 |
|
| 154 |
def _calculate_domain_statistics(self, keywords: List[Dict]) -> Dict[str, Any]:
|
|
@@ -248,12 +255,14 @@ class KeywordsModule:
|
|
| 248 |
|
| 249 |
# Set data source label based on what was actually used
|
| 250 |
if hasattr(self, '_current_api_source'):
|
| 251 |
-
if self._current_api_source == '
|
|
|
|
|
|
|
| 252 |
data_source = 'Google Keyword Insight API (rankings estimated)'
|
| 253 |
else:
|
| 254 |
data_source = f'{self._current_api_source} API'
|
| 255 |
else:
|
| 256 |
-
data_source = '
|
| 257 |
|
| 258 |
return {
|
| 259 |
'totals': totals,
|
|
@@ -724,3 +733,97 @@ class KeywordsModule:
|
|
| 724 |
}
|
| 725 |
}
|
| 726 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# RapidAPI endpoints
|
| 34 |
self.enrichment_api_host = "google-keyword-insight1.p.rapidapi.com"
|
| 35 |
+
self.similarweb_url = "https://similarweb-traffic.p.rapidapi.com/traffic"
|
| 36 |
|
| 37 |
# API priority order (tries in this order)
|
| 38 |
self.api_sources = [
|
| 39 |
+
{'name': 'SimilarWeb', 'available': bool(self.rapidapi_key)}, # Primary: SimilarWeb Traffic
|
| 40 |
+
{'name': 'GoogleInsight', 'available': bool(self.rapidapi_key)}, # Fallback: Google Keyword Insight
|
| 41 |
]
|
| 42 |
|
| 43 |
# Performance Configuration
|
|
|
|
| 82 |
# Try multiple API sources in order of preference
|
| 83 |
main_domain_data = self._fetch_domain_keywords_multi_api(domain, quick_scan)
|
| 84 |
if not main_domain_data['success']:
|
| 85 |
+
return ModuleResult(
|
| 86 |
+
success=False,
|
| 87 |
+
data={},
|
| 88 |
+
error="All keyword APIs failed - no real data available"
|
| 89 |
+
)
|
| 90 |
|
| 91 |
# Fetch competitor data
|
| 92 |
competitor_data = {}
|
|
|
|
| 130 |
available_apis = [api for api in self.api_sources if api['available']]
|
| 131 |
|
| 132 |
if not available_apis:
|
| 133 |
+
print("No keyword APIs configured")
|
| 134 |
+
return {'success': False, 'error': 'No RAPIDAPI_KEY configured'}
|
| 135 |
|
| 136 |
for api_source in available_apis:
|
| 137 |
try:
|
| 138 |
print(f"Trying {api_source['name']} for keyword data...")
|
| 139 |
|
| 140 |
+
if api_source['name'] == 'SimilarWeb':
|
| 141 |
+
result = self._fetch_domain_keywords_similarweb(domain, quick_scan)
|
| 142 |
+
elif api_source['name'] == 'GoogleInsight':
|
| 143 |
result = self._fetch_keywords_enrichment_only(domain, quick_scan)
|
| 144 |
else:
|
| 145 |
continue
|
|
|
|
| 154 |
print(f"{api_source['name']} failed: {str(e)}")
|
| 155 |
continue
|
| 156 |
|
| 157 |
+
print("All APIs failed")
|
| 158 |
+
return {'success': False, 'error': 'All keyword APIs failed'}
|
| 159 |
|
| 160 |
|
| 161 |
def _calculate_domain_statistics(self, keywords: List[Dict]) -> Dict[str, Any]:
|
|
|
|
| 255 |
|
| 256 |
# Set data source label based on what was actually used
|
| 257 |
if hasattr(self, '_current_api_source'):
|
| 258 |
+
if self._current_api_source == 'SimilarWeb':
|
| 259 |
+
data_source = 'SimilarWeb Traffic API'
|
| 260 |
+
elif self._current_api_source == 'GoogleInsight':
|
| 261 |
data_source = 'Google Keyword Insight API (rankings estimated)'
|
| 262 |
else:
|
| 263 |
data_source = f'{self._current_api_source} API'
|
| 264 |
else:
|
| 265 |
+
data_source = 'Real API data unavailable'
|
| 266 |
|
| 267 |
return {
|
| 268 |
'totals': totals,
|
|
|
|
| 733 |
}
|
| 734 |
}
|
| 735 |
|
| 736 |
+
def _fetch_domain_keywords_similarweb(self, domain: str, quick_scan: bool) -> Dict[str, Any]:
|
| 737 |
+
"""Fetch keyword data from SimilarWeb Traffic API"""
|
| 738 |
+
try:
|
| 739 |
+
headers = {
|
| 740 |
+
'x-rapidapi-key': self.rapidapi_key,
|
| 741 |
+
'x-rapidapi-host': 'similarweb-traffic.p.rapidapi.com'
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
+
params = {'domain': domain}
|
| 745 |
+
|
| 746 |
+
response = requests.get(self.similarweb_url, headers=headers, params=params, timeout=self.timeout)
|
| 747 |
+
|
| 748 |
+
if response.status_code == 429:
|
| 749 |
+
print("SimilarWeb API quota exceeded")
|
| 750 |
+
raise Exception("Quota exceeded")
|
| 751 |
+
elif response.status_code == 403:
|
| 752 |
+
print("SimilarWeb API subscription required")
|
| 753 |
+
raise Exception("Not subscribed to SimilarWeb API")
|
| 754 |
+
elif response.status_code != 200:
|
| 755 |
+
print(f"SimilarWeb API error {response.status_code}: {response.text}")
|
| 756 |
+
raise Exception(f"API error {response.status_code}")
|
| 757 |
+
|
| 758 |
+
data = response.json()
|
| 759 |
+
|
| 760 |
+
# Extract top keywords from SimilarWeb response
|
| 761 |
+
top_keywords = data.get('TopKeywords', [])
|
| 762 |
+
if not top_keywords:
|
| 763 |
+
raise Exception("No keywords found in SimilarWeb response")
|
| 764 |
+
|
| 765 |
+
# Transform SimilarWeb data to our format
|
| 766 |
+
keywords = []
|
| 767 |
+
for i, kw_data in enumerate(top_keywords[:20]): # Limit to top 20
|
| 768 |
+
keyword = kw_data.get('Name', '')
|
| 769 |
+
volume = kw_data.get('Volume', 0)
|
| 770 |
+
estimated_value = kw_data.get('EstimatedValue', 0)
|
| 771 |
+
|
| 772 |
+
# Estimate ranking based on estimated value (higher value = better ranking)
|
| 773 |
+
# Top keywords are likely ranking well for the domain
|
| 774 |
+
estimated_rank = min(i + 1, 10) if i < 10 else min(i + 5, 50)
|
| 775 |
+
|
| 776 |
+
# Calculate estimated traffic from the estimated value
|
| 777 |
+
estimated_traffic = int(estimated_value / 10) if estimated_value else 0
|
| 778 |
+
|
| 779 |
+
keywords.append({
|
| 780 |
+
'keyword': keyword,
|
| 781 |
+
'rank': estimated_rank,
|
| 782 |
+
'avg_search_volume': volume,
|
| 783 |
+
'estimated_traffic_volume': estimated_traffic,
|
| 784 |
+
'estimated_value': estimated_value
|
| 785 |
+
})
|
| 786 |
+
|
| 787 |
+
# Calculate domain statistics based on SimilarWeb data
|
| 788 |
+
total_keywords = len(keywords)
|
| 789 |
+
top3 = sum(1 for k in keywords if k['rank'] <= 3)
|
| 790 |
+
top10 = sum(1 for k in keywords if k['rank'] <= 10)
|
| 791 |
+
top50 = sum(1 for k in keywords if k['rank'] <= 50)
|
| 792 |
+
|
| 793 |
+
# Get additional traffic metrics from SimilarWeb
|
| 794 |
+
engagements = data.get('Engagements', {})
|
| 795 |
+
visits = int(engagements.get('Visits', 0))
|
| 796 |
+
|
| 797 |
+
stats = {
|
| 798 |
+
'organic': {
|
| 799 |
+
'keywords_in_pos_1': sum(1 for k in keywords if k['rank'] == 1),
|
| 800 |
+
'keywords_in_pos_2_3': sum(1 for k in keywords if 2 <= k['rank'] <= 3),
|
| 801 |
+
'keywords_in_pos_4_10': sum(1 for k in keywords if 4 <= k['rank'] <= 10),
|
| 802 |
+
'keywords_in_pos_11_20': sum(1 for k in keywords if 11 <= k['rank'] <= 20),
|
| 803 |
+
'keywords_in_pos_21_50': sum(1 for k in keywords if 21 <= k['rank'] <= 50),
|
| 804 |
+
'total_keywords_count': total_keywords,
|
| 805 |
+
'Estimated_traffic_volume': sum(k['estimated_traffic_volume'] for k in keywords),
|
| 806 |
+
'is_new': 0, # SimilarWeb doesn't provide historical comparison
|
| 807 |
+
'is_up': 0,
|
| 808 |
+
'is_down': 0,
|
| 809 |
+
'is_lost': 0
|
| 810 |
+
}
|
| 811 |
+
}
|
| 812 |
+
|
| 813 |
+
return {
|
| 814 |
+
'success': True,
|
| 815 |
+
'data': {
|
| 816 |
+
'domain': domain,
|
| 817 |
+
'statistics': stats,
|
| 818 |
+
'keywords': keywords,
|
| 819 |
+
'traffic_data': {
|
| 820 |
+
'monthly_visits': visits,
|
| 821 |
+
'global_rank': data.get('GlobalRank', {}).get('Rank', 0),
|
| 822 |
+
'bounce_rate': engagements.get('BounceRate', 0)
|
| 823 |
+
}
|
| 824 |
+
}
|
| 825 |
+
}
|
| 826 |
+
|
| 827 |
+
except Exception as e:
|
| 828 |
+
return {'success': False, 'error': str(e)}
|
| 829 |
+
|
simple_pdf_generator.py
CHANGED
|
@@ -7,36 +7,28 @@ import io
|
|
| 7 |
import re
|
| 8 |
from typing import Dict, Any
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class SimplePDFGenerator:
|
| 11 |
def __init__(self):
|
| 12 |
-
self.available =
|
| 13 |
-
try:
|
| 14 |
-
import reportlab
|
| 15 |
-
import bs4
|
| 16 |
-
self.available = True
|
| 17 |
-
except ImportError:
|
| 18 |
-
self.available = False
|
| 19 |
|
| 20 |
def generate_pdf(self, html_content: str) -> bytes:
|
| 21 |
if not self.available:
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
try:
|
| 25 |
-
from reportlab.pdfgen import canvas
|
| 26 |
-
from reportlab.lib.pagesizes import letter, A4
|
| 27 |
-
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 28 |
-
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
|
| 29 |
-
from reportlab.lib.units import inch
|
| 30 |
-
from reportlab.lib.colors import Color, black, blue, green, red
|
| 31 |
-
except ImportError as e:
|
| 32 |
-
raise ImportError(f"PDF generation requires reportlab components: {e}")
|
| 33 |
-
|
| 34 |
-
try:
|
| 35 |
-
from bs4 import BeautifulSoup
|
| 36 |
-
except ImportError:
|
| 37 |
-
raise ImportError("PDF generation requires beautifulsoup4: pip install beautifulsoup4")
|
| 38 |
-
|
| 39 |
-
import re
|
| 40 |
|
| 41 |
# Parse HTML and extract content
|
| 42 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
| 7 |
import re
|
| 8 |
from typing import Dict, Any
|
| 9 |
|
| 10 |
+
# Try to import all PDF dependencies at module level
|
| 11 |
+
try:
|
| 12 |
+
from reportlab.pdfgen import canvas
|
| 13 |
+
from reportlab.lib.pagesizes import letter, A4
|
| 14 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 15 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
|
| 16 |
+
from reportlab.lib.units import inch
|
| 17 |
+
from reportlab.lib.colors import Color, black, blue, green, red
|
| 18 |
+
from bs4 import BeautifulSoup
|
| 19 |
+
PDF_AVAILABLE = True
|
| 20 |
+
except ImportError as e:
|
| 21 |
+
PDF_AVAILABLE = False
|
| 22 |
+
PDF_ERROR = str(e)
|
| 23 |
+
|
| 24 |
class SimplePDFGenerator:
|
| 25 |
def __init__(self):
|
| 26 |
+
self.available = PDF_AVAILABLE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def generate_pdf(self, html_content: str) -> bytes:
|
| 29 |
if not self.available:
|
| 30 |
+
error_msg = PDF_ERROR if 'PDF_ERROR' in globals() else "PDF generation requires reportlab and beautifulsoup4"
|
| 31 |
+
raise ImportError(error_msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Parse HTML and extract content
|
| 34 |
soup = BeautifulSoup(html_content, 'html.parser')
|