Spaces:

broadfield-dev
/

RSS_News_1

Sleeping

App Files Files Community

broadfield-dev commited on Nov 1

Commit

1662274

verified ·

1 Parent(s): d3c98a4

Update rss_processor.py

Browse files

Files changed (1) hide show

rss_processor.py +35 -49

rss_processor.py CHANGED Viewed

@@ -10,7 +10,7 @@ import dateutil.parser
 import hashlib
 import json
 import re
-import requests
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -21,7 +21,7 @@ COLLECTION_NAME = "news_articles"
 HF_API_TOKEN = os.getenv("HF_TOKEN")
 REPO_ID = "broadfield-dev/news-rag-db"
 MAX_ARTICLES_PER_FEED = 1000
-RAW_FEEDS_DIR = "raw_rss_feeds"
 def initialize_hf_api():
     if not HF_API_TOKEN:
@@ -43,7 +43,7 @@ def get_embedding_model():
 def clean_text(text):
     if not text or not isinstance(text, str):
         return ""
-    text = re.sub(r'<.*?>', '', text)
     text = ' '.join(text.split())
     return text.strip()
@@ -83,11 +83,10 @@ def fetch_rss_feeds():
             try:
                 logger.info(f"Fetching {feed_url}")
-                # Fetch raw content first to save it
                 response = requests.get(feed_url, headers={'User-Agent': 'Mozilla/5.0'})
                 response.raise_for_status()
                 raw_content = response.text
-                save_raw_rss_to_file(feed_url, raw_content)
                 feed = feedparser.parse(raw_content)
                 if feed.bozo:
@@ -100,7 +99,6 @@ def fetch_rss_feeds():
                         continue
                     seen_links.add(link)
                     title = entry.get("title", "No Title")
                     description_raw = entry.get("summary", entry.get("description", ""))
                     description = clean_text(description_raw)
@@ -135,12 +133,8 @@ def fetch_rss_feeds():
                             continue
                     articles.append({
-                        "title": title,
-                        "link": link,
-                        "description": description,
-                        "published": published_str,
-                        "category": category,
-                        "image": image,
                     })
             except requests.exceptions.RequestException as e:
                 logger.error(f"Error fetching {feed_url}: {e}")
@@ -164,27 +158,16 @@ def process_and_store_articles(articles):
         logger.info("No existing DB found or it is empty. Starting fresh.")
         existing_ids = set()
-    contents_to_add = []
-    metadatas_to_add = []
-    ids_to_add = []
     for article in articles:
-        if not article.get('link'):
-            continue
         doc_id = hashlib.sha256(article['link'].encode('utf-8')).hexdigest()
-        if doc_id in existing_ids:
-            continue
         metadata = {
-            "title": article["title"],
-            "link": article["link"],
-            "published": article["published"],
-            "category": article["category"],
-            "image": article["image"],
         }
         contents_to_add.append(article["description"])
         metadatas_to_add.append(metadata)
         ids_to_add.append(doc_id)
@@ -194,13 +177,7 @@ def process_and_store_articles(articles):
         try:
             embedding_model = get_embedding_model()
             embeddings_to_add = embedding_model.embed_documents(contents_to_add)
-            collection.add(
-                embeddings=embeddings_to_add,
-                documents=contents_to_add,
-                metadatas=metadatas_to_add,
-                ids=ids_to_add
-            )
             logger.info(f"Successfully added {len(ids_to_add)} new articles to DB. Total in DB: {collection.count()}")
         except Exception as e:
             logger.error(f"Error storing articles in ChromaDB: {e}", exc_info=True)
@@ -212,12 +189,8 @@ def download_from_hf_hub():
         try:
             logger.info(f"Downloading Chroma DB from {REPO_ID} to {LOCAL_DB_DIR}...")
             snapshot_download(
-                repo_id=REPO_ID,
-                repo_type="dataset",
-                local_dir=".",
-                local_dir_use_symlinks=False,
-                allow_patterns=[f"{LOCAL_DB_DIR}/**"],
-                token=HF_API_TOKEN
             )
             logger.info("Finished downloading DB.")
         except Exception as e:
@@ -226,20 +199,33 @@ def download_from_hf_hub():
         logger.info(f"Local Chroma DB found at '{LOCAL_DB_DIR}', skipping download.")
 def upload_to_hf_hub():
     if os.path.exists(LOCAL_DB_DIR):
         try:
             logger.info(f"Uploading updated Chroma DB '{LOCAL_DB_DIR}' to {REPO_ID}...")
             hf_api.upload_folder(
-                folder_path=LOCAL_DB_DIR,
-                path_in_repo=LOCAL_DB_DIR,
-                repo_id=REPO_ID,
-                repo_type="dataset",
-                commit_message=f"Update RSS news database {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
-                ignore_patterns=["*.bak", "*.tmp"]
             )
             logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
         except Exception as e:
-            logger.error(f"Error uploading to Hugging Face Hub: {e}", exc_info=True)
 def main():
     try:
@@ -247,7 +233,7 @@ def main():
         articles_to_process = fetch_rss_feeds()
         if articles_to_process:
             process_and_store_articles(articles_to_process)
-            upload_to_hf_hub()
         else:
             logger.info("No articles fetched, skipping database processing and upload.")
     except Exception as e:

 import hashlib
 import json
 import re
+import requests # Ensure requests is imported
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 HF_API_TOKEN = os.getenv("HF_TOKEN")
 REPO_ID = "broadfield-dev/news-rag-db"
 MAX_ARTICLES_PER_FEED = 1000
+RAW_FEEDS_DIR = "raw_rss_feeds" # Directory for raw RSS files
 def initialize_hf_api():
     if not HF_API_TOKEN:
 def clean_text(text):
     if not text or not isinstance(text, str):
         return ""
+    text = re.sub(r'<|.*?>', '', text)
     text = ' '.join(text.split())
     return text.strip()
             try:
                 logger.info(f"Fetching {feed_url}")
                 response = requests.get(feed_url, headers={'User-Agent': 'Mozilla/5.0'})
                 response.raise_for_status()
                 raw_content = response.text
+                save_raw_rss_to_file(feed_url, raw_content) # Save the raw feed
                 feed = feedparser.parse(raw_content)
                 if feed.bozo:
                         continue
                     seen_links.add(link)
                     title = entry.get("title", "No Title")
                     description_raw = entry.get("summary", entry.get("description", ""))
                     description = clean_text(description_raw)
                             continue
                     articles.append({
+                        "title": title, "link": link, "description": description,
+                        "published": published_str, "category": category, "image": image,
                     })
             except requests.exceptions.RequestException as e:
                 logger.error(f"Error fetching {feed_url}: {e}")
         logger.info("No existing DB found or it is empty. Starting fresh.")
         existing_ids = set()
+    contents_to_add, metadatas_to_add, ids_to_add = [], [], []
     for article in articles:
+        if not article.get('link'): continue
         doc_id = hashlib.sha256(article['link'].encode('utf-8')).hexdigest()
+        if doc_id in existing_ids: continue
         metadata = {
+            "title": article["title"], "link": article["link"], "published": article["published"],
+            "category": article["category"], "image": article["image"],
         }
         contents_to_add.append(article["description"])
         metadatas_to_add.append(metadata)
         ids_to_add.append(doc_id)
         try:
             embedding_model = get_embedding_model()
             embeddings_to_add = embedding_model.embed_documents(contents_to_add)
+            collection.add(embeddings=embeddings_to_add, documents=contents_to_add, metadatas=metadatas_to_add, ids=ids_to_add)
             logger.info(f"Successfully added {len(ids_to_add)} new articles to DB. Total in DB: {collection.count()}")
         except Exception as e:
             logger.error(f"Error storing articles in ChromaDB: {e}", exc_info=True)
         try:
             logger.info(f"Downloading Chroma DB from {REPO_ID} to {LOCAL_DB_DIR}...")
             snapshot_download(
+                repo_id=REPO_ID, repo_type="dataset", local_dir=".",
+                local_dir_use_symlinks=False, allow_patterns=[f"{LOCAL_DB_DIR}/**"], token=HF_API_TOKEN
             )
             logger.info("Finished downloading DB.")
         except Exception as e:
         logger.info(f"Local Chroma DB found at '{LOCAL_DB_DIR}', skipping download.")
 def upload_to_hf_hub():
+    """Uploads both the ChromaDB and the raw RSS feeds to the Hugging Face Hub."""
+    commit_message = f"Update RSS news database and raw feeds {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+    # Upload ChromaDB if it exists
     if os.path.exists(LOCAL_DB_DIR):
         try:
             logger.info(f"Uploading updated Chroma DB '{LOCAL_DB_DIR}' to {REPO_ID}...")
             hf_api.upload_folder(
+                folder_path=LOCAL_DB_DIR, path_in_repo=LOCAL_DB_DIR, repo_id=REPO_ID,
+                repo_type="dataset", commit_message=commit_message, ignore_patterns=["*.bak", "*.tmp"]
             )
             logger.info(f"Database folder '{LOCAL_DB_DIR}' uploaded to: {REPO_ID}")
         except Exception as e:
+            logger.error(f"Error uploading Chroma DB to Hugging Face Hub: {e}", exc_info=True)
+    # Upload Raw RSS Feeds directory if it exists
+    if os.path.exists(RAW_FEEDS_DIR):
+        try:
+            logger.info(f"Uploading raw RSS feeds from '{RAW_FEEDS_DIR}' to {REPO_ID}...")
+            hf_api.upload_folder(
+                folder_path=RAW_FEEDS_DIR, path_in_repo=RAW_FEEDS_DIR, repo_id=REPO_ID,
+                repo_type="dataset", commit_message=commit_message
+            )
+            logger.info(f"Raw feeds folder '{RAW_FEEDS_DIR}' uploaded to: {REPO_ID}")
+        except Exception as e:
+            logger.error(f"Error uploading raw feeds to Hugging Face Hub: {e}", exc_info=True)
 def main():
     try:
         articles_to_process = fetch_rss_feeds()
         if articles_to_process:
             process_and_store_articles(articles_to_process)
+            upload_to_hf_hub() # This now uploads both directories
         else:
             logger.info("No articles fetched, skipping database processing and upload.")
     except Exception as e: