ai-tutor-chatbot_KK

Sleeping

App Files Files Community

omarsol commited on Mar 12

Commit

75ceb60

1 Parent(s): b5197fc

Enhance combine_all_sources function with robust source processing and logging

Browse files

Files changed (1) hide show

data/scraping_scripts/process_md_files.py +52 -11

data/scraping_scripts/process_md_files.py CHANGED Viewed

@@ -136,7 +136,7 @@ SOURCE_CONFIGS = {
     },
     "8-hour_primer": {
         "base_url": "",
-        "input_directory": "data/8-hour_primer",
         "output_file": "data/8-hour_primer_data.jsonl",  # 8-hour Generative AI Primer
         "source_name": "8-hour_primer",
         "use_include_list": False,
@@ -148,7 +148,7 @@ SOURCE_CONFIGS = {
     },
     "llm_developer": {
         "base_url": "",
-        "input_directory": "data/llm_developer",
         "output_file": "data/llm_developer_data.jsonl",  # From Beginner to Advanced LLM Developer
         "source_name": "llm_developer",
         "use_include_list": False,
@@ -160,7 +160,7 @@ SOURCE_CONFIGS = {
     },
     "python_primer": {
         "base_url": "",
-        "input_directory": "data/python_primer",
         "output_file": "data/python_primer_data.jsonl",  # From Beginner to Advanced LLM Developer
         "source_name": "python_primer",
         "use_include_list": False,
@@ -272,21 +272,62 @@ def save_jsonl(data: List[Dict], output_file: str) -> None:
 def combine_all_sources(sources: List[str]) -> None:
     all_data = []
     output_file = "data/all_sources_data.jsonl"
     for source in sources:
         if source not in SOURCE_CONFIGS:
             logger.error(f"Unknown source '{source}'. Skipping.")
             continue
         input_file = SOURCE_CONFIGS[source]["output_file"]
-        logger.info(f"Processing source: {source}")
-        with open(input_file, "r", encoding="utf-8") as f:
-            for line in f:
-                all_data.append(json.loads(line))
     logger.info(f"Total documents combined: {len(all_data)}")
     save_jsonl(all_data, output_file)
     logger.info(f"Combined data saved to {output_file}")

     },
     "8-hour_primer": {
         "base_url": "",
+        "input_directory": "data/8-hour_primer",  # Path to the directory that contains the Markdown files
         "output_file": "data/8-hour_primer_data.jsonl",  # 8-hour Generative AI Primer
         "source_name": "8-hour_primer",
         "use_include_list": False,
     },
     "llm_developer": {
         "base_url": "",
+        "input_directory": "data/llm_developer",  # Path to the directory that contains the Markdown files
         "output_file": "data/llm_developer_data.jsonl",  # From Beginner to Advanced LLM Developer
         "source_name": "llm_developer",
         "use_include_list": False,
     },
     "python_primer": {
         "base_url": "",
+        "input_directory": "data/python_primer",  # Path to the directory that contains the Markdown files
         "output_file": "data/python_primer_data.jsonl",  # From Beginner to Advanced LLM Developer
         "source_name": "python_primer",
         "use_include_list": False,
 def combine_all_sources(sources: List[str]) -> None:
+    """
+    Combine JSONL files from multiple sources, preserving existing sources not being processed.
+    For example, if sources = ['transformers'], this will:
+    1. Load data from transformers_data.jsonl
+    2. Load data from all other source JSONL files that exist (course files, etc.)
+    3. Combine them all into all_sources_data.jsonl
+    """
     all_data = []
     output_file = "data/all_sources_data.jsonl"
+    # Track which sources we're processing
+    processed_sources = set()
+    # First, add data from sources we're explicitly processing
     for source in sources:
         if source not in SOURCE_CONFIGS:
             logger.error(f"Unknown source '{source}'. Skipping.")
             continue
+        processed_sources.add(source)
         input_file = SOURCE_CONFIGS[source]["output_file"]
+        logger.info(f"Processing updated source: {source} from {input_file}")
+        try:
+            source_data = []
+            with open(input_file, "r", encoding="utf-8") as f:
+                for line in f:
+                    source_data.append(json.loads(line))
+            logger.info(f"Added {len(source_data)} documents from {source}")
+            all_data.extend(source_data)
+        except Exception as e:
+            logger.error(f"Error loading {input_file}: {e}")
+    # Now add data from all other sources not being processed
+    for source_name, config in SOURCE_CONFIGS.items():
+        # Skip sources we already processed
+        if source_name in processed_sources:
+            continue
+        # Try to load the individual source file
+        source_file = config["output_file"]
+        if os.path.exists(source_file):
+            logger.info(f"Preserving existing source: {source_name} from {source_file}")
+            try:
+                source_data = []
+                with open(source_file, "r", encoding="utf-8") as f:
+                    for line in f:
+                        source_data.append(json.loads(line))
+                logger.info(f"Preserved {len(source_data)} documents from {source_name}")
+                all_data.extend(source_data)
+            except Exception as e:
+                logger.error(f"Error loading {source_file}: {e}")
     logger.info(f"Total documents combined: {len(all_data)}")
     save_jsonl(all_data, output_file)
     logger.info(f"Combined data saved to {output_file}")