Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| """ | |
| Upload Data Files to HuggingFace | |
| This script uploads key data files to a private HuggingFace dataset repository: | |
| 1. all_sources_data.jsonl - The raw document data | |
| 2. all_sources_contextual_nodes.pkl - The processed nodes with added context | |
| This is useful for new team members who need the latest version of the data. | |
| Usage: | |
| python upload_data_to_hf.py [--repo REPO_ID] | |
| Arguments: | |
| --repo REPO_ID HuggingFace dataset repository ID (default: towardsai-tutors/ai-tutor-data) | |
| """ | |
| import argparse | |
| import os | |
| from dotenv import load_dotenv | |
| from huggingface_hub import HfApi | |
| load_dotenv() | |
| def upload_files_to_huggingface(repo_id="towardsai-tutors/ai-tutor-data"): | |
| """Upload data files to a private HuggingFace repository.""" | |
| # Main files to upload | |
| files_to_upload = [ | |
| # Combined data and vector store | |
| "data/all_sources_data.jsonl", | |
| "data/all_sources_contextual_nodes.pkl", | |
| # Individual source files | |
| "data/transformers_data.jsonl", | |
| "data/peft_data.jsonl", | |
| "data/trl_data.jsonl", | |
| "data/llama_index_data.jsonl", | |
| "data/langchain_data.jsonl", | |
| "data/openai_cookbooks_data.jsonl", | |
| # Course files | |
| "data/tai_blog_data.jsonl", | |
| "data/8-hour_primer_data.jsonl", | |
| "data/llm_developer_data.jsonl", | |
| "data/python_primer_data.jsonl", | |
| ] | |
| # Filter to only include files that exist | |
| existing_files = [] | |
| missing_files = [] | |
| for file_path in files_to_upload: | |
| if os.path.exists(file_path): | |
| existing_files.append(file_path) | |
| else: | |
| missing_files.append(file_path) | |
| # Critical files must exist | |
| critical_files = [ | |
| "data/all_sources_data.jsonl", | |
| "data/all_sources_contextual_nodes.pkl", | |
| ] | |
| critical_missing = [f for f in critical_files if f in missing_files] | |
| if critical_missing: | |
| print( | |
| f"Error: The following critical files were not found: {', '.join(critical_missing)}" | |
| ) | |
| # return False | |
| if missing_files: | |
| print( | |
| f"Warning: The following files were not found and will not be uploaded: {', '.join(missing_files)}" | |
| ) | |
| print("This is normal if you're only updating certain sources.") | |
| try: | |
| api = HfApi(token=os.getenv("HF_TOKEN")) | |
| # Check if repository exists, create if it doesn't | |
| try: | |
| api.repo_info(repo_id=repo_id, repo_type="dataset") | |
| print(f"Repository {repo_id} exists") | |
| except Exception: | |
| print( | |
| f"Repository {repo_id} doesn't exist. Please create it first on the HuggingFace platform." | |
| ) | |
| print("Make sure to set it as private if needed.") | |
| return False | |
| # Upload all existing files | |
| for file_path in existing_files: | |
| try: | |
| file_name = os.path.basename(file_path) | |
| print(f"Uploading {file_name}...") | |
| api.upload_file( | |
| path_or_fileobj=file_path, | |
| path_in_repo=file_name, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| ) | |
| print( | |
| f"Successfully uploaded {file_name} to HuggingFace repository {repo_id}" | |
| ) | |
| except Exception as e: | |
| print(f"Error uploading {file_path}: {e}") | |
| # Continue with other files even if one fails | |
| return True | |
| except Exception as e: | |
| print(f"Error uploading files: {e}") | |
| return False | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Upload Data Files to HuggingFace") | |
| parser.add_argument( | |
| "--repo", | |
| default="towardsai-tutors/ai-tutor-data", | |
| help="HuggingFace dataset repository ID", | |
| ) | |
| args = parser.parse_args() | |
| upload_files_to_huggingface(args.repo) | |
| if __name__ == "__main__": | |
| main() | |