Spaces:
Running
Running
| """Download and merge all data files from a Hugging Face dataset repo. | |
| Usage: | |
| HF_TOKEN must be exported in your environment (or pass --token). | |
| HF_DATASET_ID may be exported or passed via --repo. | |
| Example: | |
| export HF_TOKEN="hf_..." | |
| python download_dataset.py --repo kathiasi/tts-rubric-responses --outdir out | |
| This script downloads any files under `data/` (parquet or arrow/ipc), reads them, | |
| concatenates into a single table, and writes `combined.parquet` and `combined.csv` in | |
| `outdir`. | |
| """ | |
| import os | |
| import argparse | |
| import json | |
| from huggingface_hub import HfApi, hf_hub_download | |
| import pyarrow.parquet as pq | |
| import pyarrow.ipc as ipc | |
| import pandas as pd | |
| def read_parquet(path): | |
| try: | |
| tbl = pq.read_table(path) | |
| return tbl.to_pandas() | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to read parquet {path}: {e}") | |
| def read_arrow(path): | |
| try: | |
| with open(path, 'rb') as f: | |
| reader = ipc.open_file(f) | |
| tbl = reader.read_all() | |
| return tbl.to_pandas() | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to read arrow/ipc {path}: {e}") | |
| def download_and_merge(repo_id, outdir, token=None): | |
| api = HfApi() | |
| token = token or os.environ.get('HF_TOKEN') | |
| if not token: | |
| raise RuntimeError('HF_TOKEN not provided; export HF_TOKEN or pass --token') | |
| files = api.list_repo_files(repo_id=repo_id, repo_type='dataset', token=token) | |
| data_files = [f for f in files if f.startswith('data/')] | |
| if not data_files: | |
| print('No data/ files found in dataset repo. Files found:') | |
| print(json.dumps(files, indent=2)) | |
| return | |
| os.makedirs(outdir, exist_ok=True) | |
| dfs = [] | |
| for fname in sorted(data_files): | |
| print('Processing', fname) | |
| local_path = hf_hub_download(repo_id=repo_id, repo_type='dataset', filename=fname, token=token) | |
| if fname.endswith('.parquet'): | |
| df = read_parquet(local_path) | |
| elif fname.endswith('.arrow') or fname.endswith('.ipc'): | |
| df = read_arrow(local_path) | |
| else: | |
| print('Skipping unsupported data file:', fname) | |
| continue | |
| dfs.append(df) | |
| if not dfs: | |
| print('No supported data files were read.') | |
| return | |
| combined = pd.concat(dfs, ignore_index=True) | |
| out_parquet = os.path.join(outdir, 'combined.parquet') | |
| out_csv = os.path.join(outdir, 'combined.csv') | |
| print(f'Writing {len(combined)} rows to', out_parquet) | |
| combined.to_parquet(out_parquet, index=False) | |
| print('Also writing CSV to', out_csv) | |
| combined.to_csv(out_csv, index=False) | |
| print('Done.') | |
| if __name__ == '__main__': | |
| p = argparse.ArgumentParser() | |
| p.add_argument('--repo', help='Dataset repo id (user/name)', default=os.environ.get('HF_DATASET_ID')) | |
| p.add_argument('--outdir', help='Output directory', default='hf_dataset') | |
| p.add_argument('--token', help='Hugging Face token (optional)', default=None) | |
| args = p.parse_args() | |
| if not args.repo: | |
| print('Dataset repo id is required via --repo or HF_DATASET_ID env var') | |
| raise SystemExit(1) | |
| download_and_merge(args.repo, args.outdir, token=args.token) | |