nb-tts-rubric / download_dataset.py
kathiasi's picture
Initiation
842bbce verified
"""Download and merge all data files from a Hugging Face dataset repo.
Usage:
HF_TOKEN must be exported in your environment (or pass --token).
HF_DATASET_ID may be exported or passed via --repo.
Example:
export HF_TOKEN="hf_..."
python download_dataset.py --repo kathiasi/tts-rubric-responses --outdir out
This script downloads any files under `data/` (parquet or arrow/ipc), reads them,
concatenates into a single table, and writes `combined.parquet` and `combined.csv` in
`outdir`.
"""
import os
import argparse
import json
from huggingface_hub import HfApi, hf_hub_download
import pyarrow.parquet as pq
import pyarrow.ipc as ipc
import pandas as pd
def read_parquet(path):
try:
tbl = pq.read_table(path)
return tbl.to_pandas()
except Exception as e:
raise RuntimeError(f"Failed to read parquet {path}: {e}")
def read_arrow(path):
try:
with open(path, 'rb') as f:
reader = ipc.open_file(f)
tbl = reader.read_all()
return tbl.to_pandas()
except Exception as e:
raise RuntimeError(f"Failed to read arrow/ipc {path}: {e}")
def download_and_merge(repo_id, outdir, token=None):
api = HfApi()
token = token or os.environ.get('HF_TOKEN')
if not token:
raise RuntimeError('HF_TOKEN not provided; export HF_TOKEN or pass --token')
files = api.list_repo_files(repo_id=repo_id, repo_type='dataset', token=token)
data_files = [f for f in files if f.startswith('data/')]
if not data_files:
print('No data/ files found in dataset repo. Files found:')
print(json.dumps(files, indent=2))
return
os.makedirs(outdir, exist_ok=True)
dfs = []
for fname in sorted(data_files):
print('Processing', fname)
local_path = hf_hub_download(repo_id=repo_id, repo_type='dataset', filename=fname, token=token)
if fname.endswith('.parquet'):
df = read_parquet(local_path)
elif fname.endswith('.arrow') or fname.endswith('.ipc'):
df = read_arrow(local_path)
else:
print('Skipping unsupported data file:', fname)
continue
dfs.append(df)
if not dfs:
print('No supported data files were read.')
return
combined = pd.concat(dfs, ignore_index=True)
out_parquet = os.path.join(outdir, 'combined.parquet')
out_csv = os.path.join(outdir, 'combined.csv')
print(f'Writing {len(combined)} rows to', out_parquet)
combined.to_parquet(out_parquet, index=False)
print('Also writing CSV to', out_csv)
combined.to_csv(out_csv, index=False)
print('Done.')
if __name__ == '__main__':
p = argparse.ArgumentParser()
p.add_argument('--repo', help='Dataset repo id (user/name)', default=os.environ.get('HF_DATASET_ID'))
p.add_argument('--outdir', help='Output directory', default='hf_dataset')
p.add_argument('--token', help='Hugging Face token (optional)', default=None)
args = p.parse_args()
if not args.repo:
print('Dataset repo id is required via --repo or HF_DATASET_ID env var')
raise SystemExit(1)
download_and_merge(args.repo, args.outdir, token=args.token)