model_name_or_path: "openai-community/gpt2-large" dataset_name_or_path: "allenai/tulu-3-sft-olmo-2-mixture-0225" project_name: "scaling-post-training" training_args: seed: 42 num_train_epochs: 1 per_device_train_batch_size: 2 per_device_eval_batch_size: 2 gradient_accumulation_steps: 8 warmup_ratio: 0.05 weight_decay: 0.01 logging_steps: 10 eval_strategy: "steps" eval_steps: 50 report_to: "wandb" fp16: true learning_rate: 3.0e-5 lr_scheduler_type: "cosine" run_name: "gpt2-large-sft" output_dir: "models/gpt2-large/sft" save_strategy: "best" metric_for_best_model: "eval_loss" load_best_model_at_end: true save_total_limit: 1 hub_model_id: "gpt2-large-sft"