export CUDA_VISIBLE_DEVICES=0,1,2,3
torchrun --nproc_per_node=4 offline_train_loop_gate_ARC.py \
  --teacher-checkpoint "saves/loop_runs/2025-11-28/checkpoint_best.pt" \
  --epochs 30 \
  --loop-core-depth 2 \
  --max-loop-steps 6 \
  --min-loop-steps 2 \
  --batch-size 32 \
  --image-size 64 \
  --patch-size 2 \
  --learning-rate 5e-5 \
  --weight-decay 0 \
  --embed-dim 512 \
  --mlp-dim 1024 \
  --num-heads 8 \
  --include-rearc \
  --num-colors 12 \
  --data-root "raw_data/ARC-AGI" \
  --train-split "training" \
  --wandb-project "VisionARC" \
  --wandb-run-name "loop_gate_varc_gpu45" \
  --save-path "saves/loop_runs/2025-12-01/gate_stage_checkpoint_final.pt" \
  --best-save-path "saves/loop_runs/2025-12-01/gate_stage_checkpoint_best.pt" \
  --lr-scheduler "cosine" \
  --vis-every 10 \
  --distributed \
  --use-wandb \
  --train-dynamic-exit \
  --eval-dynamic-exit \
  --tune-step-embeddings