| { | |
| "best_global_step": 800, | |
| "best_metric": 0.5648624300956726, | |
| "best_model_checkpoint": "models/generation/description/checkpoint-800", | |
| "epoch": 17.5, | |
| "eval_steps": 400, | |
| "global_step": 3500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.374443531036377, | |
| "learning_rate": 8.166666666666667e-05, | |
| "loss": 12.4423, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.090597152709961, | |
| "learning_rate": 9.996828927498017e-05, | |
| "loss": 2.8795, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 3.756592035293579, | |
| "learning_rate": 9.983493166277486e-05, | |
| "loss": 2.5534, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.0228748321533203, | |
| "learning_rate": 9.959768225002347e-05, | |
| "loss": 2.5155, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 3.339592456817627, | |
| "learning_rate": 9.925703563494947e-05, | |
| "loss": 2.3755, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.7049825191497803, | |
| "learning_rate": 9.881370196982982e-05, | |
| "loss": 2.3446, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.8367323875427246, | |
| "learning_rate": 9.826860548052725e-05, | |
| "loss": 2.3269, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.6648616790771484, | |
| "learning_rate": 9.76228825397397e-05, | |
| "loss": 2.3153, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.5770760178565979, | |
| "eval_runtime": 15.1499, | |
| "eval_samples_per_second": 26.403, | |
| "eval_steps_per_second": 6.601, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 2.7784676551818848, | |
| "learning_rate": 9.687787929798317e-05, | |
| "loss": 2.1744, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.5936291217803955, | |
| "learning_rate": 9.603514887724691e-05, | |
| "loss": 2.2059, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 2.7534546852111816, | |
| "learning_rate": 9.509644813317144e-05, | |
| "loss": 2.187, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 2.6206445693969727, | |
| "learning_rate": 9.406373399249911e-05, | |
| "loss": 2.1917, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 2.880136251449585, | |
| "learning_rate": 9.293915937343299e-05, | |
| "loss": 2.0629, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 2.911586284637451, | |
| "learning_rate": 9.172506869740849e-05, | |
| "loss": 2.0542, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 2.876952886581421, | |
| "learning_rate": 9.042399300163484e-05, | |
| "loss": 2.0751, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 2.7687273025512695, | |
| "learning_rate": 8.90386446625952e-05, | |
| "loss": 2.0719, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.5648624300956726, | |
| "eval_runtime": 15.2421, | |
| "eval_samples_per_second": 26.243, | |
| "eval_steps_per_second": 6.561, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 3.4546947479248047, | |
| "learning_rate": 8.757191174150532e-05, | |
| "loss": 1.9407, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 3.255340337753296, | |
| "learning_rate": 8.60268519635192e-05, | |
| "loss": 1.935, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 3.166053056716919, | |
| "learning_rate": 8.440668634323305e-05, | |
| "loss": 1.9591, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 3.0402660369873047, | |
| "learning_rate": 8.271479246977678e-05, | |
| "loss": 1.9412, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "grad_norm": 3.6618967056274414, | |
| "learning_rate": 8.095469746549172e-05, | |
| "loss": 1.8086, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 3.8914847373962402, | |
| "learning_rate": 7.913007063287361e-05, | |
| "loss": 1.8275, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "grad_norm": 3.528841018676758, | |
| "learning_rate": 7.724471580511021e-05, | |
| "loss": 1.8043, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 3.459420919418335, | |
| "learning_rate": 7.530256341615994e-05, | |
| "loss": 1.8232, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.5697617530822754, | |
| "eval_runtime": 15.3206, | |
| "eval_samples_per_second": 26.109, | |
| "eval_steps_per_second": 6.527, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 3.8563907146453857, | |
| "learning_rate": 7.33076623069039e-05, | |
| "loss": 1.6684, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 4.100020408630371, | |
| "learning_rate": 7.126417128445263e-05, | |
| "loss": 1.6671, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 4.32282829284668, | |
| "learning_rate": 6.917635045220425e-05, | |
| "loss": 1.687, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 4.05823278427124, | |
| "learning_rate": 6.704855232872843e-05, | |
| "loss": 1.6991, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.25, | |
| "grad_norm": 4.55335807800293, | |
| "learning_rate": 6.488521277399067e-05, | |
| "loss": 1.5066, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 4.818643093109131, | |
| "learning_rate": 6.26908417418333e-05, | |
| "loss": 1.5315, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 4.753720283508301, | |
| "learning_rate": 6.0470013877991525e-05, | |
| "loss": 1.5572, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 4.679745197296143, | |
| "learning_rate": 5.8227358983245274e-05, | |
| "loss": 1.5593, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.6191036105155945, | |
| "eval_runtime": 15.2291, | |
| "eval_samples_per_second": 26.266, | |
| "eval_steps_per_second": 6.566, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.25, | |
| "grad_norm": 5.900501728057861, | |
| "learning_rate": 5.5967552361588e-05, | |
| "loss": 1.3844, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 5.828335285186768, | |
| "learning_rate": 5.3695305073534455e-05, | |
| "loss": 1.3716, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 5.216298580169678, | |
| "learning_rate": 5.141535411488584e-05, | |
| "loss": 1.4014, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 5.470153331756592, | |
| "learning_rate": 4.913245254142751e-05, | |
| "loss": 1.4239, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 9.25, | |
| "grad_norm": 6.552476406097412, | |
| "learning_rate": 4.685135956014587e-05, | |
| "loss": 1.2293, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 6.554981231689453, | |
| "learning_rate": 4.4576830607621834e-05, | |
| "loss": 1.2586, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 9.75, | |
| "grad_norm": 5.98954963684082, | |
| "learning_rate": 4.231360743628464e-05, | |
| "loss": 1.2697, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 6.276147842407227, | |
| "learning_rate": 4.00664082291931e-05, | |
| "loss": 1.2779, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.6903724074363708, | |
| "eval_runtime": 15.0396, | |
| "eval_samples_per_second": 26.596, | |
| "eval_steps_per_second": 6.649, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 10.25, | |
| "grad_norm": 7.1180267333984375, | |
| "learning_rate": 3.78399177639524e-05, | |
| "loss": 1.1174, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 10.5, | |
| "grad_norm": 6.683684825897217, | |
| "learning_rate": 3.563877764627195e-05, | |
| "loss": 1.1296, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 10.75, | |
| "grad_norm": 7.034246444702148, | |
| "learning_rate": 3.34675766335243e-05, | |
| "loss": 1.1328, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 6.708085536956787, | |
| "learning_rate": 3.13308410684782e-05, | |
| "loss": 1.1434, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 11.25, | |
| "grad_norm": 6.986971378326416, | |
| "learning_rate": 2.9233025443148317e-05, | |
| "loss": 0.9911, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 11.5, | |
| "grad_norm": 6.527721881866455, | |
| "learning_rate": 2.7178503112433672e-05, | |
| "loss": 1.0082, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 11.75, | |
| "grad_norm": 7.1417341232299805, | |
| "learning_rate": 2.517155717690404e-05, | |
| "loss": 1.0233, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 7.950260639190674, | |
| "learning_rate": 2.3216371553741295e-05, | |
| "loss": 1.0301, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.8407155871391296, | |
| "eval_runtime": 15.0924, | |
| "eval_samples_per_second": 26.503, | |
| "eval_steps_per_second": 6.626, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 12.25, | |
| "grad_norm": 6.401738166809082, | |
| "learning_rate": 2.131702225445008e-05, | |
| "loss": 0.8974, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 7.200794219970703, | |
| "learning_rate": 1.9477468887521627e-05, | |
| "loss": 0.9053, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 12.75, | |
| "grad_norm": 7.977651119232178, | |
| "learning_rate": 1.770154640376479e-05, | |
| "loss": 0.9308, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 7.930706024169922, | |
| "learning_rate": 1.5992957101513524e-05, | |
| "loss": 0.9281, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 13.25, | |
| "grad_norm": 7.262845039367676, | |
| "learning_rate": 1.4355262908377271e-05, | |
| "loss": 0.8209, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 13.5, | |
| "grad_norm": 8.71459674835205, | |
| "learning_rate": 1.2791877955624859e-05, | |
| "loss": 0.8412, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 13.75, | |
| "grad_norm": 8.987716674804688, | |
| "learning_rate": 1.1306061460682072e-05, | |
| "loss": 0.8469, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 7.137842178344727, | |
| "learning_rate": 9.90091093258102e-06, | |
| "loss": 0.8318, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.9897236824035645, | |
| "eval_runtime": 15.2729, | |
| "eval_samples_per_second": 26.19, | |
| "eval_steps_per_second": 6.548, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 14.25, | |
| "grad_norm": 6.863259792327881, | |
| "learning_rate": 8.579355714525994e-06, | |
| "loss": 0.7727, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 14.5, | |
| "grad_norm": 7.495807647705078, | |
| "learning_rate": 7.3441508770376975e-06, | |
| "loss": 0.7773, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 14.75, | |
| "grad_norm": 7.848569869995117, | |
| "learning_rate": 6.197871474406936e-06, | |
| "loss": 0.7709, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 7.177825450897217, | |
| "learning_rate": 5.142907176431455e-06, | |
| "loss": 0.7807, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 15.25, | |
| "grad_norm": 7.384244918823242, | |
| "learning_rate": 4.181457286627316e-06, | |
| "loss": 0.7236, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 15.5, | |
| "grad_norm": 7.398818492889404, | |
| "learning_rate": 3.3155261573003195e-06, | |
| "loss": 0.7448, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 15.75, | |
| "grad_norm": 7.505466461181641, | |
| "learning_rate": 2.5469190110357475e-06, | |
| "loss": 0.7335, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 7.520777702331543, | |
| "learning_rate": 1.8772381773176417e-06, | |
| "loss": 0.7423, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 1.061659574508667, | |
| "eval_runtime": 15.1874, | |
| "eval_samples_per_second": 26.338, | |
| "eval_steps_per_second": 6.584, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 16.25, | |
| "grad_norm": 8.022893905639648, | |
| "learning_rate": 1.307879752122948e-06, | |
| "loss": 0.7108, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 16.5, | |
| "grad_norm": 7.253084659576416, | |
| "learning_rate": 8.40030687454535e-07, | |
| "loss": 0.7164, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 16.75, | |
| "grad_norm": 7.7122087478637695, | |
| "learning_rate": 4.746663168804566e-07, | |
| "loss": 0.708, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 7.264121055603027, | |
| "learning_rate": 2.1254832223808196e-07, | |
| "loss": 0.7214, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 17.25, | |
| "grad_norm": 7.239109516143799, | |
| "learning_rate": 5.4223145741943983e-08, | |
| "loss": 0.7047, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "grad_norm": 7.081119060516357, | |
| "learning_rate": 2.08508055765666e-11, | |
| "loss": 0.6976, | |
| "step": 3500 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 3500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 18, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.7756473836306752e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |