{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 879, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.034129692832764506, "grad_norm": 1.3683982802260004, "learning_rate": 5.113636363636364e-06, "loss": 0.6914, "step": 10 }, { "epoch": 0.06825938566552901, "grad_norm": 1.1329734773219333, "learning_rate": 1.0795454545454547e-05, "loss": 0.5785, "step": 20 }, { "epoch": 0.10238907849829351, "grad_norm": 0.4818340266515117, "learning_rate": 1.6477272727272726e-05, "loss": 0.5215, "step": 30 }, { "epoch": 0.13651877133105803, "grad_norm": 0.40192420678859747, "learning_rate": 2.215909090909091e-05, "loss": 0.4933, "step": 40 }, { "epoch": 0.17064846416382254, "grad_norm": 0.30113184989165265, "learning_rate": 2.784090909090909e-05, "loss": 0.4751, "step": 50 }, { "epoch": 0.20477815699658702, "grad_norm": 0.6030940025399546, "learning_rate": 3.352272727272727e-05, "loss": 0.4648, "step": 60 }, { "epoch": 0.23890784982935154, "grad_norm": 0.40486139505957924, "learning_rate": 3.9204545454545456e-05, "loss": 0.4589, "step": 70 }, { "epoch": 0.27303754266211605, "grad_norm": 0.4001169424739457, "learning_rate": 4.488636363636364e-05, "loss": 0.4492, "step": 80 }, { "epoch": 0.30716723549488056, "grad_norm": 0.7056769426783731, "learning_rate": 4.99998028230155e-05, "loss": 0.4529, "step": 90 }, { "epoch": 0.3412969283276451, "grad_norm": 0.4127457799735406, "learning_rate": 4.997614534810769e-05, "loss": 0.4483, "step": 100 }, { "epoch": 0.37542662116040953, "grad_norm": 0.4253433644026189, "learning_rate": 4.991309523184661e-05, "loss": 0.4408, "step": 110 }, { "epoch": 0.40955631399317405, "grad_norm": 0.2650767061773436, "learning_rate": 4.9810751917544375e-05, "loss": 0.4361, "step": 120 }, { "epoch": 0.44368600682593856, "grad_norm": 0.3236300985877029, "learning_rate": 4.96692768221618e-05, "loss": 0.4375, "step": 130 }, { "epoch": 0.4778156996587031, "grad_norm": 0.2677148257178138, "learning_rate": 4.948889308171989e-05, "loss": 0.4343, "step": 140 }, { "epoch": 0.5119453924914675, "grad_norm": 0.31194545617250247, "learning_rate": 4.926988519936732e-05, "loss": 0.4322, "step": 150 }, { "epoch": 0.5460750853242321, "grad_norm": 0.2742536503128216, "learning_rate": 4.9012598596659034e-05, "loss": 0.4287, "step": 160 }, { "epoch": 0.5802047781569966, "grad_norm": 0.2332933827806788, "learning_rate": 4.8717439068753755e-05, "loss": 0.4282, "step": 170 }, { "epoch": 0.6143344709897611, "grad_norm": 0.3218993549663778, "learning_rate": 4.838487214438951e-05, "loss": 0.4227, "step": 180 }, { "epoch": 0.6484641638225256, "grad_norm": 0.25861357657721123, "learning_rate": 4.801542235164678e-05, "loss": 0.4286, "step": 190 }, { "epoch": 0.6825938566552902, "grad_norm": 0.2918271012332895, "learning_rate": 4.7609672390657276e-05, "loss": 0.4287, "step": 200 }, { "epoch": 0.7167235494880546, "grad_norm": 0.24406366100330784, "learning_rate": 4.716826221456305e-05, "loss": 0.4215, "step": 210 }, { "epoch": 0.7508532423208191, "grad_norm": 0.22515461276733315, "learning_rate": 4.669188802017569e-05, "loss": 0.4197, "step": 220 }, { "epoch": 0.7849829351535836, "grad_norm": 0.2186800997634458, "learning_rate": 4.6181301149927245e-05, "loss": 0.425, "step": 230 }, { "epoch": 0.8191126279863481, "grad_norm": 0.23708113058085334, "learning_rate": 4.563730690684502e-05, "loss": 0.4232, "step": 240 }, { "epoch": 0.8532423208191127, "grad_norm": 0.2585858683862343, "learning_rate": 4.5060763284419114e-05, "loss": 0.4252, "step": 250 }, { "epoch": 0.8873720136518771, "grad_norm": 0.2695864266113866, "learning_rate": 4.445257961336594e-05, "loss": 0.4173, "step": 260 }, { "epoch": 0.9215017064846417, "grad_norm": 0.23595963689596886, "learning_rate": 4.3813715127422186e-05, "loss": 0.4205, "step": 270 }, { "epoch": 0.9556313993174061, "grad_norm": 0.24101367889924785, "learning_rate": 4.3145177450431235e-05, "loss": 0.4185, "step": 280 }, { "epoch": 0.9897610921501706, "grad_norm": 0.1941853912616131, "learning_rate": 4.244802100710809e-05, "loss": 0.417, "step": 290 }, { "epoch": 1.023890784982935, "grad_norm": 0.24088200863220327, "learning_rate": 4.172334535998958e-05, "loss": 0.3911, "step": 300 }, { "epoch": 1.0580204778156996, "grad_norm": 0.2401594568811406, "learning_rate": 4.0972293475192634e-05, "loss": 0.3733, "step": 310 }, { "epoch": 1.0921501706484642, "grad_norm": 0.22375295442616824, "learning_rate": 4.0196049919716004e-05, "loss": 0.3778, "step": 320 }, { "epoch": 1.1262798634812285, "grad_norm": 0.21296450960759827, "learning_rate": 3.9395838993128645e-05, "loss": 0.3778, "step": 330 }, { "epoch": 1.1604095563139931, "grad_norm": 0.2364577800890562, "learning_rate": 3.8572922796591415e-05, "loss": 0.3751, "step": 340 }, { "epoch": 1.1945392491467577, "grad_norm": 0.20516445679828432, "learning_rate": 3.7728599242257786e-05, "loss": 0.3755, "step": 350 }, { "epoch": 1.2286689419795223, "grad_norm": 0.2080061173225757, "learning_rate": 3.686420000619301e-05, "loss": 0.38, "step": 360 }, { "epoch": 1.2627986348122868, "grad_norm": 0.20693994205238442, "learning_rate": 3.5981088428040464e-05, "loss": 0.3802, "step": 370 }, { "epoch": 1.2969283276450512, "grad_norm": 0.16640578029703731, "learning_rate": 3.5080657360747985e-05, "loss": 0.3739, "step": 380 }, { "epoch": 1.3310580204778157, "grad_norm": 0.1996673960644544, "learning_rate": 3.416432697374533e-05, "loss": 0.3791, "step": 390 }, { "epoch": 1.36518771331058, "grad_norm": 0.23313473957202652, "learning_rate": 3.323354251303797e-05, "loss": 0.3778, "step": 400 }, { "epoch": 1.3993174061433447, "grad_norm": 0.18739705777804985, "learning_rate": 3.228977202174973e-05, "loss": 0.3773, "step": 410 }, { "epoch": 1.4334470989761092, "grad_norm": 0.19881497754130406, "learning_rate": 3.1334504024709605e-05, "loss": 0.3774, "step": 420 }, { "epoch": 1.4675767918088738, "grad_norm": 0.18919226309345596, "learning_rate": 3.0369245180734605e-05, "loss": 0.3754, "step": 430 }, { "epoch": 1.5017064846416384, "grad_norm": 0.16804781179686107, "learning_rate": 2.9395517906311494e-05, "loss": 0.3769, "step": 440 }, { "epoch": 1.5358361774744027, "grad_norm": 0.19724123435397653, "learning_rate": 2.841485797442535e-05, "loss": 0.3736, "step": 450 }, { "epoch": 1.5699658703071673, "grad_norm": 0.1492512432657133, "learning_rate": 2.742881209232215e-05, "loss": 0.3749, "step": 460 }, { "epoch": 1.6040955631399316, "grad_norm": 0.15514563941327603, "learning_rate": 2.6438935462025672e-05, "loss": 0.3725, "step": 470 }, { "epoch": 1.6382252559726962, "grad_norm": 0.15833970336928466, "learning_rate": 2.5446789327456373e-05, "loss": 0.3736, "step": 480 }, { "epoch": 1.6723549488054608, "grad_norm": 0.1508024128691703, "learning_rate": 2.4453938512020927e-05, "loss": 0.3757, "step": 490 }, { "epoch": 1.7064846416382253, "grad_norm": 0.1561566494075755, "learning_rate": 2.3461948950556133e-05, "loss": 0.3718, "step": 500 }, { "epoch": 1.74061433447099, "grad_norm": 0.16769278782996944, "learning_rate": 2.247238521951992e-05, "loss": 0.3766, "step": 510 }, { "epoch": 1.7747440273037542, "grad_norm": 0.14100411189778878, "learning_rate": 2.1486808069324687e-05, "loss": 0.375, "step": 520 }, { "epoch": 1.8088737201365188, "grad_norm": 0.16483764215517682, "learning_rate": 2.0506771962705304e-05, "loss": 0.3733, "step": 530 }, { "epoch": 1.8430034129692832, "grad_norm": 0.155764171343142, "learning_rate": 1.953382262300389e-05, "loss": 0.3716, "step": 540 }, { "epoch": 1.8771331058020477, "grad_norm": 0.15072440831975784, "learning_rate": 1.8569494596238658e-05, "loss": 0.3722, "step": 550 }, { "epoch": 1.9112627986348123, "grad_norm": 0.1452936635170902, "learning_rate": 1.7615308830801576e-05, "loss": 0.372, "step": 560 }, { "epoch": 1.9453924914675769, "grad_norm": 0.1459972272942582, "learning_rate": 1.6672770278602508e-05, "loss": 0.3728, "step": 570 }, { "epoch": 1.9795221843003414, "grad_norm": 0.15824724688437872, "learning_rate": 1.5743365521443033e-05, "loss": 0.3729, "step": 580 }, { "epoch": 2.013651877133106, "grad_norm": 0.17864631402612208, "learning_rate": 1.4828560426363918e-05, "loss": 0.3545, "step": 590 }, { "epoch": 2.04778156996587, "grad_norm": 0.1588637310837059, "learning_rate": 1.3929797833664013e-05, "loss": 0.3297, "step": 600 }, { "epoch": 2.0819112627986347, "grad_norm": 0.1418501854463478, "learning_rate": 1.30484952812373e-05, "loss": 0.3297, "step": 610 }, { "epoch": 2.1160409556313993, "grad_norm": 0.15348962450232634, "learning_rate": 1.2186042768816988e-05, "loss": 0.3304, "step": 620 }, { "epoch": 2.150170648464164, "grad_norm": 0.1348514462379786, "learning_rate": 1.1343800565653332e-05, "loss": 0.3265, "step": 630 }, { "epoch": 2.1843003412969284, "grad_norm": 0.1356947041461966, "learning_rate": 1.0523097065082413e-05, "loss": 0.329, "step": 640 }, { "epoch": 2.218430034129693, "grad_norm": 0.1494755476959583, "learning_rate": 9.725226689370154e-06, "loss": 0.3276, "step": 650 }, { "epoch": 2.252559726962457, "grad_norm": 0.13335640045718117, "learning_rate": 8.951447848135758e-06, "loss": 0.3287, "step": 660 }, { "epoch": 2.2866894197952217, "grad_norm": 0.13979403886356553, "learning_rate": 8.202980953574735e-06, "loss": 0.3298, "step": 670 }, { "epoch": 2.3208191126279862, "grad_norm": 0.11971919873813754, "learning_rate": 7.481006495611817e-06, "loss": 0.3283, "step": 680 }, { "epoch": 2.354948805460751, "grad_norm": 0.13500834901363976, "learning_rate": 6.786663180019751e-06, "loss": 0.3274, "step": 690 }, { "epoch": 2.3890784982935154, "grad_norm": 0.17890565254397334, "learning_rate": 6.121046132440458e-06, "loss": 0.3255, "step": 700 }, { "epoch": 2.42320819112628, "grad_norm": 0.1302286181845878, "learning_rate": 5.485205171141272e-06, "loss": 0.3262, "step": 710 }, { "epoch": 2.4573378839590445, "grad_norm": 0.12203910298248397, "learning_rate": 4.880143151230418e-06, "loss": 0.325, "step": 720 }, { "epoch": 2.491467576791809, "grad_norm": 0.12474109586190246, "learning_rate": 4.306814382943306e-06, "loss": 0.3255, "step": 730 }, { "epoch": 2.5255972696245736, "grad_norm": 0.12095736373178395, "learning_rate": 3.7661231264943086e-06, "loss": 0.3283, "step": 740 }, { "epoch": 2.5597269624573378, "grad_norm": 0.11718436619787706, "learning_rate": 3.2589221658679586e-06, "loss": 0.3238, "step": 750 }, { "epoch": 2.5938566552901023, "grad_norm": 0.11404515711854785, "learning_rate": 2.7860114637989933e-06, "loss": 0.3268, "step": 760 }, { "epoch": 2.627986348122867, "grad_norm": 0.1211242651755094, "learning_rate": 2.3481369000626585e-06, "loss": 0.3322, "step": 770 }, { "epoch": 2.6621160409556315, "grad_norm": 0.11417951185976549, "learning_rate": 1.9459890950652093e-06, "loss": 0.3229, "step": 780 }, { "epoch": 2.696245733788396, "grad_norm": 0.11391563747812329, "learning_rate": 1.5802023205900797e-06, "loss": 0.3243, "step": 790 }, { "epoch": 2.73037542662116, "grad_norm": 0.11074398617174237, "learning_rate": 1.251353499417704e-06, "loss": 0.3282, "step": 800 }, { "epoch": 2.7645051194539247, "grad_norm": 0.11126589393346809, "learning_rate": 9.599612953967746e-07, "loss": 0.3237, "step": 810 }, { "epoch": 2.7986348122866893, "grad_norm": 0.10702690737924332, "learning_rate": 7.064852954021373e-07, "loss": 0.3309, "step": 820 }, { "epoch": 2.832764505119454, "grad_norm": 0.11277244407549764, "learning_rate": 4.913252844694821e-07, "loss": 0.3249, "step": 830 }, { "epoch": 2.8668941979522184, "grad_norm": 0.1109451852100889, "learning_rate": 3.1482061525015537e-07, "loss": 0.3248, "step": 840 }, { "epoch": 2.901023890784983, "grad_norm": 0.11192896622215733, "learning_rate": 1.772496727805495e-07, "loss": 0.3216, "step": 850 }, { "epoch": 2.9351535836177476, "grad_norm": 0.11288193182916881, "learning_rate": 7.882943541027976e-08, "loss": 0.3255, "step": 860 }, { "epoch": 2.969283276450512, "grad_norm": 0.11665972567296903, "learning_rate": 1.9715132581624164e-08, "loss": 0.3277, "step": 870 }, { "epoch": 3.0, "step": 879, "total_flos": 4042009736118272.0, "train_loss": 0.38500409647059525, "train_runtime": 77088.3871, "train_samples_per_second": 5.837, "train_steps_per_second": 0.011 } ], "logging_steps": 10, "max_steps": 879, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4042009736118272.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }