{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9040063466878223, "eval_steps": 300, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01586671955573185, "grad_norm": 0.6832552552223206, "learning_rate": 2.6455026455026455e-06, "loss": 1.3171, "step": 10 }, { "epoch": 0.0317334391114637, "grad_norm": 0.9994391798973083, "learning_rate": 5.291005291005291e-06, "loss": 1.1922, "step": 20 }, { "epoch": 0.04760015866719556, "grad_norm": 0.9905434846878052, "learning_rate": 7.936507936507936e-06, "loss": 1.2352, "step": 30 }, { "epoch": 0.0634668782229274, "grad_norm": 1.0371067523956299, "learning_rate": 1.0582010582010582e-05, "loss": 1.1191, "step": 40 }, { "epoch": 0.07933359777865927, "grad_norm": 0.9686773419380188, "learning_rate": 1.3227513227513228e-05, "loss": 1.0697, "step": 50 }, { "epoch": 0.09520031733439112, "grad_norm": 0.5632955431938171, "learning_rate": 1.5873015873015872e-05, "loss": 0.741, "step": 60 }, { "epoch": 0.11106703689012297, "grad_norm": 0.4180806875228882, "learning_rate": 1.8518518518518518e-05, "loss": 0.6942, "step": 70 }, { "epoch": 0.1269337564458548, "grad_norm": 0.3943168520927429, "learning_rate": 2.1164021164021164e-05, "loss": 0.6314, "step": 80 }, { "epoch": 0.14280047600158668, "grad_norm": 0.3465676009654999, "learning_rate": 2.380952380952381e-05, "loss": 0.6257, "step": 90 }, { "epoch": 0.15866719555731854, "grad_norm": 0.3200153708457947, "learning_rate": 2.6455026455026456e-05, "loss": 0.5426, "step": 100 }, { "epoch": 0.1745339151130504, "grad_norm": 0.28294724225997925, "learning_rate": 2.91005291005291e-05, "loss": 0.5209, "step": 110 }, { "epoch": 0.19040063466878224, "grad_norm": 0.3477802574634552, "learning_rate": 3.1746031746031745e-05, "loss": 0.4633, "step": 120 }, { "epoch": 0.2062673542245141, "grad_norm": 0.36890918016433716, "learning_rate": 3.439153439153439e-05, "loss": 0.3798, "step": 130 }, { "epoch": 0.22213407378024594, "grad_norm": 0.38711702823638916, "learning_rate": 3.7037037037037037e-05, "loss": 0.4397, "step": 140 }, { "epoch": 0.2380007933359778, "grad_norm": 0.4116644859313965, "learning_rate": 3.968253968253968e-05, "loss": 0.4021, "step": 150 }, { "epoch": 0.2538675128917096, "grad_norm": 0.402497798204422, "learning_rate": 4.232804232804233e-05, "loss": 0.3796, "step": 160 }, { "epoch": 0.2697342324474415, "grad_norm": 0.6916673183441162, "learning_rate": 4.4973544973544974e-05, "loss": 0.4462, "step": 170 }, { "epoch": 0.28560095200317337, "grad_norm": 0.49015486240386963, "learning_rate": 4.761904761904762e-05, "loss": 0.3902, "step": 180 }, { "epoch": 0.3014676715589052, "grad_norm": 0.4605613648891449, "learning_rate": 4.999995736158938e-05, "loss": 0.3751, "step": 190 }, { "epoch": 0.31733439111463707, "grad_norm": 0.3663236200809479, "learning_rate": 4.999484092829756e-05, "loss": 0.3399, "step": 200 }, { "epoch": 0.3332011106703689, "grad_norm": 0.6052913069725037, "learning_rate": 4.998119881260576e-05, "loss": 0.3738, "step": 210 }, { "epoch": 0.3490678302261008, "grad_norm": 0.5979182720184326, "learning_rate": 4.995903566780805e-05, "loss": 0.3732, "step": 220 }, { "epoch": 0.3649345497818326, "grad_norm": 0.7640148401260376, "learning_rate": 4.992835905370186e-05, "loss": 0.4339, "step": 230 }, { "epoch": 0.3808012693375645, "grad_norm": 0.5569157600402832, "learning_rate": 4.988917943400924e-05, "loss": 0.3893, "step": 240 }, { "epoch": 0.3966679888932963, "grad_norm": 0.6464380621910095, "learning_rate": 4.9841510172807834e-05, "loss": 0.3642, "step": 250 }, { "epoch": 0.4125347084490282, "grad_norm": 0.5494194030761719, "learning_rate": 4.97853675299723e-05, "loss": 0.3481, "step": 260 }, { "epoch": 0.42840142800476, "grad_norm": 0.5208620429039001, "learning_rate": 4.972077065562821e-05, "loss": 0.3968, "step": 270 }, { "epoch": 0.4442681475604919, "grad_norm": 0.6078989505767822, "learning_rate": 4.964774158361991e-05, "loss": 0.337, "step": 280 }, { "epoch": 0.4601348671162237, "grad_norm": 0.5791096687316895, "learning_rate": 4.956630522399487e-05, "loss": 0.3495, "step": 290 }, { "epoch": 0.4760015866719556, "grad_norm": 0.5267443060874939, "learning_rate": 4.947648935450689e-05, "loss": 0.3191, "step": 300 }, { "epoch": 0.4760015866719556, "eval_loss": 0.38255831599235535, "eval_runtime": 93.6436, "eval_samples_per_second": 5.991, "eval_steps_per_second": 5.991, "step": 300 }, { "epoch": 0.4918683062276874, "grad_norm": 0.42828086018562317, "learning_rate": 4.937832461114123e-05, "loss": 0.3164, "step": 310 }, { "epoch": 0.5077350257834192, "grad_norm": 0.5684987306594849, "learning_rate": 4.927184447766467e-05, "loss": 0.3047, "step": 320 }, { "epoch": 0.5236017453391512, "grad_norm": 0.6015241742134094, "learning_rate": 4.915708527420435e-05, "loss": 0.2568, "step": 330 }, { "epoch": 0.539468464894883, "grad_norm": 0.7351698279380798, "learning_rate": 4.903408614485899e-05, "loss": 0.3554, "step": 340 }, { "epoch": 0.5553351844506148, "grad_norm": 0.5476783514022827, "learning_rate": 4.890288904434699e-05, "loss": 0.2934, "step": 350 }, { "epoch": 0.5712019040063467, "grad_norm": 0.7653456330299377, "learning_rate": 4.8763538723695726e-05, "loss": 0.3548, "step": 360 }, { "epoch": 0.5870686235620786, "grad_norm": 0.5909974575042725, "learning_rate": 4.8616082714977097e-05, "loss": 0.2958, "step": 370 }, { "epoch": 0.6029353431178104, "grad_norm": 0.8618314266204834, "learning_rate": 4.8460571315094456e-05, "loss": 0.3323, "step": 380 }, { "epoch": 0.6188020626735422, "grad_norm": 0.43425798416137695, "learning_rate": 4.829705756862642e-05, "loss": 0.3256, "step": 390 }, { "epoch": 0.6346687822292741, "grad_norm": 0.6728402972221375, "learning_rate": 4.812559724973355e-05, "loss": 0.3289, "step": 400 }, { "epoch": 0.650535501785006, "grad_norm": 0.6804693341255188, "learning_rate": 4.79462488431338e-05, "loss": 0.3494, "step": 410 }, { "epoch": 0.6664022213407378, "grad_norm": 0.7322263121604919, "learning_rate": 4.775907352415367e-05, "loss": 0.286, "step": 420 }, { "epoch": 0.6822689408964696, "grad_norm": 0.5389537215232849, "learning_rate": 4.75641351378613e-05, "loss": 0.2597, "step": 430 }, { "epoch": 0.6981356604522015, "grad_norm": 0.7679384350776672, "learning_rate": 4.7361500177289156e-05, "loss": 0.3265, "step": 440 }, { "epoch": 0.7140023800079334, "grad_norm": 0.9662689566612244, "learning_rate": 4.715123776075336e-05, "loss": 0.3493, "step": 450 }, { "epoch": 0.7298690995636652, "grad_norm": 0.7264727354049683, "learning_rate": 4.693341960827764e-05, "loss": 0.3412, "step": 460 }, { "epoch": 0.745735819119397, "grad_norm": 0.6046968698501587, "learning_rate": 4.670812001712973e-05, "loss": 0.349, "step": 470 }, { "epoch": 0.761602538675129, "grad_norm": 0.7867446541786194, "learning_rate": 4.647541583647883e-05, "loss": 0.3394, "step": 480 }, { "epoch": 0.7774692582308608, "grad_norm": 0.7447336912155151, "learning_rate": 4.623538644118244e-05, "loss": 0.3738, "step": 490 }, { "epoch": 0.7933359777865926, "grad_norm": 0.5049989819526672, "learning_rate": 4.5988113704711846e-05, "loss": 0.2899, "step": 500 }, { "epoch": 0.8092026973423245, "grad_norm": 0.8148102164268494, "learning_rate": 4.573368197122524e-05, "loss": 0.3144, "step": 510 }, { "epoch": 0.8250694168980564, "grad_norm": 0.7628346681594849, "learning_rate": 4.547217802679814e-05, "loss": 0.2996, "step": 520 }, { "epoch": 0.8409361364537882, "grad_norm": 0.6373751759529114, "learning_rate": 4.520369106982084e-05, "loss": 0.2887, "step": 530 }, { "epoch": 0.85680285600952, "grad_norm": 0.6852309107780457, "learning_rate": 4.4928312680573064e-05, "loss": 0.2862, "step": 540 }, { "epoch": 0.8726695755652519, "grad_norm": 0.7369856238365173, "learning_rate": 4.464613678998612e-05, "loss": 0.3386, "step": 550 }, { "epoch": 0.8885362951209838, "grad_norm": 0.6538604497909546, "learning_rate": 4.435725964760331e-05, "loss": 0.3225, "step": 560 }, { "epoch": 0.9044030146767156, "grad_norm": 0.8635338544845581, "learning_rate": 4.406177978874941e-05, "loss": 0.3328, "step": 570 }, { "epoch": 0.9202697342324474, "grad_norm": 0.9363442063331604, "learning_rate": 4.3759798000920496e-05, "loss": 0.3315, "step": 580 }, { "epoch": 0.9361364537881793, "grad_norm": 0.7191005349159241, "learning_rate": 4.3451417289405586e-05, "loss": 0.271, "step": 590 }, { "epoch": 0.9520031733439112, "grad_norm": 0.7281237244606018, "learning_rate": 4.313674284215176e-05, "loss": 0.2945, "step": 600 }, { "epoch": 0.9520031733439112, "eval_loss": 0.3320656418800354, "eval_runtime": 93.5162, "eval_samples_per_second": 5.999, "eval_steps_per_second": 5.999, "step": 600 }, { "epoch": 0.967869892899643, "grad_norm": 0.6777219176292419, "learning_rate": 4.281588199388476e-05, "loss": 0.2844, "step": 610 }, { "epoch": 0.9837366124553748, "grad_norm": 0.7066994905471802, "learning_rate": 4.248894418949746e-05, "loss": 0.3348, "step": 620 }, { "epoch": 0.9996033320111067, "grad_norm": 0.5948446989059448, "learning_rate": 4.215604094671835e-05, "loss": 0.2532, "step": 630 }, { "epoch": 1.0154700515668384, "grad_norm": 0.8124284744262695, "learning_rate": 4.181728581807316e-05, "loss": 0.3068, "step": 640 }, { "epoch": 1.0313367711225705, "grad_norm": 0.6020251512527466, "learning_rate": 4.1472794352152366e-05, "loss": 0.2912, "step": 650 }, { "epoch": 1.0472034906783023, "grad_norm": 0.9106693863868713, "learning_rate": 4.112268405419782e-05, "loss": 0.2739, "step": 660 }, { "epoch": 1.0630702102340341, "grad_norm": 0.9437083005905151, "learning_rate": 4.076707434602194e-05, "loss": 0.2563, "step": 670 }, { "epoch": 1.078936929789766, "grad_norm": 0.7824203968048096, "learning_rate": 4.040608652527328e-05, "loss": 0.2898, "step": 680 }, { "epoch": 1.0948036493454978, "grad_norm": 0.8340286612510681, "learning_rate": 4.003984372406212e-05, "loss": 0.2665, "step": 690 }, { "epoch": 1.1106703689012296, "grad_norm": 0.7771459817886353, "learning_rate": 3.966847086696045e-05, "loss": 0.2711, "step": 700 }, { "epoch": 1.1265370884569614, "grad_norm": 0.6131294369697571, "learning_rate": 3.929209462839041e-05, "loss": 0.2825, "step": 710 }, { "epoch": 1.1424038080126935, "grad_norm": 0.8090108633041382, "learning_rate": 3.891084338941603e-05, "loss": 0.2725, "step": 720 }, { "epoch": 1.1582705275684253, "grad_norm": 0.6462133526802063, "learning_rate": 3.852484719395264e-05, "loss": 0.2406, "step": 730 }, { "epoch": 1.1741372471241571, "grad_norm": 0.7676876783370972, "learning_rate": 3.8134237704409295e-05, "loss": 0.2648, "step": 740 }, { "epoch": 1.190003966679889, "grad_norm": 0.8399226665496826, "learning_rate": 3.773914815677897e-05, "loss": 0.2693, "step": 750 }, { "epoch": 1.2058706862356208, "grad_norm": 0.7439931631088257, "learning_rate": 3.733971331519206e-05, "loss": 0.2602, "step": 760 }, { "epoch": 1.2217374057913526, "grad_norm": 0.7992270588874817, "learning_rate": 3.693606942594873e-05, "loss": 0.2647, "step": 770 }, { "epoch": 1.2376041253470844, "grad_norm": 0.8335320949554443, "learning_rate": 3.65283541710455e-05, "loss": 0.2723, "step": 780 }, { "epoch": 1.2534708449028162, "grad_norm": 0.813829779624939, "learning_rate": 3.611670662121234e-05, "loss": 0.2285, "step": 790 }, { "epoch": 1.269337564458548, "grad_norm": 1.047428846359253, "learning_rate": 3.570126718847589e-05, "loss": 0.2788, "step": 800 }, { "epoch": 1.28520428401428, "grad_norm": 1.1291059255599976, "learning_rate": 3.5282177578265296e-05, "loss": 0.3008, "step": 810 }, { "epoch": 1.301071003570012, "grad_norm": 0.6732133626937866, "learning_rate": 3.485958074107677e-05, "loss": 0.2446, "step": 820 }, { "epoch": 1.3169377231257438, "grad_norm": 0.8102436065673828, "learning_rate": 3.4433620823713564e-05, "loss": 0.2646, "step": 830 }, { "epoch": 1.3328044426814756, "grad_norm": 0.8745511770248413, "learning_rate": 3.400444312011776e-05, "loss": 0.2606, "step": 840 }, { "epoch": 1.3486711622372074, "grad_norm": 0.8574343323707581, "learning_rate": 3.3572194021810896e-05, "loss": 0.2294, "step": 850 }, { "epoch": 1.3645378817929394, "grad_norm": 0.9338182806968689, "learning_rate": 3.3137020967960154e-05, "loss": 0.2551, "step": 860 }, { "epoch": 1.3804046013486713, "grad_norm": 0.7893859148025513, "learning_rate": 3.269907239508714e-05, "loss": 0.231, "step": 870 }, { "epoch": 1.396271320904403, "grad_norm": 0.9416842460632324, "learning_rate": 3.2258497686436606e-05, "loss": 0.2528, "step": 880 }, { "epoch": 1.412138040460135, "grad_norm": 0.7524838447570801, "learning_rate": 3.181544712102216e-05, "loss": 0.2669, "step": 890 }, { "epoch": 1.4280047600158667, "grad_norm": 0.9508939981460571, "learning_rate": 3.137007182236637e-05, "loss": 0.2436, "step": 900 }, { "epoch": 1.4280047600158667, "eval_loss": 0.31376519799232483, "eval_runtime": 93.4332, "eval_samples_per_second": 6.004, "eval_steps_per_second": 6.004, "step": 900 }, { "epoch": 1.4438714795715986, "grad_norm": 1.3880516290664673, "learning_rate": 3.092252370695298e-05, "loss": 0.2781, "step": 910 }, { "epoch": 1.4597381991273304, "grad_norm": 0.7659589648246765, "learning_rate": 3.0472955432408485e-05, "loss": 0.294, "step": 920 }, { "epoch": 1.4756049186830622, "grad_norm": 0.7849363088607788, "learning_rate": 3.002152034543098e-05, "loss": 0.2768, "step": 930 }, { "epoch": 1.491471638238794, "grad_norm": 0.666022777557373, "learning_rate": 2.9568372429483966e-05, "loss": 0.2526, "step": 940 }, { "epoch": 1.5073383577945259, "grad_norm": 0.9934321641921997, "learning_rate": 2.9113666252272943e-05, "loss": 0.2524, "step": 950 }, { "epoch": 1.5232050773502577, "grad_norm": 0.8849703073501587, "learning_rate": 2.865755691302272e-05, "loss": 0.2905, "step": 960 }, { "epoch": 1.5390717969059897, "grad_norm": 0.7805452942848206, "learning_rate": 2.8200199989573432e-05, "loss": 0.242, "step": 970 }, { "epoch": 1.5549385164617215, "grad_norm": 0.9101535081863403, "learning_rate": 2.7741751485313296e-05, "loss": 0.2178, "step": 980 }, { "epoch": 1.5708052360174534, "grad_norm": 1.0016871690750122, "learning_rate": 2.728236777596621e-05, "loss": 0.2738, "step": 990 }, { "epoch": 1.5866719555731852, "grad_norm": 0.7558535933494568, "learning_rate": 2.6822205556252383e-05, "loss": 0.2363, "step": 1000 }, { "epoch": 1.6025386751289172, "grad_norm": 0.886492908000946, "learning_rate": 2.636142178644009e-05, "loss": 0.2645, "step": 1010 }, { "epoch": 1.618405394684649, "grad_norm": 0.9074681997299194, "learning_rate": 2.590017363880691e-05, "loss": 0.2616, "step": 1020 }, { "epoch": 1.6342721142403809, "grad_norm": 0.7710486054420471, "learning_rate": 2.5438618444028627e-05, "loss": 0.2776, "step": 1030 }, { "epoch": 1.6501388337961127, "grad_norm": 0.5658883452415466, "learning_rate": 2.4976913637514103e-05, "loss": 0.2259, "step": 1040 }, { "epoch": 1.6660055533518445, "grad_norm": 0.871059000492096, "learning_rate": 2.4515216705704395e-05, "loss": 0.2106, "step": 1050 }, { "epoch": 1.6818722729075763, "grad_norm": 0.6641427874565125, "learning_rate": 2.405368513235453e-05, "loss": 0.2242, "step": 1060 }, { "epoch": 1.6977389924633082, "grad_norm": 0.8793797492980957, "learning_rate": 2.359247634481615e-05, "loss": 0.2555, "step": 1070 }, { "epoch": 1.71360571201904, "grad_norm": 0.9106749296188354, "learning_rate": 2.3131747660339394e-05, "loss": 0.2903, "step": 1080 }, { "epoch": 1.7294724315747718, "grad_norm": 0.8714670538902283, "learning_rate": 2.2671656232412378e-05, "loss": 0.2885, "step": 1090 }, { "epoch": 1.7453391511305036, "grad_norm": 0.7964282631874084, "learning_rate": 2.2212358997156445e-05, "loss": 0.2579, "step": 1100 }, { "epoch": 1.7612058706862355, "grad_norm": 0.855613648891449, "learning_rate": 2.175401261979569e-05, "loss": 0.2926, "step": 1110 }, { "epoch": 1.7770725902419675, "grad_norm": 0.8496893048286438, "learning_rate": 2.1296773441218787e-05, "loss": 0.2404, "step": 1120 }, { "epoch": 1.7929393097976993, "grad_norm": 0.7642855644226074, "learning_rate": 2.084079742465142e-05, "loss": 0.2471, "step": 1130 }, { "epoch": 1.8088060293534312, "grad_norm": 0.8110942244529724, "learning_rate": 2.0386240102457682e-05, "loss": 0.236, "step": 1140 }, { "epoch": 1.824672748909163, "grad_norm": 0.9976955056190491, "learning_rate": 1.993325652308828e-05, "loss": 0.2609, "step": 1150 }, { "epoch": 1.840539468464895, "grad_norm": 0.8436864614486694, "learning_rate": 1.9482001198193882e-05, "loss": 0.2771, "step": 1160 }, { "epoch": 1.8564061880206268, "grad_norm": 0.8823544979095459, "learning_rate": 1.903262804992156e-05, "loss": 0.2399, "step": 1170 }, { "epoch": 1.8722729075763587, "grad_norm": 0.6062878370285034, "learning_rate": 1.8585290358412297e-05, "loss": 0.2344, "step": 1180 }, { "epoch": 1.8881396271320905, "grad_norm": 0.7493007183074951, "learning_rate": 1.8140140709517465e-05, "loss": 0.2311, "step": 1190 }, { "epoch": 1.9040063466878223, "grad_norm": 0.8461095094680786, "learning_rate": 1.7697330942752193e-05, "loss": 0.2414, "step": 1200 }, { "epoch": 1.9040063466878223, "eval_loss": 0.3031667470932007, "eval_runtime": 93.4886, "eval_samples_per_second": 6.001, "eval_steps_per_second": 6.001, "step": 1200 } ], "logging_steps": 10, "max_steps": 1890, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7498295064425267e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }