{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2539, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009846396218983852, "grad_norm": 5.9883294105529785, "learning_rate": 5.000000000000001e-07, "loss": 0.2518, "step": 25 }, { "epoch": 0.019692792437967704, "grad_norm": 5.031759262084961, "learning_rate": 1.0000000000000002e-06, "loss": 0.285, "step": 50 }, { "epoch": 0.029539188656951557, "grad_norm": 4.0184102058410645, "learning_rate": 1.5e-06, "loss": 0.2444, "step": 75 }, { "epoch": 0.03938558487593541, "grad_norm": 4.6933512687683105, "learning_rate": 2.0000000000000003e-06, "loss": 0.2471, "step": 100 }, { "epoch": 0.04923198109491926, "grad_norm": 6.389063835144043, "learning_rate": 2.5e-06, "loss": 0.2567, "step": 125 }, { "epoch": 0.05907837731390311, "grad_norm": 5.830894947052002, "learning_rate": 3e-06, "loss": 0.2274, "step": 150 }, { "epoch": 0.06892477353288697, "grad_norm": 3.234384536743164, "learning_rate": 3.5e-06, "loss": 0.2415, "step": 175 }, { "epoch": 0.07877116975187082, "grad_norm": 4.506255149841309, "learning_rate": 4.000000000000001e-06, "loss": 0.2362, "step": 200 }, { "epoch": 0.08861756597085467, "grad_norm": 4.6171674728393555, "learning_rate": 4.5e-06, "loss": 0.207, "step": 225 }, { "epoch": 0.09846396218983852, "grad_norm": 3.5775146484375, "learning_rate": 5e-06, "loss": 0.209, "step": 250 }, { "epoch": 0.10831035840882237, "grad_norm": 4.521059989929199, "learning_rate": 5.500000000000001e-06, "loss": 0.2349, "step": 275 }, { "epoch": 0.11815675462780623, "grad_norm": 4.866330623626709, "learning_rate": 6e-06, "loss": 0.2528, "step": 300 }, { "epoch": 0.12800315084679006, "grad_norm": 4.609464645385742, "learning_rate": 6.5000000000000004e-06, "loss": 0.1845, "step": 325 }, { "epoch": 0.13784954706577393, "grad_norm": 4.313091278076172, "learning_rate": 7e-06, "loss": 0.2509, "step": 350 }, { "epoch": 0.14769594328475777, "grad_norm": 4.451571941375732, "learning_rate": 7.500000000000001e-06, "loss": 0.2397, "step": 375 }, { "epoch": 0.15754233950374164, "grad_norm": 4.453555107116699, "learning_rate": 8.000000000000001e-06, "loss": 0.2273, "step": 400 }, { "epoch": 0.16738873572272547, "grad_norm": 3.1133830547332764, "learning_rate": 8.5e-06, "loss": 0.2333, "step": 425 }, { "epoch": 0.17723513194170934, "grad_norm": 3.7271251678466797, "learning_rate": 9e-06, "loss": 0.2183, "step": 450 }, { "epoch": 0.18708152816069318, "grad_norm": 3.5686957836151123, "learning_rate": 9.5e-06, "loss": 0.2217, "step": 475 }, { "epoch": 0.19692792437967704, "grad_norm": 3.4622321128845215, "learning_rate": 1e-05, "loss": 0.2243, "step": 500 }, { "epoch": 0.20677432059866088, "grad_norm": 5.357515811920166, "learning_rate": 9.877390877881315e-06, "loss": 0.216, "step": 525 }, { "epoch": 0.21662071681764475, "grad_norm": 4.189847469329834, "learning_rate": 9.75478175576263e-06, "loss": 0.2293, "step": 550 }, { "epoch": 0.2264671130366286, "grad_norm": 5.216653347015381, "learning_rate": 9.632172633643944e-06, "loss": 0.2119, "step": 575 }, { "epoch": 0.23631350925561245, "grad_norm": 3.989429235458374, "learning_rate": 9.509563511525259e-06, "loss": 0.2028, "step": 600 }, { "epoch": 0.2461599054745963, "grad_norm": 4.413074970245361, "learning_rate": 9.386954389406573e-06, "loss": 0.2317, "step": 625 }, { "epoch": 0.25600630169358013, "grad_norm": 4.636322021484375, "learning_rate": 9.264345267287887e-06, "loss": 0.2295, "step": 650 }, { "epoch": 0.265852697912564, "grad_norm": 4.068817615509033, "learning_rate": 9.141736145169202e-06, "loss": 0.2168, "step": 675 }, { "epoch": 0.27569909413154786, "grad_norm": 3.76188588142395, "learning_rate": 9.019127023050516e-06, "loss": 0.2085, "step": 700 }, { "epoch": 0.2855454903505317, "grad_norm": 5.307705879211426, "learning_rate": 8.89651790093183e-06, "loss": 0.2234, "step": 725 }, { "epoch": 0.29539188656951554, "grad_norm": 5.800079822540283, "learning_rate": 8.773908778813145e-06, "loss": 0.2501, "step": 750 }, { "epoch": 0.3052382827884994, "grad_norm": 3.7706942558288574, "learning_rate": 8.651299656694458e-06, "loss": 0.2425, "step": 775 }, { "epoch": 0.31508467900748327, "grad_norm": 4.2593512535095215, "learning_rate": 8.528690534575772e-06, "loss": 0.215, "step": 800 }, { "epoch": 0.32493107522646714, "grad_norm": 5.396867752075195, "learning_rate": 8.406081412457088e-06, "loss": 0.1831, "step": 825 }, { "epoch": 0.33477747144545095, "grad_norm": 3.5747408866882324, "learning_rate": 8.283472290338403e-06, "loss": 0.1976, "step": 850 }, { "epoch": 0.3446238676644348, "grad_norm": 5.190290451049805, "learning_rate": 8.160863168219716e-06, "loss": 0.2329, "step": 875 }, { "epoch": 0.3544702638834187, "grad_norm": 6.033483505249023, "learning_rate": 8.043158410985778e-06, "loss": 0.2233, "step": 900 }, { "epoch": 0.36431666010240255, "grad_norm": 3.0238583087921143, "learning_rate": 7.920549288867092e-06, "loss": 0.2168, "step": 925 }, { "epoch": 0.37416305632138636, "grad_norm": 4.231501579284668, "learning_rate": 7.797940166748407e-06, "loss": 0.2065, "step": 950 }, { "epoch": 0.3840094525403702, "grad_norm": 3.6126151084899902, "learning_rate": 7.675331044629721e-06, "loss": 0.2126, "step": 975 }, { "epoch": 0.3938558487593541, "grad_norm": 4.870268821716309, "learning_rate": 7.552721922511036e-06, "loss": 0.2269, "step": 1000 }, { "epoch": 0.40370224497833795, "grad_norm": 4.647860527038574, "learning_rate": 7.430112800392349e-06, "loss": 0.1892, "step": 1025 }, { "epoch": 0.41354864119732176, "grad_norm": 4.208144187927246, "learning_rate": 7.307503678273664e-06, "loss": 0.181, "step": 1050 }, { "epoch": 0.42339503741630563, "grad_norm": 4.470357894897461, "learning_rate": 7.184894556154979e-06, "loss": 0.1931, "step": 1075 }, { "epoch": 0.4332414336352895, "grad_norm": 3.936370372772217, "learning_rate": 7.062285434036293e-06, "loss": 0.1884, "step": 1100 }, { "epoch": 0.44308782985427336, "grad_norm": 2.863593816757202, "learning_rate": 6.939676311917607e-06, "loss": 0.2054, "step": 1125 }, { "epoch": 0.4529342260732572, "grad_norm": 3.7073140144348145, "learning_rate": 6.817067189798921e-06, "loss": 0.202, "step": 1150 }, { "epoch": 0.46278062229224104, "grad_norm": 3.3471767902374268, "learning_rate": 6.6944580676802366e-06, "loss": 0.1962, "step": 1175 }, { "epoch": 0.4726270185112249, "grad_norm": 2.521800994873047, "learning_rate": 6.571848945561551e-06, "loss": 0.1967, "step": 1200 }, { "epoch": 0.4824734147302087, "grad_norm": 4.672091007232666, "learning_rate": 6.4492398234428646e-06, "loss": 0.1866, "step": 1225 }, { "epoch": 0.4923198109491926, "grad_norm": 4.11766242980957, "learning_rate": 6.326630701324179e-06, "loss": 0.2203, "step": 1250 }, { "epoch": 0.5021662071681764, "grad_norm": 3.7128915786743164, "learning_rate": 6.204021579205493e-06, "loss": 0.1754, "step": 1275 }, { "epoch": 0.5120126033871603, "grad_norm": 2.663398265838623, "learning_rate": 6.081412457086808e-06, "loss": 0.1737, "step": 1300 }, { "epoch": 0.5218589996061441, "grad_norm": 2.656521797180176, "learning_rate": 5.9588033349681214e-06, "loss": 0.2085, "step": 1325 }, { "epoch": 0.531705395825128, "grad_norm": 4.869094371795654, "learning_rate": 5.836194212849437e-06, "loss": 0.1943, "step": 1350 }, { "epoch": 0.5415517920441119, "grad_norm": 4.049020290374756, "learning_rate": 5.713585090730751e-06, "loss": 0.2035, "step": 1375 }, { "epoch": 0.5513981882630957, "grad_norm": 3.473222255706787, "learning_rate": 5.5909759686120656e-06, "loss": 0.1774, "step": 1400 }, { "epoch": 0.5612445844820796, "grad_norm": 4.270164489746094, "learning_rate": 5.468366846493379e-06, "loss": 0.202, "step": 1425 }, { "epoch": 0.5710909807010635, "grad_norm": 2.7588560581207275, "learning_rate": 5.345757724374694e-06, "loss": 0.2143, "step": 1450 }, { "epoch": 0.5809373769200472, "grad_norm": 4.859555721282959, "learning_rate": 5.223148602256009e-06, "loss": 0.2009, "step": 1475 }, { "epoch": 0.5907837731390311, "grad_norm": 5.985973358154297, "learning_rate": 5.100539480137323e-06, "loss": 0.2334, "step": 1500 }, { "epoch": 0.6006301693580149, "grad_norm": 4.282456398010254, "learning_rate": 4.977930358018637e-06, "loss": 0.2082, "step": 1525 }, { "epoch": 0.6104765655769988, "grad_norm": 3.605886697769165, "learning_rate": 4.855321235899951e-06, "loss": 0.2204, "step": 1550 }, { "epoch": 0.6203229617959827, "grad_norm": 4.4538044929504395, "learning_rate": 4.732712113781266e-06, "loss": 0.1548, "step": 1575 }, { "epoch": 0.6301693580149665, "grad_norm": 2.746072769165039, "learning_rate": 4.61010299166258e-06, "loss": 0.2109, "step": 1600 }, { "epoch": 0.6400157542339504, "grad_norm": 4.928244113922119, "learning_rate": 4.487493869543895e-06, "loss": 0.18, "step": 1625 }, { "epoch": 0.6498621504529343, "grad_norm": 3.3110663890838623, "learning_rate": 4.364884747425209e-06, "loss": 0.1707, "step": 1650 }, { "epoch": 0.659708546671918, "grad_norm": 2.6316301822662354, "learning_rate": 4.2422756253065234e-06, "loss": 0.2035, "step": 1675 }, { "epoch": 0.6695549428909019, "grad_norm": 4.228478908538818, "learning_rate": 4.119666503187837e-06, "loss": 0.201, "step": 1700 }, { "epoch": 0.6794013391098858, "grad_norm": 4.702217102050781, "learning_rate": 3.997057381069152e-06, "loss": 0.1919, "step": 1725 }, { "epoch": 0.6892477353288696, "grad_norm": 4.416632652282715, "learning_rate": 3.874448258950466e-06, "loss": 0.1837, "step": 1750 }, { "epoch": 0.6990941315478535, "grad_norm": 4.304840087890625, "learning_rate": 3.7518391368317807e-06, "loss": 0.1866, "step": 1775 }, { "epoch": 0.7089405277668374, "grad_norm": 2.635377883911133, "learning_rate": 3.6292300147130947e-06, "loss": 0.1949, "step": 1800 }, { "epoch": 0.7187869239858212, "grad_norm": 4.77269172668457, "learning_rate": 3.5066208925944096e-06, "loss": 0.202, "step": 1825 }, { "epoch": 0.7286333202048051, "grad_norm": 3.0173046588897705, "learning_rate": 3.3840117704757236e-06, "loss": 0.1867, "step": 1850 }, { "epoch": 0.7384797164237888, "grad_norm": 3.118154764175415, "learning_rate": 3.261402648357038e-06, "loss": 0.1681, "step": 1875 }, { "epoch": 0.7483261126427727, "grad_norm": 4.375317573547363, "learning_rate": 3.1387935262383525e-06, "loss": 0.1782, "step": 1900 }, { "epoch": 0.7581725088617566, "grad_norm": 5.0968122482299805, "learning_rate": 3.016184404119667e-06, "loss": 0.1862, "step": 1925 }, { "epoch": 0.7680189050807404, "grad_norm": 4.492552280426025, "learning_rate": 2.893575282000981e-06, "loss": 0.1951, "step": 1950 }, { "epoch": 0.7778653012997243, "grad_norm": 3.4022319316864014, "learning_rate": 2.7709661598822958e-06, "loss": 0.1873, "step": 1975 }, { "epoch": 0.7877116975187082, "grad_norm": 3.6453022956848145, "learning_rate": 2.6483570377636098e-06, "loss": 0.2134, "step": 2000 }, { "epoch": 0.797558093737692, "grad_norm": 2.7721006870269775, "learning_rate": 2.525747915644924e-06, "loss": 0.1731, "step": 2025 }, { "epoch": 0.8074044899566759, "grad_norm": 3.6965274810791016, "learning_rate": 2.4031387935262386e-06, "loss": 0.1581, "step": 2050 }, { "epoch": 0.8172508861756597, "grad_norm": 3.9200284481048584, "learning_rate": 2.2805296714075526e-06, "loss": 0.2, "step": 2075 }, { "epoch": 0.8270972823946435, "grad_norm": 3.353593587875366, "learning_rate": 2.157920549288867e-06, "loss": 0.1836, "step": 2100 }, { "epoch": 0.8369436786136274, "grad_norm": 3.3680572509765625, "learning_rate": 2.0353114271701815e-06, "loss": 0.2145, "step": 2125 }, { "epoch": 0.8467900748326113, "grad_norm": 2.7071352005004883, "learning_rate": 1.912702305051496e-06, "loss": 0.1837, "step": 2150 }, { "epoch": 0.8566364710515951, "grad_norm": 3.3706490993499756, "learning_rate": 1.7900931829328103e-06, "loss": 0.1642, "step": 2175 }, { "epoch": 0.866482867270579, "grad_norm": 3.4134597778320312, "learning_rate": 1.6674840608141246e-06, "loss": 0.1829, "step": 2200 }, { "epoch": 0.8763292634895629, "grad_norm": 4.351123809814453, "learning_rate": 1.544874938695439e-06, "loss": 0.1766, "step": 2225 }, { "epoch": 0.8861756597085467, "grad_norm": 4.491402626037598, "learning_rate": 1.4222658165767534e-06, "loss": 0.1793, "step": 2250 }, { "epoch": 0.8960220559275305, "grad_norm": 4.368827819824219, "learning_rate": 1.2996566944580676e-06, "loss": 0.1871, "step": 2275 }, { "epoch": 0.9058684521465143, "grad_norm": 3.653308868408203, "learning_rate": 1.1770475723393823e-06, "loss": 0.1685, "step": 2300 }, { "epoch": 0.9157148483654982, "grad_norm": 4.542746067047119, "learning_rate": 1.0544384502206965e-06, "loss": 0.1921, "step": 2325 }, { "epoch": 0.9255612445844821, "grad_norm": 3.869722366333008, "learning_rate": 9.318293281020109e-07, "loss": 0.1985, "step": 2350 }, { "epoch": 0.935407640803466, "grad_norm": 5.539394378662109, "learning_rate": 8.092202059833253e-07, "loss": 0.233, "step": 2375 }, { "epoch": 0.9452540370224498, "grad_norm": 4.2061872482299805, "learning_rate": 6.866110838646396e-07, "loss": 0.1844, "step": 2400 }, { "epoch": 0.9551004332414337, "grad_norm": 5.669102191925049, "learning_rate": 5.64001961745954e-07, "loss": 0.208, "step": 2425 }, { "epoch": 0.9649468294604174, "grad_norm": 2.2702748775482178, "learning_rate": 4.4139283962726833e-07, "loss": 0.188, "step": 2450 }, { "epoch": 0.9747932256794013, "grad_norm": 2.7447268962860107, "learning_rate": 3.187837175085827e-07, "loss": 0.1868, "step": 2475 }, { "epoch": 0.9846396218983852, "grad_norm": 4.356093406677246, "learning_rate": 1.9617459538989703e-07, "loss": 0.1993, "step": 2500 }, { "epoch": 0.994486018117369, "grad_norm": 3.6656863689422607, "learning_rate": 7.356547327121139e-08, "loss": 0.1833, "step": 2525 } ], "logging_steps": 25, "max_steps": 2539, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.172262754639872e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }