hemingkx's picture
Upload 14 files
467cd56 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9040063466878223,
"eval_steps": 300,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01586671955573185,
"grad_norm": 0.6832552552223206,
"learning_rate": 2.6455026455026455e-06,
"loss": 1.3171,
"step": 10
},
{
"epoch": 0.0317334391114637,
"grad_norm": 0.9994391798973083,
"learning_rate": 5.291005291005291e-06,
"loss": 1.1922,
"step": 20
},
{
"epoch": 0.04760015866719556,
"grad_norm": 0.9905434846878052,
"learning_rate": 7.936507936507936e-06,
"loss": 1.2352,
"step": 30
},
{
"epoch": 0.0634668782229274,
"grad_norm": 1.0371067523956299,
"learning_rate": 1.0582010582010582e-05,
"loss": 1.1191,
"step": 40
},
{
"epoch": 0.07933359777865927,
"grad_norm": 0.9686773419380188,
"learning_rate": 1.3227513227513228e-05,
"loss": 1.0697,
"step": 50
},
{
"epoch": 0.09520031733439112,
"grad_norm": 0.5632955431938171,
"learning_rate": 1.5873015873015872e-05,
"loss": 0.741,
"step": 60
},
{
"epoch": 0.11106703689012297,
"grad_norm": 0.4180806875228882,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.6942,
"step": 70
},
{
"epoch": 0.1269337564458548,
"grad_norm": 0.3943168520927429,
"learning_rate": 2.1164021164021164e-05,
"loss": 0.6314,
"step": 80
},
{
"epoch": 0.14280047600158668,
"grad_norm": 0.3465676009654999,
"learning_rate": 2.380952380952381e-05,
"loss": 0.6257,
"step": 90
},
{
"epoch": 0.15866719555731854,
"grad_norm": 0.3200153708457947,
"learning_rate": 2.6455026455026456e-05,
"loss": 0.5426,
"step": 100
},
{
"epoch": 0.1745339151130504,
"grad_norm": 0.28294724225997925,
"learning_rate": 2.91005291005291e-05,
"loss": 0.5209,
"step": 110
},
{
"epoch": 0.19040063466878224,
"grad_norm": 0.3477802574634552,
"learning_rate": 3.1746031746031745e-05,
"loss": 0.4633,
"step": 120
},
{
"epoch": 0.2062673542245141,
"grad_norm": 0.36890918016433716,
"learning_rate": 3.439153439153439e-05,
"loss": 0.3798,
"step": 130
},
{
"epoch": 0.22213407378024594,
"grad_norm": 0.38711702823638916,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.4397,
"step": 140
},
{
"epoch": 0.2380007933359778,
"grad_norm": 0.4116644859313965,
"learning_rate": 3.968253968253968e-05,
"loss": 0.4021,
"step": 150
},
{
"epoch": 0.2538675128917096,
"grad_norm": 0.402497798204422,
"learning_rate": 4.232804232804233e-05,
"loss": 0.3796,
"step": 160
},
{
"epoch": 0.2697342324474415,
"grad_norm": 0.6916673183441162,
"learning_rate": 4.4973544973544974e-05,
"loss": 0.4462,
"step": 170
},
{
"epoch": 0.28560095200317337,
"grad_norm": 0.49015486240386963,
"learning_rate": 4.761904761904762e-05,
"loss": 0.3902,
"step": 180
},
{
"epoch": 0.3014676715589052,
"grad_norm": 0.4605613648891449,
"learning_rate": 4.999995736158938e-05,
"loss": 0.3751,
"step": 190
},
{
"epoch": 0.31733439111463707,
"grad_norm": 0.3663236200809479,
"learning_rate": 4.999484092829756e-05,
"loss": 0.3399,
"step": 200
},
{
"epoch": 0.3332011106703689,
"grad_norm": 0.6052913069725037,
"learning_rate": 4.998119881260576e-05,
"loss": 0.3738,
"step": 210
},
{
"epoch": 0.3490678302261008,
"grad_norm": 0.5979182720184326,
"learning_rate": 4.995903566780805e-05,
"loss": 0.3732,
"step": 220
},
{
"epoch": 0.3649345497818326,
"grad_norm": 0.7640148401260376,
"learning_rate": 4.992835905370186e-05,
"loss": 0.4339,
"step": 230
},
{
"epoch": 0.3808012693375645,
"grad_norm": 0.5569157600402832,
"learning_rate": 4.988917943400924e-05,
"loss": 0.3893,
"step": 240
},
{
"epoch": 0.3966679888932963,
"grad_norm": 0.6464380621910095,
"learning_rate": 4.9841510172807834e-05,
"loss": 0.3642,
"step": 250
},
{
"epoch": 0.4125347084490282,
"grad_norm": 0.5494194030761719,
"learning_rate": 4.97853675299723e-05,
"loss": 0.3481,
"step": 260
},
{
"epoch": 0.42840142800476,
"grad_norm": 0.5208620429039001,
"learning_rate": 4.972077065562821e-05,
"loss": 0.3968,
"step": 270
},
{
"epoch": 0.4442681475604919,
"grad_norm": 0.6078989505767822,
"learning_rate": 4.964774158361991e-05,
"loss": 0.337,
"step": 280
},
{
"epoch": 0.4601348671162237,
"grad_norm": 0.5791096687316895,
"learning_rate": 4.956630522399487e-05,
"loss": 0.3495,
"step": 290
},
{
"epoch": 0.4760015866719556,
"grad_norm": 0.5267443060874939,
"learning_rate": 4.947648935450689e-05,
"loss": 0.3191,
"step": 300
},
{
"epoch": 0.4760015866719556,
"eval_loss": 0.38255831599235535,
"eval_runtime": 93.6436,
"eval_samples_per_second": 5.991,
"eval_steps_per_second": 5.991,
"step": 300
},
{
"epoch": 0.4918683062276874,
"grad_norm": 0.42828086018562317,
"learning_rate": 4.937832461114123e-05,
"loss": 0.3164,
"step": 310
},
{
"epoch": 0.5077350257834192,
"grad_norm": 0.5684987306594849,
"learning_rate": 4.927184447766467e-05,
"loss": 0.3047,
"step": 320
},
{
"epoch": 0.5236017453391512,
"grad_norm": 0.6015241742134094,
"learning_rate": 4.915708527420435e-05,
"loss": 0.2568,
"step": 330
},
{
"epoch": 0.539468464894883,
"grad_norm": 0.7351698279380798,
"learning_rate": 4.903408614485899e-05,
"loss": 0.3554,
"step": 340
},
{
"epoch": 0.5553351844506148,
"grad_norm": 0.5476783514022827,
"learning_rate": 4.890288904434699e-05,
"loss": 0.2934,
"step": 350
},
{
"epoch": 0.5712019040063467,
"grad_norm": 0.7653456330299377,
"learning_rate": 4.8763538723695726e-05,
"loss": 0.3548,
"step": 360
},
{
"epoch": 0.5870686235620786,
"grad_norm": 0.5909974575042725,
"learning_rate": 4.8616082714977097e-05,
"loss": 0.2958,
"step": 370
},
{
"epoch": 0.6029353431178104,
"grad_norm": 0.8618314266204834,
"learning_rate": 4.8460571315094456e-05,
"loss": 0.3323,
"step": 380
},
{
"epoch": 0.6188020626735422,
"grad_norm": 0.43425798416137695,
"learning_rate": 4.829705756862642e-05,
"loss": 0.3256,
"step": 390
},
{
"epoch": 0.6346687822292741,
"grad_norm": 0.6728402972221375,
"learning_rate": 4.812559724973355e-05,
"loss": 0.3289,
"step": 400
},
{
"epoch": 0.650535501785006,
"grad_norm": 0.6804693341255188,
"learning_rate": 4.79462488431338e-05,
"loss": 0.3494,
"step": 410
},
{
"epoch": 0.6664022213407378,
"grad_norm": 0.7322263121604919,
"learning_rate": 4.775907352415367e-05,
"loss": 0.286,
"step": 420
},
{
"epoch": 0.6822689408964696,
"grad_norm": 0.5389537215232849,
"learning_rate": 4.75641351378613e-05,
"loss": 0.2597,
"step": 430
},
{
"epoch": 0.6981356604522015,
"grad_norm": 0.7679384350776672,
"learning_rate": 4.7361500177289156e-05,
"loss": 0.3265,
"step": 440
},
{
"epoch": 0.7140023800079334,
"grad_norm": 0.9662689566612244,
"learning_rate": 4.715123776075336e-05,
"loss": 0.3493,
"step": 450
},
{
"epoch": 0.7298690995636652,
"grad_norm": 0.7264727354049683,
"learning_rate": 4.693341960827764e-05,
"loss": 0.3412,
"step": 460
},
{
"epoch": 0.745735819119397,
"grad_norm": 0.6046968698501587,
"learning_rate": 4.670812001712973e-05,
"loss": 0.349,
"step": 470
},
{
"epoch": 0.761602538675129,
"grad_norm": 0.7867446541786194,
"learning_rate": 4.647541583647883e-05,
"loss": 0.3394,
"step": 480
},
{
"epoch": 0.7774692582308608,
"grad_norm": 0.7447336912155151,
"learning_rate": 4.623538644118244e-05,
"loss": 0.3738,
"step": 490
},
{
"epoch": 0.7933359777865926,
"grad_norm": 0.5049989819526672,
"learning_rate": 4.5988113704711846e-05,
"loss": 0.2899,
"step": 500
},
{
"epoch": 0.8092026973423245,
"grad_norm": 0.8148102164268494,
"learning_rate": 4.573368197122524e-05,
"loss": 0.3144,
"step": 510
},
{
"epoch": 0.8250694168980564,
"grad_norm": 0.7628346681594849,
"learning_rate": 4.547217802679814e-05,
"loss": 0.2996,
"step": 520
},
{
"epoch": 0.8409361364537882,
"grad_norm": 0.6373751759529114,
"learning_rate": 4.520369106982084e-05,
"loss": 0.2887,
"step": 530
},
{
"epoch": 0.85680285600952,
"grad_norm": 0.6852309107780457,
"learning_rate": 4.4928312680573064e-05,
"loss": 0.2862,
"step": 540
},
{
"epoch": 0.8726695755652519,
"grad_norm": 0.7369856238365173,
"learning_rate": 4.464613678998612e-05,
"loss": 0.3386,
"step": 550
},
{
"epoch": 0.8885362951209838,
"grad_norm": 0.6538604497909546,
"learning_rate": 4.435725964760331e-05,
"loss": 0.3225,
"step": 560
},
{
"epoch": 0.9044030146767156,
"grad_norm": 0.8635338544845581,
"learning_rate": 4.406177978874941e-05,
"loss": 0.3328,
"step": 570
},
{
"epoch": 0.9202697342324474,
"grad_norm": 0.9363442063331604,
"learning_rate": 4.3759798000920496e-05,
"loss": 0.3315,
"step": 580
},
{
"epoch": 0.9361364537881793,
"grad_norm": 0.7191005349159241,
"learning_rate": 4.3451417289405586e-05,
"loss": 0.271,
"step": 590
},
{
"epoch": 0.9520031733439112,
"grad_norm": 0.7281237244606018,
"learning_rate": 4.313674284215176e-05,
"loss": 0.2945,
"step": 600
},
{
"epoch": 0.9520031733439112,
"eval_loss": 0.3320656418800354,
"eval_runtime": 93.5162,
"eval_samples_per_second": 5.999,
"eval_steps_per_second": 5.999,
"step": 600
},
{
"epoch": 0.967869892899643,
"grad_norm": 0.6777219176292419,
"learning_rate": 4.281588199388476e-05,
"loss": 0.2844,
"step": 610
},
{
"epoch": 0.9837366124553748,
"grad_norm": 0.7066994905471802,
"learning_rate": 4.248894418949746e-05,
"loss": 0.3348,
"step": 620
},
{
"epoch": 0.9996033320111067,
"grad_norm": 0.5948446989059448,
"learning_rate": 4.215604094671835e-05,
"loss": 0.2532,
"step": 630
},
{
"epoch": 1.0154700515668384,
"grad_norm": 0.8124284744262695,
"learning_rate": 4.181728581807316e-05,
"loss": 0.3068,
"step": 640
},
{
"epoch": 1.0313367711225705,
"grad_norm": 0.6020251512527466,
"learning_rate": 4.1472794352152366e-05,
"loss": 0.2912,
"step": 650
},
{
"epoch": 1.0472034906783023,
"grad_norm": 0.9106693863868713,
"learning_rate": 4.112268405419782e-05,
"loss": 0.2739,
"step": 660
},
{
"epoch": 1.0630702102340341,
"grad_norm": 0.9437083005905151,
"learning_rate": 4.076707434602194e-05,
"loss": 0.2563,
"step": 670
},
{
"epoch": 1.078936929789766,
"grad_norm": 0.7824203968048096,
"learning_rate": 4.040608652527328e-05,
"loss": 0.2898,
"step": 680
},
{
"epoch": 1.0948036493454978,
"grad_norm": 0.8340286612510681,
"learning_rate": 4.003984372406212e-05,
"loss": 0.2665,
"step": 690
},
{
"epoch": 1.1106703689012296,
"grad_norm": 0.7771459817886353,
"learning_rate": 3.966847086696045e-05,
"loss": 0.2711,
"step": 700
},
{
"epoch": 1.1265370884569614,
"grad_norm": 0.6131294369697571,
"learning_rate": 3.929209462839041e-05,
"loss": 0.2825,
"step": 710
},
{
"epoch": 1.1424038080126935,
"grad_norm": 0.8090108633041382,
"learning_rate": 3.891084338941603e-05,
"loss": 0.2725,
"step": 720
},
{
"epoch": 1.1582705275684253,
"grad_norm": 0.6462133526802063,
"learning_rate": 3.852484719395264e-05,
"loss": 0.2406,
"step": 730
},
{
"epoch": 1.1741372471241571,
"grad_norm": 0.7676876783370972,
"learning_rate": 3.8134237704409295e-05,
"loss": 0.2648,
"step": 740
},
{
"epoch": 1.190003966679889,
"grad_norm": 0.8399226665496826,
"learning_rate": 3.773914815677897e-05,
"loss": 0.2693,
"step": 750
},
{
"epoch": 1.2058706862356208,
"grad_norm": 0.7439931631088257,
"learning_rate": 3.733971331519206e-05,
"loss": 0.2602,
"step": 760
},
{
"epoch": 1.2217374057913526,
"grad_norm": 0.7992270588874817,
"learning_rate": 3.693606942594873e-05,
"loss": 0.2647,
"step": 770
},
{
"epoch": 1.2376041253470844,
"grad_norm": 0.8335320949554443,
"learning_rate": 3.65283541710455e-05,
"loss": 0.2723,
"step": 780
},
{
"epoch": 1.2534708449028162,
"grad_norm": 0.813829779624939,
"learning_rate": 3.611670662121234e-05,
"loss": 0.2285,
"step": 790
},
{
"epoch": 1.269337564458548,
"grad_norm": 1.047428846359253,
"learning_rate": 3.570126718847589e-05,
"loss": 0.2788,
"step": 800
},
{
"epoch": 1.28520428401428,
"grad_norm": 1.1291059255599976,
"learning_rate": 3.5282177578265296e-05,
"loss": 0.3008,
"step": 810
},
{
"epoch": 1.301071003570012,
"grad_norm": 0.6732133626937866,
"learning_rate": 3.485958074107677e-05,
"loss": 0.2446,
"step": 820
},
{
"epoch": 1.3169377231257438,
"grad_norm": 0.8102436065673828,
"learning_rate": 3.4433620823713564e-05,
"loss": 0.2646,
"step": 830
},
{
"epoch": 1.3328044426814756,
"grad_norm": 0.8745511770248413,
"learning_rate": 3.400444312011776e-05,
"loss": 0.2606,
"step": 840
},
{
"epoch": 1.3486711622372074,
"grad_norm": 0.8574343323707581,
"learning_rate": 3.3572194021810896e-05,
"loss": 0.2294,
"step": 850
},
{
"epoch": 1.3645378817929394,
"grad_norm": 0.9338182806968689,
"learning_rate": 3.3137020967960154e-05,
"loss": 0.2551,
"step": 860
},
{
"epoch": 1.3804046013486713,
"grad_norm": 0.7893859148025513,
"learning_rate": 3.269907239508714e-05,
"loss": 0.231,
"step": 870
},
{
"epoch": 1.396271320904403,
"grad_norm": 0.9416842460632324,
"learning_rate": 3.2258497686436606e-05,
"loss": 0.2528,
"step": 880
},
{
"epoch": 1.412138040460135,
"grad_norm": 0.7524838447570801,
"learning_rate": 3.181544712102216e-05,
"loss": 0.2669,
"step": 890
},
{
"epoch": 1.4280047600158667,
"grad_norm": 0.9508939981460571,
"learning_rate": 3.137007182236637e-05,
"loss": 0.2436,
"step": 900
},
{
"epoch": 1.4280047600158667,
"eval_loss": 0.31376519799232483,
"eval_runtime": 93.4332,
"eval_samples_per_second": 6.004,
"eval_steps_per_second": 6.004,
"step": 900
},
{
"epoch": 1.4438714795715986,
"grad_norm": 1.3880516290664673,
"learning_rate": 3.092252370695298e-05,
"loss": 0.2781,
"step": 910
},
{
"epoch": 1.4597381991273304,
"grad_norm": 0.7659589648246765,
"learning_rate": 3.0472955432408485e-05,
"loss": 0.294,
"step": 920
},
{
"epoch": 1.4756049186830622,
"grad_norm": 0.7849363088607788,
"learning_rate": 3.002152034543098e-05,
"loss": 0.2768,
"step": 930
},
{
"epoch": 1.491471638238794,
"grad_norm": 0.666022777557373,
"learning_rate": 2.9568372429483966e-05,
"loss": 0.2526,
"step": 940
},
{
"epoch": 1.5073383577945259,
"grad_norm": 0.9934321641921997,
"learning_rate": 2.9113666252272943e-05,
"loss": 0.2524,
"step": 950
},
{
"epoch": 1.5232050773502577,
"grad_norm": 0.8849703073501587,
"learning_rate": 2.865755691302272e-05,
"loss": 0.2905,
"step": 960
},
{
"epoch": 1.5390717969059897,
"grad_norm": 0.7805452942848206,
"learning_rate": 2.8200199989573432e-05,
"loss": 0.242,
"step": 970
},
{
"epoch": 1.5549385164617215,
"grad_norm": 0.9101535081863403,
"learning_rate": 2.7741751485313296e-05,
"loss": 0.2178,
"step": 980
},
{
"epoch": 1.5708052360174534,
"grad_norm": 1.0016871690750122,
"learning_rate": 2.728236777596621e-05,
"loss": 0.2738,
"step": 990
},
{
"epoch": 1.5866719555731852,
"grad_norm": 0.7558535933494568,
"learning_rate": 2.6822205556252383e-05,
"loss": 0.2363,
"step": 1000
},
{
"epoch": 1.6025386751289172,
"grad_norm": 0.886492908000946,
"learning_rate": 2.636142178644009e-05,
"loss": 0.2645,
"step": 1010
},
{
"epoch": 1.618405394684649,
"grad_norm": 0.9074681997299194,
"learning_rate": 2.590017363880691e-05,
"loss": 0.2616,
"step": 1020
},
{
"epoch": 1.6342721142403809,
"grad_norm": 0.7710486054420471,
"learning_rate": 2.5438618444028627e-05,
"loss": 0.2776,
"step": 1030
},
{
"epoch": 1.6501388337961127,
"grad_norm": 0.5658883452415466,
"learning_rate": 2.4976913637514103e-05,
"loss": 0.2259,
"step": 1040
},
{
"epoch": 1.6660055533518445,
"grad_norm": 0.871059000492096,
"learning_rate": 2.4515216705704395e-05,
"loss": 0.2106,
"step": 1050
},
{
"epoch": 1.6818722729075763,
"grad_norm": 0.6641427874565125,
"learning_rate": 2.405368513235453e-05,
"loss": 0.2242,
"step": 1060
},
{
"epoch": 1.6977389924633082,
"grad_norm": 0.8793797492980957,
"learning_rate": 2.359247634481615e-05,
"loss": 0.2555,
"step": 1070
},
{
"epoch": 1.71360571201904,
"grad_norm": 0.9106749296188354,
"learning_rate": 2.3131747660339394e-05,
"loss": 0.2903,
"step": 1080
},
{
"epoch": 1.7294724315747718,
"grad_norm": 0.8714670538902283,
"learning_rate": 2.2671656232412378e-05,
"loss": 0.2885,
"step": 1090
},
{
"epoch": 1.7453391511305036,
"grad_norm": 0.7964282631874084,
"learning_rate": 2.2212358997156445e-05,
"loss": 0.2579,
"step": 1100
},
{
"epoch": 1.7612058706862355,
"grad_norm": 0.855613648891449,
"learning_rate": 2.175401261979569e-05,
"loss": 0.2926,
"step": 1110
},
{
"epoch": 1.7770725902419675,
"grad_norm": 0.8496893048286438,
"learning_rate": 2.1296773441218787e-05,
"loss": 0.2404,
"step": 1120
},
{
"epoch": 1.7929393097976993,
"grad_norm": 0.7642855644226074,
"learning_rate": 2.084079742465142e-05,
"loss": 0.2471,
"step": 1130
},
{
"epoch": 1.8088060293534312,
"grad_norm": 0.8110942244529724,
"learning_rate": 2.0386240102457682e-05,
"loss": 0.236,
"step": 1140
},
{
"epoch": 1.824672748909163,
"grad_norm": 0.9976955056190491,
"learning_rate": 1.993325652308828e-05,
"loss": 0.2609,
"step": 1150
},
{
"epoch": 1.840539468464895,
"grad_norm": 0.8436864614486694,
"learning_rate": 1.9482001198193882e-05,
"loss": 0.2771,
"step": 1160
},
{
"epoch": 1.8564061880206268,
"grad_norm": 0.8823544979095459,
"learning_rate": 1.903262804992156e-05,
"loss": 0.2399,
"step": 1170
},
{
"epoch": 1.8722729075763587,
"grad_norm": 0.6062878370285034,
"learning_rate": 1.8585290358412297e-05,
"loss": 0.2344,
"step": 1180
},
{
"epoch": 1.8881396271320905,
"grad_norm": 0.7493007183074951,
"learning_rate": 1.8140140709517465e-05,
"loss": 0.2311,
"step": 1190
},
{
"epoch": 1.9040063466878223,
"grad_norm": 0.8461095094680786,
"learning_rate": 1.7697330942752193e-05,
"loss": 0.2414,
"step": 1200
},
{
"epoch": 1.9040063466878223,
"eval_loss": 0.3031667470932007,
"eval_runtime": 93.4886,
"eval_samples_per_second": 6.001,
"eval_steps_per_second": 6.001,
"step": 1200
}
],
"logging_steps": 10,
"max_steps": 1890,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.7498295064425267e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}