{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 99.84,
  "eval_steps": 200,
  "global_step": 7800,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0128,
      "grad_norm": 37.46966889794853,
      "learning_rate": 6.410256410256411e-06,
      "loss": 38.1241,
      "loss_layer_12_head": 7.381371974945068,
      "loss_layer_18_head": 6.901525497436523,
      "loss_layer_24_head": 5.663371562957764,
      "loss_layer_30_head": 4.744361400604248,
      "loss_layer_36_head": 3.7597877979278564,
      "loss_layer_42_head": 2.159144163131714,
      "loss_layer_6_head": 7.8785858154296875,
      "step": 1
    },
    {
      "epoch": 0.064,
      "grad_norm": 26.449641086557047,
      "learning_rate": 3.205128205128205e-05,
      "loss": 35.0284,
      "loss_layer_12_head": 6.8439621925354,
      "loss_layer_18_head": 6.3815083503723145,
      "loss_layer_24_head": 5.158053398132324,
      "loss_layer_30_head": 4.249989032745361,
      "loss_layer_36_head": 3.3109536170959473,
      "loss_layer_42_head": 1.7714693546295166,
      "loss_layer_6_head": 7.365020751953125,
      "step": 5
    },
    {
      "epoch": 0.128,
      "grad_norm": 13.599673697128445,
      "learning_rate": 6.41025641025641e-05,
      "loss": 22.7694,
      "loss_layer_12_head": 4.75654935836792,
      "loss_layer_18_head": 4.304225921630859,
      "loss_layer_24_head": 3.241687297821045,
      "loss_layer_30_head": 2.528693675994873,
      "loss_layer_36_head": 1.794163703918457,
      "loss_layer_42_head": 0.9631277322769165,
      "loss_layer_6_head": 5.269097328186035,
      "step": 10
    },
    {
      "epoch": 0.192,
      "grad_norm": 14.661030316188471,
      "learning_rate": 9.615384615384617e-05,
      "loss": 22.865,
      "loss_layer_12_head": 4.691239356994629,
      "loss_layer_18_head": 4.312588214874268,
      "loss_layer_24_head": 3.2695541381835938,
      "loss_layer_30_head": 2.5162501335144043,
      "loss_layer_36_head": 1.796791672706604,
      "loss_layer_42_head": 1.0636465549468994,
      "loss_layer_6_head": 5.050011157989502,
      "step": 15
    },
    {
      "epoch": 0.256,
      "grad_norm": 5.107867753770457,
      "learning_rate": 0.0001282051282051282,
      "loss": 18.0737,
      "loss_layer_12_head": 3.7832603454589844,
      "loss_layer_18_head": 3.4329986572265625,
      "loss_layer_24_head": 2.522334575653076,
      "loss_layer_30_head": 1.9381910562515259,
      "loss_layer_36_head": 1.4090098142623901,
      "loss_layer_42_head": 0.7508165240287781,
      "loss_layer_6_head": 4.18397331237793,
      "step": 20
    },
    {
      "epoch": 0.32,
      "grad_norm": 5.267519415751645,
      "learning_rate": 0.00016025641025641026,
      "loss": 16.0614,
      "loss_layer_12_head": 3.398284912109375,
      "loss_layer_18_head": 3.0487191677093506,
      "loss_layer_24_head": 2.190309762954712,
      "loss_layer_30_head": 1.6072800159454346,
      "loss_layer_36_head": 1.0952484607696533,
      "loss_layer_42_head": 0.6513544321060181,
      "loss_layer_6_head": 3.761554718017578,
      "step": 25
    },
    {
      "epoch": 0.384,
      "grad_norm": 3.210790190285614,
      "learning_rate": 0.00019230769230769233,
      "loss": 14.4409,
      "loss_layer_12_head": 3.1237034797668457,
      "loss_layer_18_head": 2.803212881088257,
      "loss_layer_24_head": 1.9972496032714844,
      "loss_layer_30_head": 1.4782203435897827,
      "loss_layer_36_head": 0.9908267259597778,
      "loss_layer_42_head": 0.5320068597793579,
      "loss_layer_6_head": 3.485320568084717,
      "step": 30
    },
    {
      "epoch": 0.448,
      "grad_norm": 2.214427908312317,
      "learning_rate": 0.00022435897435897436,
      "loss": 12.8345,
      "loss_layer_12_head": 2.812263011932373,
      "loss_layer_18_head": 2.5024733543395996,
      "loss_layer_24_head": 1.7400400638580322,
      "loss_layer_30_head": 1.2413004636764526,
      "loss_layer_36_head": 0.7980144619941711,
      "loss_layer_42_head": 0.42239946126937866,
      "loss_layer_6_head": 3.1914687156677246,
      "step": 35
    },
    {
      "epoch": 0.512,
      "grad_norm": 1.8194382754752458,
      "learning_rate": 0.0002564102564102564,
      "loss": 11.813,
      "loss_layer_12_head": 2.6922152042388916,
      "loss_layer_18_head": 2.374938488006592,
      "loss_layer_24_head": 1.6287014484405518,
      "loss_layer_30_head": 1.1428781747817993,
      "loss_layer_36_head": 0.7217909693717957,
      "loss_layer_42_head": 0.368852436542511,
      "loss_layer_6_head": 3.0436530113220215,
      "step": 40
    },
    {
      "epoch": 0.576,
      "grad_norm": 1.420722649608212,
      "learning_rate": 0.0002884615384615385,
      "loss": 10.8442,
      "loss_layer_12_head": 2.503904342651367,
      "loss_layer_18_head": 2.209184169769287,
      "loss_layer_24_head": 1.4891456365585327,
      "loss_layer_30_head": 1.0462461709976196,
      "loss_layer_36_head": 0.6369980573654175,
      "loss_layer_42_head": 0.32643309235572815,
      "loss_layer_6_head": 2.8512232303619385,
      "step": 45
    },
    {
      "epoch": 0.64,
      "grad_norm": 1.2761969986146626,
      "learning_rate": 0.0003205128205128205,
      "loss": 10.2015,
      "loss_layer_12_head": 2.3154778480529785,
      "loss_layer_18_head": 2.0404632091522217,
      "loss_layer_24_head": 1.3670313358306885,
      "loss_layer_30_head": 0.9330158233642578,
      "loss_layer_36_head": 0.565844714641571,
      "loss_layer_42_head": 0.2868990898132324,
      "loss_layer_6_head": 2.67217755317688,
      "step": 50
    },
    {
      "epoch": 0.704,
      "grad_norm": 1.0630403550636491,
      "learning_rate": 0.0003525641025641026,
      "loss": 9.5749,
      "loss_layer_12_head": 2.2206552028656006,
      "loss_layer_18_head": 1.9401308298110962,
      "loss_layer_24_head": 1.2793065309524536,
      "loss_layer_30_head": 0.874772846698761,
      "loss_layer_36_head": 0.528229832649231,
      "loss_layer_42_head": 0.2617526948451996,
      "loss_layer_6_head": 2.578800678253174,
      "step": 55
    },
    {
      "epoch": 0.768,
      "grad_norm": 0.9447109889665385,
      "learning_rate": 0.00038461538461538467,
      "loss": 9.0692,
      "loss_layer_12_head": 2.1164443492889404,
      "loss_layer_18_head": 1.8493837118148804,
      "loss_layer_24_head": 1.2103344202041626,
      "loss_layer_30_head": 0.8208127021789551,
      "loss_layer_36_head": 0.4829682409763336,
      "loss_layer_42_head": 0.24367626011371613,
      "loss_layer_6_head": 2.4864346981048584,
      "step": 60
    },
    {
      "epoch": 0.832,
      "grad_norm": 0.8805324960486642,
      "learning_rate": 0.00041666666666666664,
      "loss": 8.6596,
      "loss_layer_12_head": 1.9864718914031982,
      "loss_layer_18_head": 1.7390094995498657,
      "loss_layer_24_head": 1.1169682741165161,
      "loss_layer_30_head": 0.7481921911239624,
      "loss_layer_36_head": 0.44111114740371704,
      "loss_layer_42_head": 0.22223806381225586,
      "loss_layer_6_head": 2.3423190116882324,
      "step": 65
    },
    {
      "epoch": 0.896,
      "grad_norm": 0.8173380721735507,
      "learning_rate": 0.0004487179487179487,
      "loss": 8.1995,
      "loss_layer_12_head": 1.9407764673233032,
      "loss_layer_18_head": 1.6924203634262085,
      "loss_layer_24_head": 1.0880826711654663,
      "loss_layer_30_head": 0.7236670851707458,
      "loss_layer_36_head": 0.42235738039016724,
      "loss_layer_42_head": 0.20802442729473114,
      "loss_layer_6_head": 2.3072762489318848,
      "step": 70
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.7651660379980307,
      "learning_rate": 0.0004807692307692308,
      "loss": 8.0029,
      "loss_layer_12_head": 1.9295625686645508,
      "loss_layer_18_head": 1.6837472915649414,
      "loss_layer_24_head": 1.087652325630188,
      "loss_layer_30_head": 0.7212607264518738,
      "loss_layer_36_head": 0.418633371591568,
      "loss_layer_42_head": 0.20959711074829102,
      "loss_layer_6_head": 2.2810096740722656,
      "step": 75
    },
    {
      "epoch": 1.024,
      "grad_norm": 0.7337383131358958,
      "learning_rate": 0.0005128205128205128,
      "loss": 7.5284,
      "loss_layer_12_head": 1.777679443359375,
      "loss_layer_18_head": 1.5342943668365479,
      "loss_layer_24_head": 0.9637476801872253,
      "loss_layer_30_head": 0.6230955719947815,
      "loss_layer_36_head": 0.3595765233039856,
      "loss_layer_42_head": 0.1875569075345993,
      "loss_layer_6_head": 2.144977331161499,
      "step": 80
    },
    {
      "epoch": 1.088,
      "grad_norm": 0.7009584419229203,
      "learning_rate": 0.0005448717948717948,
      "loss": 7.1368,
      "loss_layer_12_head": 1.696702241897583,
      "loss_layer_18_head": 1.4718778133392334,
      "loss_layer_24_head": 0.9216338396072388,
      "loss_layer_30_head": 0.5889676213264465,
      "loss_layer_36_head": 0.3432418406009674,
      "loss_layer_42_head": 0.17707733809947968,
      "loss_layer_6_head": 2.0628068447113037,
      "step": 85
    },
    {
      "epoch": 1.152,
      "grad_norm": 0.7249431531459248,
      "learning_rate": 0.000576923076923077,
      "loss": 6.9598,
      "loss_layer_12_head": 1.6328859329223633,
      "loss_layer_18_head": 1.4174306392669678,
      "loss_layer_24_head": 0.8910695910453796,
      "loss_layer_30_head": 0.5765085816383362,
      "loss_layer_36_head": 0.3352130651473999,
      "loss_layer_42_head": 0.18452748656272888,
      "loss_layer_6_head": 2.002959728240967,
      "step": 90
    },
    {
      "epoch": 1.216,
      "grad_norm": 0.713897037635612,
      "learning_rate": 0.000608974358974359,
      "loss": 6.8369,
      "loss_layer_12_head": 1.621018648147583,
      "loss_layer_18_head": 1.3990185260772705,
      "loss_layer_24_head": 0.8698197603225708,
      "loss_layer_30_head": 0.5596818923950195,
      "loss_layer_36_head": 0.32750824093818665,
      "loss_layer_42_head": 0.1740235984325409,
      "loss_layer_6_head": 1.9899883270263672,
      "step": 95
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.7355830580076791,
      "learning_rate": 0.000641025641025641,
      "loss": 6.6869,
      "loss_layer_12_head": 1.5669212341308594,
      "loss_layer_18_head": 1.356851577758789,
      "loss_layer_24_head": 0.8450614213943481,
      "loss_layer_30_head": 0.5401099920272827,
      "loss_layer_36_head": 0.3211931884288788,
      "loss_layer_42_head": 0.17946001887321472,
      "loss_layer_6_head": 1.9455448389053345,
      "step": 100
    },
    {
      "epoch": 1.3439999999999999,
      "grad_norm": 0.7465867943312552,
      "learning_rate": 0.000673076923076923,
      "loss": 6.4941,
      "loss_layer_12_head": 1.5120972394943237,
      "loss_layer_18_head": 1.3155301809310913,
      "loss_layer_24_head": 0.8143423199653625,
      "loss_layer_30_head": 0.5246442556381226,
      "loss_layer_36_head": 0.310540109872818,
      "loss_layer_42_head": 0.17187541723251343,
      "loss_layer_6_head": 1.895432710647583,
      "step": 105
    },
    {
      "epoch": 1.408,
      "grad_norm": 0.6921590973909223,
      "learning_rate": 0.0007051282051282052,
      "loss": 6.3462,
      "loss_layer_12_head": 1.4897682666778564,
      "loss_layer_18_head": 1.2878010272979736,
      "loss_layer_24_head": 0.7983564138412476,
      "loss_layer_30_head": 0.5089081525802612,
      "loss_layer_36_head": 0.30936819314956665,
      "loss_layer_42_head": 0.1655205488204956,
      "loss_layer_6_head": 1.8700761795043945,
      "step": 110
    },
    {
      "epoch": 1.472,
      "grad_norm": 0.8164224053940632,
      "learning_rate": 0.0007371794871794872,
      "loss": 6.3803,
      "loss_layer_12_head": 1.4769477844238281,
      "loss_layer_18_head": 1.2795255184173584,
      "loss_layer_24_head": 0.800942063331604,
      "loss_layer_30_head": 0.5113844871520996,
      "loss_layer_36_head": 0.30834177136421204,
      "loss_layer_42_head": 0.16064013540744781,
      "loss_layer_6_head": 1.8500779867172241,
      "step": 115
    },
    {
      "epoch": 1.536,
      "grad_norm": 0.9347288444957269,
      "learning_rate": 0.0007692307692307693,
      "loss": 6.2038,
      "loss_layer_12_head": 1.412451148033142,
      "loss_layer_18_head": 1.2162846326828003,
      "loss_layer_24_head": 0.7555257678031921,
      "loss_layer_30_head": 0.486905962228775,
      "loss_layer_36_head": 0.29446402192115784,
      "loss_layer_42_head": 0.14996713399887085,
      "loss_layer_6_head": 1.7827972173690796,
      "step": 120
    },
    {
      "epoch": 1.6,
      "grad_norm": 1.0195412041401324,
      "learning_rate": 0.0008012820512820514,
      "loss": 6.1792,
      "loss_layer_12_head": 1.4261515140533447,
      "loss_layer_18_head": 1.2306697368621826,
      "loss_layer_24_head": 0.7668939828872681,
      "loss_layer_30_head": 0.49948734045028687,
      "loss_layer_36_head": 0.2962823510169983,
      "loss_layer_42_head": 0.1473720371723175,
      "loss_layer_6_head": 1.7997850179672241,
      "step": 125
    },
    {
      "epoch": 1.6640000000000001,
      "grad_norm": 1.202758573196248,
      "learning_rate": 0.0008333333333333333,
      "loss": 6.1759,
      "loss_layer_12_head": 1.3696413040161133,
      "loss_layer_18_head": 1.1943457126617432,
      "loss_layer_24_head": 0.7365537881851196,
      "loss_layer_30_head": 0.4711362421512604,
      "loss_layer_36_head": 0.2902056574821472,
      "loss_layer_42_head": 0.14509081840515137,
      "loss_layer_6_head": 1.7396440505981445,
      "step": 130
    },
    {
      "epoch": 1.728,
      "grad_norm": 1.3920792917165532,
      "learning_rate": 0.0008653846153846154,
      "loss": 6.1628,
      "loss_layer_12_head": 1.4630802869796753,
      "loss_layer_18_head": 1.2507648468017578,
      "loss_layer_24_head": 0.7907452583312988,
      "loss_layer_30_head": 0.4928191602230072,
      "loss_layer_36_head": 0.29016149044036865,
      "loss_layer_42_head": 0.18300706148147583,
      "loss_layer_6_head": 1.779584527015686,
      "step": 135
    },
    {
      "epoch": 1.792,
      "grad_norm": 1.7865661756407187,
      "learning_rate": 0.0008974358974358974,
      "loss": 6.2154,
      "loss_layer_12_head": 1.4242961406707764,
      "loss_layer_18_head": 1.2039217948913574,
      "loss_layer_24_head": 0.796029269695282,
      "loss_layer_30_head": 0.5012171864509583,
      "loss_layer_36_head": 0.28143981099128723,
      "loss_layer_42_head": 0.23380407691001892,
      "loss_layer_6_head": 1.7281856536865234,
      "step": 140
    },
    {
      "epoch": 1.8559999999999999,
      "grad_norm": 1.3391866291369003,
      "learning_rate": 0.0009294871794871796,
      "loss": 6.1459,
      "loss_layer_12_head": 1.4044183492660522,
      "loss_layer_18_head": 1.195464849472046,
      "loss_layer_24_head": 0.7732126116752625,
      "loss_layer_30_head": 0.5159396529197693,
      "loss_layer_36_head": 0.31018996238708496,
      "loss_layer_42_head": 0.23216986656188965,
      "loss_layer_6_head": 1.7235887050628662,
      "step": 145
    },
    {
      "epoch": 1.92,
      "grad_norm": 1.232859650120295,
      "learning_rate": 0.0009615384615384616,
      "loss": 6.0659,
      "loss_layer_12_head": 1.351414680480957,
      "loss_layer_18_head": 1.161749005317688,
      "loss_layer_24_head": 0.7462221384048462,
      "loss_layer_30_head": 0.4966951906681061,
      "loss_layer_36_head": 0.30694982409477234,
      "loss_layer_42_head": 0.19564782083034515,
      "loss_layer_6_head": 1.6975170373916626,
      "step": 150
    },
    {
      "epoch": 1.984,
      "grad_norm": 1.4637350305351655,
      "learning_rate": 0.0009935897435897436,
      "loss": 6.0057,
      "loss_layer_12_head": 1.3477075099945068,
      "loss_layer_18_head": 1.1702073812484741,
      "loss_layer_24_head": 0.7490946650505066,
      "loss_layer_30_head": 0.4988594651222229,
      "loss_layer_36_head": 0.2953687906265259,
      "loss_layer_42_head": 0.21625740826129913,
      "loss_layer_6_head": 1.7067304849624634,
      "step": 155
    },
    {
      "epoch": 2.048,
      "grad_norm": 1.2595688783451227,
      "learning_rate": 0.0010256410256410256,
      "loss": 5.4249,
      "loss_layer_12_head": 1.2489827871322632,
      "loss_layer_18_head": 1.0638232231140137,
      "loss_layer_24_head": 0.6694899797439575,
      "loss_layer_30_head": 0.4312301576137543,
      "loss_layer_36_head": 0.25463834404945374,
      "loss_layer_42_head": 0.1779422163963318,
      "loss_layer_6_head": 1.6177845001220703,
      "step": 160
    },
    {
      "epoch": 2.112,
      "grad_norm": 1.3285144399488442,
      "learning_rate": 0.0010576923076923077,
      "loss": 5.1574,
      "loss_layer_12_head": 1.2045129537582397,
      "loss_layer_18_head": 1.0227335691452026,
      "loss_layer_24_head": 0.6365531086921692,
      "loss_layer_30_head": 0.3906553387641907,
      "loss_layer_36_head": 0.23611776530742645,
      "loss_layer_42_head": 0.1476689726114273,
      "loss_layer_6_head": 1.5717567205429077,
      "step": 165
    },
    {
      "epoch": 2.176,
      "grad_norm": 1.8242976717722958,
      "learning_rate": 0.0010897435897435897,
      "loss": 5.2334,
      "loss_layer_12_head": 1.1918764114379883,
      "loss_layer_18_head": 1.0200471878051758,
      "loss_layer_24_head": 0.6467212438583374,
      "loss_layer_30_head": 0.3948553800582886,
      "loss_layer_36_head": 0.23422153294086456,
      "loss_layer_42_head": 0.15323373675346375,
      "loss_layer_6_head": 1.5479586124420166,
      "step": 170
    },
    {
      "epoch": 2.24,
      "grad_norm": 2.7604160421919217,
      "learning_rate": 0.001121794871794872,
      "loss": 5.3358,
      "loss_layer_12_head": 1.2481719255447388,
      "loss_layer_18_head": 1.015689730644226,
      "loss_layer_24_head": 0.7251331210136414,
      "loss_layer_30_head": 0.3761669993400574,
      "loss_layer_36_head": 0.23120105266571045,
      "loss_layer_42_head": 0.14694413542747498,
      "loss_layer_6_head": 1.5622069835662842,
      "step": 175
    },
    {
      "epoch": 2.304,
      "grad_norm": 3.0160878801269173,
      "learning_rate": 0.001153846153846154,
      "loss": 5.5909,
      "loss_layer_12_head": 1.381043553352356,
      "loss_layer_18_head": 1.1257587671279907,
      "loss_layer_24_head": 0.8395994901657104,
      "loss_layer_30_head": 0.44287142157554626,
      "loss_layer_36_head": 0.2527807354927063,
      "loss_layer_42_head": 0.13897545635700226,
      "loss_layer_6_head": 1.6381677389144897,
      "step": 180
    },
    {
      "epoch": 2.368,
      "grad_norm": 2.650271324864963,
      "learning_rate": 0.001185897435897436,
      "loss": 5.9432,
      "loss_layer_12_head": 1.4114423990249634,
      "loss_layer_18_head": 1.3900816440582275,
      "loss_layer_24_head": 0.7341901063919067,
      "loss_layer_30_head": 0.49527424573898315,
      "loss_layer_36_head": 0.25369030237197876,
      "loss_layer_42_head": 0.12579014897346497,
      "loss_layer_6_head": 1.5585331916809082,
      "step": 185
    },
    {
      "epoch": 2.432,
      "grad_norm": 2.2057643269803804,
      "learning_rate": 0.001217948717948718,
      "loss": 6.1003,
      "loss_layer_12_head": 1.4078388214111328,
      "loss_layer_18_head": 1.5590273141860962,
      "loss_layer_24_head": 0.7257327437400818,
      "loss_layer_30_head": 0.5263029932975769,
      "loss_layer_36_head": 0.32185983657836914,
      "loss_layer_42_head": 0.14923664927482605,
      "loss_layer_6_head": 1.5773340463638306,
      "step": 190
    },
    {
      "epoch": 2.496,
      "grad_norm": 1.6016054466802783,
      "learning_rate": 0.00125,
      "loss": 5.8783,
      "loss_layer_12_head": 1.3602807521820068,
      "loss_layer_18_head": 1.3887226581573486,
      "loss_layer_24_head": 0.6914435625076294,
      "loss_layer_30_head": 0.4761180281639099,
      "loss_layer_36_head": 0.3037206530570984,
      "loss_layer_42_head": 0.18457774817943573,
      "loss_layer_6_head": 1.5961837768554688,
      "step": 195
    },
    {
      "epoch": 2.56,
      "grad_norm": 1.5894494398292354,
      "learning_rate": 0.001282051282051282,
      "loss": 5.7055,
      "loss_layer_12_head": 1.2778801918029785,
      "loss_layer_18_head": 1.2522013187408447,
      "loss_layer_24_head": 0.6793658137321472,
      "loss_layer_30_head": 0.45726174116134644,
      "loss_layer_36_head": 0.30848923325538635,
      "loss_layer_42_head": 0.2648147940635681,
      "loss_layer_6_head": 1.557602882385254,
      "step": 200
    },
    {
      "epoch": 2.56,
      "eval_loss": 6.73535680770874,
      "eval_loss_layer_12_head": 1.4923317432403564,
      "eval_loss_layer_18_head": 1.420610785484314,
      "eval_loss_layer_24_head": 0.8735315203666687,
      "eval_loss_layer_30_head": 0.6246117353439331,
      "eval_loss_layer_36_head": 0.4657672047615051,
      "eval_loss_layer_42_head": 0.4023154377937317,
      "eval_loss_layer_6_head": 1.7853742837905884,
      "eval_runtime": 33.1303,
      "eval_samples_per_second": 9.659,
      "eval_steps_per_second": 0.604,
      "step": 200
    },
    {
      "epoch": 2.624,
      "grad_norm": 1.8128627683017784,
      "learning_rate": 0.001314102564102564,
      "loss": 5.5585,
      "loss_layer_12_head": 1.2553622722625732,
      "loss_layer_18_head": 1.1636579036712646,
      "loss_layer_24_head": 0.6378979682922363,
      "loss_layer_30_head": 0.4233826696872711,
      "loss_layer_36_head": 0.2628973126411438,
      "loss_layer_42_head": 0.221687912940979,
      "loss_layer_6_head": 1.5374953746795654,
      "step": 205
    },
    {
      "epoch": 2.6879999999999997,
      "grad_norm": 1.002823815544602,
      "learning_rate": 0.001346153846153846,
      "loss": 5.4613,
      "loss_layer_12_head": 1.2105135917663574,
      "loss_layer_18_head": 1.10488760471344,
      "loss_layer_24_head": 0.6486782431602478,
      "loss_layer_30_head": 0.4790101945400238,
      "loss_layer_36_head": 0.38131821155548096,
      "loss_layer_42_head": 0.2511555552482605,
      "loss_layer_6_head": 1.5660761594772339,
      "step": 210
    },
    {
      "epoch": 2.752,
      "grad_norm": 1.7095772384713652,
      "learning_rate": 0.0013782051282051283,
      "loss": 5.316,
      "loss_layer_12_head": 1.2337706089019775,
      "loss_layer_18_head": 1.0718581676483154,
      "loss_layer_24_head": 0.620266318321228,
      "loss_layer_30_head": 0.40365323424339294,
      "loss_layer_36_head": 0.27666035294532776,
      "loss_layer_42_head": 0.17687053978443146,
      "loss_layer_6_head": 1.5316277742385864,
      "step": 215
    },
    {
      "epoch": 2.816,
      "grad_norm": 1.714551805274401,
      "learning_rate": 0.0014102564102564104,
      "loss": 5.2826,
      "loss_layer_12_head": 1.2206809520721436,
      "loss_layer_18_head": 1.0399658679962158,
      "loss_layer_24_head": 0.6188268661499023,
      "loss_layer_30_head": 0.38949504494667053,
      "loss_layer_36_head": 0.28615182638168335,
      "loss_layer_42_head": 0.16158878803253174,
      "loss_layer_6_head": 1.5471827983856201,
      "step": 220
    },
    {
      "epoch": 2.88,
      "grad_norm": 1.5674921702883369,
      "learning_rate": 0.0014423076923076922,
      "loss": 5.2878,
      "loss_layer_12_head": 1.2041304111480713,
      "loss_layer_18_head": 1.0012842416763306,
      "loss_layer_24_head": 0.6141806840896606,
      "loss_layer_30_head": 0.38315606117248535,
      "loss_layer_36_head": 0.29070788621902466,
      "loss_layer_42_head": 0.16414299607276917,
      "loss_layer_6_head": 1.5484325885772705,
      "step": 225
    },
    {
      "epoch": 2.944,
      "grad_norm": 2.2148064115783215,
      "learning_rate": 0.0014743589743589744,
      "loss": 5.361,
      "loss_layer_12_head": 1.1711474657058716,
      "loss_layer_18_head": 0.989611029624939,
      "loss_layer_24_head": 0.6490617990493774,
      "loss_layer_30_head": 0.41618824005126953,
      "loss_layer_36_head": 0.3345591723918915,
      "loss_layer_42_head": 0.16940604150295258,
      "loss_layer_6_head": 1.6113455295562744,
      "step": 230
    },
    {
      "epoch": 3.008,
      "grad_norm": 2.7476882588188474,
      "learning_rate": 0.0015064102564102564,
      "loss": 5.3154,
      "loss_layer_12_head": 1.180027723312378,
      "loss_layer_18_head": 0.9775432348251343,
      "loss_layer_24_head": 0.6728541851043701,
      "loss_layer_30_head": 0.41325920820236206,
      "loss_layer_36_head": 0.3218420147895813,
      "loss_layer_42_head": 0.15750746428966522,
      "loss_layer_6_head": 1.7114267349243164,
      "step": 235
    },
    {
      "epoch": 3.072,
      "grad_norm": 1.8594191463899867,
      "learning_rate": 0.0015384615384615387,
      "loss": 4.9434,
      "loss_layer_12_head": 1.1500180959701538,
      "loss_layer_18_head": 0.8422005772590637,
      "loss_layer_24_head": 0.5771728754043579,
      "loss_layer_30_head": 0.354364275932312,
      "loss_layer_36_head": 0.2512372136116028,
      "loss_layer_42_head": 0.1405602991580963,
      "loss_layer_6_head": 1.543402910232544,
      "step": 240
    },
    {
      "epoch": 3.136,
      "grad_norm": 1.8623394445264356,
      "learning_rate": 0.0015705128205128205,
      "loss": 4.9496,
      "loss_layer_12_head": 1.1674667596817017,
      "loss_layer_18_head": 0.8629356622695923,
      "loss_layer_24_head": 0.6125825643539429,
      "loss_layer_30_head": 0.36996617913246155,
      "loss_layer_36_head": 0.2859323024749756,
      "loss_layer_42_head": 0.1452450454235077,
      "loss_layer_6_head": 1.510387659072876,
      "step": 245
    },
    {
      "epoch": 3.2,
      "grad_norm": 1.632577822842244,
      "learning_rate": 0.0016025641025641027,
      "loss": 4.8385,
      "loss_layer_12_head": 1.1753814220428467,
      "loss_layer_18_head": 0.8263545036315918,
      "loss_layer_24_head": 0.602365255355835,
      "loss_layer_30_head": 0.36572879552841187,
      "loss_layer_36_head": 0.2508179843425751,
      "loss_layer_42_head": 0.14004835486412048,
      "loss_layer_6_head": 1.4294288158416748,
      "step": 250
    },
    {
      "epoch": 3.2640000000000002,
      "grad_norm": 1.3539730910441845,
      "learning_rate": 0.0016346153846153847,
      "loss": 4.8845,
      "loss_layer_12_head": 1.1988013982772827,
      "loss_layer_18_head": 0.8767802119255066,
      "loss_layer_24_head": 0.6163928508758545,
      "loss_layer_30_head": 0.3718445897102356,
      "loss_layer_36_head": 0.2381191998720169,
      "loss_layer_42_head": 0.1942732334136963,
      "loss_layer_6_head": 1.4342732429504395,
      "step": 255
    },
    {
      "epoch": 3.328,
      "grad_norm": 1.6317964794238538,
      "learning_rate": 0.0016666666666666666,
      "loss": 4.9381,
      "loss_layer_12_head": 1.1393346786499023,
      "loss_layer_18_head": 0.8653410077095032,
      "loss_layer_24_head": 0.6422839760780334,
      "loss_layer_30_head": 0.39166685938835144,
      "loss_layer_36_head": 0.21941299736499786,
      "loss_layer_42_head": 0.19203431904315948,
      "loss_layer_6_head": 1.4244370460510254,
      "step": 260
    },
    {
      "epoch": 3.392,
      "grad_norm": 2.263488910232371,
      "learning_rate": 0.0016987179487179488,
      "loss": 4.8506,
      "loss_layer_12_head": 1.0970256328582764,
      "loss_layer_18_head": 0.8824162483215332,
      "loss_layer_24_head": 0.6799618005752563,
      "loss_layer_30_head": 0.41541925072669983,
      "loss_layer_36_head": 0.2162415087223053,
      "loss_layer_42_head": 0.19602268934249878,
      "loss_layer_6_head": 1.4223828315734863,
      "step": 265
    },
    {
      "epoch": 3.456,
      "grad_norm": 1.5473674903367038,
      "learning_rate": 0.0017307692307692308,
      "loss": 5.132,
      "loss_layer_12_head": 1.0690199136734009,
      "loss_layer_18_head": 0.9159833192825317,
      "loss_layer_24_head": 0.852130115032196,
      "loss_layer_30_head": 0.4602586627006531,
      "loss_layer_36_head": 0.22735771536827087,
      "loss_layer_42_head": 0.21244323253631592,
      "loss_layer_6_head": 1.3965208530426025,
      "step": 270
    },
    {
      "epoch": 3.52,
      "grad_norm": 1.4382976453996263,
      "learning_rate": 0.001762820512820513,
      "loss": 5.0897,
      "loss_layer_12_head": 1.0752629041671753,
      "loss_layer_18_head": 0.9956910014152527,
      "loss_layer_24_head": 0.7844030261039734,
      "loss_layer_30_head": 0.44828978180885315,
      "loss_layer_36_head": 0.2555540204048157,
      "loss_layer_42_head": 0.19376778602600098,
      "loss_layer_6_head": 1.422660231590271,
      "step": 275
    },
    {
      "epoch": 3.584,
      "grad_norm": 2.2002637704552033,
      "learning_rate": 0.0017948717948717949,
      "loss": 5.0483,
      "loss_layer_12_head": 1.0711702108383179,
      "loss_layer_18_head": 1.049375295639038,
      "loss_layer_24_head": 0.7038620114326477,
      "loss_layer_30_head": 0.3932981789112091,
      "loss_layer_36_head": 0.2658161520957947,
      "loss_layer_42_head": 0.19047731161117554,
      "loss_layer_6_head": 1.4400508403778076,
      "step": 280
    },
    {
      "epoch": 3.648,
      "grad_norm": 2.2350298323016338,
      "learning_rate": 0.001826923076923077,
      "loss": 5.1412,
      "loss_layer_12_head": 1.0619419813156128,
      "loss_layer_18_head": 1.1268839836120605,
      "loss_layer_24_head": 0.6878223419189453,
      "loss_layer_30_head": 0.3887704908847809,
      "loss_layer_36_head": 0.3069573640823364,
      "loss_layer_42_head": 0.20403429865837097,
      "loss_layer_6_head": 1.4380861520767212,
      "step": 285
    },
    {
      "epoch": 3.7119999999999997,
      "grad_norm": 1.7587774365139515,
      "learning_rate": 0.0018589743589743591,
      "loss": 5.157,
      "loss_layer_12_head": 1.0203802585601807,
      "loss_layer_18_head": 1.0854065418243408,
      "loss_layer_24_head": 0.6086921691894531,
      "loss_layer_30_head": 0.35945647954940796,
      "loss_layer_36_head": 0.3403290808200836,
      "loss_layer_42_head": 0.16456429660320282,
      "loss_layer_6_head": 1.4395874738693237,
      "step": 290
    },
    {
      "epoch": 3.776,
      "grad_norm": 1.6686618334285492,
      "learning_rate": 0.001891025641025641,
      "loss": 5.0356,
      "loss_layer_12_head": 1.068701982498169,
      "loss_layer_18_head": 1.0769846439361572,
      "loss_layer_24_head": 0.6416333913803101,
      "loss_layer_30_head": 0.3786800801753998,
      "loss_layer_36_head": 0.3261027932167053,
      "loss_layer_42_head": 0.17153556644916534,
      "loss_layer_6_head": 1.4446418285369873,
      "step": 295
    },
    {
      "epoch": 3.84,
      "grad_norm": 1.6412688965344944,
      "learning_rate": 0.0019230769230769232,
      "loss": 5.0676,
      "loss_layer_12_head": 1.1414475440979004,
      "loss_layer_18_head": 0.9928023219108582,
      "loss_layer_24_head": 0.5756922364234924,
      "loss_layer_30_head": 0.37908726930618286,
      "loss_layer_36_head": 0.303453266620636,
      "loss_layer_42_head": 0.1407882571220398,
      "loss_layer_6_head": 1.4420920610427856,
      "step": 300
    },
    {
      "epoch": 3.904,
      "grad_norm": 2.40352247158346,
      "learning_rate": 0.001955128205128205,
      "loss": 5.148,
      "loss_layer_12_head": 1.2359702587127686,
      "loss_layer_18_head": 0.9578679203987122,
      "loss_layer_24_head": 0.5664467215538025,
      "loss_layer_30_head": 0.47213059663772583,
      "loss_layer_36_head": 0.2848809063434601,
      "loss_layer_42_head": 0.13193312287330627,
      "loss_layer_6_head": 1.4372678995132446,
      "step": 305
    },
    {
      "epoch": 3.968,
      "grad_norm": 1.559874040817101,
      "learning_rate": 0.0019871794871794872,
      "loss": 5.2764,
      "loss_layer_12_head": 1.4110486507415771,
      "loss_layer_18_head": 0.8875546455383301,
      "loss_layer_24_head": 0.5397177338600159,
      "loss_layer_30_head": 0.4647239148616791,
      "loss_layer_36_head": 0.26638108491897583,
      "loss_layer_42_head": 0.14694492518901825,
      "loss_layer_6_head": 1.4002892971038818,
      "step": 310
    },
    {
      "epoch": 4.032,
      "grad_norm": 1.8123879661805182,
      "learning_rate": 0.0020192307692307693,
      "loss": 4.9416,
      "loss_layer_12_head": 1.2729299068450928,
      "loss_layer_18_head": 0.868046760559082,
      "loss_layer_24_head": 0.5329074859619141,
      "loss_layer_30_head": 0.43405523896217346,
      "loss_layer_36_head": 0.2538921535015106,
      "loss_layer_42_head": 0.1853492259979248,
      "loss_layer_6_head": 1.431146502494812,
      "step": 315
    },
    {
      "epoch": 4.096,
      "grad_norm": 1.2525152251140055,
      "learning_rate": 0.0020512820512820513,
      "loss": 4.5417,
      "loss_layer_12_head": 1.1057329177856445,
      "loss_layer_18_head": 0.7626581788063049,
      "loss_layer_24_head": 0.48418840765953064,
      "loss_layer_30_head": 0.4015658497810364,
      "loss_layer_36_head": 0.23147901892662048,
      "loss_layer_42_head": 0.19527146220207214,
      "loss_layer_6_head": 1.3196216821670532,
      "step": 320
    },
    {
      "epoch": 4.16,
      "grad_norm": 1.257348104360811,
      "learning_rate": 0.0020833333333333333,
      "loss": 4.496,
      "loss_layer_12_head": 1.067979097366333,
      "loss_layer_18_head": 0.7820265293121338,
      "loss_layer_24_head": 0.4917287230491638,
      "loss_layer_30_head": 0.3839060366153717,
      "loss_layer_36_head": 0.22729161381721497,
      "loss_layer_42_head": 0.19780611991882324,
      "loss_layer_6_head": 1.3909142017364502,
      "step": 325
    },
    {
      "epoch": 4.224,
      "grad_norm": 2.22371753427275,
      "learning_rate": 0.0021153846153846153,
      "loss": 4.6205,
      "loss_layer_12_head": 1.0380141735076904,
      "loss_layer_18_head": 0.8120869398117065,
      "loss_layer_24_head": 0.5308312177658081,
      "loss_layer_30_head": 0.3838871121406555,
      "loss_layer_36_head": 0.21532897651195526,
      "loss_layer_42_head": 0.16444726288318634,
      "loss_layer_6_head": 1.519095540046692,
      "step": 330
    },
    {
      "epoch": 4.288,
      "grad_norm": 1.5458235702482288,
      "learning_rate": 0.0021474358974358974,
      "loss": 4.6074,
      "loss_layer_12_head": 0.9993322491645813,
      "loss_layer_18_head": 0.8261927366256714,
      "loss_layer_24_head": 0.5410993695259094,
      "loss_layer_30_head": 0.38132694363594055,
      "loss_layer_36_head": 0.22273953258991241,
      "loss_layer_42_head": 0.14796359837055206,
      "loss_layer_6_head": 1.5583345890045166,
      "step": 335
    },
    {
      "epoch": 4.352,
      "grad_norm": 2.065552699038677,
      "learning_rate": 0.0021794871794871794,
      "loss": 4.8012,
      "loss_layer_12_head": 0.9577761888504028,
      "loss_layer_18_head": 0.943595290184021,
      "loss_layer_24_head": 0.6405254006385803,
      "loss_layer_30_head": 0.3599250316619873,
      "loss_layer_36_head": 0.2304101437330246,
      "loss_layer_42_head": 0.16143080592155457,
      "loss_layer_6_head": 1.449658751487732,
      "step": 340
    },
    {
      "epoch": 4.416,
      "grad_norm": 1.9164712312773848,
      "learning_rate": 0.0022115384615384614,
      "loss": 5.156,
      "loss_layer_12_head": 1.0099929571151733,
      "loss_layer_18_head": 1.1876944303512573,
      "loss_layer_24_head": 0.8076890707015991,
      "loss_layer_30_head": 0.35284629464149475,
      "loss_layer_36_head": 0.2251063883304596,
      "loss_layer_42_head": 0.1491672843694687,
      "loss_layer_6_head": 1.4466583728790283,
      "step": 345
    },
    {
      "epoch": 4.48,
      "grad_norm": 1.1959281262833081,
      "learning_rate": 0.002243589743589744,
      "loss": 4.9407,
      "loss_layer_12_head": 1.0012428760528564,
      "loss_layer_18_head": 1.1133602857589722,
      "loss_layer_24_head": 0.7792037725448608,
      "loss_layer_30_head": 0.3430318236351013,
      "loss_layer_36_head": 0.25059375166893005,
      "loss_layer_42_head": 0.1691216677427292,
      "loss_layer_6_head": 1.3917280435562134,
      "step": 350
    },
    {
      "epoch": 4.5440000000000005,
      "grad_norm": 1.2019935676567937,
      "learning_rate": 0.0022756410256410254,
      "loss": 4.854,
      "loss_layer_12_head": 0.9871445894241333,
      "loss_layer_18_head": 1.0343468189239502,
      "loss_layer_24_head": 0.7177326083183289,
      "loss_layer_30_head": 0.32834428548812866,
      "loss_layer_36_head": 0.2527516484260559,
      "loss_layer_42_head": 0.16427282989025116,
      "loss_layer_6_head": 1.3611738681793213,
      "step": 355
    },
    {
      "epoch": 4.608,
      "grad_norm": 1.3790209448169755,
      "learning_rate": 0.002307692307692308,
      "loss": 4.8081,
      "loss_layer_12_head": 0.9923605918884277,
      "loss_layer_18_head": 0.9624230265617371,
      "loss_layer_24_head": 0.6422814130783081,
      "loss_layer_30_head": 0.3139871060848236,
      "loss_layer_36_head": 0.2991451323032379,
      "loss_layer_42_head": 0.17165526747703552,
      "loss_layer_6_head": 1.3332586288452148,
      "step": 360
    },
    {
      "epoch": 4.672,
      "grad_norm": 1.2567642348119363,
      "learning_rate": 0.00233974358974359,
      "loss": 4.7918,
      "loss_layer_12_head": 1.0560863018035889,
      "loss_layer_18_head": 0.9091771245002747,
      "loss_layer_24_head": 0.6125902533531189,
      "loss_layer_30_head": 0.380319207906723,
      "loss_layer_36_head": 0.3379630446434021,
      "loss_layer_42_head": 0.19742897152900696,
      "loss_layer_6_head": 1.3430302143096924,
      "step": 365
    },
    {
      "epoch": 4.736,
      "grad_norm": 1.150627385919798,
      "learning_rate": 0.002371794871794872,
      "loss": 4.7062,
      "loss_layer_12_head": 1.0729880332946777,
      "loss_layer_18_head": 0.8711227178573608,
      "loss_layer_24_head": 0.5666517019271851,
      "loss_layer_30_head": 0.3383065164089203,
      "loss_layer_36_head": 0.29244428873062134,
      "loss_layer_42_head": 0.20372024178504944,
      "loss_layer_6_head": 1.3268954753875732,
      "step": 370
    },
    {
      "epoch": 4.8,
      "grad_norm": 1.2679919868951717,
      "learning_rate": 0.002403846153846154,
      "loss": 4.7592,
      "loss_layer_12_head": 1.1276118755340576,
      "loss_layer_18_head": 0.8566043972969055,
      "loss_layer_24_head": 0.5548937320709229,
      "loss_layer_30_head": 0.3628123998641968,
      "loss_layer_36_head": 0.26117438077926636,
      "loss_layer_42_head": 0.19338004291057587,
      "loss_layer_6_head": 1.3220725059509277,
      "step": 375
    },
    {
      "epoch": 4.864,
      "grad_norm": 1.1835291744550351,
      "learning_rate": 0.002435897435897436,
      "loss": 4.6888,
      "loss_layer_12_head": 1.1320111751556396,
      "loss_layer_18_head": 0.838676929473877,
      "loss_layer_24_head": 0.5394530296325684,
      "loss_layer_30_head": 0.3824705481529236,
      "loss_layer_36_head": 0.24661512672901154,
      "loss_layer_42_head": 0.18477994203567505,
      "loss_layer_6_head": 1.3196923732757568,
      "step": 380
    },
    {
      "epoch": 4.928,
      "grad_norm": 0.93297205510994,
      "learning_rate": 0.002467948717948718,
      "loss": 4.6268,
      "loss_layer_12_head": 1.129207968711853,
      "loss_layer_18_head": 0.8534852862358093,
      "loss_layer_24_head": 0.5393639802932739,
      "loss_layer_30_head": 0.37884843349456787,
      "loss_layer_36_head": 0.23918715119361877,
      "loss_layer_42_head": 0.17380383610725403,
      "loss_layer_6_head": 1.3462952375411987,
      "step": 385
    },
    {
      "epoch": 4.992,
      "grad_norm": 1.1118442858024633,
      "learning_rate": 0.0025,
      "loss": 4.6166,
      "loss_layer_12_head": 1.057861089706421,
      "loss_layer_18_head": 0.8227025270462036,
      "loss_layer_24_head": 0.5256088376045227,
      "loss_layer_30_head": 0.40619951486587524,
      "loss_layer_36_head": 0.22796925902366638,
      "loss_layer_42_head": 0.17456981539726257,
      "loss_layer_6_head": 1.3249610662460327,
      "step": 390
    },
    {
      "epoch": 5.056,
      "grad_norm": 1.434047640503835,
      "learning_rate": 0.002532051282051282,
      "loss": 4.1882,
      "loss_layer_12_head": 1.0051571130752563,
      "loss_layer_18_head": 0.7884635925292969,
      "loss_layer_24_head": 0.5074498653411865,
      "loss_layer_30_head": 0.39994820952415466,
      "loss_layer_36_head": 0.23031273484230042,
      "loss_layer_42_head": 0.16539302468299866,
      "loss_layer_6_head": 1.3033608198165894,
      "step": 395
    },
    {
      "epoch": 5.12,
      "grad_norm": 1.3644910422058338,
      "learning_rate": 0.002564102564102564,
      "loss": 4.2132,
      "loss_layer_12_head": 0.9761854410171509,
      "loss_layer_18_head": 0.7867392301559448,
      "loss_layer_24_head": 0.48766621947288513,
      "loss_layer_30_head": 0.3911130130290985,
      "loss_layer_36_head": 0.22689180076122284,
      "loss_layer_42_head": 0.14693288505077362,
      "loss_layer_6_head": 1.3670469522476196,
      "step": 400
    },
    {
      "epoch": 5.12,
      "eval_loss": 6.154580116271973,
      "eval_loss_layer_12_head": 1.395673155784607,
      "eval_loss_layer_18_head": 1.143993854522705,
      "eval_loss_layer_24_head": 0.7580552101135254,
      "eval_loss_layer_30_head": 0.6526235938072205,
      "eval_loss_layer_36_head": 0.3537737727165222,
      "eval_loss_layer_42_head": 0.21668951213359833,
      "eval_loss_layer_6_head": 1.7830324172973633,
      "eval_runtime": 33.078,
      "eval_samples_per_second": 9.674,
      "eval_steps_per_second": 0.605,
      "step": 400
    },
    {
      "epoch": 5.184,
      "grad_norm": 1.8730269346761994,
      "learning_rate": 0.0025961538461538466,
      "loss": 4.3432,
      "loss_layer_12_head": 0.9276682734489441,
      "loss_layer_18_head": 0.7169128656387329,
      "loss_layer_24_head": 0.4453452229499817,
      "loss_layer_30_head": 0.40672698616981506,
      "loss_layer_36_head": 0.2135811746120453,
      "loss_layer_42_head": 0.16693656146526337,
      "loss_layer_6_head": 1.3595335483551025,
      "step": 405
    },
    {
      "epoch": 5.248,
      "grad_norm": 1.801244908548722,
      "learning_rate": 0.002628205128205128,
      "loss": 4.547,
      "loss_layer_12_head": 1.0343167781829834,
      "loss_layer_18_head": 0.7185916304588318,
      "loss_layer_24_head": 0.47973212599754333,
      "loss_layer_30_head": 0.3832799792289734,
      "loss_layer_36_head": 0.24773678183555603,
      "loss_layer_42_head": 0.1812468022108078,
      "loss_layer_6_head": 1.4077363014221191,
      "step": 410
    },
    {
      "epoch": 5.312,
      "grad_norm": 1.569899746305991,
      "learning_rate": 0.00266025641025641,
      "loss": 4.6794,
      "loss_layer_12_head": 1.0456072092056274,
      "loss_layer_18_head": 0.8095369338989258,
      "loss_layer_24_head": 0.5158900022506714,
      "loss_layer_30_head": 0.36340200901031494,
      "loss_layer_36_head": 0.3141656517982483,
      "loss_layer_42_head": 0.17219407856464386,
      "loss_layer_6_head": 1.4655390977859497,
      "step": 415
    },
    {
      "epoch": 5.376,
      "grad_norm": 1.5447932736310925,
      "learning_rate": 0.002692307692307692,
      "loss": 4.8808,
      "loss_layer_12_head": 1.048090934753418,
      "loss_layer_18_head": 0.9932184219360352,
      "loss_layer_24_head": 0.6526538133621216,
      "loss_layer_30_head": 0.36372339725494385,
      "loss_layer_36_head": 0.3433704674243927,
      "loss_layer_42_head": 0.17487025260925293,
      "loss_layer_6_head": 1.505536437034607,
      "step": 420
    },
    {
      "epoch": 5.44,
      "grad_norm": 1.3185188756169084,
      "learning_rate": 0.0027243589743589742,
      "loss": 4.8864,
      "loss_layer_12_head": 0.9522102475166321,
      "loss_layer_18_head": 1.044856071472168,
      "loss_layer_24_head": 0.7105457782745361,
      "loss_layer_30_head": 0.32473626732826233,
      "loss_layer_36_head": 0.2757802903652191,
      "loss_layer_42_head": 0.16760139167308807,
      "loss_layer_6_head": 1.3566120862960815,
      "step": 425
    },
    {
      "epoch": 5.504,
      "grad_norm": 1.0495143215178655,
      "learning_rate": 0.0027564102564102567,
      "loss": 4.7014,
      "loss_layer_12_head": 0.9236788749694824,
      "loss_layer_18_head": 0.9669715762138367,
      "loss_layer_24_head": 0.6264371275901794,
      "loss_layer_30_head": 0.31668153405189514,
      "loss_layer_36_head": 0.2569767236709595,
      "loss_layer_42_head": 0.16442960500717163,
      "loss_layer_6_head": 1.3053088188171387,
      "step": 430
    },
    {
      "epoch": 5.568,
      "grad_norm": 1.0699955567391883,
      "learning_rate": 0.0027884615384615387,
      "loss": 4.6046,
      "loss_layer_12_head": 0.9677447080612183,
      "loss_layer_18_head": 0.9345834851264954,
      "loss_layer_24_head": 0.6055396795272827,
      "loss_layer_30_head": 0.338079035282135,
      "loss_layer_36_head": 0.24012379348278046,
      "loss_layer_42_head": 0.2280844897031784,
      "loss_layer_6_head": 1.3431198596954346,
      "step": 435
    },
    {
      "epoch": 5.632,
      "grad_norm": 0.8143527732539593,
      "learning_rate": 0.0028205128205128207,
      "loss": 4.528,
      "loss_layer_12_head": 0.9626684188842773,
      "loss_layer_18_head": 0.8939258456230164,
      "loss_layer_24_head": 0.5646082758903503,
      "loss_layer_30_head": 0.3391712009906769,
      "loss_layer_36_head": 0.2381356656551361,
      "loss_layer_42_head": 0.21609929203987122,
      "loss_layer_6_head": 1.322672724723816,
      "step": 440
    },
    {
      "epoch": 5.696,
      "grad_norm": 1.0137754592918653,
      "learning_rate": 0.0028525641025641023,
      "loss": 4.3659,
      "loss_layer_12_head": 0.949466347694397,
      "loss_layer_18_head": 0.8459054827690125,
      "loss_layer_24_head": 0.5401872992515564,
      "loss_layer_30_head": 0.3393881916999817,
      "loss_layer_36_head": 0.22150559723377228,
      "loss_layer_42_head": 0.18342123925685883,
      "loss_layer_6_head": 1.2657294273376465,
      "step": 445
    },
    {
      "epoch": 5.76,
      "grad_norm": 0.7570886580752633,
      "learning_rate": 0.0028846153846153843,
      "loss": 4.3652,
      "loss_layer_12_head": 1.0390746593475342,
      "loss_layer_18_head": 0.8489527702331543,
      "loss_layer_24_head": 0.5171689987182617,
      "loss_layer_30_head": 0.33211398124694824,
      "loss_layer_36_head": 0.21194668114185333,
      "loss_layer_42_head": 0.15884031355381012,
      "loss_layer_6_head": 1.2906792163848877,
      "step": 450
    },
    {
      "epoch": 5.824,
      "grad_norm": 1.254205915108188,
      "learning_rate": 0.002916666666666667,
      "loss": 4.4785,
      "loss_layer_12_head": 1.1366043090820312,
      "loss_layer_18_head": 0.8151988983154297,
      "loss_layer_24_head": 0.48534002900123596,
      "loss_layer_30_head": 0.31870633363723755,
      "loss_layer_36_head": 0.20031118392944336,
      "loss_layer_42_head": 0.14673419296741486,
      "loss_layer_6_head": 1.2437018156051636,
      "step": 455
    },
    {
      "epoch": 5.888,
      "grad_norm": 1.0776074618151747,
      "learning_rate": 0.002948717948717949,
      "loss": 4.5772,
      "loss_layer_12_head": 1.1926627159118652,
      "loss_layer_18_head": 0.7862736582756042,
      "loss_layer_24_head": 0.47694167494773865,
      "loss_layer_30_head": 0.3496543765068054,
      "loss_layer_36_head": 0.19799809157848358,
      "loss_layer_42_head": 0.13700205087661743,
      "loss_layer_6_head": 1.237363576889038,
      "step": 460
    },
    {
      "epoch": 5.952,
      "grad_norm": 0.8471130293622348,
      "learning_rate": 0.002980769230769231,
      "loss": 4.4384,
      "loss_layer_12_head": 1.1546635627746582,
      "loss_layer_18_head": 0.7836486101150513,
      "loss_layer_24_head": 0.4960857331752777,
      "loss_layer_30_head": 0.3857091963291168,
      "loss_layer_36_head": 0.24500486254692078,
      "loss_layer_42_head": 0.13489064574241638,
      "loss_layer_6_head": 1.244358777999878,
      "step": 465
    },
    {
      "epoch": 6.016,
      "grad_norm": 0.8265654180605722,
      "learning_rate": 0.003012820512820513,
      "loss": 4.4001,
      "loss_layer_12_head": 1.0673664808273315,
      "loss_layer_18_head": 0.7711819410324097,
      "loss_layer_24_head": 0.47758427262306213,
      "loss_layer_30_head": 0.3348618149757385,
      "loss_layer_36_head": 0.21865704655647278,
      "loss_layer_42_head": 0.1468522548675537,
      "loss_layer_6_head": 1.2463687658309937,
      "step": 470
    },
    {
      "epoch": 6.08,
      "grad_norm": 0.890075097723861,
      "learning_rate": 0.0030448717948717945,
      "loss": 3.9821,
      "loss_layer_12_head": 0.9638459086418152,
      "loss_layer_18_head": 0.7209836840629578,
      "loss_layer_24_head": 0.4490864872932434,
      "loss_layer_30_head": 0.3297625482082367,
      "loss_layer_36_head": 0.22605113685131073,
      "loss_layer_42_head": 0.13971206545829773,
      "loss_layer_6_head": 1.1923015117645264,
      "step": 475
    },
    {
      "epoch": 6.144,
      "grad_norm": 1.8263986054382972,
      "learning_rate": 0.0030769230769230774,
      "loss": 4.1905,
      "loss_layer_12_head": 0.9166902303695679,
      "loss_layer_18_head": 0.7430115342140198,
      "loss_layer_24_head": 0.46037110686302185,
      "loss_layer_30_head": 0.31110483407974243,
      "loss_layer_36_head": 0.2877315878868103,
      "loss_layer_42_head": 0.14303576946258545,
      "loss_layer_6_head": 1.313041090965271,
      "step": 480
    },
    {
      "epoch": 6.208,
      "grad_norm": 1.5436261864803291,
      "learning_rate": 0.003108974358974359,
      "loss": 4.5676,
      "loss_layer_12_head": 0.8770769238471985,
      "loss_layer_18_head": 0.8532589077949524,
      "loss_layer_24_head": 0.44717922806739807,
      "loss_layer_30_head": 0.3180530071258545,
      "loss_layer_36_head": 0.30972445011138916,
      "loss_layer_42_head": 0.17381611466407776,
      "loss_layer_6_head": 1.4809176921844482,
      "step": 485
    },
    {
      "epoch": 6.272,
      "grad_norm": 0.9554494907475025,
      "learning_rate": 0.003141025641025641,
      "loss": 4.6513,
      "loss_layer_12_head": 0.8698602914810181,
      "loss_layer_18_head": 0.8265264630317688,
      "loss_layer_24_head": 0.5025246739387512,
      "loss_layer_30_head": 0.3753402829170227,
      "loss_layer_36_head": 0.28678324818611145,
      "loss_layer_42_head": 0.26798298954963684,
      "loss_layer_6_head": 1.530792474746704,
      "step": 490
    },
    {
      "epoch": 6.336,
      "grad_norm": 1.2057259121600852,
      "learning_rate": 0.003173076923076923,
      "loss": 4.6448,
      "loss_layer_12_head": 0.8546503782272339,
      "loss_layer_18_head": 0.840173602104187,
      "loss_layer_24_head": 0.6339765191078186,
      "loss_layer_30_head": 0.32096096873283386,
      "loss_layer_36_head": 0.25971919298171997,
      "loss_layer_42_head": 0.21377480030059814,
      "loss_layer_6_head": 1.42341947555542,
      "step": 495
    },
    {
      "epoch": 6.4,
      "grad_norm": 1.0490588078953151,
      "learning_rate": 0.0032051282051282055,
      "loss": 4.5947,
      "loss_layer_12_head": 0.856784462928772,
      "loss_layer_18_head": 0.8372087478637695,
      "loss_layer_24_head": 0.6806648969650269,
      "loss_layer_30_head": 0.3442658483982086,
      "loss_layer_36_head": 0.2468332052230835,
      "loss_layer_42_head": 0.20357272028923035,
      "loss_layer_6_head": 1.3583182096481323,
      "step": 500
    },
    {
      "epoch": 6.464,
      "grad_norm": 0.8575077972077892,
      "learning_rate": 0.0032371794871794875,
      "loss": 4.4413,
      "loss_layer_12_head": 0.8581374883651733,
      "loss_layer_18_head": 0.795878529548645,
      "loss_layer_24_head": 0.5982597470283508,
      "loss_layer_30_head": 0.38770443201065063,
      "loss_layer_36_head": 0.23882412910461426,
      "loss_layer_42_head": 0.18923978507518768,
      "loss_layer_6_head": 1.3043135404586792,
      "step": 505
    },
    {
      "epoch": 6.5280000000000005,
      "grad_norm": 0.7551993190002948,
      "learning_rate": 0.0032692307692307695,
      "loss": 4.2962,
      "loss_layer_12_head": 0.8788810968399048,
      "loss_layer_18_head": 0.7811435461044312,
      "loss_layer_24_head": 0.5521578192710876,
      "loss_layer_30_head": 0.40578141808509827,
      "loss_layer_36_head": 0.23572847247123718,
      "loss_layer_42_head": 0.15992064774036407,
      "loss_layer_6_head": 1.2605953216552734,
      "step": 510
    },
    {
      "epoch": 6.592,
      "grad_norm": 0.8052547950147906,
      "learning_rate": 0.003301282051282051,
      "loss": 4.1919,
      "loss_layer_12_head": 0.919018566608429,
      "loss_layer_18_head": 0.7793886065483093,
      "loss_layer_24_head": 0.5306540727615356,
      "loss_layer_30_head": 0.37470743060112,
      "loss_layer_36_head": 0.23630854487419128,
      "loss_layer_42_head": 0.1456299126148224,
      "loss_layer_6_head": 1.2628568410873413,
      "step": 515
    },
    {
      "epoch": 6.656,
      "grad_norm": 1.1501260763896224,
      "learning_rate": 0.003333333333333333,
      "loss": 4.1942,
      "loss_layer_12_head": 0.9507948756217957,
      "loss_layer_18_head": 0.7825154066085815,
      "loss_layer_24_head": 0.5200616717338562,
      "loss_layer_30_head": 0.3602267801761627,
      "loss_layer_36_head": 0.22598901391029358,
      "loss_layer_42_head": 0.13484326004981995,
      "loss_layer_6_head": 1.271482229232788,
      "step": 520
    },
    {
      "epoch": 6.72,
      "grad_norm": 1.3402864479006709,
      "learning_rate": 0.0033653846153846156,
      "loss": 4.3211,
      "loss_layer_12_head": 1.0922011137008667,
      "loss_layer_18_head": 0.8022863268852234,
      "loss_layer_24_head": 0.5143569707870483,
      "loss_layer_30_head": 0.3965030908584595,
      "loss_layer_36_head": 0.2344985455274582,
      "loss_layer_42_head": 0.1375109702348709,
      "loss_layer_6_head": 1.2579545974731445,
      "step": 525
    },
    {
      "epoch": 6.784,
      "grad_norm": 1.2141701437693182,
      "learning_rate": 0.0033974358974358976,
      "loss": 4.5096,
      "loss_layer_12_head": 1.2246015071868896,
      "loss_layer_18_head": 0.7686507105827332,
      "loss_layer_24_head": 0.49818333983421326,
      "loss_layer_30_head": 0.34936264157295227,
      "loss_layer_36_head": 0.2610321640968323,
      "loss_layer_42_head": 0.12700845301151276,
      "loss_layer_6_head": 1.2108025550842285,
      "step": 530
    },
    {
      "epoch": 6.848,
      "grad_norm": 0.904221634424843,
      "learning_rate": 0.0034294871794871796,
      "loss": 4.5247,
      "loss_layer_12_head": 1.2784221172332764,
      "loss_layer_18_head": 0.7871406674385071,
      "loss_layer_24_head": 0.5177046060562134,
      "loss_layer_30_head": 0.4392577111721039,
      "loss_layer_36_head": 0.27904826402664185,
      "loss_layer_42_head": 0.14788372814655304,
      "loss_layer_6_head": 1.2267922163009644,
      "step": 535
    },
    {
      "epoch": 6.912,
      "grad_norm": 0.7372333895332408,
      "learning_rate": 0.0034615384615384616,
      "loss": 4.4031,
      "loss_layer_12_head": 1.1357537508010864,
      "loss_layer_18_head": 0.7835424542427063,
      "loss_layer_24_head": 0.4916456341743469,
      "loss_layer_30_head": 0.3651697337627411,
      "loss_layer_36_head": 0.24365434050559998,
      "loss_layer_42_head": 0.14184752106666565,
      "loss_layer_6_head": 1.250795602798462,
      "step": 540
    },
    {
      "epoch": 6.976,
      "grad_norm": 1.1518282393412917,
      "learning_rate": 0.0034935897435897437,
      "loss": 4.3684,
      "loss_layer_12_head": 1.0992364883422852,
      "loss_layer_18_head": 0.8232961893081665,
      "loss_layer_24_head": 0.5295633673667908,
      "loss_layer_30_head": 0.36773088574409485,
      "loss_layer_36_head": 0.23921005427837372,
      "loss_layer_42_head": 0.14976578950881958,
      "loss_layer_6_head": 1.2840278148651123,
      "step": 545
    },
    {
      "epoch": 7.04,
      "grad_norm": 0.9540059730742603,
      "learning_rate": 0.003525641025641026,
      "loss": 4.0574,
      "loss_layer_12_head": 0.9577862024307251,
      "loss_layer_18_head": 0.7789104580879211,
      "loss_layer_24_head": 0.4452721178531647,
      "loss_layer_30_head": 0.3106338679790497,
      "loss_layer_36_head": 0.21194377541542053,
      "loss_layer_42_head": 0.15220779180526733,
      "loss_layer_6_head": 1.198129653930664,
      "step": 550
    },
    {
      "epoch": 7.104,
      "grad_norm": 1.38998640031747,
      "learning_rate": 0.0035576923076923077,
      "loss": 4.029,
      "loss_layer_12_head": 0.882230281829834,
      "loss_layer_18_head": 0.8701518774032593,
      "loss_layer_24_head": 0.42898431420326233,
      "loss_layer_30_head": 0.2921150028705597,
      "loss_layer_36_head": 0.20976360142230988,
      "loss_layer_42_head": 0.17359986901283264,
      "loss_layer_6_head": 1.1614536046981812,
      "step": 555
    },
    {
      "epoch": 7.168,
      "grad_norm": 1.2363741975414473,
      "learning_rate": 0.0035897435897435897,
      "loss": 4.1461,
      "loss_layer_12_head": 0.875937283039093,
      "loss_layer_18_head": 0.987763524055481,
      "loss_layer_24_head": 0.45980095863342285,
      "loss_layer_30_head": 0.29953113198280334,
      "loss_layer_36_head": 0.2310682088136673,
      "loss_layer_42_head": 0.17081494629383087,
      "loss_layer_6_head": 1.188507080078125,
      "step": 560
    },
    {
      "epoch": 7.232,
      "grad_norm": 1.3629196402016688,
      "learning_rate": 0.0036217948717948718,
      "loss": 4.0577,
      "loss_layer_12_head": 0.8127782940864563,
      "loss_layer_18_head": 0.8724933862686157,
      "loss_layer_24_head": 0.43443790078163147,
      "loss_layer_30_head": 0.27760523557662964,
      "loss_layer_36_head": 0.20727689564228058,
      "loss_layer_42_head": 0.15831176936626434,
      "loss_layer_6_head": 1.1757885217666626,
      "step": 565
    },
    {
      "epoch": 7.296,
      "grad_norm": 1.240952373818873,
      "learning_rate": 0.003653846153846154,
      "loss": 4.104,
      "loss_layer_12_head": 0.8376092910766602,
      "loss_layer_18_head": 0.8506229519844055,
      "loss_layer_24_head": 0.44845181703567505,
      "loss_layer_30_head": 0.2983912229537964,
      "loss_layer_36_head": 0.21085870265960693,
      "loss_layer_42_head": 0.16104820370674133,
      "loss_layer_6_head": 1.3250142335891724,
      "step": 570
    },
    {
      "epoch": 7.36,
      "grad_norm": 1.3133458596014376,
      "learning_rate": 0.0036858974358974362,
      "loss": 4.1512,
      "loss_layer_12_head": 0.8345325589179993,
      "loss_layer_18_head": 0.7679761648178101,
      "loss_layer_24_head": 0.4447510838508606,
      "loss_layer_30_head": 0.29866132140159607,
      "loss_layer_36_head": 0.2203553467988968,
      "loss_layer_42_head": 0.17855314910411835,
      "loss_layer_6_head": 1.342247724533081,
      "step": 575
    },
    {
      "epoch": 7.424,
      "grad_norm": 0.8461197040152086,
      "learning_rate": 0.0037179487179487183,
      "loss": 4.2459,
      "loss_layer_12_head": 0.8581629991531372,
      "loss_layer_18_head": 0.7624831795692444,
      "loss_layer_24_head": 0.46204739809036255,
      "loss_layer_30_head": 0.2906055748462677,
      "loss_layer_36_head": 0.20187988877296448,
      "loss_layer_42_head": 0.15841703116893768,
      "loss_layer_6_head": 1.5256927013397217,
      "step": 580
    },
    {
      "epoch": 7.4879999999999995,
      "grad_norm": 0.8474525094168374,
      "learning_rate": 0.00375,
      "loss": 4.1751,
      "loss_layer_12_head": 0.8404186964035034,
      "loss_layer_18_head": 0.7366082072257996,
      "loss_layer_24_head": 0.4866747260093689,
      "loss_layer_30_head": 0.30878889560699463,
      "loss_layer_36_head": 0.2227415293455124,
      "loss_layer_42_head": 0.16082309186458588,
      "loss_layer_6_head": 1.4227885007858276,
      "step": 585
    },
    {
      "epoch": 7.552,
      "grad_norm": 0.8347713672764921,
      "learning_rate": 0.003782051282051282,
      "loss": 4.0779,
      "loss_layer_12_head": 0.8894980549812317,
      "loss_layer_18_head": 0.7416540384292603,
      "loss_layer_24_head": 0.5072723627090454,
      "loss_layer_30_head": 0.2958090603351593,
      "loss_layer_36_head": 0.20604205131530762,
      "loss_layer_42_head": 0.14479871094226837,
      "loss_layer_6_head": 1.3990154266357422,
      "step": 590
    },
    {
      "epoch": 7.616,
      "grad_norm": 0.6317410878521267,
      "learning_rate": 0.003814102564102564,
      "loss": 3.993,
      "loss_layer_12_head": 0.8329566717147827,
      "loss_layer_18_head": 0.692613422870636,
      "loss_layer_24_head": 0.490090936422348,
      "loss_layer_30_head": 0.28243404626846313,
      "loss_layer_36_head": 0.2037874162197113,
      "loss_layer_42_head": 0.13317333161830902,
      "loss_layer_6_head": 1.2798428535461426,
      "step": 595
    },
    {
      "epoch": 7.68,
      "grad_norm": 1.0688879520585153,
      "learning_rate": 0.0038461538461538464,
      "loss": 4.081,
      "loss_layer_12_head": 0.9211309552192688,
      "loss_layer_18_head": 0.7072088718414307,
      "loss_layer_24_head": 0.5382223725318909,
      "loss_layer_30_head": 0.30142974853515625,
      "loss_layer_36_head": 0.2008870542049408,
      "loss_layer_42_head": 0.12783093750476837,
      "loss_layer_6_head": 1.2565667629241943,
      "step": 600
    },
    {
      "epoch": 7.68,
      "eval_loss": 6.064269065856934,
      "eval_loss_layer_12_head": 1.4230351448059082,
      "eval_loss_layer_18_head": 1.1565618515014648,
      "eval_loss_layer_24_head": 0.8412654995918274,
      "eval_loss_layer_30_head": 0.529054582118988,
      "eval_loss_layer_36_head": 0.3588993549346924,
      "eval_loss_layer_42_head": 0.2186090499162674,
      "eval_loss_layer_6_head": 1.6945667266845703,
      "eval_runtime": 33.1186,
      "eval_samples_per_second": 9.662,
      "eval_steps_per_second": 0.604,
      "step": 600
    },
    {
      "epoch": 7.744,
      "grad_norm": 0.7138553749173591,
      "learning_rate": 0.0038782051282051284,
      "loss": 4.2201,
      "loss_layer_12_head": 1.0682505369186401,
      "loss_layer_18_head": 0.751064121723175,
      "loss_layer_24_head": 0.5447863340377808,
      "loss_layer_30_head": 0.3238937258720398,
      "loss_layer_36_head": 0.22007110714912415,
      "loss_layer_42_head": 0.1354173868894577,
      "loss_layer_6_head": 1.2807810306549072,
      "step": 605
    },
    {
      "epoch": 7.808,
      "grad_norm": 0.8490313341296922,
      "learning_rate": 0.00391025641025641,
      "loss": 4.2939,
      "loss_layer_12_head": 0.9529972076416016,
      "loss_layer_18_head": 0.7455770373344421,
      "loss_layer_24_head": 0.5497812032699585,
      "loss_layer_30_head": 0.3712846040725708,
      "loss_layer_36_head": 0.28533223271369934,
      "loss_layer_42_head": 0.12184665352106094,
      "loss_layer_6_head": 1.2286648750305176,
      "step": 610
    },
    {
      "epoch": 7.872,
      "grad_norm": 0.9024593184388664,
      "learning_rate": 0.003942307692307692,
      "loss": 4.3743,
      "loss_layer_12_head": 0.9823104739189148,
      "loss_layer_18_head": 0.7695722579956055,
      "loss_layer_24_head": 0.5379729866981506,
      "loss_layer_30_head": 0.4097183644771576,
      "loss_layer_36_head": 0.3168531656265259,
      "loss_layer_42_head": 0.12440316379070282,
      "loss_layer_6_head": 1.2501013278961182,
      "step": 615
    },
    {
      "epoch": 7.936,
      "grad_norm": 0.7475790582335897,
      "learning_rate": 0.0039743589743589745,
      "loss": 4.2725,
      "loss_layer_12_head": 0.9023788571357727,
      "loss_layer_18_head": 0.7425622344017029,
      "loss_layer_24_head": 0.4975395202636719,
      "loss_layer_30_head": 0.4235556125640869,
      "loss_layer_36_head": 0.27163082361221313,
      "loss_layer_42_head": 0.14592969417572021,
      "loss_layer_6_head": 1.1735336780548096,
      "step": 620
    },
    {
      "epoch": 8.0,
      "grad_norm": 0.6885028312453886,
      "learning_rate": 0.004006410256410257,
      "loss": 4.2075,
      "loss_layer_12_head": 0.9071186184883118,
      "loss_layer_18_head": 0.7721095085144043,
      "loss_layer_24_head": 0.49094945192337036,
      "loss_layer_30_head": 0.4024503231048584,
      "loss_layer_36_head": 0.25670623779296875,
      "loss_layer_42_head": 0.1462739259004593,
      "loss_layer_6_head": 1.2107973098754883,
      "step": 625
    },
    {
      "epoch": 8.064,
      "grad_norm": 0.7930899356114212,
      "learning_rate": 0.0040384615384615385,
      "loss": 3.7216,
      "loss_layer_12_head": 0.833595871925354,
      "loss_layer_18_head": 0.6859909296035767,
      "loss_layer_24_head": 0.4429362416267395,
      "loss_layer_30_head": 0.3521788716316223,
      "loss_layer_36_head": 0.2264052927494049,
      "loss_layer_42_head": 0.14582210779190063,
      "loss_layer_6_head": 1.1322518587112427,
      "step": 630
    },
    {
      "epoch": 8.128,
      "grad_norm": 1.0166276836444685,
      "learning_rate": 0.004070512820512821,
      "loss": 3.8125,
      "loss_layer_12_head": 0.8520393371582031,
      "loss_layer_18_head": 0.7408521771430969,
      "loss_layer_24_head": 0.48459750413894653,
      "loss_layer_30_head": 0.33646121621131897,
      "loss_layer_36_head": 0.22250790894031525,
      "loss_layer_42_head": 0.14810803532600403,
      "loss_layer_6_head": 1.167609453201294,
      "step": 635
    },
    {
      "epoch": 8.192,
      "grad_norm": 1.1566682607871035,
      "learning_rate": 0.0041025641025641026,
      "loss": 3.8419,
      "loss_layer_12_head": 0.8167425394058228,
      "loss_layer_18_head": 0.7366998791694641,
      "loss_layer_24_head": 0.46892642974853516,
      "loss_layer_30_head": 0.324607789516449,
      "loss_layer_36_head": 0.21795305609703064,
      "loss_layer_42_head": 0.14156334102153778,
      "loss_layer_6_head": 1.1235706806182861,
      "step": 640
    },
    {
      "epoch": 8.256,
      "grad_norm": 1.4750565025540163,
      "learning_rate": 0.004134615384615384,
      "loss": 4.0744,
      "loss_layer_12_head": 0.9226335287094116,
      "loss_layer_18_head": 0.809144139289856,
      "loss_layer_24_head": 0.4757646918296814,
      "loss_layer_30_head": 0.31459298729896545,
      "loss_layer_36_head": 0.20873066782951355,
      "loss_layer_42_head": 0.1286831796169281,
      "loss_layer_6_head": 1.1599065065383911,
      "step": 645
    },
    {
      "epoch": 8.32,
      "grad_norm": 1.3774586293314957,
      "learning_rate": 0.004166666666666667,
      "loss": 4.5192,
      "loss_layer_12_head": 1.4602680206298828,
      "loss_layer_18_head": 0.8041539192199707,
      "loss_layer_24_head": 0.47331005334854126,
      "loss_layer_30_head": 0.31224408745765686,
      "loss_layer_36_head": 0.22158387303352356,
      "loss_layer_42_head": 0.13520172238349915,
      "loss_layer_6_head": 1.2111009359359741,
      "step": 650
    },
    {
      "epoch": 8.384,
      "grad_norm": 0.913194514455112,
      "learning_rate": 0.004198717948717949,
      "loss": 4.5784,
      "loss_layer_12_head": 1.4019635915756226,
      "loss_layer_18_head": 0.7819536924362183,
      "loss_layer_24_head": 0.4640430808067322,
      "loss_layer_30_head": 0.30181169509887695,
      "loss_layer_36_head": 0.2320955991744995,
      "loss_layer_42_head": 0.1298990696668625,
      "loss_layer_6_head": 1.3035223484039307,
      "step": 655
    },
    {
      "epoch": 8.448,
      "grad_norm": 0.7493350950664678,
      "learning_rate": 0.004230769230769231,
      "loss": 4.3498,
      "loss_layer_12_head": 1.2139297723770142,
      "loss_layer_18_head": 0.7345114946365356,
      "loss_layer_24_head": 0.4715096950531006,
      "loss_layer_30_head": 0.3014346957206726,
      "loss_layer_36_head": 0.27791768312454224,
      "loss_layer_42_head": 0.19256797432899475,
      "loss_layer_6_head": 1.202768087387085,
      "step": 660
    },
    {
      "epoch": 8.512,
      "grad_norm": 0.6502739102103727,
      "learning_rate": 0.004262820512820513,
      "loss": 4.1186,
      "loss_layer_12_head": 1.0816333293914795,
      "loss_layer_18_head": 0.693094789981842,
      "loss_layer_24_head": 0.44037437438964844,
      "loss_layer_30_head": 0.27115243673324585,
      "loss_layer_36_head": 0.20725497603416443,
      "loss_layer_42_head": 0.12962451577186584,
      "loss_layer_6_head": 1.1937158107757568,
      "step": 665
    },
    {
      "epoch": 8.576,
      "grad_norm": 0.7313705718476786,
      "learning_rate": 0.004294871794871795,
      "loss": 4.013,
      "loss_layer_12_head": 1.0277862548828125,
      "loss_layer_18_head": 0.7302553653717041,
      "loss_layer_24_head": 0.46252337098121643,
      "loss_layer_30_head": 0.281171053647995,
      "loss_layer_36_head": 0.2022136151790619,
      "loss_layer_42_head": 0.14281702041625977,
      "loss_layer_6_head": 1.223236083984375,
      "step": 670
    },
    {
      "epoch": 8.64,
      "grad_norm": 0.736020403800073,
      "learning_rate": 0.004326923076923077,
      "loss": 3.9946,
      "loss_layer_12_head": 0.9839867353439331,
      "loss_layer_18_head": 0.719161868095398,
      "loss_layer_24_head": 0.5124601125717163,
      "loss_layer_30_head": 0.29098740220069885,
      "loss_layer_36_head": 0.19581526517868042,
      "loss_layer_42_head": 0.13490745425224304,
      "loss_layer_6_head": 1.241705298423767,
      "step": 675
    },
    {
      "epoch": 8.704,
      "grad_norm": 0.8092532169345228,
      "learning_rate": 0.004358974358974359,
      "loss": 4.1571,
      "loss_layer_12_head": 0.917145848274231,
      "loss_layer_18_head": 0.7412582635879517,
      "loss_layer_24_head": 0.6232308149337769,
      "loss_layer_30_head": 0.29547226428985596,
      "loss_layer_36_head": 0.20242604613304138,
      "loss_layer_42_head": 0.16369326412677765,
      "loss_layer_6_head": 1.1925930976867676,
      "step": 680
    },
    {
      "epoch": 8.768,
      "grad_norm": 1.1324177730415395,
      "learning_rate": 0.004391025641025641,
      "loss": 4.2441,
      "loss_layer_12_head": 0.8849488496780396,
      "loss_layer_18_head": 0.7745043635368347,
      "loss_layer_24_head": 0.6524385213851929,
      "loss_layer_30_head": 0.28186529874801636,
      "loss_layer_36_head": 0.19327642023563385,
      "loss_layer_42_head": 0.15677759051322937,
      "loss_layer_6_head": 1.1674079895019531,
      "step": 685
    },
    {
      "epoch": 8.832,
      "grad_norm": 0.7412420767975169,
      "learning_rate": 0.004423076923076923,
      "loss": 4.5218,
      "loss_layer_12_head": 0.8412851095199585,
      "loss_layer_18_head": 1.0960347652435303,
      "loss_layer_24_head": 0.5754497647285461,
      "loss_layer_30_head": 0.32664886116981506,
      "loss_layer_36_head": 0.23857375979423523,
      "loss_layer_42_head": 0.21322807669639587,
      "loss_layer_6_head": 1.127020001411438,
      "step": 690
    },
    {
      "epoch": 8.896,
      "grad_norm": 0.8291825973396217,
      "learning_rate": 0.004455128205128205,
      "loss": 4.5892,
      "loss_layer_12_head": 0.8623842000961304,
      "loss_layer_18_head": 1.3173308372497559,
      "loss_layer_24_head": 0.548263430595398,
      "loss_layer_30_head": 0.2995585799217224,
      "loss_layer_36_head": 0.2090265303850174,
      "loss_layer_42_head": 0.1565561294555664,
      "loss_layer_6_head": 1.1702009439468384,
      "step": 695
    },
    {
      "epoch": 8.96,
      "grad_norm": 0.5926635228888841,
      "learning_rate": 0.004487179487179488,
      "loss": 4.2789,
      "loss_layer_12_head": 0.8864520788192749,
      "loss_layer_18_head": 1.0335904359817505,
      "loss_layer_24_head": 0.5306079983711243,
      "loss_layer_30_head": 0.3287288546562195,
      "loss_layer_36_head": 0.20537595450878143,
      "loss_layer_42_head": 0.15819822251796722,
      "loss_layer_6_head": 1.2337465286254883,
      "step": 700
    },
    {
      "epoch": 9.024,
      "grad_norm": 0.9309568852246792,
      "learning_rate": 0.004519230769230769,
      "loss": 4.1292,
      "loss_layer_12_head": 0.8643959760665894,
      "loss_layer_18_head": 0.956216037273407,
      "loss_layer_24_head": 0.49535542726516724,
      "loss_layer_30_head": 0.32274430990219116,
      "loss_layer_36_head": 0.21061468124389648,
      "loss_layer_42_head": 0.14522498846054077,
      "loss_layer_6_head": 1.2452510595321655,
      "step": 705
    },
    {
      "epoch": 9.088,
      "grad_norm": 0.9073428836594415,
      "learning_rate": 0.004551282051282051,
      "loss": 3.8754,
      "loss_layer_12_head": 0.7484974265098572,
      "loss_layer_18_head": 0.7865571975708008,
      "loss_layer_24_head": 0.421290785074234,
      "loss_layer_30_head": 0.29494112730026245,
      "loss_layer_36_head": 0.1857721209526062,
      "loss_layer_42_head": 0.13087508082389832,
      "loss_layer_6_head": 1.2146589756011963,
      "step": 710
    },
    {
      "epoch": 9.152,
      "grad_norm": 0.7982886572718486,
      "learning_rate": 0.004583333333333333,
      "loss": 3.8852,
      "loss_layer_12_head": 0.7922431230545044,
      "loss_layer_18_head": 0.7730211019515991,
      "loss_layer_24_head": 0.4380415380001068,
      "loss_layer_30_head": 0.3227178752422333,
      "loss_layer_36_head": 0.20073524117469788,
      "loss_layer_42_head": 0.14083267748355865,
      "loss_layer_6_head": 1.2430880069732666,
      "step": 715
    },
    {
      "epoch": 9.216,
      "grad_norm": 0.8793203507102387,
      "learning_rate": 0.004615384615384616,
      "loss": 3.7695,
      "loss_layer_12_head": 0.7930987477302551,
      "loss_layer_18_head": 0.7404181957244873,
      "loss_layer_24_head": 0.4397159516811371,
      "loss_layer_30_head": 0.323036253452301,
      "loss_layer_36_head": 0.19060379266738892,
      "loss_layer_42_head": 0.13104695081710815,
      "loss_layer_6_head": 1.2130464315414429,
      "step": 720
    },
    {
      "epoch": 9.28,
      "grad_norm": 0.7942344284140812,
      "learning_rate": 0.004647435897435897,
      "loss": 3.7671,
      "loss_layer_12_head": 0.8263927698135376,
      "loss_layer_18_head": 0.7249279618263245,
      "loss_layer_24_head": 0.44984954595565796,
      "loss_layer_30_head": 0.3150155544281006,
      "loss_layer_36_head": 0.20275506377220154,
      "loss_layer_42_head": 0.12700262665748596,
      "loss_layer_6_head": 1.2755789756774902,
      "step": 725
    },
    {
      "epoch": 9.344,
      "grad_norm": 0.8435364827805502,
      "learning_rate": 0.00467948717948718,
      "loss": 3.8156,
      "loss_layer_12_head": 0.8390544056892395,
      "loss_layer_18_head": 0.7215897440910339,
      "loss_layer_24_head": 0.45940589904785156,
      "loss_layer_30_head": 0.3094175457954407,
      "loss_layer_36_head": 0.21510329842567444,
      "loss_layer_42_head": 0.1464075893163681,
      "loss_layer_6_head": 1.3084766864776611,
      "step": 730
    },
    {
      "epoch": 9.408,
      "grad_norm": 0.6480798628146789,
      "learning_rate": 0.0047115384615384615,
      "loss": 3.7881,
      "loss_layer_12_head": 0.8014190793037415,
      "loss_layer_18_head": 0.6649230122566223,
      "loss_layer_24_head": 0.42368221282958984,
      "loss_layer_30_head": 0.28849735856056213,
      "loss_layer_36_head": 0.19939973950386047,
      "loss_layer_42_head": 0.1339743286371231,
      "loss_layer_6_head": 1.221366047859192,
      "step": 735
    },
    {
      "epoch": 9.472,
      "grad_norm": 0.6339993293899585,
      "learning_rate": 0.004743589743589744,
      "loss": 3.8176,
      "loss_layer_12_head": 0.8401615023612976,
      "loss_layer_18_head": 0.6599575281143188,
      "loss_layer_24_head": 0.42019709944725037,
      "loss_layer_30_head": 0.29990941286087036,
      "loss_layer_36_head": 0.20576226711273193,
      "loss_layer_42_head": 0.14101438224315643,
      "loss_layer_6_head": 1.1898235082626343,
      "step": 740
    },
    {
      "epoch": 9.536,
      "grad_norm": 0.9745719874860632,
      "learning_rate": 0.004775641025641026,
      "loss": 3.9571,
      "loss_layer_12_head": 0.9243666529655457,
      "loss_layer_18_head": 0.6275273561477661,
      "loss_layer_24_head": 0.41716575622558594,
      "loss_layer_30_head": 0.32748299837112427,
      "loss_layer_36_head": 0.20611253380775452,
      "loss_layer_42_head": 0.14104749262332916,
      "loss_layer_6_head": 1.117631435394287,
      "step": 745
    },
    {
      "epoch": 9.6,
      "grad_norm": 0.6544354987982257,
      "learning_rate": 0.004807692307692308,
      "loss": 4.0578,
      "loss_layer_12_head": 1.0271931886672974,
      "loss_layer_18_head": 0.6758699417114258,
      "loss_layer_24_head": 0.45379161834716797,
      "loss_layer_30_head": 0.37005171179771423,
      "loss_layer_36_head": 0.2151561677455902,
      "loss_layer_42_head": 0.1704704314470291,
      "loss_layer_6_head": 1.1697505712509155,
      "step": 750
    },
    {
      "epoch": 9.664,
      "grad_norm": 0.8535242374622737,
      "learning_rate": 0.0048397435897435895,
      "loss": 4.1287,
      "loss_layer_12_head": 0.9570671916007996,
      "loss_layer_18_head": 0.6683294177055359,
      "loss_layer_24_head": 0.5884752869606018,
      "loss_layer_30_head": 0.3260679841041565,
      "loss_layer_36_head": 0.23440642654895782,
      "loss_layer_42_head": 0.1572146862745285,
      "loss_layer_6_head": 1.1262012720108032,
      "step": 755
    },
    {
      "epoch": 9.728,
      "grad_norm": 0.7511329878792825,
      "learning_rate": 0.004871794871794872,
      "loss": 4.2804,
      "loss_layer_12_head": 0.9420676231384277,
      "loss_layer_18_head": 0.6989115476608276,
      "loss_layer_24_head": 0.67049241065979,
      "loss_layer_30_head": 0.32886701822280884,
      "loss_layer_36_head": 0.2956869602203369,
      "loss_layer_42_head": 0.14903786778450012,
      "loss_layer_6_head": 1.1558923721313477,
      "step": 760
    },
    {
      "epoch": 9.792,
      "grad_norm": 0.6705090103704959,
      "learning_rate": 0.004903846153846154,
      "loss": 4.1366,
      "loss_layer_12_head": 0.9071542024612427,
      "loss_layer_18_head": 0.6990827322006226,
      "loss_layer_24_head": 0.640920102596283,
      "loss_layer_30_head": 0.30766063928604126,
      "loss_layer_36_head": 0.2701354920864105,
      "loss_layer_42_head": 0.13981254398822784,
      "loss_layer_6_head": 1.1638771295547485,
      "step": 765
    },
    {
      "epoch": 9.856,
      "grad_norm": 0.6896417204537261,
      "learning_rate": 0.004935897435897436,
      "loss": 4.0965,
      "loss_layer_12_head": 0.9019530415534973,
      "loss_layer_18_head": 0.71703040599823,
      "loss_layer_24_head": 0.6173619031906128,
      "loss_layer_30_head": 0.30312564969062805,
      "loss_layer_36_head": 0.2544689476490021,
      "loss_layer_42_head": 0.13770273327827454,
      "loss_layer_6_head": 1.193363070487976,
      "step": 770
    },
    {
      "epoch": 9.92,
      "grad_norm": 0.5902894213768881,
      "learning_rate": 0.0049679487179487185,
      "loss": 3.9413,
      "loss_layer_12_head": 0.8495297431945801,
      "loss_layer_18_head": 0.6878644227981567,
      "loss_layer_24_head": 0.5319159626960754,
      "loss_layer_30_head": 0.2897586226463318,
      "loss_layer_36_head": 0.22791822254657745,
      "loss_layer_42_head": 0.13464471697807312,
      "loss_layer_6_head": 1.156378149986267,
      "step": 775
    },
    {
      "epoch": 9.984,
      "grad_norm": 0.5234203935107921,
      "learning_rate": 0.005,
      "loss": 3.9196,
      "loss_layer_12_head": 0.8882707357406616,
      "loss_layer_18_head": 0.7308486700057983,
      "loss_layer_24_head": 0.5361499190330505,
      "loss_layer_30_head": 0.31083592772483826,
      "loss_layer_36_head": 0.23185530304908752,
      "loss_layer_42_head": 0.13899096846580505,
      "loss_layer_6_head": 1.1995540857315063,
      "step": 780
    },
    {
      "epoch": 10.048,
      "grad_norm": 0.7913427415962071,
      "learning_rate": 0.004999993741426433,
      "loss": 3.609,
      "loss_layer_12_head": 0.7875979542732239,
      "loss_layer_18_head": 0.6391259431838989,
      "loss_layer_24_head": 0.45563459396362305,
      "loss_layer_30_head": 0.28283217549324036,
      "loss_layer_36_head": 0.21338777244091034,
      "loss_layer_42_head": 0.14084963500499725,
      "loss_layer_6_head": 1.0909838676452637,
      "step": 785
    },
    {
      "epoch": 10.112,
      "grad_norm": 0.6485858961875729,
      "learning_rate": 0.004999974965737065,
      "loss": 3.513,
      "loss_layer_12_head": 0.7797413468360901,
      "loss_layer_18_head": 0.6495454907417297,
      "loss_layer_24_head": 0.4232507646083832,
      "loss_layer_30_head": 0.2750047743320465,
      "loss_layer_36_head": 0.2055617868900299,
      "loss_layer_42_head": 0.13488493859767914,
      "loss_layer_6_head": 1.072763442993164,
      "step": 790
    },
    {
      "epoch": 10.176,
      "grad_norm": 0.6069753191568229,
      "learning_rate": 0.004999943673025905,
      "loss": 3.5494,
      "loss_layer_12_head": 0.7835508584976196,
      "loss_layer_18_head": 0.6683422923088074,
      "loss_layer_24_head": 0.4202864170074463,
      "loss_layer_30_head": 0.3021675944328308,
      "loss_layer_36_head": 0.21884259581565857,
      "loss_layer_42_head": 0.13419374823570251,
      "loss_layer_6_head": 1.100640058517456,
      "step": 795
    },
    {
      "epoch": 10.24,
      "grad_norm": 0.5294126949457005,
      "learning_rate": 0.004999899863449631,
      "loss": 3.5585,
      "loss_layer_12_head": 0.7869202494621277,
      "loss_layer_18_head": 0.6362560391426086,
      "loss_layer_24_head": 0.40147289633750916,
      "loss_layer_30_head": 0.2632017731666565,
      "loss_layer_36_head": 0.20447704195976257,
      "loss_layer_42_head": 0.12183183431625366,
      "loss_layer_6_head": 1.0817599296569824,
      "step": 800
    },
    {
      "epoch": 10.24,
      "eval_loss": 5.882894992828369,
      "eval_loss_layer_12_head": 1.3382538557052612,
      "eval_loss_layer_18_head": 1.138518214225769,
      "eval_loss_layer_24_head": 0.7602499723434448,
      "eval_loss_layer_30_head": 0.5903175473213196,
      "eval_loss_layer_36_head": 0.36771634221076965,
      "eval_loss_layer_42_head": 0.222148135304451,
      "eval_loss_layer_6_head": 1.6598612070083618,
      "eval_runtime": 33.0895,
      "eval_samples_per_second": 9.671,
      "eval_steps_per_second": 0.604,
      "step": 800
    },
    {
      "epoch": 10.304,
      "grad_norm": 0.6468142018655761,
      "learning_rate": 0.004999843537227591,
      "loss": 3.6506,
      "loss_layer_12_head": 0.8227441906929016,
      "loss_layer_18_head": 0.6552620530128479,
      "loss_layer_24_head": 0.4054376184940338,
      "loss_layer_30_head": 0.27477455139160156,
      "loss_layer_36_head": 0.21480974555015564,
      "loss_layer_42_head": 0.11899278312921524,
      "loss_layer_6_head": 1.1323211193084717,
      "step": 805
    },
    {
      "epoch": 10.368,
      "grad_norm": 1.2374152642800478,
      "learning_rate": 0.004999774694641803,
      "loss": 3.9997,
      "loss_layer_12_head": 0.9236928820610046,
      "loss_layer_18_head": 0.6624023914337158,
      "loss_layer_24_head": 0.40048861503601074,
      "loss_layer_30_head": 0.26759856939315796,
      "loss_layer_36_head": 0.21244657039642334,
      "loss_layer_42_head": 0.11748615652322769,
      "loss_layer_6_head": 1.2908852100372314,
      "step": 810
    },
    {
      "epoch": 10.432,
      "grad_norm": 0.8024183052445998,
      "learning_rate": 0.004999693336036951,
      "loss": 3.9851,
      "loss_layer_12_head": 0.9720361828804016,
      "loss_layer_18_head": 0.666892945766449,
      "loss_layer_24_head": 0.4019949436187744,
      "loss_layer_30_head": 0.26828405261039734,
      "loss_layer_36_head": 0.19670316576957703,
      "loss_layer_42_head": 0.116876520216465,
      "loss_layer_6_head": 1.333050012588501,
      "step": 815
    },
    {
      "epoch": 10.496,
      "grad_norm": 0.6704032401067405,
      "learning_rate": 0.004999599461820387,
      "loss": 3.948,
      "loss_layer_12_head": 0.9675441980361938,
      "loss_layer_18_head": 0.692890465259552,
      "loss_layer_24_head": 0.4455263614654541,
      "loss_layer_30_head": 0.31320804357528687,
      "loss_layer_36_head": 0.2434219866991043,
      "loss_layer_42_head": 0.12184138596057892,
      "loss_layer_6_head": 1.3387033939361572,
      "step": 820
    },
    {
      "epoch": 10.56,
      "grad_norm": 0.7554297858455521,
      "learning_rate": 0.0049994930724621255,
      "loss": 3.8667,
      "loss_layer_12_head": 0.9173803329467773,
      "loss_layer_18_head": 0.7038418650627136,
      "loss_layer_24_head": 0.4315706789493561,
      "loss_layer_30_head": 0.27121514081954956,
      "loss_layer_36_head": 0.18684476613998413,
      "loss_layer_42_head": 0.12130451202392578,
      "loss_layer_6_head": 1.2730941772460938,
      "step": 825
    },
    {
      "epoch": 10.624,
      "grad_norm": 0.4475077079478471,
      "learning_rate": 0.004999374168494843,
      "loss": 3.7607,
      "loss_layer_12_head": 0.8306806683540344,
      "loss_layer_18_head": 0.6765504479408264,
      "loss_layer_24_head": 0.42198801040649414,
      "loss_layer_30_head": 0.26795485615730286,
      "loss_layer_36_head": 0.2055717408657074,
      "loss_layer_42_head": 0.113437220454216,
      "loss_layer_6_head": 1.162183165550232,
      "step": 830
    },
    {
      "epoch": 10.688,
      "grad_norm": 0.5485556921310942,
      "learning_rate": 0.004999242750513875,
      "loss": 3.7091,
      "loss_layer_12_head": 0.8276138305664062,
      "loss_layer_18_head": 0.6838864088058472,
      "loss_layer_24_head": 0.43842750787734985,
      "loss_layer_30_head": 0.2773090600967407,
      "loss_layer_36_head": 0.20196489989757538,
      "loss_layer_42_head": 0.11639173328876495,
      "loss_layer_6_head": 1.1702423095703125,
      "step": 835
    },
    {
      "epoch": 10.752,
      "grad_norm": 0.5945409931021391,
      "learning_rate": 0.0049990988191772135,
      "loss": 3.7538,
      "loss_layer_12_head": 0.8156588673591614,
      "loss_layer_18_head": 0.672545313835144,
      "loss_layer_24_head": 0.4866648316383362,
      "loss_layer_30_head": 0.28981298208236694,
      "loss_layer_36_head": 0.17222259938716888,
      "loss_layer_42_head": 0.12276073545217514,
      "loss_layer_6_head": 1.1391099691390991,
      "step": 840
    },
    {
      "epoch": 10.816,
      "grad_norm": 0.5843620944895394,
      "learning_rate": 0.004998942375205502,
      "loss": 3.7886,
      "loss_layer_12_head": 0.7917013764381409,
      "loss_layer_18_head": 0.6679895520210266,
      "loss_layer_24_head": 0.5169625878334045,
      "loss_layer_30_head": 0.2832443118095398,
      "loss_layer_36_head": 0.16928339004516602,
      "loss_layer_42_head": 0.11932657659053802,
      "loss_layer_6_head": 1.1084778308868408,
      "step": 845
    },
    {
      "epoch": 10.88,
      "grad_norm": 0.7474542838092315,
      "learning_rate": 0.004998773419382032,
      "loss": 3.703,
      "loss_layer_12_head": 0.8292906880378723,
      "loss_layer_18_head": 0.6983738541603088,
      "loss_layer_24_head": 0.5246380567550659,
      "loss_layer_30_head": 0.30518367886543274,
      "loss_layer_36_head": 0.18770478665828705,
      "loss_layer_42_head": 0.12145034968852997,
      "loss_layer_6_head": 1.1638790369033813,
      "step": 850
    },
    {
      "epoch": 10.943999999999999,
      "grad_norm": 0.6923797303302748,
      "learning_rate": 0.004998591952552743,
      "loss": 3.8102,
      "loss_layer_12_head": 0.8115772008895874,
      "loss_layer_18_head": 0.7400087118148804,
      "loss_layer_24_head": 0.500815212726593,
      "loss_layer_30_head": 0.29992857575416565,
      "loss_layer_36_head": 0.1747281700372696,
      "loss_layer_42_head": 0.1261713206768036,
      "loss_layer_6_head": 1.1336292028427124,
      "step": 855
    },
    {
      "epoch": 11.008,
      "grad_norm": 0.9900583303882464,
      "learning_rate": 0.004998397975626213,
      "loss": 3.9496,
      "loss_layer_12_head": 0.8063982725143433,
      "loss_layer_18_head": 0.8219122886657715,
      "loss_layer_24_head": 0.4829619526863098,
      "loss_layer_30_head": 0.34369713068008423,
      "loss_layer_36_head": 0.18863904476165771,
      "loss_layer_42_head": 0.12803509831428528,
      "loss_layer_6_head": 1.1153267621994019,
      "step": 860
    },
    {
      "epoch": 11.072,
      "grad_norm": 0.7000454457479166,
      "learning_rate": 0.004998191489573658,
      "loss": 3.7525,
      "loss_layer_12_head": 0.7168534398078918,
      "loss_layer_18_head": 0.8682465553283691,
      "loss_layer_24_head": 0.41980594396591187,
      "loss_layer_30_head": 0.3854736089706421,
      "loss_layer_36_head": 0.17978918552398682,
      "loss_layer_42_head": 0.12647852301597595,
      "loss_layer_6_head": 1.0120512247085571,
      "step": 865
    },
    {
      "epoch": 11.136,
      "grad_norm": 0.5464392472083748,
      "learning_rate": 0.004997972495428924,
      "loss": 3.6326,
      "loss_layer_12_head": 0.7337355017662048,
      "loss_layer_18_head": 0.8039522171020508,
      "loss_layer_24_head": 0.4050355851650238,
      "loss_layer_30_head": 0.33392971754074097,
      "loss_layer_36_head": 0.1819077581167221,
      "loss_layer_42_head": 0.1179690808057785,
      "loss_layer_6_head": 1.0289865732192993,
      "step": 870
    },
    {
      "epoch": 11.2,
      "grad_norm": 0.6441186662116861,
      "learning_rate": 0.004997740994288484,
      "loss": 3.6305,
      "loss_layer_12_head": 0.7131476402282715,
      "loss_layer_18_head": 0.7375887632369995,
      "loss_layer_24_head": 0.3756333887577057,
      "loss_layer_30_head": 0.302691251039505,
      "loss_layer_36_head": 0.19120857119560242,
      "loss_layer_42_head": 0.1272311955690384,
      "loss_layer_6_head": 0.9893538355827332,
      "step": 875
    },
    {
      "epoch": 11.264,
      "grad_norm": 0.7578554180380498,
      "learning_rate": 0.00499749698731143,
      "loss": 3.5718,
      "loss_layer_12_head": 0.7484856843948364,
      "loss_layer_18_head": 0.7354165315628052,
      "loss_layer_24_head": 0.4240771234035492,
      "loss_layer_30_head": 0.3184972107410431,
      "loss_layer_36_head": 0.20635369420051575,
      "loss_layer_42_head": 0.1232929676771164,
      "loss_layer_6_head": 1.0856561660766602,
      "step": 880
    },
    {
      "epoch": 11.328,
      "grad_norm": 0.7875572072621716,
      "learning_rate": 0.004997240475719473,
      "loss": 3.6196,
      "loss_layer_12_head": 0.7762835025787354,
      "loss_layer_18_head": 0.6921391487121582,
      "loss_layer_24_head": 0.3905457854270935,
      "loss_layer_30_head": 0.28671813011169434,
      "loss_layer_36_head": 0.2018081247806549,
      "loss_layer_42_head": 0.13844159245491028,
      "loss_layer_6_head": 1.0779507160186768,
      "step": 885
    },
    {
      "epoch": 11.392,
      "grad_norm": 0.8839642056210026,
      "learning_rate": 0.0049969714607969295,
      "loss": 3.5761,
      "loss_layer_12_head": 0.7560387253761292,
      "loss_layer_18_head": 0.6469532251358032,
      "loss_layer_24_head": 0.3736235499382019,
      "loss_layer_30_head": 0.2680806517601013,
      "loss_layer_36_head": 0.19140441715717316,
      "loss_layer_42_head": 0.14330454170703888,
      "loss_layer_6_head": 1.0871533155441284,
      "step": 890
    },
    {
      "epoch": 11.456,
      "grad_norm": 0.8721985462371473,
      "learning_rate": 0.004996689943890718,
      "loss": 3.7619,
      "loss_layer_12_head": 0.8740240931510925,
      "loss_layer_18_head": 0.6426118612289429,
      "loss_layer_24_head": 0.38512954115867615,
      "loss_layer_30_head": 0.26182329654693604,
      "loss_layer_36_head": 0.1825040876865387,
      "loss_layer_42_head": 0.15867795050144196,
      "loss_layer_6_head": 1.1908880472183228,
      "step": 895
    },
    {
      "epoch": 11.52,
      "grad_norm": 0.7021087985286999,
      "learning_rate": 0.0049963959264103545,
      "loss": 3.7236,
      "loss_layer_12_head": 0.8948087692260742,
      "loss_layer_18_head": 0.6564909815788269,
      "loss_layer_24_head": 0.4051855504512787,
      "loss_layer_30_head": 0.28841087222099304,
      "loss_layer_36_head": 0.19624382257461548,
      "loss_layer_42_head": 0.1766609251499176,
      "loss_layer_6_head": 1.1841399669647217,
      "step": 900
    },
    {
      "epoch": 11.584,
      "grad_norm": 0.5885183860440354,
      "learning_rate": 0.004996089409827944,
      "loss": 3.6413,
      "loss_layer_12_head": 0.8821050524711609,
      "loss_layer_18_head": 0.6303891539573669,
      "loss_layer_24_head": 0.38617196679115295,
      "loss_layer_30_head": 0.25283628702163696,
      "loss_layer_36_head": 0.17324580252170563,
      "loss_layer_42_head": 0.14827527105808258,
      "loss_layer_6_head": 1.173156499862671,
      "step": 905
    },
    {
      "epoch": 11.648,
      "grad_norm": 0.6582972779376169,
      "learning_rate": 0.004995770395678171,
      "loss": 3.5876,
      "loss_layer_12_head": 0.8242586851119995,
      "loss_layer_18_head": 0.6043913960456848,
      "loss_layer_24_head": 0.4045889377593994,
      "loss_layer_30_head": 0.25077739357948303,
      "loss_layer_36_head": 0.16636748611927032,
      "loss_layer_42_head": 0.13331210613250732,
      "loss_layer_6_head": 1.1119669675827026,
      "step": 910
    },
    {
      "epoch": 11.712,
      "grad_norm": 0.5175160742683517,
      "learning_rate": 0.004995438885558294,
      "loss": 3.7105,
      "loss_layer_12_head": 0.8833678960800171,
      "loss_layer_18_head": 0.6702718138694763,
      "loss_layer_24_head": 0.4672086834907532,
      "loss_layer_30_head": 0.27043429017066956,
      "loss_layer_36_head": 0.17605915665626526,
      "loss_layer_42_head": 0.1416662484407425,
      "loss_layer_6_head": 1.1802916526794434,
      "step": 915
    },
    {
      "epoch": 11.776,
      "grad_norm": 0.5039469866391701,
      "learning_rate": 0.004995094881128138,
      "loss": 3.6069,
      "loss_layer_12_head": 0.8093682527542114,
      "loss_layer_18_head": 0.6275688409805298,
      "loss_layer_24_head": 0.4318751394748688,
      "loss_layer_30_head": 0.2539094090461731,
      "loss_layer_36_head": 0.16552020609378815,
      "loss_layer_42_head": 0.1224556714296341,
      "loss_layer_6_head": 1.0909665822982788,
      "step": 920
    },
    {
      "epoch": 11.84,
      "grad_norm": 0.4462295768349692,
      "learning_rate": 0.004994738384110084,
      "loss": 3.5635,
      "loss_layer_12_head": 0.8099982142448425,
      "loss_layer_18_head": 0.6287972927093506,
      "loss_layer_24_head": 0.42529717087745667,
      "loss_layer_30_head": 0.2500496506690979,
      "loss_layer_36_head": 0.16674992442131042,
      "loss_layer_42_head": 0.12000541388988495,
      "loss_layer_6_head": 1.0971558094024658,
      "step": 925
    },
    {
      "epoch": 11.904,
      "grad_norm": 0.727753105442928,
      "learning_rate": 0.004994369396289063,
      "loss": 3.5484,
      "loss_layer_12_head": 0.8151998519897461,
      "loss_layer_18_head": 0.6257910132408142,
      "loss_layer_24_head": 0.42311739921569824,
      "loss_layer_30_head": 0.24520912766456604,
      "loss_layer_36_head": 0.162801593542099,
      "loss_layer_42_head": 0.1296689510345459,
      "loss_layer_6_head": 1.08476722240448,
      "step": 930
    },
    {
      "epoch": 11.968,
      "grad_norm": 0.7092336566500927,
      "learning_rate": 0.004993987919512545,
      "loss": 3.6437,
      "loss_layer_12_head": 0.870212197303772,
      "loss_layer_18_head": 0.6820809841156006,
      "loss_layer_24_head": 0.4426618218421936,
      "loss_layer_30_head": 0.25865188241004944,
      "loss_layer_36_head": 0.17032727599143982,
      "loss_layer_42_head": 0.13718058168888092,
      "loss_layer_6_head": 1.1282765865325928,
      "step": 935
    },
    {
      "epoch": 12.032,
      "grad_norm": 0.7222560963901075,
      "learning_rate": 0.0049935939556905295,
      "loss": 3.6487,
      "loss_layer_12_head": 0.962159276008606,
      "loss_layer_18_head": 0.6417534947395325,
      "loss_layer_24_head": 0.4140968918800354,
      "loss_layer_30_head": 0.25313547253608704,
      "loss_layer_36_head": 0.1646692454814911,
      "loss_layer_42_head": 0.14386749267578125,
      "loss_layer_6_head": 1.0626068115234375,
      "step": 940
    },
    {
      "epoch": 12.096,
      "grad_norm": 0.7630450240344818,
      "learning_rate": 0.004993187506795538,
      "loss": 3.4508,
      "loss_layer_12_head": 0.8663468360900879,
      "loss_layer_18_head": 0.6510176062583923,
      "loss_layer_24_head": 0.39164796471595764,
      "loss_layer_30_head": 0.24641506373882294,
      "loss_layer_36_head": 0.16527552902698517,
      "loss_layer_42_head": 0.13015350699424744,
      "loss_layer_6_head": 0.9934841394424438,
      "step": 945
    },
    {
      "epoch": 12.16,
      "grad_norm": 0.6987697986212847,
      "learning_rate": 0.004992768574862603,
      "loss": 3.6342,
      "loss_layer_12_head": 0.8568476438522339,
      "loss_layer_18_head": 0.7414189577102661,
      "loss_layer_24_head": 0.39979448914527893,
      "loss_layer_30_head": 0.2642044425010681,
      "loss_layer_36_head": 0.16585540771484375,
      "loss_layer_42_head": 0.12148065865039825,
      "loss_layer_6_head": 1.0142256021499634,
      "step": 950
    },
    {
      "epoch": 12.224,
      "grad_norm": 0.7057516391581583,
      "learning_rate": 0.004992337161989257,
      "loss": 3.5965,
      "loss_layer_12_head": 0.8435189127922058,
      "loss_layer_18_head": 0.7561205625534058,
      "loss_layer_24_head": 0.4250868856906891,
      "loss_layer_30_head": 0.26793041825294495,
      "loss_layer_36_head": 0.17376108467578888,
      "loss_layer_42_head": 0.12712249159812927,
      "loss_layer_6_head": 1.0281429290771484,
      "step": 955
    },
    {
      "epoch": 12.288,
      "grad_norm": 0.6703718970129543,
      "learning_rate": 0.0049918932703355255,
      "loss": 3.6228,
      "loss_layer_12_head": 0.811074435710907,
      "loss_layer_18_head": 0.7310625314712524,
      "loss_layer_24_head": 0.4820745587348938,
      "loss_layer_30_head": 0.24346427619457245,
      "loss_layer_36_head": 0.1732034832239151,
      "loss_layer_42_head": 0.11513793468475342,
      "loss_layer_6_head": 1.0396512746810913,
      "step": 960
    },
    {
      "epoch": 12.352,
      "grad_norm": 0.6714543408419269,
      "learning_rate": 0.004991436902123908,
      "loss": 3.5438,
      "loss_layer_12_head": 0.8118473291397095,
      "loss_layer_18_head": 0.6970846652984619,
      "loss_layer_24_head": 0.49596279859542847,
      "loss_layer_30_head": 0.2532462775707245,
      "loss_layer_36_head": 0.18058347702026367,
      "loss_layer_42_head": 0.11702495813369751,
      "loss_layer_6_head": 1.0946464538574219,
      "step": 965
    },
    {
      "epoch": 12.416,
      "grad_norm": 0.7752398352276996,
      "learning_rate": 0.004990968059639379,
      "loss": 3.5337,
      "loss_layer_12_head": 0.8104971051216125,
      "loss_layer_18_head": 0.6889949440956116,
      "loss_layer_24_head": 0.49680590629577637,
      "loss_layer_30_head": 0.25002971291542053,
      "loss_layer_36_head": 0.17103442549705505,
      "loss_layer_42_head": 0.10418820381164551,
      "loss_layer_6_head": 1.139098882675171,
      "step": 970
    },
    {
      "epoch": 12.48,
      "grad_norm": 0.8114682972703338,
      "learning_rate": 0.004990486745229364,
      "loss": 3.4976,
      "loss_layer_12_head": 0.7499794960021973,
      "loss_layer_18_head": 0.6315177083015442,
      "loss_layer_24_head": 0.46894869208335876,
      "loss_layer_30_head": 0.24252791702747345,
      "loss_layer_36_head": 0.16321976482868195,
      "loss_layer_42_head": 0.09979903697967529,
      "loss_layer_6_head": 1.135143756866455,
      "step": 975
    },
    {
      "epoch": 12.544,
      "grad_norm": 0.4848201722076279,
      "learning_rate": 0.0049899929613037375,
      "loss": 3.554,
      "loss_layer_12_head": 0.7848585844039917,
      "loss_layer_18_head": 0.6557260751724243,
      "loss_layer_24_head": 0.4702858328819275,
      "loss_layer_30_head": 0.25672072172164917,
      "loss_layer_36_head": 0.18087394535541534,
      "loss_layer_42_head": 0.10268770158290863,
      "loss_layer_6_head": 1.2365235090255737,
      "step": 980
    },
    {
      "epoch": 12.608,
      "grad_norm": 0.592526280961882,
      "learning_rate": 0.004989486710334806,
      "loss": 3.683,
      "loss_layer_12_head": 0.7318404316902161,
      "loss_layer_18_head": 0.6104401350021362,
      "loss_layer_24_head": 0.42001619935035706,
      "loss_layer_30_head": 0.24403110146522522,
      "loss_layer_36_head": 0.17308169603347778,
      "loss_layer_42_head": 0.09923659265041351,
      "loss_layer_6_head": 1.3666105270385742,
      "step": 985
    },
    {
      "epoch": 12.672,
      "grad_norm": 0.511764158494744,
      "learning_rate": 0.004988967994857297,
      "loss": 3.5968,
      "loss_layer_12_head": 0.7282203435897827,
      "loss_layer_18_head": 0.596890926361084,
      "loss_layer_24_head": 0.40133294463157654,
      "loss_layer_30_head": 0.24455273151397705,
      "loss_layer_36_head": 0.16770371794700623,
      "loss_layer_42_head": 0.09156014025211334,
      "loss_layer_6_head": 1.2598038911819458,
      "step": 990
    },
    {
      "epoch": 12.736,
      "grad_norm": 0.49894636704485484,
      "learning_rate": 0.0049884368174683445,
      "loss": 3.5244,
      "loss_layer_12_head": 0.782880425453186,
      "loss_layer_18_head": 0.6296457648277283,
      "loss_layer_24_head": 0.41355735063552856,
      "loss_layer_30_head": 0.2625717222690582,
      "loss_layer_36_head": 0.17686952650547028,
      "loss_layer_42_head": 0.09636244922876358,
      "loss_layer_6_head": 1.2363524436950684,
      "step": 995
    },
    {
      "epoch": 12.8,
      "grad_norm": 0.48783038135734846,
      "learning_rate": 0.00498789318082748,
      "loss": 3.5251,
      "loss_layer_12_head": 0.7548606991767883,
      "loss_layer_18_head": 0.6160756349563599,
      "loss_layer_24_head": 0.40164703130722046,
      "loss_layer_30_head": 0.26370325684547424,
      "loss_layer_36_head": 0.18664014339447021,
      "loss_layer_42_head": 0.10964316129684448,
      "loss_layer_6_head": 1.1515281200408936,
      "step": 1000
    },
    {
      "epoch": 12.8,
      "eval_loss": 5.699979305267334,
      "eval_loss_layer_12_head": 1.2993661165237427,
      "eval_loss_layer_18_head": 1.0979113578796387,
      "eval_loss_layer_24_head": 0.7252463102340698,
      "eval_loss_layer_30_head": 0.5119324922561646,
      "eval_loss_layer_36_head": 0.34375834465026855,
      "eval_loss_layer_42_head": 0.2164229452610016,
      "eval_loss_layer_6_head": 1.649012565612793,
      "eval_runtime": 33.0973,
      "eval_samples_per_second": 9.668,
      "eval_steps_per_second": 0.604,
      "step": 1000
    },
    {
      "epoch": 12.864,
      "grad_norm": 0.6700594264324334,
      "learning_rate": 0.004987337087656613,
      "loss": 3.5157,
      "loss_layer_12_head": 0.7961663603782654,
      "loss_layer_18_head": 0.6058277487754822,
      "loss_layer_24_head": 0.38322770595550537,
      "loss_layer_30_head": 0.2624596953392029,
      "loss_layer_36_head": 0.1715574413537979,
      "loss_layer_42_head": 0.09740383177995682,
      "loss_layer_6_head": 1.1046103239059448,
      "step": 1005
    },
    {
      "epoch": 12.928,
      "grad_norm": 0.7460804816522585,
      "learning_rate": 0.004986768540740027,
      "loss": 3.5273,
      "loss_layer_12_head": 0.8814705014228821,
      "loss_layer_18_head": 0.6103768348693848,
      "loss_layer_24_head": 0.3821176290512085,
      "loss_layer_30_head": 0.2683340013027191,
      "loss_layer_36_head": 0.1720578670501709,
      "loss_layer_42_head": 0.10446105897426605,
      "loss_layer_6_head": 1.0786540508270264,
      "step": 1010
    },
    {
      "epoch": 12.992,
      "grad_norm": 0.6042386527450659,
      "learning_rate": 0.004986187542924355,
      "loss": 3.7587,
      "loss_layer_12_head": 1.0378557443618774,
      "loss_layer_18_head": 0.6542109251022339,
      "loss_layer_24_head": 0.39738351106643677,
      "loss_layer_30_head": 0.3031170964241028,
      "loss_layer_36_head": 0.16943764686584473,
      "loss_layer_42_head": 0.093946173787117,
      "loss_layer_6_head": 1.121674656867981,
      "step": 1015
    },
    {
      "epoch": 13.056,
      "grad_norm": 0.5910091287383138,
      "learning_rate": 0.00498559409711857,
      "loss": 3.4469,
      "loss_layer_12_head": 0.9428478479385376,
      "loss_layer_18_head": 0.6204007267951965,
      "loss_layer_24_head": 0.3856988549232483,
      "loss_layer_30_head": 0.31170153617858887,
      "loss_layer_36_head": 0.16824871301651,
      "loss_layer_42_head": 0.09604080021381378,
      "loss_layer_6_head": 1.0742381811141968,
      "step": 1020
    },
    {
      "epoch": 13.12,
      "grad_norm": 0.5351787635841635,
      "learning_rate": 0.004984988206293972,
      "loss": 3.4175,
      "loss_layer_12_head": 0.8628122210502625,
      "loss_layer_18_head": 0.5796699523925781,
      "loss_layer_24_head": 0.3653489649295807,
      "loss_layer_30_head": 0.3351936936378479,
      "loss_layer_36_head": 0.15915483236312866,
      "loss_layer_42_head": 0.0989261269569397,
      "loss_layer_6_head": 1.0250425338745117,
      "step": 1025
    },
    {
      "epoch": 13.184,
      "grad_norm": 0.5190457080028751,
      "learning_rate": 0.00498436987348417,
      "loss": 3.3304,
      "loss_layer_12_head": 0.7954726815223694,
      "loss_layer_18_head": 0.5772978067398071,
      "loss_layer_24_head": 0.36007919907569885,
      "loss_layer_30_head": 0.304819256067276,
      "loss_layer_36_head": 0.18556824326515198,
      "loss_layer_42_head": 0.10523227602243423,
      "loss_layer_6_head": 1.0020222663879395,
      "step": 1030
    },
    {
      "epoch": 13.248,
      "grad_norm": 0.4398709736630784,
      "learning_rate": 0.004983739101785071,
      "loss": 3.3184,
      "loss_layer_12_head": 0.7838091850280762,
      "loss_layer_18_head": 0.5958616733551025,
      "loss_layer_24_head": 0.3747026324272156,
      "loss_layer_30_head": 0.2868528366088867,
      "loss_layer_36_head": 0.21156159043312073,
      "loss_layer_42_head": 0.11176309734582901,
      "loss_layer_6_head": 1.0199041366577148,
      "step": 1035
    },
    {
      "epoch": 13.312,
      "grad_norm": 0.5437980713477997,
      "learning_rate": 0.004983095894354857,
      "loss": 3.3719,
      "loss_layer_12_head": 0.7722082138061523,
      "loss_layer_18_head": 0.6397719383239746,
      "loss_layer_24_head": 0.3792113661766052,
      "loss_layer_30_head": 0.2772682309150696,
      "loss_layer_36_head": 0.21843251585960388,
      "loss_layer_42_head": 0.10822826623916626,
      "loss_layer_6_head": 1.0376275777816772,
      "step": 1040
    },
    {
      "epoch": 13.376,
      "grad_norm": 0.6582910627742266,
      "learning_rate": 0.00498244025441398,
      "loss": 3.357,
      "loss_layer_12_head": 0.738771915435791,
      "loss_layer_18_head": 0.6946569681167603,
      "loss_layer_24_head": 0.3759083151817322,
      "loss_layer_30_head": 0.2572627067565918,
      "loss_layer_36_head": 0.19886913895606995,
      "loss_layer_42_head": 0.1124260425567627,
      "loss_layer_6_head": 1.0125656127929688,
      "step": 1045
    },
    {
      "epoch": 13.44,
      "grad_norm": 0.5889537405838476,
      "learning_rate": 0.004981772185245135,
      "loss": 3.4023,
      "loss_layer_12_head": 0.7286524772644043,
      "loss_layer_18_head": 0.7508174180984497,
      "loss_layer_24_head": 0.395038902759552,
      "loss_layer_30_head": 0.2534017264842987,
      "loss_layer_36_head": 0.19118908047676086,
      "loss_layer_42_head": 0.11285771429538727,
      "loss_layer_6_head": 1.0196412801742554,
      "step": 1050
    },
    {
      "epoch": 13.504,
      "grad_norm": 0.4462035674331681,
      "learning_rate": 0.00498109169019325,
      "loss": 3.3762,
      "loss_layer_12_head": 0.7459102869033813,
      "loss_layer_18_head": 0.7173882722854614,
      "loss_layer_24_head": 0.3867571949958801,
      "loss_layer_30_head": 0.2455999106168747,
      "loss_layer_36_head": 0.18299970030784607,
      "loss_layer_42_head": 0.10762417316436768,
      "loss_layer_6_head": 1.062795639038086,
      "step": 1055
    },
    {
      "epoch": 13.568,
      "grad_norm": 0.37099943692064596,
      "learning_rate": 0.0049803987726654685,
      "loss": 3.368,
      "loss_layer_12_head": 0.7342067956924438,
      "loss_layer_18_head": 0.6730878353118896,
      "loss_layer_24_head": 0.38590073585510254,
      "loss_layer_30_head": 0.24184656143188477,
      "loss_layer_36_head": 0.16803565621376038,
      "loss_layer_42_head": 0.10017059743404388,
      "loss_layer_6_head": 1.0649298429489136,
      "step": 1060
    },
    {
      "epoch": 13.632,
      "grad_norm": 0.6478129492587109,
      "learning_rate": 0.00497969343613113,
      "loss": 3.3161,
      "loss_layer_12_head": 0.7512027025222778,
      "loss_layer_18_head": 0.6762269735336304,
      "loss_layer_24_head": 0.3929465413093567,
      "loss_layer_30_head": 0.25318464636802673,
      "loss_layer_36_head": 0.1764782965183258,
      "loss_layer_42_head": 0.10358822345733643,
      "loss_layer_6_head": 1.1124573945999146,
      "step": 1065
    },
    {
      "epoch": 13.696,
      "grad_norm": 0.8146130180464151,
      "learning_rate": 0.004978975684121754,
      "loss": 3.3838,
      "loss_layer_12_head": 0.7403011918067932,
      "loss_layer_18_head": 0.647377610206604,
      "loss_layer_24_head": 0.3945668041706085,
      "loss_layer_30_head": 0.25004884600639343,
      "loss_layer_36_head": 0.16671442985534668,
      "loss_layer_42_head": 0.09793929010629654,
      "loss_layer_6_head": 1.136136770248413,
      "step": 1070
    },
    {
      "epoch": 13.76,
      "grad_norm": 0.8701572245167457,
      "learning_rate": 0.004978245520231026,
      "loss": 3.4597,
      "loss_layer_12_head": 0.7073389291763306,
      "loss_layer_18_head": 0.6073203086853027,
      "loss_layer_24_head": 0.3763437867164612,
      "loss_layer_30_head": 0.23771211504936218,
      "loss_layer_36_head": 0.1610027253627777,
      "loss_layer_42_head": 0.0988539382815361,
      "loss_layer_6_head": 1.2122056484222412,
      "step": 1075
    },
    {
      "epoch": 13.824,
      "grad_norm": 0.6443834582695529,
      "learning_rate": 0.0049775029481147716,
      "loss": 3.523,
      "loss_layer_12_head": 0.7137944102287292,
      "loss_layer_18_head": 0.5965902805328369,
      "loss_layer_24_head": 0.385064959526062,
      "loss_layer_30_head": 0.2293461263179779,
      "loss_layer_36_head": 0.15438520908355713,
      "loss_layer_42_head": 0.09393082559108734,
      "loss_layer_6_head": 1.265026330947876,
      "step": 1080
    },
    {
      "epoch": 13.888,
      "grad_norm": 0.6565263372397392,
      "learning_rate": 0.004976747971490945,
      "loss": 3.5756,
      "loss_layer_12_head": 0.754848837852478,
      "loss_layer_18_head": 0.6088674664497375,
      "loss_layer_24_head": 0.42070087790489197,
      "loss_layer_30_head": 0.2425309717655182,
      "loss_layer_36_head": 0.15607061982154846,
      "loss_layer_42_head": 0.09453573077917099,
      "loss_layer_6_head": 1.2209789752960205,
      "step": 1085
    },
    {
      "epoch": 13.952,
      "grad_norm": 0.6338487391411252,
      "learning_rate": 0.004975980594139608,
      "loss": 3.5709,
      "loss_layer_12_head": 0.7938184142112732,
      "loss_layer_18_head": 0.6214991807937622,
      "loss_layer_24_head": 0.4322379529476166,
      "loss_layer_30_head": 0.2405668944120407,
      "loss_layer_36_head": 0.15487752854824066,
      "loss_layer_42_head": 0.0899648666381836,
      "loss_layer_6_head": 1.2087949514389038,
      "step": 1090
    },
    {
      "epoch": 14.016,
      "grad_norm": 0.5757779648796327,
      "learning_rate": 0.00497520081990291,
      "loss": 3.455,
      "loss_layer_12_head": 0.7624772787094116,
      "loss_layer_18_head": 0.6114548444747925,
      "loss_layer_24_head": 0.42166009545326233,
      "loss_layer_30_head": 0.23516900837421417,
      "loss_layer_36_head": 0.14838233590126038,
      "loss_layer_42_head": 0.10038051754236221,
      "loss_layer_6_head": 1.1508773565292358,
      "step": 1095
    },
    {
      "epoch": 14.08,
      "grad_norm": 0.5299509784660359,
      "learning_rate": 0.004974408652685071,
      "loss": 3.1902,
      "loss_layer_12_head": 0.7056769132614136,
      "loss_layer_18_head": 0.5630340576171875,
      "loss_layer_24_head": 0.3848210573196411,
      "loss_layer_30_head": 0.2582586407661438,
      "loss_layer_36_head": 0.14832940697669983,
      "loss_layer_42_head": 0.10141785442829132,
      "loss_layer_6_head": 1.0479910373687744,
      "step": 1100
    },
    {
      "epoch": 14.144,
      "grad_norm": 0.4665210961255684,
      "learning_rate": 0.004973604096452361,
      "loss": 3.2448,
      "loss_layer_12_head": 0.6931835412979126,
      "loss_layer_18_head": 0.5475801229476929,
      "loss_layer_24_head": 0.36137092113494873,
      "loss_layer_30_head": 0.2624973654747009,
      "loss_layer_36_head": 0.14943203330039978,
      "loss_layer_42_head": 0.09534861147403717,
      "loss_layer_6_head": 1.0066078901290894,
      "step": 1105
    },
    {
      "epoch": 14.208,
      "grad_norm": 0.750710988398109,
      "learning_rate": 0.004972787155233079,
      "loss": 3.3204,
      "loss_layer_12_head": 0.7884098887443542,
      "loss_layer_18_head": 0.5912284851074219,
      "loss_layer_24_head": 0.3815678060054779,
      "loss_layer_30_head": 0.27999284863471985,
      "loss_layer_36_head": 0.15887503325939178,
      "loss_layer_42_head": 0.10960929095745087,
      "loss_layer_6_head": 1.0481228828430176,
      "step": 1110
    },
    {
      "epoch": 14.272,
      "grad_norm": 0.6351634456244593,
      "learning_rate": 0.004971957833117534,
      "loss": 3.3065,
      "loss_layer_12_head": 0.7795000076293945,
      "loss_layer_18_head": 0.6063762903213501,
      "loss_layer_24_head": 0.35088902711868286,
      "loss_layer_30_head": 0.254025399684906,
      "loss_layer_36_head": 0.14552141726016998,
      "loss_layer_42_head": 0.10806427150964737,
      "loss_layer_6_head": 0.9759188890457153,
      "step": 1115
    },
    {
      "epoch": 14.336,
      "grad_norm": 0.7454363124282164,
      "learning_rate": 0.004971116134258025,
      "loss": 3.5333,
      "loss_layer_12_head": 0.7915444374084473,
      "loss_layer_18_head": 0.749876856803894,
      "loss_layer_24_head": 0.3633958697319031,
      "loss_layer_30_head": 0.2544538080692291,
      "loss_layer_36_head": 0.15091346204280853,
      "loss_layer_42_head": 0.1601913571357727,
      "loss_layer_6_head": 0.9883192181587219,
      "step": 1120
    },
    {
      "epoch": 14.4,
      "grad_norm": 0.7615166892017682,
      "learning_rate": 0.00497026206286882,
      "loss": 3.6056,
      "loss_layer_12_head": 0.8301437497138977,
      "loss_layer_18_head": 0.8514488339424133,
      "loss_layer_24_head": 0.38463133573532104,
      "loss_layer_30_head": 0.2521565854549408,
      "loss_layer_36_head": 0.16258808970451355,
      "loss_layer_42_head": 0.17077982425689697,
      "loss_layer_6_head": 1.0490601062774658,
      "step": 1125
    },
    {
      "epoch": 14.464,
      "grad_norm": 0.6867061400711191,
      "learning_rate": 0.004969395623226133,
      "loss": 3.5742,
      "loss_layer_12_head": 0.8164698481559753,
      "loss_layer_18_head": 0.7292622327804565,
      "loss_layer_24_head": 0.36151689291000366,
      "loss_layer_30_head": 0.2422943115234375,
      "loss_layer_36_head": 0.163498193025589,
      "loss_layer_42_head": 0.1550932228565216,
      "loss_layer_6_head": 0.990157425403595,
      "step": 1130
    },
    {
      "epoch": 14.528,
      "grad_norm": 0.47860352387950567,
      "learning_rate": 0.004968516819668105,
      "loss": 3.4556,
      "loss_layer_12_head": 0.8495017886161804,
      "loss_layer_18_head": 0.7350481748580933,
      "loss_layer_24_head": 0.39126425981521606,
      "loss_layer_30_head": 0.24367788434028625,
      "loss_layer_36_head": 0.16477090120315552,
      "loss_layer_42_head": 0.13474634289741516,
      "loss_layer_6_head": 1.0453661680221558,
      "step": 1135
    },
    {
      "epoch": 14.592,
      "grad_norm": 0.4686644429666745,
      "learning_rate": 0.004967625656594782,
      "loss": 3.3806,
      "loss_layer_12_head": 0.8133857846260071,
      "loss_layer_18_head": 0.6815980672836304,
      "loss_layer_24_head": 0.386465847492218,
      "loss_layer_30_head": 0.23411008715629578,
      "loss_layer_36_head": 0.16067293286323547,
      "loss_layer_42_head": 0.11982128769159317,
      "loss_layer_6_head": 1.0258219242095947,
      "step": 1140
    },
    {
      "epoch": 14.656,
      "grad_norm": 0.47753029790017887,
      "learning_rate": 0.004966722138468092,
      "loss": 3.402,
      "loss_layer_12_head": 0.7901675701141357,
      "loss_layer_18_head": 0.6485341787338257,
      "loss_layer_24_head": 0.3868171274662018,
      "loss_layer_30_head": 0.23003344237804413,
      "loss_layer_36_head": 0.1585492193698883,
      "loss_layer_42_head": 0.1129513755440712,
      "loss_layer_6_head": 1.0243456363677979,
      "step": 1145
    },
    {
      "epoch": 14.72,
      "grad_norm": 0.41487162104371816,
      "learning_rate": 0.004965806269811821,
      "loss": 3.3389,
      "loss_layer_12_head": 0.7756093740463257,
      "loss_layer_18_head": 0.6403516530990601,
      "loss_layer_24_head": 0.3819546699523926,
      "loss_layer_30_head": 0.22707819938659668,
      "loss_layer_36_head": 0.1615157425403595,
      "loss_layer_42_head": 0.11443561315536499,
      "loss_layer_6_head": 1.0147532224655151,
      "step": 1150
    },
    {
      "epoch": 14.784,
      "grad_norm": 0.5889765762584342,
      "learning_rate": 0.004964878055211597,
      "loss": 3.3233,
      "loss_layer_12_head": 0.753922164440155,
      "loss_layer_18_head": 0.6092627048492432,
      "loss_layer_24_head": 0.3604854941368103,
      "loss_layer_30_head": 0.22794711589813232,
      "loss_layer_36_head": 0.15462544560432434,
      "loss_layer_42_head": 0.10322277247905731,
      "loss_layer_6_head": 0.9830999374389648,
      "step": 1155
    },
    {
      "epoch": 14.848,
      "grad_norm": 0.8345581064851518,
      "learning_rate": 0.004963937499314857,
      "loss": 3.4544,
      "loss_layer_12_head": 0.7836810350418091,
      "loss_layer_18_head": 0.670457124710083,
      "loss_layer_24_head": 0.3604394793510437,
      "loss_layer_30_head": 0.22445884346961975,
      "loss_layer_36_head": 0.15584328770637512,
      "loss_layer_42_head": 0.0979786217212677,
      "loss_layer_6_head": 0.9851406216621399,
      "step": 1160
    },
    {
      "epoch": 14.912,
      "grad_norm": 0.665605216399136,
      "learning_rate": 0.004962984606830831,
      "loss": 3.4646,
      "loss_layer_12_head": 0.8306870460510254,
      "loss_layer_18_head": 0.6783283352851868,
      "loss_layer_24_head": 0.38808220624923706,
      "loss_layer_30_head": 0.24880416691303253,
      "loss_layer_36_head": 0.16605809330940247,
      "loss_layer_42_head": 0.10300717502832413,
      "loss_layer_6_head": 1.036123514175415,
      "step": 1165
    },
    {
      "epoch": 14.975999999999999,
      "grad_norm": 0.5237110520254066,
      "learning_rate": 0.00496201938253052,
      "loss": 3.3808,
      "loss_layer_12_head": 0.8197532892227173,
      "loss_layer_18_head": 0.6646395921707153,
      "loss_layer_24_head": 0.3825700283050537,
      "loss_layer_30_head": 0.24767646193504333,
      "loss_layer_36_head": 0.16340544819831848,
      "loss_layer_42_head": 0.10074009746313095,
      "loss_layer_6_head": 1.0488249063491821,
      "step": 1170
    },
    {
      "epoch": 15.04,
      "grad_norm": 0.5691141599616598,
      "learning_rate": 0.004961041831246665,
      "loss": 3.1916,
      "loss_layer_12_head": 0.7693406939506531,
      "loss_layer_18_head": 0.6334480047225952,
      "loss_layer_24_head": 0.36191526055336,
      "loss_layer_30_head": 0.2351757287979126,
      "loss_layer_36_head": 0.16698212921619415,
      "loss_layer_42_head": 0.0941908210515976,
      "loss_layer_6_head": 1.009463906288147,
      "step": 1175
    },
    {
      "epoch": 15.104,
      "grad_norm": 0.48677348717474805,
      "learning_rate": 0.004960051957873725,
      "loss": 3.1017,
      "loss_layer_12_head": 0.7114187479019165,
      "loss_layer_18_head": 0.5862293839454651,
      "loss_layer_24_head": 0.34804868698120117,
      "loss_layer_30_head": 0.2377994954586029,
      "loss_layer_36_head": 0.16968996822834015,
      "loss_layer_42_head": 0.09139184653759003,
      "loss_layer_6_head": 0.9709873199462891,
      "step": 1180
    },
    {
      "epoch": 15.168,
      "grad_norm": 0.638138039664249,
      "learning_rate": 0.0049590497673678595,
      "loss": 3.1734,
      "loss_layer_12_head": 0.6779589653015137,
      "loss_layer_18_head": 0.5614650249481201,
      "loss_layer_24_head": 0.35107797384262085,
      "loss_layer_30_head": 0.258102685213089,
      "loss_layer_36_head": 0.16775578260421753,
      "loss_layer_42_head": 0.08884912729263306,
      "loss_layer_6_head": 0.9450022578239441,
      "step": 1185
    },
    {
      "epoch": 15.232,
      "grad_norm": 0.6236679276582957,
      "learning_rate": 0.004958035264746892,
      "loss": 3.2177,
      "loss_layer_12_head": 0.7044528722763062,
      "loss_layer_18_head": 0.5770634412765503,
      "loss_layer_24_head": 0.37361830472946167,
      "loss_layer_30_head": 0.32047295570373535,
      "loss_layer_36_head": 0.16556866466999054,
      "loss_layer_42_head": 0.08639300614595413,
      "loss_layer_6_head": 1.001249074935913,
      "step": 1190
    },
    {
      "epoch": 15.296,
      "grad_norm": 0.66616709684256,
      "learning_rate": 0.004957008455090296,
      "loss": 3.2428,
      "loss_layer_12_head": 0.7080108523368835,
      "loss_layer_18_head": 0.6047711968421936,
      "loss_layer_24_head": 0.37771105766296387,
      "loss_layer_30_head": 0.3046354651451111,
      "loss_layer_36_head": 0.1633838713169098,
      "loss_layer_42_head": 0.08688460290431976,
      "loss_layer_6_head": 1.0304388999938965,
      "step": 1195
    },
    {
      "epoch": 15.36,
      "grad_norm": 0.6478234449927864,
      "learning_rate": 0.004955969343539162,
      "loss": 3.1679,
      "loss_layer_12_head": 0.6620894074440002,
      "loss_layer_18_head": 0.6078165769577026,
      "loss_layer_24_head": 0.35161152482032776,
      "loss_layer_30_head": 0.272592693567276,
      "loss_layer_36_head": 0.14646513760089874,
      "loss_layer_42_head": 0.08043842017650604,
      "loss_layer_6_head": 0.9763358235359192,
      "step": 1200
    },
    {
      "epoch": 15.36,
      "eval_loss": 5.653632164001465,
      "eval_loss_layer_12_head": 1.2553300857543945,
      "eval_loss_layer_18_head": 1.1685264110565186,
      "eval_loss_layer_24_head": 0.7246644496917725,
      "eval_loss_layer_30_head": 0.5291981101036072,
      "eval_loss_layer_36_head": 0.31247785687446594,
      "eval_loss_layer_42_head": 0.18734292685985565,
      "eval_loss_layer_6_head": 1.6223541498184204,
      "eval_runtime": 33.0746,
      "eval_samples_per_second": 9.675,
      "eval_steps_per_second": 0.605,
      "step": 1200
    },
    {
      "epoch": 15.424,
      "grad_norm": 0.6867058048629272,
      "learning_rate": 0.004954917935296175,
      "loss": 3.3087,
      "loss_layer_12_head": 0.7135751843452454,
      "loss_layer_18_head": 0.6804697513580322,
      "loss_layer_24_head": 0.3701918423175812,
      "loss_layer_30_head": 0.2702871859073639,
      "loss_layer_36_head": 0.15693072974681854,
      "loss_layer_42_head": 0.08752115070819855,
      "loss_layer_6_head": 1.0398051738739014,
      "step": 1205
    },
    {
      "epoch": 15.488,
      "grad_norm": 0.6090389318476087,
      "learning_rate": 0.0049538542356255865,
      "loss": 3.2807,
      "loss_layer_12_head": 0.7006480097770691,
      "loss_layer_18_head": 0.6693123579025269,
      "loss_layer_24_head": 0.3649710416793823,
      "loss_layer_30_head": 0.25089147686958313,
      "loss_layer_36_head": 0.154684379696846,
      "loss_layer_42_head": 0.08712951093912125,
      "loss_layer_6_head": 1.036698579788208,
      "step": 1210
    },
    {
      "epoch": 15.552,
      "grad_norm": 0.9658639913270239,
      "learning_rate": 0.004952778249853192,
      "loss": 3.3874,
      "loss_layer_12_head": 0.7338878512382507,
      "loss_layer_18_head": 0.6605514883995056,
      "loss_layer_24_head": 0.38610729575157166,
      "loss_layer_30_head": 0.2480933666229248,
      "loss_layer_36_head": 0.1597428023815155,
      "loss_layer_42_head": 0.08830897510051727,
      "loss_layer_6_head": 1.156058430671692,
      "step": 1215
    },
    {
      "epoch": 15.616,
      "grad_norm": 0.7794831771375103,
      "learning_rate": 0.004951689983366299,
      "loss": 3.3331,
      "loss_layer_12_head": 0.7002662420272827,
      "loss_layer_18_head": 0.6172714829444885,
      "loss_layer_24_head": 0.37848401069641113,
      "loss_layer_30_head": 0.23044779896736145,
      "loss_layer_36_head": 0.15394659340381622,
      "loss_layer_42_head": 0.0856502428650856,
      "loss_layer_6_head": 1.1114778518676758,
      "step": 1220
    },
    {
      "epoch": 15.68,
      "grad_norm": 0.6901947123503279,
      "learning_rate": 0.004950589441613707,
      "loss": 3.3754,
      "loss_layer_12_head": 0.7352136373519897,
      "loss_layer_18_head": 0.6146910786628723,
      "loss_layer_24_head": 0.392516553401947,
      "loss_layer_30_head": 0.24169793725013733,
      "loss_layer_36_head": 0.16307665407657623,
      "loss_layer_42_head": 0.08708196133375168,
      "loss_layer_6_head": 1.1177246570587158,
      "step": 1225
    },
    {
      "epoch": 15.744,
      "grad_norm": 0.8458469954515039,
      "learning_rate": 0.004949476630105669,
      "loss": 3.3321,
      "loss_layer_12_head": 0.729667067527771,
      "loss_layer_18_head": 0.5730844736099243,
      "loss_layer_24_head": 0.3574075698852539,
      "loss_layer_30_head": 0.22016894817352295,
      "loss_layer_36_head": 0.15608440339565277,
      "loss_layer_42_head": 0.09126514941453934,
      "loss_layer_6_head": 1.0882729291915894,
      "step": 1230
    },
    {
      "epoch": 15.808,
      "grad_norm": 0.6046771355258104,
      "learning_rate": 0.004948351554413879,
      "loss": 3.483,
      "loss_layer_12_head": 0.8392592668533325,
      "loss_layer_18_head": 0.6292867064476013,
      "loss_layer_24_head": 0.40893298387527466,
      "loss_layer_30_head": 0.23542578518390656,
      "loss_layer_36_head": 0.16896067559719086,
      "loss_layer_42_head": 0.10560699552297592,
      "loss_layer_6_head": 1.192775011062622,
      "step": 1235
    },
    {
      "epoch": 15.872,
      "grad_norm": 0.8530891513247477,
      "learning_rate": 0.00494721422017143,
      "loss": 3.4788,
      "loss_layer_12_head": 0.8105120658874512,
      "loss_layer_18_head": 0.5674842596054077,
      "loss_layer_24_head": 0.36804115772247314,
      "loss_layer_30_head": 0.21658475697040558,
      "loss_layer_36_head": 0.17330919206142426,
      "loss_layer_42_head": 0.12003310769796371,
      "loss_layer_6_head": 1.111560583114624,
      "step": 1240
    },
    {
      "epoch": 15.936,
      "grad_norm": 0.6218558612306648,
      "learning_rate": 0.004946064633072795,
      "loss": 3.6155,
      "loss_layer_12_head": 0.8941036462783813,
      "loss_layer_18_head": 0.6137522459030151,
      "loss_layer_24_head": 0.40153074264526367,
      "loss_layer_30_head": 0.2411966323852539,
      "loss_layer_36_head": 0.2290516346693039,
      "loss_layer_42_head": 0.15114183723926544,
      "loss_layer_6_head": 1.1253597736358643,
      "step": 1245
    },
    {
      "epoch": 16.0,
      "grad_norm": 0.5884215368805477,
      "learning_rate": 0.004944902798873794,
      "loss": 3.5069,
      "loss_layer_12_head": 0.8023141622543335,
      "loss_layer_18_head": 0.5734208822250366,
      "loss_layer_24_head": 0.3805721402168274,
      "loss_layer_30_head": 0.21649625897407532,
      "loss_layer_36_head": 0.18713167309761047,
      "loss_layer_42_head": 0.15947124361991882,
      "loss_layer_6_head": 1.0516259670257568,
      "step": 1250
    },
    {
      "epoch": 16.064,
      "grad_norm": 0.5225892357327171,
      "learning_rate": 0.004943728723391566,
      "loss": 3.1741,
      "loss_layer_12_head": 0.7241431474685669,
      "loss_layer_18_head": 0.5220152139663696,
      "loss_layer_24_head": 0.3623393774032593,
      "loss_layer_30_head": 0.20867891609668732,
      "loss_layer_36_head": 0.16728238761425018,
      "loss_layer_42_head": 0.13852593302726746,
      "loss_layer_6_head": 0.9629136919975281,
      "step": 1255
    },
    {
      "epoch": 16.128,
      "grad_norm": 0.45337071122114286,
      "learning_rate": 0.004942542412504543,
      "loss": 3.1201,
      "loss_layer_12_head": 0.7606677412986755,
      "loss_layer_18_head": 0.5765603184700012,
      "loss_layer_24_head": 0.4028806686401367,
      "loss_layer_30_head": 0.2328529804944992,
      "loss_layer_36_head": 0.17835351824760437,
      "loss_layer_42_head": 0.13164518773555756,
      "loss_layer_6_head": 1.0228877067565918,
      "step": 1260
    },
    {
      "epoch": 16.192,
      "grad_norm": 0.4247709899601678,
      "learning_rate": 0.0049413438721524145,
      "loss": 3.1085,
      "loss_layer_12_head": 0.7224582433700562,
      "loss_layer_18_head": 0.5543040633201599,
      "loss_layer_24_head": 0.40800055861473083,
      "loss_layer_30_head": 0.22409161925315857,
      "loss_layer_36_head": 0.16312387585639954,
      "loss_layer_42_head": 0.1117904782295227,
      "loss_layer_6_head": 0.9960142970085144,
      "step": 1265
    },
    {
      "epoch": 16.256,
      "grad_norm": 0.3659032460367919,
      "learning_rate": 0.004940133108336104,
      "loss": 3.112,
      "loss_layer_12_head": 0.7004276514053345,
      "loss_layer_18_head": 0.5516552329063416,
      "loss_layer_24_head": 0.4227422773838043,
      "loss_layer_30_head": 0.23043136298656464,
      "loss_layer_36_head": 0.15917718410491943,
      "loss_layer_42_head": 0.10694196075201035,
      "loss_layer_6_head": 0.981259822845459,
      "step": 1270
    },
    {
      "epoch": 16.32,
      "grad_norm": 0.42356293418599666,
      "learning_rate": 0.004938910127117735,
      "loss": 3.1477,
      "loss_layer_12_head": 0.7414632439613342,
      "loss_layer_18_head": 0.5998217463493347,
      "loss_layer_24_head": 0.4339497983455658,
      "loss_layer_30_head": 0.24939019978046417,
      "loss_layer_36_head": 0.16011786460876465,
      "loss_layer_42_head": 0.10240612179040909,
      "loss_layer_6_head": 1.0301176309585571,
      "step": 1275
    },
    {
      "epoch": 16.384,
      "grad_norm": 0.5289552253872069,
      "learning_rate": 0.004937674934620601,
      "loss": 3.0975,
      "loss_layer_12_head": 0.7156280279159546,
      "loss_layer_18_head": 0.5668956637382507,
      "loss_layer_24_head": 0.4238418638706207,
      "loss_layer_30_head": 0.23876997828483582,
      "loss_layer_36_head": 0.1519106775522232,
      "loss_layer_42_head": 0.09453745186328888,
      "loss_layer_6_head": 1.023458480834961,
      "step": 1280
    },
    {
      "epoch": 16.448,
      "grad_norm": 0.44103017606094197,
      "learning_rate": 0.004936427537029137,
      "loss": 3.1774,
      "loss_layer_12_head": 0.6973890066146851,
      "loss_layer_18_head": 0.5626018643379211,
      "loss_layer_24_head": 0.4451088011264801,
      "loss_layer_30_head": 0.2413700819015503,
      "loss_layer_36_head": 0.147749125957489,
      "loss_layer_42_head": 0.0862850621342659,
      "loss_layer_6_head": 0.9939961433410645,
      "step": 1285
    },
    {
      "epoch": 16.512,
      "grad_norm": 0.4347148877270318,
      "learning_rate": 0.004935167940588887,
      "loss": 3.1551,
      "loss_layer_12_head": 0.6765448451042175,
      "loss_layer_18_head": 0.5564993023872375,
      "loss_layer_24_head": 0.41178983449935913,
      "loss_layer_30_head": 0.24470603466033936,
      "loss_layer_36_head": 0.14604239165782928,
      "loss_layer_42_head": 0.08493504673242569,
      "loss_layer_6_head": 0.9738994836807251,
      "step": 1290
    },
    {
      "epoch": 16.576,
      "grad_norm": 0.5244049055470625,
      "learning_rate": 0.004933896151606471,
      "loss": 3.2202,
      "loss_layer_12_head": 0.699276864528656,
      "loss_layer_18_head": 0.5723982453346252,
      "loss_layer_24_head": 0.40463772416114807,
      "loss_layer_30_head": 0.2263565957546234,
      "loss_layer_36_head": 0.14647449553012848,
      "loss_layer_42_head": 0.08172450214624405,
      "loss_layer_6_head": 0.9878841638565063,
      "step": 1295
    },
    {
      "epoch": 16.64,
      "grad_norm": 0.4881412481769761,
      "learning_rate": 0.004932612176449559,
      "loss": 3.1399,
      "loss_layer_12_head": 0.6982871890068054,
      "loss_layer_18_head": 0.567693829536438,
      "loss_layer_24_head": 0.40264391899108887,
      "loss_layer_30_head": 0.22491240501403809,
      "loss_layer_36_head": 0.1454627960920334,
      "loss_layer_42_head": 0.0813308134675026,
      "loss_layer_6_head": 0.9824591875076294,
      "step": 1300
    },
    {
      "epoch": 16.704,
      "grad_norm": 0.5353039773218903,
      "learning_rate": 0.004931316021546834,
      "loss": 3.1687,
      "loss_layer_12_head": 0.7318192720413208,
      "loss_layer_18_head": 0.5941206216812134,
      "loss_layer_24_head": 0.4133567810058594,
      "loss_layer_30_head": 0.2314300537109375,
      "loss_layer_36_head": 0.1498340666294098,
      "loss_layer_42_head": 0.08335960656404495,
      "loss_layer_6_head": 1.0306583642959595,
      "step": 1305
    },
    {
      "epoch": 16.768,
      "grad_norm": 0.5320263781100686,
      "learning_rate": 0.004930007693387957,
      "loss": 3.2323,
      "loss_layer_12_head": 0.6972874402999878,
      "loss_layer_18_head": 0.5647767782211304,
      "loss_layer_24_head": 0.3870491683483124,
      "loss_layer_30_head": 0.2178010642528534,
      "loss_layer_36_head": 0.15132862329483032,
      "loss_layer_42_head": 0.07852266728878021,
      "loss_layer_6_head": 1.0007141828536987,
      "step": 1310
    },
    {
      "epoch": 16.832,
      "grad_norm": 0.7008622484075332,
      "learning_rate": 0.004928687198523546,
      "loss": 3.2172,
      "loss_layer_12_head": 0.7122496366500854,
      "loss_layer_18_head": 0.5724424719810486,
      "loss_layer_24_head": 0.38043099641799927,
      "loss_layer_30_head": 0.22405891120433807,
      "loss_layer_36_head": 0.15329545736312866,
      "loss_layer_42_head": 0.09241309762001038,
      "loss_layer_6_head": 1.0291364192962646,
      "step": 1315
    },
    {
      "epoch": 16.896,
      "grad_norm": 0.4144580154082241,
      "learning_rate": 0.0049273545435651305,
      "loss": 3.1641,
      "loss_layer_12_head": 0.7149626016616821,
      "loss_layer_18_head": 0.5869017839431763,
      "loss_layer_24_head": 0.37773579359054565,
      "loss_layer_30_head": 0.22526881098747253,
      "loss_layer_36_head": 0.15522436797618866,
      "loss_layer_42_head": 0.08087652176618576,
      "loss_layer_6_head": 1.0109407901763916,
      "step": 1320
    },
    {
      "epoch": 16.96,
      "grad_norm": 0.4787289608145743,
      "learning_rate": 0.004926009735185127,
      "loss": 3.1964,
      "loss_layer_12_head": 0.7091538906097412,
      "loss_layer_18_head": 0.571030855178833,
      "loss_layer_24_head": 0.3655551075935364,
      "loss_layer_30_head": 0.21990999579429626,
      "loss_layer_36_head": 0.14906707406044006,
      "loss_layer_42_head": 0.08713117986917496,
      "loss_layer_6_head": 1.0153952836990356,
      "step": 1325
    },
    {
      "epoch": 17.024,
      "grad_norm": 0.5519771590135708,
      "learning_rate": 0.004924652780116799,
      "loss": 3.1006,
      "loss_layer_12_head": 0.7275198698043823,
      "loss_layer_18_head": 0.5749965906143188,
      "loss_layer_24_head": 0.36105626821517944,
      "loss_layer_30_head": 0.22089333832263947,
      "loss_layer_36_head": 0.15601114928722382,
      "loss_layer_42_head": 0.08505455404520035,
      "loss_layer_6_head": 1.017714023590088,
      "step": 1330
    },
    {
      "epoch": 17.088,
      "grad_norm": 0.6759962159833123,
      "learning_rate": 0.004923283685154232,
      "loss": 3.0572,
      "loss_layer_12_head": 0.7011330723762512,
      "loss_layer_18_head": 0.5382241010665894,
      "loss_layer_24_head": 0.3375994563102722,
      "loss_layer_30_head": 0.2189328670501709,
      "loss_layer_36_head": 0.15959127247333527,
      "loss_layer_42_head": 0.09193147718906403,
      "loss_layer_6_head": 0.9696317911148071,
      "step": 1335
    },
    {
      "epoch": 17.152,
      "grad_norm": 0.7492066406224508,
      "learning_rate": 0.004921902457152289,
      "loss": 3.236,
      "loss_layer_12_head": 0.7362578511238098,
      "loss_layer_18_head": 0.5638151168823242,
      "loss_layer_24_head": 0.3446405827999115,
      "loss_layer_30_head": 0.23062515258789062,
      "loss_layer_36_head": 0.1608099639415741,
      "loss_layer_42_head": 0.08730962872505188,
      "loss_layer_6_head": 1.2179874181747437,
      "step": 1340
    },
    {
      "epoch": 17.216,
      "grad_norm": 0.831622911888283,
      "learning_rate": 0.004920509103026584,
      "loss": 3.4759,
      "loss_layer_12_head": 0.7534016370773315,
      "loss_layer_18_head": 0.6363294124603271,
      "loss_layer_24_head": 0.3466118276119232,
      "loss_layer_30_head": 0.28126975893974304,
      "loss_layer_36_head": 0.1647748053073883,
      "loss_layer_42_head": 0.09097732603549957,
      "loss_layer_6_head": 1.3024110794067383,
      "step": 1345
    },
    {
      "epoch": 17.28,
      "grad_norm": 0.7884825070908287,
      "learning_rate": 0.004919103629753445,
      "loss": 3.6842,
      "loss_layer_12_head": 0.8099075555801392,
      "loss_layer_18_head": 0.8278264999389648,
      "loss_layer_24_head": 0.34443217515945435,
      "loss_layer_30_head": 0.30316975712776184,
      "loss_layer_36_head": 0.16976439952850342,
      "loss_layer_42_head": 0.0895252376794815,
      "loss_layer_6_head": 1.1807701587677002,
      "step": 1350
    },
    {
      "epoch": 17.344,
      "grad_norm": 0.4796602541321931,
      "learning_rate": 0.004917686044369879,
      "loss": 3.7467,
      "loss_layer_12_head": 0.8182764053344727,
      "loss_layer_18_head": 0.8977873921394348,
      "loss_layer_24_head": 0.3436835706233978,
      "loss_layer_30_head": 0.28306490182876587,
      "loss_layer_36_head": 0.16302275657653809,
      "loss_layer_42_head": 0.08982818573713303,
      "loss_layer_6_head": 1.1481602191925049,
      "step": 1355
    },
    {
      "epoch": 17.408,
      "grad_norm": 0.5197048313422988,
      "learning_rate": 0.004916256353973535,
      "loss": 3.6486,
      "loss_layer_12_head": 0.8608118891716003,
      "loss_layer_18_head": 0.8750579953193665,
      "loss_layer_24_head": 0.37789979577064514,
      "loss_layer_30_head": 0.2897063195705414,
      "loss_layer_36_head": 0.15753884613513947,
      "loss_layer_42_head": 0.09018051624298096,
      "loss_layer_6_head": 1.1552715301513672,
      "step": 1360
    },
    {
      "epoch": 17.472,
      "grad_norm": 0.4278508880437451,
      "learning_rate": 0.004914814565722671,
      "loss": 3.5154,
      "loss_layer_12_head": 0.7820771336555481,
      "loss_layer_18_head": 0.7514595985412598,
      "loss_layer_24_head": 0.36200058460235596,
      "loss_layer_30_head": 0.2529972195625305,
      "loss_layer_36_head": 0.14967811107635498,
      "loss_layer_42_head": 0.08785318583250046,
      "loss_layer_6_head": 1.0975428819656372,
      "step": 1365
    },
    {
      "epoch": 17.536,
      "grad_norm": 0.4961000093382288,
      "learning_rate": 0.004913360686836117,
      "loss": 3.4199,
      "loss_layer_12_head": 0.7563498616218567,
      "loss_layer_18_head": 0.7089719176292419,
      "loss_layer_24_head": 0.4126870036125183,
      "loss_layer_30_head": 0.245447039604187,
      "loss_layer_36_head": 0.15485070645809174,
      "loss_layer_42_head": 0.0907430425286293,
      "loss_layer_6_head": 1.060893177986145,
      "step": 1370
    },
    {
      "epoch": 17.6,
      "grad_norm": 0.4149503638553699,
      "learning_rate": 0.0049118947245932396,
      "loss": 3.4495,
      "loss_layer_12_head": 0.7734986543655396,
      "loss_layer_18_head": 0.6840087175369263,
      "loss_layer_24_head": 0.48642176389694214,
      "loss_layer_30_head": 0.2610875964164734,
      "loss_layer_36_head": 0.16970601677894592,
      "loss_layer_42_head": 0.0951908528804779,
      "loss_layer_6_head": 1.0771145820617676,
      "step": 1375
    },
    {
      "epoch": 17.664,
      "grad_norm": 0.3501951043297786,
      "learning_rate": 0.004910416686333906,
      "loss": 3.3416,
      "loss_layer_12_head": 0.7466601133346558,
      "loss_layer_18_head": 0.6449626088142395,
      "loss_layer_24_head": 0.4465230107307434,
      "loss_layer_30_head": 0.23542122542858124,
      "loss_layer_36_head": 0.1598082184791565,
      "loss_layer_42_head": 0.09479852020740509,
      "loss_layer_6_head": 1.0352535247802734,
      "step": 1380
    },
    {
      "epoch": 17.728,
      "grad_norm": 0.30086630107210066,
      "learning_rate": 0.004908926579458444,
      "loss": 3.2883,
      "loss_layer_12_head": 0.7241634130477905,
      "loss_layer_18_head": 0.5994696021080017,
      "loss_layer_24_head": 0.4371631145477295,
      "loss_layer_30_head": 0.2204952985048294,
      "loss_layer_36_head": 0.1530158668756485,
      "loss_layer_42_head": 0.09420449286699295,
      "loss_layer_6_head": 1.008998155593872,
      "step": 1385
    },
    {
      "epoch": 17.792,
      "grad_norm": 0.2707735684593116,
      "learning_rate": 0.004907424411427608,
      "loss": 3.2703,
      "loss_layer_12_head": 0.7208462357521057,
      "loss_layer_18_head": 0.5975139141082764,
      "loss_layer_24_head": 0.4319986402988434,
      "loss_layer_30_head": 0.2214580774307251,
      "loss_layer_36_head": 0.15389080345630646,
      "loss_layer_42_head": 0.09249377250671387,
      "loss_layer_6_head": 1.00188410282135,
      "step": 1390
    },
    {
      "epoch": 17.856,
      "grad_norm": 0.3206142107005092,
      "learning_rate": 0.004905910189762542,
      "loss": 3.2018,
      "loss_layer_12_head": 0.7149311900138855,
      "loss_layer_18_head": 0.5904548168182373,
      "loss_layer_24_head": 0.41963744163513184,
      "loss_layer_30_head": 0.22388502955436707,
      "loss_layer_36_head": 0.1542077213525772,
      "loss_layer_42_head": 0.09287864714860916,
      "loss_layer_6_head": 0.9961978793144226,
      "step": 1395
    },
    {
      "epoch": 17.92,
      "grad_norm": 0.42783985386525714,
      "learning_rate": 0.00490438392204474,
      "loss": 3.2193,
      "loss_layer_12_head": 0.7198162078857422,
      "loss_layer_18_head": 0.582438588142395,
      "loss_layer_24_head": 0.3966544568538666,
      "loss_layer_30_head": 0.23883943259716034,
      "loss_layer_36_head": 0.15754231810569763,
      "loss_layer_42_head": 0.09034500271081924,
      "loss_layer_6_head": 1.001526951789856,
      "step": 1400
    },
    {
      "epoch": 17.92,
      "eval_loss": 5.5505900382995605,
      "eval_loss_layer_12_head": 1.2720930576324463,
      "eval_loss_layer_18_head": 1.0924803018569946,
      "eval_loss_layer_24_head": 0.7382358312606812,
      "eval_loss_layer_30_head": 0.4849087595939636,
      "eval_loss_layer_36_head": 0.3224385678768158,
      "eval_loss_layer_42_head": 0.19694986939430237,
      "eval_loss_layer_6_head": 1.5900121927261353,
      "eval_runtime": 33.1274,
      "eval_samples_per_second": 9.66,
      "eval_steps_per_second": 0.604,
      "step": 1400
    },
    {
      "epoch": 17.984,
      "grad_norm": 0.35950707411963784,
      "learning_rate": 0.004902845615916009,
      "loss": 3.2034,
      "loss_layer_12_head": 0.7266178131103516,
      "loss_layer_18_head": 0.5792479515075684,
      "loss_layer_24_head": 0.3833567500114441,
      "loss_layer_30_head": 0.21955418586730957,
      "loss_layer_36_head": 0.15840497612953186,
      "loss_layer_42_head": 0.09206746518611908,
      "loss_layer_6_head": 1.0011776685714722,
      "step": 1405
    },
    {
      "epoch": 18.048,
      "grad_norm": 0.2995367087506173,
      "learning_rate": 0.004901295279078431,
      "loss": 2.963,
      "loss_layer_12_head": 0.7038646340370178,
      "loss_layer_18_head": 0.5575783848762512,
      "loss_layer_24_head": 0.3671122193336487,
      "loss_layer_30_head": 0.2151305377483368,
      "loss_layer_36_head": 0.1534932255744934,
      "loss_layer_42_head": 0.08721452206373215,
      "loss_layer_6_head": 0.9791160821914673,
      "step": 1410
    },
    {
      "epoch": 18.112,
      "grad_norm": 0.30613184563275353,
      "learning_rate": 0.004899732919294323,
      "loss": 2.9008,
      "loss_layer_12_head": 0.6466768980026245,
      "loss_layer_18_head": 0.5196658372879028,
      "loss_layer_24_head": 0.33654212951660156,
      "loss_layer_30_head": 0.19856718182563782,
      "loss_layer_36_head": 0.1476348340511322,
      "loss_layer_42_head": 0.08416076749563217,
      "loss_layer_6_head": 0.9082804918289185,
      "step": 1415
    },
    {
      "epoch": 18.176,
      "grad_norm": 0.22707138032705035,
      "learning_rate": 0.004898158544386201,
      "loss": 2.9398,
      "loss_layer_12_head": 0.6891641020774841,
      "loss_layer_18_head": 0.5542203187942505,
      "loss_layer_24_head": 0.35374900698661804,
      "loss_layer_30_head": 0.2147204875946045,
      "loss_layer_36_head": 0.1551155298948288,
      "loss_layer_42_head": 0.08717845380306244,
      "loss_layer_6_head": 0.9759851694107056,
      "step": 1420
    },
    {
      "epoch": 18.24,
      "grad_norm": 0.371875239803756,
      "learning_rate": 0.004896572162236736,
      "loss": 2.9919,
      "loss_layer_12_head": 0.6962164640426636,
      "loss_layer_18_head": 0.562719464302063,
      "loss_layer_24_head": 0.35642528533935547,
      "loss_layer_30_head": 0.21888470649719238,
      "loss_layer_36_head": 0.15511459112167358,
      "loss_layer_42_head": 0.0979718342423439,
      "loss_layer_6_head": 0.9729306101799011,
      "step": 1425
    },
    {
      "epoch": 18.304,
      "grad_norm": 0.2965210945351107,
      "learning_rate": 0.004894973780788722,
      "loss": 2.9599,
      "loss_layer_12_head": 0.6927074193954468,
      "loss_layer_18_head": 0.5536083579063416,
      "loss_layer_24_head": 0.3483261168003082,
      "loss_layer_30_head": 0.22005967795848846,
      "loss_layer_36_head": 0.14609557390213013,
      "loss_layer_42_head": 0.08893860876560211,
      "loss_layer_6_head": 0.9660736322402954,
      "step": 1430
    },
    {
      "epoch": 18.368,
      "grad_norm": 0.36466014468246505,
      "learning_rate": 0.004893363408045029,
      "loss": 2.9764,
      "loss_layer_12_head": 0.7174627184867859,
      "loss_layer_18_head": 0.5583192706108093,
      "loss_layer_24_head": 0.3550580143928528,
      "loss_layer_30_head": 0.2209354192018509,
      "loss_layer_36_head": 0.14910905063152313,
      "loss_layer_42_head": 0.08668553829193115,
      "loss_layer_6_head": 0.9841427803039551,
      "step": 1435
    },
    {
      "epoch": 18.432,
      "grad_norm": 0.5284625993010669,
      "learning_rate": 0.004891741052068564,
      "loss": 2.9953,
      "loss_layer_12_head": 0.7047576904296875,
      "loss_layer_18_head": 0.5404755473136902,
      "loss_layer_24_head": 0.344508558511734,
      "loss_layer_30_head": 0.21836546063423157,
      "loss_layer_36_head": 0.14187093079090118,
      "loss_layer_42_head": 0.0838368684053421,
      "loss_layer_6_head": 0.9445589780807495,
      "step": 1440
    },
    {
      "epoch": 18.496,
      "grad_norm": 0.5715507374326825,
      "learning_rate": 0.004890106720982235,
      "loss": 3.1298,
      "loss_layer_12_head": 0.8469120860099792,
      "loss_layer_18_head": 0.5620571374893188,
      "loss_layer_24_head": 0.3470451533794403,
      "loss_layer_30_head": 0.22352853417396545,
      "loss_layer_36_head": 0.1448194682598114,
      "loss_layer_42_head": 0.08771485835313797,
      "loss_layer_6_head": 0.9724265336990356,
      "step": 1445
    },
    {
      "epoch": 18.56,
      "grad_norm": 0.47536316821713864,
      "learning_rate": 0.004888460422968908,
      "loss": 3.1709,
      "loss_layer_12_head": 0.8255361318588257,
      "loss_layer_18_head": 0.5529444813728333,
      "loss_layer_24_head": 0.34804078936576843,
      "loss_layer_30_head": 0.22453705966472626,
      "loss_layer_36_head": 0.142146497964859,
      "loss_layer_42_head": 0.0897841602563858,
      "loss_layer_6_head": 0.9733034372329712,
      "step": 1450
    },
    {
      "epoch": 18.624,
      "grad_norm": 0.31618789432141486,
      "learning_rate": 0.004886802166271364,
      "loss": 3.1788,
      "loss_layer_12_head": 0.8118365406990051,
      "loss_layer_18_head": 0.579054057598114,
      "loss_layer_24_head": 0.35920998454093933,
      "loss_layer_30_head": 0.22656838595867157,
      "loss_layer_36_head": 0.14317438006401062,
      "loss_layer_42_head": 0.08472011238336563,
      "loss_layer_6_head": 1.0096265077590942,
      "step": 1455
    },
    {
      "epoch": 18.688,
      "grad_norm": 0.43266873664773453,
      "learning_rate": 0.004885131959192261,
      "loss": 3.1383,
      "loss_layer_12_head": 0.7924840450286865,
      "loss_layer_18_head": 0.5741826295852661,
      "loss_layer_24_head": 0.3526131510734558,
      "loss_layer_30_head": 0.21853236854076385,
      "loss_layer_36_head": 0.14850430190563202,
      "loss_layer_42_head": 0.08624231815338135,
      "loss_layer_6_head": 1.0062892436981201,
      "step": 1460
    },
    {
      "epoch": 18.752,
      "grad_norm": 0.31051893836907096,
      "learning_rate": 0.0048834498100940905,
      "loss": 3.1173,
      "loss_layer_12_head": 0.7378137111663818,
      "loss_layer_18_head": 0.5532205104827881,
      "loss_layer_24_head": 0.3348642587661743,
      "loss_layer_30_head": 0.212578684091568,
      "loss_layer_36_head": 0.1419658362865448,
      "loss_layer_42_head": 0.08181919157505035,
      "loss_layer_6_head": 0.9646576642990112,
      "step": 1465
    },
    {
      "epoch": 18.816,
      "grad_norm": 0.34108977454035466,
      "learning_rate": 0.004881755727399134,
      "loss": 3.1228,
      "loss_layer_12_head": 0.7150170207023621,
      "loss_layer_18_head": 0.5414386987686157,
      "loss_layer_24_head": 0.3492732644081116,
      "loss_layer_30_head": 0.212129145860672,
      "loss_layer_36_head": 0.15533356368541718,
      "loss_layer_42_head": 0.09902982413768768,
      "loss_layer_6_head": 0.9647548794746399,
      "step": 1470
    },
    {
      "epoch": 18.88,
      "grad_norm": 0.533394501232972,
      "learning_rate": 0.004880049719589425,
      "loss": 3.1948,
      "loss_layer_12_head": 0.7482460737228394,
      "loss_layer_18_head": 0.5848065614700317,
      "loss_layer_24_head": 0.3878377377986908,
      "loss_layer_30_head": 0.23633889853954315,
      "loss_layer_36_head": 0.17814373970031738,
      "loss_layer_42_head": 0.10173021256923676,
      "loss_layer_6_head": 1.0336793661117554,
      "step": 1475
    },
    {
      "epoch": 18.944,
      "grad_norm": 0.4564722942736434,
      "learning_rate": 0.004878331795206705,
      "loss": 3.1566,
      "loss_layer_12_head": 0.7343682050704956,
      "loss_layer_18_head": 0.5963757634162903,
      "loss_layer_24_head": 0.38881877064704895,
      "loss_layer_30_head": 0.23519356548786163,
      "loss_layer_36_head": 0.16722872853279114,
      "loss_layer_42_head": 0.10227765887975693,
      "loss_layer_6_head": 1.028169870376587,
      "step": 1480
    },
    {
      "epoch": 19.008,
      "grad_norm": 0.4613371778809834,
      "learning_rate": 0.004876601962852377,
      "loss": 3.1503,
      "loss_layer_12_head": 0.7205085158348083,
      "loss_layer_18_head": 0.6032729148864746,
      "loss_layer_24_head": 0.3639681935310364,
      "loss_layer_30_head": 0.2198045253753662,
      "loss_layer_36_head": 0.1575845181941986,
      "loss_layer_42_head": 0.10175911337137222,
      "loss_layer_6_head": 1.0105431079864502,
      "step": 1485
    },
    {
      "epoch": 19.072,
      "grad_norm": 0.3631369627115455,
      "learning_rate": 0.004874860231187469,
      "loss": 2.9361,
      "loss_layer_12_head": 0.6539273262023926,
      "loss_layer_18_head": 0.5349099636077881,
      "loss_layer_24_head": 0.3321605920791626,
      "loss_layer_30_head": 0.21559560298919678,
      "loss_layer_36_head": 0.14331617951393127,
      "loss_layer_42_head": 0.11375503242015839,
      "loss_layer_6_head": 0.9071669578552246,
      "step": 1490
    },
    {
      "epoch": 19.136,
      "grad_norm": 0.4094587211478041,
      "learning_rate": 0.004873106608932585,
      "loss": 2.9995,
      "loss_layer_12_head": 0.6697855591773987,
      "loss_layer_18_head": 0.5577580332756042,
      "loss_layer_24_head": 0.34684833884239197,
      "loss_layer_30_head": 0.22804339230060577,
      "loss_layer_36_head": 0.15083782374858856,
      "loss_layer_42_head": 0.114118792116642,
      "loss_layer_6_head": 0.926591694355011,
      "step": 1495
    },
    {
      "epoch": 19.2,
      "grad_norm": 0.5197962545279547,
      "learning_rate": 0.004871341104867864,
      "loss": 2.979,
      "loss_layer_12_head": 0.6665138006210327,
      "loss_layer_18_head": 0.5379384160041809,
      "loss_layer_24_head": 0.33327096700668335,
      "loss_layer_30_head": 0.21521063148975372,
      "loss_layer_36_head": 0.14225339889526367,
      "loss_layer_42_head": 0.10413273423910141,
      "loss_layer_6_head": 0.936945915222168,
      "step": 1500
    },
    {
      "epoch": 19.264,
      "grad_norm": 0.42814712707455815,
      "learning_rate": 0.004869563727832936,
      "loss": 3.0093,
      "loss_layer_12_head": 0.6911715269088745,
      "loss_layer_18_head": 0.5504287481307983,
      "loss_layer_24_head": 0.34287288784980774,
      "loss_layer_30_head": 0.22563442587852478,
      "loss_layer_36_head": 0.1424397975206375,
      "loss_layer_42_head": 0.10573281347751617,
      "loss_layer_6_head": 0.9500272870063782,
      "step": 1505
    },
    {
      "epoch": 19.328,
      "grad_norm": 0.49533018557438857,
      "learning_rate": 0.004867774486726877,
      "loss": 2.9883,
      "loss_layer_12_head": 0.7024285197257996,
      "loss_layer_18_head": 0.5612101554870605,
      "loss_layer_24_head": 0.34151691198349,
      "loss_layer_30_head": 0.227279931306839,
      "loss_layer_36_head": 0.13940644264221191,
      "loss_layer_42_head": 0.09808440506458282,
      "loss_layer_6_head": 0.9625148773193359,
      "step": 1510
    },
    {
      "epoch": 19.392,
      "grad_norm": 0.6816390077872695,
      "learning_rate": 0.004865973390508163,
      "loss": 3.1667,
      "loss_layer_12_head": 0.7772215604782104,
      "loss_layer_18_head": 0.5524588227272034,
      "loss_layer_24_head": 0.3325223922729492,
      "loss_layer_30_head": 0.2255311906337738,
      "loss_layer_36_head": 0.1461932361125946,
      "loss_layer_42_head": 0.10088052600622177,
      "loss_layer_6_head": 0.9478126764297485,
      "step": 1515
    },
    {
      "epoch": 19.456,
      "grad_norm": 0.7925747282267671,
      "learning_rate": 0.004864160448194631,
      "loss": 3.2879,
      "loss_layer_12_head": 0.8161728978157043,
      "loss_layer_18_head": 0.5632739663124084,
      "loss_layer_24_head": 0.3616149425506592,
      "loss_layer_30_head": 0.24557864665985107,
      "loss_layer_36_head": 0.1972227692604065,
      "loss_layer_42_head": 0.09971685707569122,
      "loss_layer_6_head": 1.0158379077911377,
      "step": 1520
    },
    {
      "epoch": 19.52,
      "grad_norm": 0.5415075699333366,
      "learning_rate": 0.004862335668863427,
      "loss": 3.3306,
      "loss_layer_12_head": 0.8345929384231567,
      "loss_layer_18_head": 0.5822216868400574,
      "loss_layer_24_head": 0.35487115383148193,
      "loss_layer_30_head": 0.23379254341125488,
      "loss_layer_36_head": 0.15698441863059998,
      "loss_layer_42_head": 0.10201475769281387,
      "loss_layer_6_head": 1.0900990962982178,
      "step": 1525
    },
    {
      "epoch": 19.584,
      "grad_norm": 0.5939143161731206,
      "learning_rate": 0.004860499061650961,
      "loss": 3.2933,
      "loss_layer_12_head": 0.8178718686103821,
      "loss_layer_18_head": 0.6001588106155396,
      "loss_layer_24_head": 0.3719843327999115,
      "loss_layer_30_head": 0.23490682244300842,
      "loss_layer_36_head": 0.15630753338336945,
      "loss_layer_42_head": 0.10899871587753296,
      "loss_layer_6_head": 1.0860679149627686,
      "step": 1530
    },
    {
      "epoch": 19.648,
      "grad_norm": 0.5778848289848845,
      "learning_rate": 0.00485865063575287,
      "loss": 3.2372,
      "loss_layer_12_head": 0.7766883969306946,
      "loss_layer_18_head": 0.5595159530639648,
      "loss_layer_24_head": 0.36884570121765137,
      "loss_layer_30_head": 0.22227194905281067,
      "loss_layer_36_head": 0.14947186410427094,
      "loss_layer_42_head": 0.09630294144153595,
      "loss_layer_6_head": 1.0391987562179565,
      "step": 1535
    },
    {
      "epoch": 19.712,
      "grad_norm": 0.4686243563151677,
      "learning_rate": 0.004856790400423958,
      "loss": 3.2512,
      "loss_layer_12_head": 0.7561641931533813,
      "loss_layer_18_head": 0.5596798658370972,
      "loss_layer_24_head": 0.3865264058113098,
      "loss_layer_30_head": 0.23176749050617218,
      "loss_layer_36_head": 0.1816871166229248,
      "loss_layer_42_head": 0.1074337363243103,
      "loss_layer_6_head": 1.0577130317687988,
      "step": 1540
    },
    {
      "epoch": 19.776,
      "grad_norm": 0.76115242136125,
      "learning_rate": 0.004854918364978163,
      "loss": 3.2278,
      "loss_layer_12_head": 0.7542145252227783,
      "loss_layer_18_head": 0.5497565269470215,
      "loss_layer_24_head": 0.3975273668766022,
      "loss_layer_30_head": 0.2130308449268341,
      "loss_layer_36_head": 0.14324040710926056,
      "loss_layer_42_head": 0.08736969530582428,
      "loss_layer_6_head": 1.0337458848953247,
      "step": 1545
    },
    {
      "epoch": 19.84,
      "grad_norm": 0.6283045000748732,
      "learning_rate": 0.0048530345387885,
      "loss": 3.2438,
      "loss_layer_12_head": 0.8078277707099915,
      "loss_layer_18_head": 0.5854156613349915,
      "loss_layer_24_head": 0.4033629894256592,
      "loss_layer_30_head": 0.22831375896930695,
      "loss_layer_36_head": 0.14858314394950867,
      "loss_layer_42_head": 0.09449674934148788,
      "loss_layer_6_head": 1.0714917182922363,
      "step": 1550
    },
    {
      "epoch": 19.904,
      "grad_norm": 0.7351693245330702,
      "learning_rate": 0.004851138931287024,
      "loss": 3.3901,
      "loss_layer_12_head": 0.7661847472190857,
      "loss_layer_18_head": 0.6042229533195496,
      "loss_layer_24_head": 0.43115559220314026,
      "loss_layer_30_head": 0.24249839782714844,
      "loss_layer_36_head": 0.14992274343967438,
      "loss_layer_42_head": 0.09131094813346863,
      "loss_layer_6_head": 1.0724594593048096,
      "step": 1555
    },
    {
      "epoch": 19.968,
      "grad_norm": 0.7941202531638374,
      "learning_rate": 0.0048492315519647715,
      "loss": 3.3655,
      "loss_layer_12_head": 0.7465814352035522,
      "loss_layer_18_head": 0.5866889953613281,
      "loss_layer_24_head": 0.4172203540802002,
      "loss_layer_30_head": 0.26989370584487915,
      "loss_layer_36_head": 0.15543322265148163,
      "loss_layer_42_head": 0.10856852680444717,
      "loss_layer_6_head": 1.0785472393035889,
      "step": 1560
    },
    {
      "epoch": 20.032,
      "grad_norm": 0.33464629065893486,
      "learning_rate": 0.004847312410371722,
      "loss": 3.2122,
      "loss_layer_12_head": 0.7360634803771973,
      "loss_layer_18_head": 0.5760800838470459,
      "loss_layer_24_head": 0.4085420072078705,
      "loss_layer_30_head": 0.2828262746334076,
      "loss_layer_36_head": 0.15188473463058472,
      "loss_layer_42_head": 0.10583782196044922,
      "loss_layer_6_head": 1.0441932678222656,
      "step": 1565
    },
    {
      "epoch": 20.096,
      "grad_norm": 0.5730615921795303,
      "learning_rate": 0.0048453815161167485,
      "loss": 3.1013,
      "loss_layer_12_head": 0.6980878114700317,
      "loss_layer_18_head": 0.534099280834198,
      "loss_layer_24_head": 0.38472673296928406,
      "loss_layer_30_head": 0.2557997703552246,
      "loss_layer_36_head": 0.14928656816482544,
      "loss_layer_42_head": 0.0986810177564621,
      "loss_layer_6_head": 0.9572297930717468,
      "step": 1570
    },
    {
      "epoch": 20.16,
      "grad_norm": 0.626696603059949,
      "learning_rate": 0.004843438878867563,
      "loss": 3.0858,
      "loss_layer_12_head": 0.7495425343513489,
      "loss_layer_18_head": 0.5503208041191101,
      "loss_layer_24_head": 0.40107831358909607,
      "loss_layer_30_head": 0.2483876645565033,
      "loss_layer_36_head": 0.14450868964195251,
      "loss_layer_42_head": 0.08518800139427185,
      "loss_layer_6_head": 0.9990106821060181,
      "step": 1575
    },
    {
      "epoch": 20.224,
      "grad_norm": 0.4426088809368434,
      "learning_rate": 0.0048414845083506785,
      "loss": 3.0301,
      "loss_layer_12_head": 0.681336522102356,
      "loss_layer_18_head": 0.5108008980751038,
      "loss_layer_24_head": 0.3683341145515442,
      "loss_layer_30_head": 0.23685534298419952,
      "loss_layer_36_head": 0.13728925585746765,
      "loss_layer_42_head": 0.08627957105636597,
      "loss_layer_6_head": 0.9287586212158203,
      "step": 1580
    },
    {
      "epoch": 20.288,
      "grad_norm": 0.5057744658130514,
      "learning_rate": 0.004839518414351352,
      "loss": 3.009,
      "loss_layer_12_head": 0.6958004236221313,
      "loss_layer_18_head": 0.533592700958252,
      "loss_layer_24_head": 0.36192968487739563,
      "loss_layer_30_head": 0.22564208507537842,
      "loss_layer_36_head": 0.1390310823917389,
      "loss_layer_42_head": 0.0878349170088768,
      "loss_layer_6_head": 0.9484044313430786,
      "step": 1585
    },
    {
      "epoch": 20.352,
      "grad_norm": 0.4345087186972502,
      "learning_rate": 0.0048375406067135375,
      "loss": 3.0051,
      "loss_layer_12_head": 0.716814398765564,
      "loss_layer_18_head": 0.5648194551467896,
      "loss_layer_24_head": 0.37631911039352417,
      "loss_layer_30_head": 0.23107901215553284,
      "loss_layer_36_head": 0.14717121422290802,
      "loss_layer_42_head": 0.09897182136774063,
      "loss_layer_6_head": 0.9635587930679321,
      "step": 1590
    },
    {
      "epoch": 20.416,
      "grad_norm": 0.3100534717481353,
      "learning_rate": 0.004835551095339839,
      "loss": 3.0741,
      "loss_layer_12_head": 0.7481504082679749,
      "loss_layer_18_head": 0.5717365741729736,
      "loss_layer_24_head": 0.37495532631874084,
      "loss_layer_30_head": 0.2240494042634964,
      "loss_layer_36_head": 0.14183567464351654,
      "loss_layer_42_head": 0.08657179027795792,
      "loss_layer_6_head": 1.0129997730255127,
      "step": 1595
    },
    {
      "epoch": 20.48,
      "grad_norm": 0.6071272427474294,
      "learning_rate": 0.00483354989019146,
      "loss": 3.0832,
      "loss_layer_12_head": 0.7416980862617493,
      "loss_layer_18_head": 0.5456105470657349,
      "loss_layer_24_head": 0.36494144797325134,
      "loss_layer_30_head": 0.22586658596992493,
      "loss_layer_36_head": 0.13822290301322937,
      "loss_layer_42_head": 0.0847085565328598,
      "loss_layer_6_head": 0.9393714666366577,
      "step": 1600
    },
    {
      "epoch": 20.48,
      "eval_loss": 5.563967704772949,
      "eval_loss_layer_12_head": 1.2975389957427979,
      "eval_loss_layer_18_head": 1.1011850833892822,
      "eval_loss_layer_24_head": 0.731925368309021,
      "eval_loss_layer_30_head": 0.4884292185306549,
      "eval_loss_layer_36_head": 0.30645832419395447,
      "eval_loss_layer_42_head": 0.18912078440189362,
      "eval_loss_layer_6_head": 1.597771406173706,
      "eval_runtime": 33.0637,
      "eval_samples_per_second": 9.678,
      "eval_steps_per_second": 0.605,
      "step": 1600
    },
    {
      "epoch": 20.544,
      "grad_norm": 0.6134705952127156,
      "learning_rate": 0.004831537001288151,
      "loss": 3.0681,
      "loss_layer_12_head": 0.7579014301300049,
      "loss_layer_18_head": 0.5624231100082397,
      "loss_layer_24_head": 0.375941663980484,
      "loss_layer_30_head": 0.23204576969146729,
      "loss_layer_36_head": 0.140121191740036,
      "loss_layer_42_head": 0.08910927176475525,
      "loss_layer_6_head": 0.9663691520690918,
      "step": 1605
    },
    {
      "epoch": 20.608,
      "grad_norm": 0.5952627139264144,
      "learning_rate": 0.004829512438708164,
      "loss": 3.1886,
      "loss_layer_12_head": 0.7661651372909546,
      "loss_layer_18_head": 0.5942111611366272,
      "loss_layer_24_head": 0.41996508836746216,
      "loss_layer_30_head": 0.28046655654907227,
      "loss_layer_36_head": 0.14358152449131012,
      "loss_layer_42_head": 0.08481258153915405,
      "loss_layer_6_head": 1.010411262512207,
      "step": 1610
    },
    {
      "epoch": 20.672,
      "grad_norm": 0.618251781408709,
      "learning_rate": 0.004827476212588196,
      "loss": 3.2635,
      "loss_layer_12_head": 0.7308121919631958,
      "loss_layer_18_head": 0.5874748229980469,
      "loss_layer_24_head": 0.4182962477207184,
      "loss_layer_30_head": 0.27959316968917847,
      "loss_layer_36_head": 0.14219971001148224,
      "loss_layer_42_head": 0.08691873401403427,
      "loss_layer_6_head": 0.992128849029541,
      "step": 1615
    },
    {
      "epoch": 20.736,
      "grad_norm": 0.5131676731500275,
      "learning_rate": 0.004825428333123346,
      "loss": 3.1828,
      "loss_layer_12_head": 0.6917648315429688,
      "loss_layer_18_head": 0.5770801305770874,
      "loss_layer_24_head": 0.37391766905784607,
      "loss_layer_30_head": 0.2577366530895233,
      "loss_layer_36_head": 0.15651835501194,
      "loss_layer_42_head": 0.08447730541229248,
      "loss_layer_6_head": 0.9514881372451782,
      "step": 1620
    },
    {
      "epoch": 20.8,
      "grad_norm": 0.5585705585576392,
      "learning_rate": 0.004823368810567056,
      "loss": 3.2928,
      "loss_layer_12_head": 0.7000068426132202,
      "loss_layer_18_head": 0.653378963470459,
      "loss_layer_24_head": 0.3638749122619629,
      "loss_layer_30_head": 0.24381215870380402,
      "loss_layer_36_head": 0.16137631237506866,
      "loss_layer_42_head": 0.09685395658016205,
      "loss_layer_6_head": 0.9746227264404297,
      "step": 1625
    },
    {
      "epoch": 20.864,
      "grad_norm": 0.4291249601618325,
      "learning_rate": 0.004821297655231066,
      "loss": 3.3249,
      "loss_layer_12_head": 0.7149084210395813,
      "loss_layer_18_head": 0.7570079565048218,
      "loss_layer_24_head": 0.38511911034584045,
      "loss_layer_30_head": 0.25151458382606506,
      "loss_layer_36_head": 0.18747413158416748,
      "loss_layer_42_head": 0.10756643861532211,
      "loss_layer_6_head": 0.9964591860771179,
      "step": 1630
    },
    {
      "epoch": 20.928,
      "grad_norm": 0.4084580420610398,
      "learning_rate": 0.004819214877485357,
      "loss": 3.2692,
      "loss_layer_12_head": 0.7349473237991333,
      "loss_layer_18_head": 0.7351863980293274,
      "loss_layer_24_head": 0.37549442052841187,
      "loss_layer_30_head": 0.23866935074329376,
      "loss_layer_36_head": 0.17290076613426208,
      "loss_layer_42_head": 0.09471454471349716,
      "loss_layer_6_head": 1.0317671298980713,
      "step": 1635
    },
    {
      "epoch": 20.992,
      "grad_norm": 0.40605354688883716,
      "learning_rate": 0.004817120487758104,
      "loss": 3.1896,
      "loss_layer_12_head": 0.7176186442375183,
      "loss_layer_18_head": 0.6813039779663086,
      "loss_layer_24_head": 0.3717449903488159,
      "loss_layer_30_head": 0.2592929005622864,
      "loss_layer_36_head": 0.17080166935920715,
      "loss_layer_42_head": 0.10529599338769913,
      "loss_layer_6_head": 1.0234200954437256,
      "step": 1640
    },
    {
      "epoch": 21.056,
      "grad_norm": 0.4552971890219544,
      "learning_rate": 0.004815014496535621,
      "loss": 3.0337,
      "loss_layer_12_head": 0.638253390789032,
      "loss_layer_18_head": 0.6015220284461975,
      "loss_layer_24_head": 0.3225458860397339,
      "loss_layer_30_head": 0.2156144082546234,
      "loss_layer_36_head": 0.14704987406730652,
      "loss_layer_42_head": 0.09176966547966003,
      "loss_layer_6_head": 0.9370401501655579,
      "step": 1645
    },
    {
      "epoch": 21.12,
      "grad_norm": 0.36513298758151375,
      "learning_rate": 0.0048128969143623095,
      "loss": 3.0092,
      "loss_layer_12_head": 0.6517444849014282,
      "loss_layer_18_head": 0.5819887518882751,
      "loss_layer_24_head": 0.32912614941596985,
      "loss_layer_30_head": 0.24032971262931824,
      "loss_layer_36_head": 0.14910313487052917,
      "loss_layer_42_head": 0.09510574489831924,
      "loss_layer_6_head": 0.9613767862319946,
      "step": 1650
    },
    {
      "epoch": 21.184,
      "grad_norm": 0.29406281079098257,
      "learning_rate": 0.004810767751840602,
      "loss": 2.8994,
      "loss_layer_12_head": 0.6251100301742554,
      "loss_layer_18_head": 0.543388843536377,
      "loss_layer_24_head": 0.3159942924976349,
      "loss_layer_30_head": 0.21306219696998596,
      "loss_layer_36_head": 0.1424252688884735,
      "loss_layer_42_head": 0.08935324847698212,
      "loss_layer_6_head": 0.9264875650405884,
      "step": 1655
    },
    {
      "epoch": 21.248,
      "grad_norm": 0.41404413294279196,
      "learning_rate": 0.004808627019630917,
      "loss": 2.974,
      "loss_layer_12_head": 0.6551018953323364,
      "loss_layer_18_head": 0.5522714257240295,
      "loss_layer_24_head": 0.3275538980960846,
      "loss_layer_30_head": 0.21536001563072205,
      "loss_layer_36_head": 0.140096515417099,
      "loss_layer_42_head": 0.0885264128446579,
      "loss_layer_6_head": 0.9863259196281433,
      "step": 1660
    },
    {
      "epoch": 21.312,
      "grad_norm": 0.5718881858264676,
      "learning_rate": 0.004806474728451597,
      "loss": 2.9686,
      "loss_layer_12_head": 0.6139146089553833,
      "loss_layer_18_head": 0.49676713347435,
      "loss_layer_24_head": 0.29922953248023987,
      "loss_layer_30_head": 0.1944054514169693,
      "loss_layer_36_head": 0.12710222601890564,
      "loss_layer_42_head": 0.07917798310518265,
      "loss_layer_6_head": 0.9382314682006836,
      "step": 1665
    },
    {
      "epoch": 21.376,
      "grad_norm": 0.6175257722347135,
      "learning_rate": 0.004804310889078861,
      "loss": 2.9894,
      "loss_layer_12_head": 0.6765131950378418,
      "loss_layer_18_head": 0.5418305397033691,
      "loss_layer_24_head": 0.33013349771499634,
      "loss_layer_30_head": 0.21335947513580322,
      "loss_layer_36_head": 0.13631169497966766,
      "loss_layer_42_head": 0.08206924051046371,
      "loss_layer_6_head": 1.0135917663574219,
      "step": 1670
    },
    {
      "epoch": 21.44,
      "grad_norm": 0.49476703524186777,
      "learning_rate": 0.004802135512346747,
      "loss": 3.0844,
      "loss_layer_12_head": 0.7152357697486877,
      "loss_layer_18_head": 0.5490128993988037,
      "loss_layer_24_head": 0.3345467448234558,
      "loss_layer_30_head": 0.21705731749534607,
      "loss_layer_36_head": 0.1405579000711441,
      "loss_layer_42_head": 0.0834902971982956,
      "loss_layer_6_head": 1.0803035497665405,
      "step": 1675
    },
    {
      "epoch": 21.504,
      "grad_norm": 0.6785143367917855,
      "learning_rate": 0.004799948609147061,
      "loss": 3.1484,
      "loss_layer_12_head": 0.8115327954292297,
      "loss_layer_18_head": 0.5425211787223816,
      "loss_layer_24_head": 0.32740694284439087,
      "loss_layer_30_head": 0.20602674782276154,
      "loss_layer_36_head": 0.13740526139736176,
      "loss_layer_42_head": 0.08350611478090286,
      "loss_layer_6_head": 1.0295675992965698,
      "step": 1680
    },
    {
      "epoch": 21.568,
      "grad_norm": 0.5319574641898781,
      "learning_rate": 0.0047977501904293155,
      "loss": 3.1424,
      "loss_layer_12_head": 0.8004010319709778,
      "loss_layer_18_head": 0.5671056509017944,
      "loss_layer_24_head": 0.3398260474205017,
      "loss_layer_30_head": 0.2099180519580841,
      "loss_layer_36_head": 0.14159265160560608,
      "loss_layer_42_head": 0.08886562287807465,
      "loss_layer_6_head": 1.0578364133834839,
      "step": 1685
    },
    {
      "epoch": 21.632,
      "grad_norm": 0.55972269641195,
      "learning_rate": 0.004795540267200685,
      "loss": 3.0953,
      "loss_layer_12_head": 0.7616764903068542,
      "loss_layer_18_head": 0.5534841418266296,
      "loss_layer_24_head": 0.3312055468559265,
      "loss_layer_30_head": 0.20728769898414612,
      "loss_layer_36_head": 0.14260080456733704,
      "loss_layer_42_head": 0.08669861406087875,
      "loss_layer_6_head": 1.0139487981796265,
      "step": 1690
    },
    {
      "epoch": 21.696,
      "grad_norm": 0.5434461090550678,
      "learning_rate": 0.004793318850525943,
      "loss": 3.1026,
      "loss_layer_12_head": 0.7260487079620361,
      "loss_layer_18_head": 0.532487154006958,
      "loss_layer_24_head": 0.33136191964149475,
      "loss_layer_30_head": 0.20234759151935577,
      "loss_layer_36_head": 0.14332114160060883,
      "loss_layer_42_head": 0.08317707479000092,
      "loss_layer_6_head": 0.9699150323867798,
      "step": 1695
    },
    {
      "epoch": 21.76,
      "grad_norm": 0.3789054503084818,
      "learning_rate": 0.004791085951527408,
      "loss": 3.1203,
      "loss_layer_12_head": 0.708491325378418,
      "loss_layer_18_head": 0.5417188405990601,
      "loss_layer_24_head": 0.34097760915756226,
      "loss_layer_30_head": 0.19857382774353027,
      "loss_layer_36_head": 0.1453758329153061,
      "loss_layer_42_head": 0.0807570219039917,
      "loss_layer_6_head": 0.952672004699707,
      "step": 1700
    },
    {
      "epoch": 21.824,
      "grad_norm": 0.4241726194876586,
      "learning_rate": 0.004788841581384891,
      "loss": 3.0639,
      "loss_layer_12_head": 0.7187703847885132,
      "loss_layer_18_head": 0.5506891012191772,
      "loss_layer_24_head": 0.3504531681537628,
      "loss_layer_30_head": 0.20287339389324188,
      "loss_layer_36_head": 0.1603461652994156,
      "loss_layer_42_head": 0.08160021156072617,
      "loss_layer_6_head": 0.9732823371887207,
      "step": 1705
    },
    {
      "epoch": 21.888,
      "grad_norm": 0.4605764464468742,
      "learning_rate": 0.004786585751335636,
      "loss": 3.0689,
      "loss_layer_12_head": 0.7443972229957581,
      "loss_layer_18_head": 0.5518706440925598,
      "loss_layer_24_head": 0.35241708159446716,
      "loss_layer_30_head": 0.20425565540790558,
      "loss_layer_36_head": 0.15290476381778717,
      "loss_layer_42_head": 0.08435919135808945,
      "loss_layer_6_head": 0.9795257449150085,
      "step": 1710
    },
    {
      "epoch": 21.951999999999998,
      "grad_norm": 0.5003864638953475,
      "learning_rate": 0.004784318472674267,
      "loss": 3.1347,
      "loss_layer_12_head": 0.7615423798561096,
      "loss_layer_18_head": 0.5583871006965637,
      "loss_layer_24_head": 0.34288641810417175,
      "loss_layer_30_head": 0.19951409101486206,
      "loss_layer_36_head": 0.1486385315656662,
      "loss_layer_42_head": 0.07823002338409424,
      "loss_layer_6_head": 0.9533261060714722,
      "step": 1715
    },
    {
      "epoch": 22.016,
      "grad_norm": 0.5916376605304198,
      "learning_rate": 0.004782039756752727,
      "loss": 3.0459,
      "loss_layer_12_head": 0.738690197467804,
      "loss_layer_18_head": 0.5495492219924927,
      "loss_layer_24_head": 0.33271872997283936,
      "loss_layer_30_head": 0.2009279429912567,
      "loss_layer_36_head": 0.14565247297286987,
      "loss_layer_42_head": 0.07559780776500702,
      "loss_layer_6_head": 0.9331909418106079,
      "step": 1720
    },
    {
      "epoch": 22.08,
      "grad_norm": 0.3698221329954725,
      "learning_rate": 0.004779749614980225,
      "loss": 2.9059,
      "loss_layer_12_head": 0.6851332187652588,
      "loss_layer_18_head": 0.5198072195053101,
      "loss_layer_24_head": 0.32271355390548706,
      "loss_layer_30_head": 0.1980070173740387,
      "loss_layer_36_head": 0.1409723460674286,
      "loss_layer_42_head": 0.07337971776723862,
      "loss_layer_6_head": 0.9018647074699402,
      "step": 1725
    },
    {
      "epoch": 22.144,
      "grad_norm": 0.3522827408349662,
      "learning_rate": 0.004777448058823178,
      "loss": 2.8889,
      "loss_layer_12_head": 0.688713550567627,
      "loss_layer_18_head": 0.5382566452026367,
      "loss_layer_24_head": 0.3284897804260254,
      "loss_layer_30_head": 0.20808129012584686,
      "loss_layer_36_head": 0.13935129344463348,
      "loss_layer_42_head": 0.08263488858938217,
      "loss_layer_6_head": 0.9334216117858887,
      "step": 1730
    },
    {
      "epoch": 22.208,
      "grad_norm": 0.4786946472540282,
      "learning_rate": 0.0047751350998051535,
      "loss": 2.9466,
      "loss_layer_12_head": 0.7182289361953735,
      "loss_layer_18_head": 0.5816367268562317,
      "loss_layer_24_head": 0.35359448194503784,
      "loss_layer_30_head": 0.21156230568885803,
      "loss_layer_36_head": 0.14046403765678406,
      "loss_layer_42_head": 0.08700470626354218,
      "loss_layer_6_head": 0.9829589128494263,
      "step": 1735
    },
    {
      "epoch": 22.272,
      "grad_norm": 0.49674088089498564,
      "learning_rate": 0.00477281074950681,
      "loss": 2.9761,
      "loss_layer_12_head": 0.6763153672218323,
      "loss_layer_18_head": 0.5377945303916931,
      "loss_layer_24_head": 0.34007275104522705,
      "loss_layer_30_head": 0.20980516076087952,
      "loss_layer_36_head": 0.1342969834804535,
      "loss_layer_42_head": 0.09058590233325958,
      "loss_layer_6_head": 0.9105340838432312,
      "step": 1740
    },
    {
      "epoch": 22.336,
      "grad_norm": 0.6371500688805153,
      "learning_rate": 0.00477047501956584,
      "loss": 2.9641,
      "loss_layer_12_head": 0.7299153804779053,
      "loss_layer_18_head": 0.5415923595428467,
      "loss_layer_24_head": 0.3394579291343689,
      "loss_layer_30_head": 0.2095079869031906,
      "loss_layer_36_head": 0.13529720902442932,
      "loss_layer_42_head": 0.08852270990610123,
      "loss_layer_6_head": 0.932408332824707,
      "step": 1745
    },
    {
      "epoch": 22.4,
      "grad_norm": 0.5159131387785424,
      "learning_rate": 0.004768127921676916,
      "loss": 3.0178,
      "loss_layer_12_head": 0.7179011106491089,
      "loss_layer_18_head": 0.5388921499252319,
      "loss_layer_24_head": 0.3411840498447418,
      "loss_layer_30_head": 0.21252965927124023,
      "loss_layer_36_head": 0.1366848200559616,
      "loss_layer_42_head": 0.09100259840488434,
      "loss_layer_6_head": 0.9402325749397278,
      "step": 1750
    },
    {
      "epoch": 22.464,
      "grad_norm": 0.32963622881971,
      "learning_rate": 0.004765769467591626,
      "loss": 3.0298,
      "loss_layer_12_head": 0.7254591584205627,
      "loss_layer_18_head": 0.5756968259811401,
      "loss_layer_24_head": 0.3554520010948181,
      "loss_layer_30_head": 0.21753406524658203,
      "loss_layer_36_head": 0.14242665469646454,
      "loss_layer_42_head": 0.08691893517971039,
      "loss_layer_6_head": 0.9921077489852905,
      "step": 1755
    },
    {
      "epoch": 22.528,
      "grad_norm": 0.471368963454448,
      "learning_rate": 0.004763399669118413,
      "loss": 2.9455,
      "loss_layer_12_head": 0.6884724497795105,
      "loss_layer_18_head": 0.540235161781311,
      "loss_layer_24_head": 0.3398441970348358,
      "loss_layer_30_head": 0.21145418286323547,
      "loss_layer_36_head": 0.1303374469280243,
      "loss_layer_42_head": 0.08503861725330353,
      "loss_layer_6_head": 0.9523102641105652,
      "step": 1760
    },
    {
      "epoch": 22.592,
      "grad_norm": 0.4856062055504192,
      "learning_rate": 0.004761018538122528,
      "loss": 2.9438,
      "loss_layer_12_head": 0.6778172254562378,
      "loss_layer_18_head": 0.5443695783615112,
      "loss_layer_24_head": 0.3373316824436188,
      "loss_layer_30_head": 0.21453002095222473,
      "loss_layer_36_head": 0.13375157117843628,
      "loss_layer_42_head": 0.08410833775997162,
      "loss_layer_6_head": 0.970514178276062,
      "step": 1765
    },
    {
      "epoch": 22.656,
      "grad_norm": 0.36790846594258825,
      "learning_rate": 0.004758626086525956,
      "loss": 2.9732,
      "loss_layer_12_head": 0.672629714012146,
      "loss_layer_18_head": 0.547683835029602,
      "loss_layer_24_head": 0.3490315079689026,
      "loss_layer_30_head": 0.2288057804107666,
      "loss_layer_36_head": 0.13497015833854675,
      "loss_layer_42_head": 0.08837074786424637,
      "loss_layer_6_head": 0.957310676574707,
      "step": 1770
    },
    {
      "epoch": 22.72,
      "grad_norm": 0.38845062098378175,
      "learning_rate": 0.004756222326307364,
      "loss": 2.9591,
      "loss_layer_12_head": 0.6653910279273987,
      "loss_layer_18_head": 0.5317212343215942,
      "loss_layer_24_head": 0.33674490451812744,
      "loss_layer_30_head": 0.21031756699085236,
      "loss_layer_36_head": 0.1310308873653412,
      "loss_layer_42_head": 0.08323374390602112,
      "loss_layer_6_head": 0.945144534111023,
      "step": 1775
    },
    {
      "epoch": 22.784,
      "grad_norm": 0.2770673473342903,
      "learning_rate": 0.004753807269502041,
      "loss": 3.0412,
      "loss_layer_12_head": 0.7038198709487915,
      "loss_layer_18_head": 0.5736119747161865,
      "loss_layer_24_head": 0.3526606261730194,
      "loss_layer_30_head": 0.21843290328979492,
      "loss_layer_36_head": 0.14482656121253967,
      "loss_layer_42_head": 0.07941797375679016,
      "loss_layer_6_head": 0.991602897644043,
      "step": 1780
    },
    {
      "epoch": 22.848,
      "grad_norm": 0.3974819444140201,
      "learning_rate": 0.0047513809282018335,
      "loss": 3.0678,
      "loss_layer_12_head": 0.6813812255859375,
      "loss_layer_18_head": 0.5556277632713318,
      "loss_layer_24_head": 0.35258346796035767,
      "loss_layer_30_head": 0.20963843166828156,
      "loss_layer_36_head": 0.14229050278663635,
      "loss_layer_42_head": 0.08167450875043869,
      "loss_layer_6_head": 0.9670661687850952,
      "step": 1785
    },
    {
      "epoch": 22.912,
      "grad_norm": 0.3713322455444316,
      "learning_rate": 0.004748943314555093,
      "loss": 3.0245,
      "loss_layer_12_head": 0.684149980545044,
      "loss_layer_18_head": 0.5738719701766968,
      "loss_layer_24_head": 0.36741724610328674,
      "loss_layer_30_head": 0.20626941323280334,
      "loss_layer_36_head": 0.13893644511699677,
      "loss_layer_42_head": 0.08503827452659607,
      "loss_layer_6_head": 0.9790215492248535,
      "step": 1790
    },
    {
      "epoch": 22.976,
      "grad_norm": 0.37659926298929114,
      "learning_rate": 0.004746494440766605,
      "loss": 3.045,
      "loss_layer_12_head": 0.6893389225006104,
      "loss_layer_18_head": 0.5775801539421082,
      "loss_layer_24_head": 0.36834976077079773,
      "loss_layer_30_head": 0.21297776699066162,
      "loss_layer_36_head": 0.14115816354751587,
      "loss_layer_42_head": 0.08701495081186295,
      "loss_layer_6_head": 0.9853302240371704,
      "step": 1795
    },
    {
      "epoch": 23.04,
      "grad_norm": 0.3950278991507192,
      "learning_rate": 0.004744034319097535,
      "loss": 2.9621,
      "loss_layer_12_head": 0.629054605960846,
      "loss_layer_18_head": 0.5164355635643005,
      "loss_layer_24_head": 0.33957016468048096,
      "loss_layer_30_head": 0.2046838104724884,
      "loss_layer_36_head": 0.12630127370357513,
      "loss_layer_42_head": 0.08438660949468613,
      "loss_layer_6_head": 0.9039801359176636,
      "step": 1800
    },
    {
      "epoch": 23.04,
      "eval_loss": 5.568202972412109,
      "eval_loss_layer_12_head": 1.2700358629226685,
      "eval_loss_layer_18_head": 1.1179884672164917,
      "eval_loss_layer_24_head": 0.737288236618042,
      "eval_loss_layer_30_head": 0.46151334047317505,
      "eval_loss_layer_36_head": 0.2985280156135559,
      "eval_loss_layer_42_head": 0.2074306756258011,
      "eval_loss_layer_6_head": 1.6053718328475952,
      "eval_runtime": 33.0856,
      "eval_samples_per_second": 9.672,
      "eval_steps_per_second": 0.604,
      "step": 1800
    },
    {
      "epoch": 23.104,
      "grad_norm": 0.3236940985463961,
      "learning_rate": 0.0047415629618653655,
      "loss": 2.8887,
      "loss_layer_12_head": 0.6388901472091675,
      "loss_layer_18_head": 0.5137020349502563,
      "loss_layer_24_head": 0.3546183407306671,
      "loss_layer_30_head": 0.2111816108226776,
      "loss_layer_36_head": 0.14080731570720673,
      "loss_layer_42_head": 0.09509092569351196,
      "loss_layer_6_head": 0.9157737493515015,
      "step": 1805
    },
    {
      "epoch": 23.168,
      "grad_norm": 0.40450140511010296,
      "learning_rate": 0.004739080381443834,
      "loss": 2.9065,
      "loss_layer_12_head": 0.6531544923782349,
      "loss_layer_18_head": 0.5227010250091553,
      "loss_layer_24_head": 0.34976404905319214,
      "loss_layer_30_head": 0.2207869291305542,
      "loss_layer_36_head": 0.13684092462062836,
      "loss_layer_42_head": 0.09872017800807953,
      "loss_layer_6_head": 0.9294286966323853,
      "step": 1810
    },
    {
      "epoch": 23.232,
      "grad_norm": 0.38161649319776825,
      "learning_rate": 0.004736586590262868,
      "loss": 2.9279,
      "loss_layer_12_head": 0.6576783061027527,
      "loss_layer_18_head": 0.544166624546051,
      "loss_layer_24_head": 0.36016446352005005,
      "loss_layer_30_head": 0.23141531646251678,
      "loss_layer_36_head": 0.1462727189064026,
      "loss_layer_42_head": 0.10991980135440826,
      "loss_layer_6_head": 0.9398279190063477,
      "step": 1815
    },
    {
      "epoch": 23.296,
      "grad_norm": 0.3579195678996897,
      "learning_rate": 0.004734081600808531,
      "loss": 2.9612,
      "loss_layer_12_head": 0.6567124724388123,
      "loss_layer_18_head": 0.5400344133377075,
      "loss_layer_24_head": 0.3514339327812195,
      "loss_layer_30_head": 0.22242406010627747,
      "loss_layer_36_head": 0.15010282397270203,
      "loss_layer_42_head": 0.09956856071949005,
      "loss_layer_6_head": 0.9415644407272339,
      "step": 1820
    },
    {
      "epoch": 23.36,
      "grad_norm": 0.6560099240590554,
      "learning_rate": 0.004731565425622949,
      "loss": 2.9958,
      "loss_layer_12_head": 0.6743842363357544,
      "loss_layer_18_head": 0.5798726081848145,
      "loss_layer_24_head": 0.351627916097641,
      "loss_layer_30_head": 0.2305908203125,
      "loss_layer_36_head": 0.14780136942863464,
      "loss_layer_42_head": 0.09136875718832016,
      "loss_layer_6_head": 0.960567831993103,
      "step": 1825
    },
    {
      "epoch": 23.424,
      "grad_norm": 0.6239241628199141,
      "learning_rate": 0.004729038077304257,
      "loss": 3.123,
      "loss_layer_12_head": 0.6951879858970642,
      "loss_layer_18_head": 0.6438666582107544,
      "loss_layer_24_head": 0.32944342494010925,
      "loss_layer_30_head": 0.219035342335701,
      "loss_layer_36_head": 0.15711084008216858,
      "loss_layer_42_head": 0.08476213365793228,
      "loss_layer_6_head": 0.9157019853591919,
      "step": 1830
    },
    {
      "epoch": 23.488,
      "grad_norm": 0.5405246938900362,
      "learning_rate": 0.004726499568506531,
      "loss": 3.2297,
      "loss_layer_12_head": 0.7834885716438293,
      "loss_layer_18_head": 0.6808453798294067,
      "loss_layer_24_head": 0.3531964421272278,
      "loss_layer_30_head": 0.22429104149341583,
      "loss_layer_36_head": 0.17609432339668274,
      "loss_layer_42_head": 0.08292917162179947,
      "loss_layer_6_head": 0.977643609046936,
      "step": 1835
    },
    {
      "epoch": 23.552,
      "grad_norm": 0.5239699032008917,
      "learning_rate": 0.004723949911939728,
      "loss": 3.1461,
      "loss_layer_12_head": 0.7686442136764526,
      "loss_layer_18_head": 0.6159401535987854,
      "loss_layer_24_head": 0.3291798532009125,
      "loss_layer_30_head": 0.2103312462568283,
      "loss_layer_36_head": 0.15625818073749542,
      "loss_layer_42_head": 0.08098305761814117,
      "loss_layer_6_head": 0.9246538281440735,
      "step": 1840
    },
    {
      "epoch": 23.616,
      "grad_norm": 0.5170274394471907,
      "learning_rate": 0.0047213891203696165,
      "loss": 3.1036,
      "loss_layer_12_head": 0.7760995030403137,
      "loss_layer_18_head": 0.636306643486023,
      "loss_layer_24_head": 0.356485515832901,
      "loss_layer_30_head": 0.22722971439361572,
      "loss_layer_36_head": 0.15972909331321716,
      "loss_layer_42_head": 0.08121715486049652,
      "loss_layer_6_head": 0.9980652928352356,
      "step": 1845
    },
    {
      "epoch": 23.68,
      "grad_norm": 0.3433044889890816,
      "learning_rate": 0.004718817206617718,
      "loss": 3.0383,
      "loss_layer_12_head": 0.740127444267273,
      "loss_layer_18_head": 0.604573130607605,
      "loss_layer_24_head": 0.3500204384326935,
      "loss_layer_30_head": 0.2343074381351471,
      "loss_layer_36_head": 0.15824632346630096,
      "loss_layer_42_head": 0.08342351019382477,
      "loss_layer_6_head": 0.9855822324752808,
      "step": 1850
    },
    {
      "epoch": 23.744,
      "grad_norm": 0.35627059296077773,
      "learning_rate": 0.004716234183561244,
      "loss": 3.0691,
      "loss_layer_12_head": 0.750816822052002,
      "loss_layer_18_head": 0.5981661081314087,
      "loss_layer_24_head": 0.3520575165748596,
      "loss_layer_30_head": 0.22234377264976501,
      "loss_layer_36_head": 0.14646562933921814,
      "loss_layer_42_head": 0.076685830950737,
      "loss_layer_6_head": 1.006516695022583,
      "step": 1855
    },
    {
      "epoch": 23.808,
      "grad_norm": 0.5567684313492611,
      "learning_rate": 0.004713640064133024,
      "loss": 3.0672,
      "loss_layer_12_head": 0.758025050163269,
      "loss_layer_18_head": 0.5640559792518616,
      "loss_layer_24_head": 0.34767717123031616,
      "loss_layer_30_head": 0.21540379524230957,
      "loss_layer_36_head": 0.14160935580730438,
      "loss_layer_42_head": 0.07396423071622849,
      "loss_layer_6_head": 0.9792525172233582,
      "step": 1860
    },
    {
      "epoch": 23.872,
      "grad_norm": 0.4840669217461741,
      "learning_rate": 0.00471103486132145,
      "loss": 3.0862,
      "loss_layer_12_head": 0.71921306848526,
      "loss_layer_18_head": 0.5363633632659912,
      "loss_layer_24_head": 0.34492865204811096,
      "loss_layer_30_head": 0.2067732810974121,
      "loss_layer_36_head": 0.13777290284633636,
      "loss_layer_42_head": 0.07972152531147003,
      "loss_layer_6_head": 0.9511731266975403,
      "step": 1865
    },
    {
      "epoch": 23.936,
      "grad_norm": 0.9064723368197312,
      "learning_rate": 0.004708418588170404,
      "loss": 3.2252,
      "loss_layer_12_head": 0.7186329364776611,
      "loss_layer_18_head": 0.5588528513908386,
      "loss_layer_24_head": 0.3825071454048157,
      "loss_layer_30_head": 0.21649746596813202,
      "loss_layer_36_head": 0.14201994240283966,
      "loss_layer_42_head": 0.07602548599243164,
      "loss_layer_6_head": 1.058148980140686,
      "step": 1870
    },
    {
      "epoch": 24.0,
      "grad_norm": 0.42101773249252994,
      "learning_rate": 0.004705791257779195,
      "loss": 3.1705,
      "loss_layer_12_head": 0.7624877691268921,
      "loss_layer_18_head": 0.5995546579360962,
      "loss_layer_24_head": 0.4108217656612396,
      "loss_layer_30_head": 0.22984914481639862,
      "loss_layer_36_head": 0.1438816338777542,
      "loss_layer_42_head": 0.08263182640075684,
      "loss_layer_6_head": 1.1131511926651,
      "step": 1875
    },
    {
      "epoch": 24.064,
      "grad_norm": 0.687757920394935,
      "learning_rate": 0.0047031528833024975,
      "loss": 2.9374,
      "loss_layer_12_head": 0.6059186458587646,
      "loss_layer_18_head": 0.48163285851478577,
      "loss_layer_24_head": 0.3405088186264038,
      "loss_layer_30_head": 0.19482308626174927,
      "loss_layer_36_head": 0.13261628150939941,
      "loss_layer_42_head": 0.07388617098331451,
      "loss_layer_6_head": 0.91693514585495,
      "step": 1880
    },
    {
      "epoch": 24.128,
      "grad_norm": 0.6341483372663932,
      "learning_rate": 0.004700503477950277,
      "loss": 2.9019,
      "loss_layer_12_head": 0.642470121383667,
      "loss_layer_18_head": 0.5155765414237976,
      "loss_layer_24_head": 0.35391703248023987,
      "loss_layer_30_head": 0.20020286738872528,
      "loss_layer_36_head": 0.13319741189479828,
      "loss_layer_42_head": 0.07559539377689362,
      "loss_layer_6_head": 0.9833466410636902,
      "step": 1885
    },
    {
      "epoch": 24.192,
      "grad_norm": 0.4273991889495586,
      "learning_rate": 0.004697843054987737,
      "loss": 2.9711,
      "loss_layer_12_head": 0.6445605158805847,
      "loss_layer_18_head": 0.5413626432418823,
      "loss_layer_24_head": 0.3457954525947571,
      "loss_layer_30_head": 0.2062971144914627,
      "loss_layer_36_head": 0.13670064508914948,
      "loss_layer_42_head": 0.0781421884894371,
      "loss_layer_6_head": 0.9752801060676575,
      "step": 1890
    },
    {
      "epoch": 24.256,
      "grad_norm": 0.6264953799105228,
      "learning_rate": 0.004695171627735235,
      "loss": 3.0644,
      "loss_layer_12_head": 0.6723799109458923,
      "loss_layer_18_head": 0.5722543001174927,
      "loss_layer_24_head": 0.3585831820964813,
      "loss_layer_30_head": 0.2131202220916748,
      "loss_layer_36_head": 0.14144279062747955,
      "loss_layer_42_head": 0.07747422903776169,
      "loss_layer_6_head": 1.0644537210464478,
      "step": 1895
    },
    {
      "epoch": 24.32,
      "grad_norm": 0.7078466072664457,
      "learning_rate": 0.004692489209568234,
      "loss": 3.0917,
      "loss_layer_12_head": 0.6268420815467834,
      "loss_layer_18_head": 0.6354551911354065,
      "loss_layer_24_head": 0.3283812701702118,
      "loss_layer_30_head": 0.2039872705936432,
      "loss_layer_36_head": 0.14215025305747986,
      "loss_layer_42_head": 0.08367790281772614,
      "loss_layer_6_head": 0.9903539419174194,
      "step": 1900
    },
    {
      "epoch": 24.384,
      "grad_norm": 0.512597665795083,
      "learning_rate": 0.00468979581391722,
      "loss": 3.0488,
      "loss_layer_12_head": 0.6374137997627258,
      "loss_layer_18_head": 0.6513009071350098,
      "loss_layer_24_head": 0.33201009035110474,
      "loss_layer_30_head": 0.20343251526355743,
      "loss_layer_36_head": 0.13328376412391663,
      "loss_layer_42_head": 0.08123214542865753,
      "loss_layer_6_head": 0.9907103776931763,
      "step": 1905
    },
    {
      "epoch": 24.448,
      "grad_norm": 0.3600674296397683,
      "learning_rate": 0.004687091454267646,
      "loss": 3.0838,
      "loss_layer_12_head": 0.6559634208679199,
      "loss_layer_18_head": 0.6601884365081787,
      "loss_layer_24_head": 0.3469889163970947,
      "loss_layer_30_head": 0.2183941900730133,
      "loss_layer_36_head": 0.1625228375196457,
      "loss_layer_42_head": 0.09797803312540054,
      "loss_layer_6_head": 0.9831121563911438,
      "step": 1910
    },
    {
      "epoch": 24.512,
      "grad_norm": 0.294961316534889,
      "learning_rate": 0.004684376144159861,
      "loss": 2.9928,
      "loss_layer_12_head": 0.6603584289550781,
      "loss_layer_18_head": 0.6234963536262512,
      "loss_layer_24_head": 0.33703741431236267,
      "loss_layer_30_head": 0.20308807492256165,
      "loss_layer_36_head": 0.1387360394001007,
      "loss_layer_42_head": 0.08298148959875107,
      "loss_layer_6_head": 0.9852527379989624,
      "step": 1915
    },
    {
      "epoch": 24.576,
      "grad_norm": 0.29178682650980986,
      "learning_rate": 0.004681649897189036,
      "loss": 2.9437,
      "loss_layer_12_head": 0.6581500768661499,
      "loss_layer_18_head": 0.5933951735496521,
      "loss_layer_24_head": 0.3342687487602234,
      "loss_layer_30_head": 0.20333118736743927,
      "loss_layer_36_head": 0.13772818446159363,
      "loss_layer_42_head": 0.08354105055332184,
      "loss_layer_6_head": 0.9644285440444946,
      "step": 1920
    },
    {
      "epoch": 24.64,
      "grad_norm": 0.26095495153784665,
      "learning_rate": 0.004678912727005107,
      "loss": 3.026,
      "loss_layer_12_head": 0.6825097799301147,
      "loss_layer_18_head": 0.5814059972763062,
      "loss_layer_24_head": 0.34336110949516296,
      "loss_layer_30_head": 0.2012273073196411,
      "loss_layer_36_head": 0.1347159892320633,
      "loss_layer_42_head": 0.08002408593893051,
      "loss_layer_6_head": 0.9646388292312622,
      "step": 1925
    },
    {
      "epoch": 24.704,
      "grad_norm": 0.2647216601196533,
      "learning_rate": 0.004676164647312698,
      "loss": 2.94,
      "loss_layer_12_head": 0.6632307767868042,
      "loss_layer_18_head": 0.5546755194664001,
      "loss_layer_24_head": 0.33305445313453674,
      "loss_layer_30_head": 0.20298342406749725,
      "loss_layer_36_head": 0.1351158320903778,
      "loss_layer_42_head": 0.08000093698501587,
      "loss_layer_6_head": 0.940167248249054,
      "step": 1930
    },
    {
      "epoch": 24.768,
      "grad_norm": 0.29632796677932066,
      "learning_rate": 0.004673405671871057,
      "loss": 2.9521,
      "loss_layer_12_head": 0.6683317422866821,
      "loss_layer_18_head": 0.5510379076004028,
      "loss_layer_24_head": 0.339195191860199,
      "loss_layer_30_head": 0.20363128185272217,
      "loss_layer_36_head": 0.1321253776550293,
      "loss_layer_42_head": 0.08001138269901276,
      "loss_layer_6_head": 0.9321775436401367,
      "step": 1935
    },
    {
      "epoch": 24.832,
      "grad_norm": 0.3096951416128564,
      "learning_rate": 0.004670635814493985,
      "loss": 2.9655,
      "loss_layer_12_head": 0.6587310433387756,
      "loss_layer_18_head": 0.5377337336540222,
      "loss_layer_24_head": 0.3281893730163574,
      "loss_layer_30_head": 0.1988770067691803,
      "loss_layer_36_head": 0.13438665866851807,
      "loss_layer_42_head": 0.07810235768556595,
      "loss_layer_6_head": 0.9387785792350769,
      "step": 1940
    },
    {
      "epoch": 24.896,
      "grad_norm": 0.4240328752475281,
      "learning_rate": 0.004667855089049764,
      "loss": 2.9667,
      "loss_layer_12_head": 0.6732971668243408,
      "loss_layer_18_head": 0.555254340171814,
      "loss_layer_24_head": 0.3389227092266083,
      "loss_layer_30_head": 0.20529255270957947,
      "loss_layer_36_head": 0.13950686156749725,
      "loss_layer_42_head": 0.08232109248638153,
      "loss_layer_6_head": 0.9318910837173462,
      "step": 1945
    },
    {
      "epoch": 24.96,
      "grad_norm": 0.5308924561068813,
      "learning_rate": 0.004665063509461097,
      "loss": 3.0316,
      "loss_layer_12_head": 0.7027512788772583,
      "loss_layer_18_head": 0.5731266736984253,
      "loss_layer_24_head": 0.35123974084854126,
      "loss_layer_30_head": 0.2191992700099945,
      "loss_layer_36_head": 0.17243464291095734,
      "loss_layer_42_head": 0.10528196394443512,
      "loss_layer_6_head": 0.9696947336196899,
      "step": 1950
    },
    {
      "epoch": 25.024,
      "grad_norm": 0.466381369769412,
      "learning_rate": 0.004662261089705027,
      "loss": 2.9785,
      "loss_layer_12_head": 0.6600301861763,
      "loss_layer_18_head": 0.5271055698394775,
      "loss_layer_24_head": 0.33537960052490234,
      "loss_layer_30_head": 0.20436692237854004,
      "loss_layer_36_head": 0.15316472947597504,
      "loss_layer_42_head": 0.08902554959058762,
      "loss_layer_6_head": 0.8979424238204956,
      "step": 1955
    },
    {
      "epoch": 25.088,
      "grad_norm": 0.3577564677250191,
      "learning_rate": 0.004659447843812876,
      "loss": 2.8664,
      "loss_layer_12_head": 0.6688176989555359,
      "loss_layer_18_head": 0.5335163474082947,
      "loss_layer_24_head": 0.37035638093948364,
      "loss_layer_30_head": 0.22202622890472412,
      "loss_layer_36_head": 0.16433492302894592,
      "loss_layer_42_head": 0.09961771965026855,
      "loss_layer_6_head": 0.9240397214889526,
      "step": 1960
    },
    {
      "epoch": 25.152,
      "grad_norm": 0.37951986327115844,
      "learning_rate": 0.004656623785870167,
      "loss": 2.8435,
      "loss_layer_12_head": 0.6609958410263062,
      "loss_layer_18_head": 0.5315054655075073,
      "loss_layer_24_head": 0.3473655581474304,
      "loss_layer_30_head": 0.2254653424024582,
      "loss_layer_36_head": 0.14256532490253448,
      "loss_layer_42_head": 0.08825518935918808,
      "loss_layer_6_head": 0.9284340143203735,
      "step": 1965
    },
    {
      "epoch": 25.216,
      "grad_norm": 0.43457959993937495,
      "learning_rate": 0.004653788930016562,
      "loss": 2.8821,
      "loss_layer_12_head": 0.6403862237930298,
      "loss_layer_18_head": 0.5175502896308899,
      "loss_layer_24_head": 0.3623273968696594,
      "loss_layer_30_head": 0.24095407128334045,
      "loss_layer_36_head": 0.1499880999326706,
      "loss_layer_42_head": 0.09598751366138458,
      "loss_layer_6_head": 0.9038140177726746,
      "step": 1970
    },
    {
      "epoch": 25.28,
      "grad_norm": 0.3268640738594013,
      "learning_rate": 0.004650943290445781,
      "loss": 2.8314,
      "loss_layer_12_head": 0.6393994688987732,
      "loss_layer_18_head": 0.5103954076766968,
      "loss_layer_24_head": 0.3503054976463318,
      "loss_layer_30_head": 0.21978874504566193,
      "loss_layer_36_head": 0.14244893193244934,
      "loss_layer_42_head": 0.07929064333438873,
      "loss_layer_6_head": 0.9143568277359009,
      "step": 1975
    },
    {
      "epoch": 25.344,
      "grad_norm": 0.4273214069008089,
      "learning_rate": 0.004648086881405542,
      "loss": 2.944,
      "loss_layer_12_head": 0.6338635683059692,
      "loss_layer_18_head": 0.5149096250534058,
      "loss_layer_24_head": 0.34061554074287415,
      "loss_layer_30_head": 0.21972429752349854,
      "loss_layer_36_head": 0.1441533863544464,
      "loss_layer_42_head": 0.0847097635269165,
      "loss_layer_6_head": 0.8959451913833618,
      "step": 1980
    },
    {
      "epoch": 25.408,
      "grad_norm": 0.6731955240448945,
      "learning_rate": 0.004645219717197482,
      "loss": 2.9407,
      "loss_layer_12_head": 0.6205049157142639,
      "loss_layer_18_head": 0.5120773315429688,
      "loss_layer_24_head": 0.3636119067668915,
      "loss_layer_30_head": 0.20753642916679382,
      "loss_layer_36_head": 0.13663505017757416,
      "loss_layer_42_head": 0.08178742229938507,
      "loss_layer_6_head": 0.8831892013549805,
      "step": 1985
    },
    {
      "epoch": 25.472,
      "grad_norm": 0.7607500989846088,
      "learning_rate": 0.0046423418121770855,
      "loss": 3.0138,
      "loss_layer_12_head": 0.7108671069145203,
      "loss_layer_18_head": 0.559550404548645,
      "loss_layer_24_head": 0.4524900019168854,
      "loss_layer_30_head": 0.21820411086082458,
      "loss_layer_36_head": 0.1445203274488449,
      "loss_layer_42_head": 0.08608443289995193,
      "loss_layer_6_head": 0.9818849563598633,
      "step": 1990
    },
    {
      "epoch": 25.536,
      "grad_norm": 0.5551945471810279,
      "learning_rate": 0.004639453180753619,
      "loss": 3.0864,
      "loss_layer_12_head": 0.6945182085037231,
      "loss_layer_18_head": 0.5390790700912476,
      "loss_layer_24_head": 0.39790382981300354,
      "loss_layer_30_head": 0.19965913891792297,
      "loss_layer_36_head": 0.13391347229480743,
      "loss_layer_42_head": 0.08197671920061111,
      "loss_layer_6_head": 0.9382524490356445,
      "step": 1995
    },
    {
      "epoch": 25.6,
      "grad_norm": 0.5969935608879392,
      "learning_rate": 0.00463655383739005,
      "loss": 3.0878,
      "loss_layer_12_head": 0.7555915713310242,
      "loss_layer_18_head": 0.5696674585342407,
      "loss_layer_24_head": 0.3942463994026184,
      "loss_layer_30_head": 0.20785459876060486,
      "loss_layer_36_head": 0.1389370560646057,
      "loss_layer_42_head": 0.09127404540777206,
      "loss_layer_6_head": 0.999057948589325,
      "step": 2000
    },
    {
      "epoch": 25.6,
      "eval_loss": 5.722364902496338,
      "eval_loss_layer_12_head": 1.4047225713729858,
      "eval_loss_layer_18_head": 1.12978994846344,
      "eval_loss_layer_24_head": 0.7445850372314453,
      "eval_loss_layer_30_head": 0.4840705990791321,
      "eval_loss_layer_36_head": 0.31088677048683167,
      "eval_loss_layer_42_head": 0.18901023268699646,
      "eval_loss_layer_6_head": 1.6020418405532837,
      "eval_runtime": 33.0729,
      "eval_samples_per_second": 9.676,
      "eval_steps_per_second": 0.605,
      "step": 2000
    },
    {
      "epoch": 25.664,
      "grad_norm": 0.5303238252293723,
      "learning_rate": 0.004633643796602985,
      "loss": 3.0822,
      "loss_layer_12_head": 0.8055858612060547,
      "loss_layer_18_head": 0.5460436940193176,
      "loss_layer_24_head": 0.36327728629112244,
      "loss_layer_30_head": 0.20192968845367432,
      "loss_layer_36_head": 0.13539138436317444,
      "loss_layer_42_head": 0.0914485976099968,
      "loss_layer_6_head": 0.9463146924972534,
      "step": 2005
    },
    {
      "epoch": 25.728,
      "grad_norm": 0.3619423314728626,
      "learning_rate": 0.004630723072962584,
      "loss": 3.0955,
      "loss_layer_12_head": 0.776841402053833,
      "loss_layer_18_head": 0.5522428750991821,
      "loss_layer_24_head": 0.35818570852279663,
      "loss_layer_30_head": 0.2079772651195526,
      "loss_layer_36_head": 0.13512223958969116,
      "loss_layer_42_head": 0.08908144384622574,
      "loss_layer_6_head": 0.9621550440788269,
      "step": 2010
    },
    {
      "epoch": 25.792,
      "grad_norm": 0.38612322029297563,
      "learning_rate": 0.004627791681092499,
      "loss": 3.0637,
      "loss_layer_12_head": 0.7377291917800903,
      "loss_layer_18_head": 0.545174241065979,
      "loss_layer_24_head": 0.35134202241897583,
      "loss_layer_30_head": 0.22256764769554138,
      "loss_layer_36_head": 0.1411951631307602,
      "loss_layer_42_head": 0.09762486070394516,
      "loss_layer_6_head": 0.9333661794662476,
      "step": 2015
    },
    {
      "epoch": 25.856,
      "grad_norm": 0.3524085002930232,
      "learning_rate": 0.004624849635669797,
      "loss": 3.0162,
      "loss_layer_12_head": 0.7451778650283813,
      "loss_layer_18_head": 0.5710796117782593,
      "loss_layer_24_head": 0.355121374130249,
      "loss_layer_30_head": 0.22390589118003845,
      "loss_layer_36_head": 0.13880571722984314,
      "loss_layer_42_head": 0.09811806678771973,
      "loss_layer_6_head": 0.9747943878173828,
      "step": 2020
    },
    {
      "epoch": 25.92,
      "grad_norm": 0.31255683997386563,
      "learning_rate": 0.004621896951424882,
      "loss": 3.0143,
      "loss_layer_12_head": 0.689067006111145,
      "loss_layer_18_head": 0.534162700176239,
      "loss_layer_24_head": 0.33647626638412476,
      "loss_layer_30_head": 0.22423723340034485,
      "loss_layer_36_head": 0.1323883980512619,
      "loss_layer_42_head": 0.09736645966768265,
      "loss_layer_6_head": 0.9303882718086243,
      "step": 2025
    },
    {
      "epoch": 25.984,
      "grad_norm": 0.29245920760282296,
      "learning_rate": 0.004618933643141428,
      "loss": 3.0616,
      "loss_layer_12_head": 0.69266277551651,
      "loss_layer_18_head": 0.5485231280326843,
      "loss_layer_24_head": 0.3458479940891266,
      "loss_layer_30_head": 0.22592835128307343,
      "loss_layer_36_head": 0.13432851433753967,
      "loss_layer_42_head": 0.10265462100505829,
      "loss_layer_6_head": 0.9485656023025513,
      "step": 2030
    },
    {
      "epoch": 26.048,
      "grad_norm": 0.3271357188126486,
      "learning_rate": 0.004615959725656301,
      "loss": 2.8889,
      "loss_layer_12_head": 0.6622344255447388,
      "loss_layer_18_head": 0.5278757810592651,
      "loss_layer_24_head": 0.329339861869812,
      "loss_layer_30_head": 0.2364305555820465,
      "loss_layer_36_head": 0.1320003867149353,
      "loss_layer_42_head": 0.09957057982683182,
      "loss_layer_6_head": 0.9246984720230103,
      "step": 2035
    },
    {
      "epoch": 26.112,
      "grad_norm": 0.3356198308135893,
      "learning_rate": 0.0046129752138594875,
      "loss": 2.7963,
      "loss_layer_12_head": 0.6362636685371399,
      "loss_layer_18_head": 0.503398597240448,
      "loss_layer_24_head": 0.3143315613269806,
      "loss_layer_30_head": 0.21860018372535706,
      "loss_layer_36_head": 0.12349758297204971,
      "loss_layer_42_head": 0.0846986174583435,
      "loss_layer_6_head": 0.9112758636474609,
      "step": 2040
    },
    {
      "epoch": 26.176,
      "grad_norm": 0.46294810662075864,
      "learning_rate": 0.004609980122694015,
      "loss": 2.8502,
      "loss_layer_12_head": 0.6139417886734009,
      "loss_layer_18_head": 0.524552047252655,
      "loss_layer_24_head": 0.3115992844104767,
      "loss_layer_30_head": 0.21854886412620544,
      "loss_layer_36_head": 0.13659927248954773,
      "loss_layer_42_head": 0.08243266493082047,
      "loss_layer_6_head": 0.8913648724555969,
      "step": 2045
    },
    {
      "epoch": 26.24,
      "grad_norm": 0.346722771165039,
      "learning_rate": 0.004606974467155883,
      "loss": 2.864,
      "loss_layer_12_head": 0.5966392755508423,
      "loss_layer_18_head": 0.5326512455940247,
      "loss_layer_24_head": 0.30123579502105713,
      "loss_layer_30_head": 0.21306905150413513,
      "loss_layer_36_head": 0.13040505349636078,
      "loss_layer_42_head": 0.07549242675304413,
      "loss_layer_6_head": 0.8691840171813965,
      "step": 2050
    },
    {
      "epoch": 26.304,
      "grad_norm": 0.35325239600660613,
      "learning_rate": 0.004603958262293985,
      "loss": 2.8705,
      "loss_layer_12_head": 0.6183648705482483,
      "loss_layer_18_head": 0.5306669473648071,
      "loss_layer_24_head": 0.30991846323013306,
      "loss_layer_30_head": 0.20817609131336212,
      "loss_layer_36_head": 0.13762615621089935,
      "loss_layer_42_head": 0.07640819251537323,
      "loss_layer_6_head": 0.8957975506782532,
      "step": 2055
    },
    {
      "epoch": 26.368,
      "grad_norm": 0.5096732917618784,
      "learning_rate": 0.0046009315232100325,
      "loss": 2.9014,
      "loss_layer_12_head": 0.6459457278251648,
      "loss_layer_18_head": 0.604252278804779,
      "loss_layer_24_head": 0.3227326273918152,
      "loss_layer_30_head": 0.21349342167377472,
      "loss_layer_36_head": 0.14750833809375763,
      "loss_layer_42_head": 0.07837899774312973,
      "loss_layer_6_head": 0.9368564486503601,
      "step": 2060
    },
    {
      "epoch": 26.432,
      "grad_norm": 0.5141892734094963,
      "learning_rate": 0.004597894265058481,
      "loss": 2.937,
      "loss_layer_12_head": 0.6551240682601929,
      "loss_layer_18_head": 0.6271176338195801,
      "loss_layer_24_head": 0.3266449272632599,
      "loss_layer_30_head": 0.20855514705181122,
      "loss_layer_36_head": 0.15309588611125946,
      "loss_layer_42_head": 0.07607389986515045,
      "loss_layer_6_head": 0.9417223930358887,
      "step": 2065
    },
    {
      "epoch": 26.496,
      "grad_norm": 0.4361916393079167,
      "learning_rate": 0.004594846503046453,
      "loss": 2.9597,
      "loss_layer_12_head": 0.6418355703353882,
      "loss_layer_18_head": 0.6027976274490356,
      "loss_layer_24_head": 0.31365588307380676,
      "loss_layer_30_head": 0.19588688015937805,
      "loss_layer_36_head": 0.14614388346672058,
      "loss_layer_42_head": 0.07882354408502579,
      "loss_layer_6_head": 0.9202559590339661,
      "step": 2070
    },
    {
      "epoch": 26.56,
      "grad_norm": 0.5508807573244089,
      "learning_rate": 0.004591788252433664,
      "loss": 2.9319,
      "loss_layer_12_head": 0.650751531124115,
      "loss_layer_18_head": 0.5883651375770569,
      "loss_layer_24_head": 0.32289326190948486,
      "loss_layer_30_head": 0.19792437553405762,
      "loss_layer_36_head": 0.14756618440151215,
      "loss_layer_42_head": 0.07378596067428589,
      "loss_layer_6_head": 0.9382122755050659,
      "step": 2075
    },
    {
      "epoch": 26.624,
      "grad_norm": 0.5088650102059447,
      "learning_rate": 0.004588719528532341,
      "loss": 3.0074,
      "loss_layer_12_head": 0.6641827821731567,
      "loss_layer_18_head": 0.5771077871322632,
      "loss_layer_24_head": 0.33071082830429077,
      "loss_layer_30_head": 0.20514056086540222,
      "loss_layer_36_head": 0.1566210836172104,
      "loss_layer_42_head": 0.07869430631399155,
      "loss_layer_6_head": 0.9419099688529968,
      "step": 2080
    },
    {
      "epoch": 26.688,
      "grad_norm": 0.5727060132523473,
      "learning_rate": 0.004585640346707154,
      "loss": 3.056,
      "loss_layer_12_head": 0.6951443552970886,
      "loss_layer_18_head": 0.5986614227294922,
      "loss_layer_24_head": 0.34166979789733887,
      "loss_layer_30_head": 0.21002443134784698,
      "loss_layer_36_head": 0.1486164629459381,
      "loss_layer_42_head": 0.08392110466957092,
      "loss_layer_6_head": 1.0333703756332397,
      "step": 2085
    },
    {
      "epoch": 26.752,
      "grad_norm": 0.692422902268322,
      "learning_rate": 0.00458255072237513,
      "loss": 3.1342,
      "loss_layer_12_head": 0.6765012741088867,
      "loss_layer_18_head": 0.5820575952529907,
      "loss_layer_24_head": 0.33604031801223755,
      "loss_layer_30_head": 0.20902781188488007,
      "loss_layer_36_head": 0.1397251933813095,
      "loss_layer_42_head": 0.07510069757699966,
      "loss_layer_6_head": 1.0613892078399658,
      "step": 2090
    },
    {
      "epoch": 26.816,
      "grad_norm": 0.6931445119428886,
      "learning_rate": 0.0045794506710055815,
      "loss": 3.0809,
      "loss_layer_12_head": 0.6440457105636597,
      "loss_layer_18_head": 0.5909689664840698,
      "loss_layer_24_head": 0.3242195248603821,
      "loss_layer_30_head": 0.19809284806251526,
      "loss_layer_36_head": 0.1336321234703064,
      "loss_layer_42_head": 0.07267114520072937,
      "loss_layer_6_head": 1.0365307331085205,
      "step": 2095
    },
    {
      "epoch": 26.88,
      "grad_norm": 0.509877543768777,
      "learning_rate": 0.00457634020812003,
      "loss": 3.1963,
      "loss_layer_12_head": 0.7147582769393921,
      "loss_layer_18_head": 0.6956864595413208,
      "loss_layer_24_head": 0.3706192076206207,
      "loss_layer_30_head": 0.2205020636320114,
      "loss_layer_36_head": 0.14474917948246002,
      "loss_layer_42_head": 0.07950203865766525,
      "loss_layer_6_head": 1.202239751815796,
      "step": 2100
    },
    {
      "epoch": 26.944,
      "grad_norm": 0.4560124866359943,
      "learning_rate": 0.004573219349292122,
      "loss": 3.1631,
      "loss_layer_12_head": 0.703018069267273,
      "loss_layer_18_head": 0.6414461135864258,
      "loss_layer_24_head": 0.36159080266952515,
      "loss_layer_30_head": 0.21555331349372864,
      "loss_layer_36_head": 0.1438833624124527,
      "loss_layer_42_head": 0.08191318809986115,
      "loss_layer_6_head": 1.1449391841888428,
      "step": 2105
    },
    {
      "epoch": 27.008,
      "grad_norm": 0.26251249513984104,
      "learning_rate": 0.0045700881101475585,
      "loss": 3.069,
      "loss_layer_12_head": 0.6652728319168091,
      "loss_layer_18_head": 0.6010472774505615,
      "loss_layer_24_head": 0.34376299381256104,
      "loss_layer_30_head": 0.2011212855577469,
      "loss_layer_36_head": 0.1384899914264679,
      "loss_layer_42_head": 0.08094318956136703,
      "loss_layer_6_head": 1.0663573741912842,
      "step": 2110
    },
    {
      "epoch": 27.072,
      "grad_norm": 0.31899264620690804,
      "learning_rate": 0.004566946506364013,
      "loss": 2.8588,
      "loss_layer_12_head": 0.6228111982345581,
      "loss_layer_18_head": 0.5436338186264038,
      "loss_layer_24_head": 0.32807210087776184,
      "loss_layer_30_head": 0.18760253489017487,
      "loss_layer_36_head": 0.12601293623447418,
      "loss_layer_42_head": 0.07730147242546082,
      "loss_layer_6_head": 0.9528765678405762,
      "step": 2115
    },
    {
      "epoch": 27.136,
      "grad_norm": 0.3750326351568498,
      "learning_rate": 0.00456379455367105,
      "loss": 2.8716,
      "loss_layer_12_head": 0.6243578195571899,
      "loss_layer_18_head": 0.5332263708114624,
      "loss_layer_24_head": 0.3453260362148285,
      "loss_layer_30_head": 0.1960875689983368,
      "loss_layer_36_head": 0.12453963607549667,
      "loss_layer_42_head": 0.08662150055170059,
      "loss_layer_6_head": 0.9377754330635071,
      "step": 2120
    },
    {
      "epoch": 27.2,
      "grad_norm": 0.22290787603625714,
      "learning_rate": 0.004560632267850053,
      "loss": 2.8262,
      "loss_layer_12_head": 0.6181994676589966,
      "loss_layer_18_head": 0.5263110399246216,
      "loss_layer_24_head": 0.32987290620803833,
      "loss_layer_30_head": 0.18940457701683044,
      "loss_layer_36_head": 0.1268617808818817,
      "loss_layer_42_head": 0.08855735510587692,
      "loss_layer_6_head": 0.9013465642929077,
      "step": 2125
    },
    {
      "epoch": 27.264,
      "grad_norm": 0.5302758506441183,
      "learning_rate": 0.004557459664734141,
      "loss": 2.8542,
      "loss_layer_12_head": 0.651372492313385,
      "loss_layer_18_head": 0.553097128868103,
      "loss_layer_24_head": 0.34666937589645386,
      "loss_layer_30_head": 0.20660820603370667,
      "loss_layer_36_head": 0.13136355578899384,
      "loss_layer_42_head": 0.10167491436004639,
      "loss_layer_6_head": 0.9175933003425598,
      "step": 2130
    },
    {
      "epoch": 27.328,
      "grad_norm": 0.5257873406021739,
      "learning_rate": 0.0045542767602080895,
      "loss": 3.0233,
      "loss_layer_12_head": 0.681158185005188,
      "loss_layer_18_head": 0.568608283996582,
      "loss_layer_24_head": 0.33333539962768555,
      "loss_layer_30_head": 0.20848457515239716,
      "loss_layer_36_head": 0.13191251456737518,
      "loss_layer_42_head": 0.09854697436094284,
      "loss_layer_6_head": 0.9126516580581665,
      "step": 2135
    },
    {
      "epoch": 27.392,
      "grad_norm": 0.5036075355659602,
      "learning_rate": 0.004551083570208251,
      "loss": 3.0809,
      "loss_layer_12_head": 0.773600697517395,
      "loss_layer_18_head": 0.5883660316467285,
      "loss_layer_24_head": 0.3326999545097351,
      "loss_layer_30_head": 0.21410226821899414,
      "loss_layer_36_head": 0.13079780340194702,
      "loss_layer_42_head": 0.08748898655176163,
      "loss_layer_6_head": 0.9304162859916687,
      "step": 2140
    },
    {
      "epoch": 27.456,
      "grad_norm": 0.3920961751783294,
      "learning_rate": 0.00454788011072248,
      "loss": 3.0579,
      "loss_layer_12_head": 0.7628167867660522,
      "loss_layer_18_head": 0.5618486404418945,
      "loss_layer_24_head": 0.3304392695426941,
      "loss_layer_30_head": 0.20652513206005096,
      "loss_layer_36_head": 0.13650162518024445,
      "loss_layer_42_head": 0.08511413633823395,
      "loss_layer_6_head": 0.9148740768432617,
      "step": 2145
    },
    {
      "epoch": 27.52,
      "grad_norm": 0.4257014252649816,
      "learning_rate": 0.004544666397790042,
      "loss": 2.9982,
      "loss_layer_12_head": 0.7609049677848816,
      "loss_layer_18_head": 0.5664193034172058,
      "loss_layer_24_head": 0.3341578245162964,
      "loss_layer_30_head": 0.2297562062740326,
      "loss_layer_36_head": 0.1418401300907135,
      "loss_layer_42_head": 0.08695818483829498,
      "loss_layer_6_head": 0.929245114326477,
      "step": 2150
    },
    {
      "epoch": 27.584,
      "grad_norm": 0.4668907748783865,
      "learning_rate": 0.004541442447501549,
      "loss": 3.0287,
      "loss_layer_12_head": 0.742009699344635,
      "loss_layer_18_head": 0.5522295236587524,
      "loss_layer_24_head": 0.3286585211753845,
      "loss_layer_30_head": 0.20474526286125183,
      "loss_layer_36_head": 0.1352091133594513,
      "loss_layer_42_head": 0.08259187638759613,
      "loss_layer_6_head": 0.9116083979606628,
      "step": 2155
    },
    {
      "epoch": 27.648,
      "grad_norm": 0.34664217561359134,
      "learning_rate": 0.004538208275998861,
      "loss": 3.01,
      "loss_layer_12_head": 0.7528090476989746,
      "loss_layer_18_head": 0.5679936408996582,
      "loss_layer_24_head": 0.34072890877723694,
      "loss_layer_30_head": 0.21010306477546692,
      "loss_layer_36_head": 0.14381557703018188,
      "loss_layer_42_head": 0.08117672055959702,
      "loss_layer_6_head": 0.9503291845321655,
      "step": 2160
    },
    {
      "epoch": 27.712,
      "grad_norm": 0.3541904504441687,
      "learning_rate": 0.00453496389947502,
      "loss": 2.9903,
      "loss_layer_12_head": 0.7502453923225403,
      "loss_layer_18_head": 0.5729094743728638,
      "loss_layer_24_head": 0.34654325246810913,
      "loss_layer_30_head": 0.21894705295562744,
      "loss_layer_36_head": 0.14152508974075317,
      "loss_layer_42_head": 0.08530915528535843,
      "loss_layer_6_head": 0.9618476629257202,
      "step": 2165
    },
    {
      "epoch": 27.776,
      "grad_norm": 0.4057527845003344,
      "learning_rate": 0.0045317093341741615,
      "loss": 2.9215,
      "loss_layer_12_head": 0.6779439449310303,
      "loss_layer_18_head": 0.5396813154220581,
      "loss_layer_24_head": 0.3163859248161316,
      "loss_layer_30_head": 0.20133265852928162,
      "loss_layer_36_head": 0.12853218615055084,
      "loss_layer_42_head": 0.07760678976774216,
      "loss_layer_6_head": 0.9023246765136719,
      "step": 2170
    },
    {
      "epoch": 27.84,
      "grad_norm": 0.2816415359911579,
      "learning_rate": 0.004528444596391433,
      "loss": 2.9314,
      "loss_layer_12_head": 0.6869233846664429,
      "loss_layer_18_head": 0.5558017492294312,
      "loss_layer_24_head": 0.33123236894607544,
      "loss_layer_30_head": 0.20556628704071045,
      "loss_layer_36_head": 0.13566318154335022,
      "loss_layer_42_head": 0.08006473630666733,
      "loss_layer_6_head": 0.9339157342910767,
      "step": 2175
    },
    {
      "epoch": 27.904,
      "grad_norm": 0.23189135893989643,
      "learning_rate": 0.0045251697024729165,
      "loss": 2.9192,
      "loss_layer_12_head": 0.6643164753913879,
      "loss_layer_18_head": 0.5566911697387695,
      "loss_layer_24_head": 0.3315706253051758,
      "loss_layer_30_head": 0.20314237475395203,
      "loss_layer_36_head": 0.1302090287208557,
      "loss_layer_42_head": 0.07545685768127441,
      "loss_layer_6_head": 0.9062907099723816,
      "step": 2180
    },
    {
      "epoch": 27.968,
      "grad_norm": 0.5490687943759491,
      "learning_rate": 0.004521884668815545,
      "loss": 2.9989,
      "loss_layer_12_head": 0.6815639734268188,
      "loss_layer_18_head": 0.6222435235977173,
      "loss_layer_24_head": 0.3434344530105591,
      "loss_layer_30_head": 0.20724093914031982,
      "loss_layer_36_head": 0.13804176449775696,
      "loss_layer_42_head": 0.08291787654161453,
      "loss_layer_6_head": 0.947675347328186,
      "step": 2185
    },
    {
      "epoch": 28.032,
      "grad_norm": 0.3762423754432878,
      "learning_rate": 0.0045185895118670175,
      "loss": 2.9635,
      "loss_layer_12_head": 0.6596229076385498,
      "loss_layer_18_head": 0.6128451228141785,
      "loss_layer_24_head": 0.3529302477836609,
      "loss_layer_30_head": 0.21032562851905823,
      "loss_layer_36_head": 0.14685119688510895,
      "loss_layer_42_head": 0.0825103223323822,
      "loss_layer_6_head": 0.9303401708602905,
      "step": 2190
    },
    {
      "epoch": 28.096,
      "grad_norm": 0.36868079202630427,
      "learning_rate": 0.004515284248125718,
      "loss": 2.822,
      "loss_layer_12_head": 0.6239296197891235,
      "loss_layer_18_head": 0.5627091526985168,
      "loss_layer_24_head": 0.3314482569694519,
      "loss_layer_30_head": 0.19831296801567078,
      "loss_layer_36_head": 0.1356363445520401,
      "loss_layer_42_head": 0.08486764132976532,
      "loss_layer_6_head": 0.9032891392707825,
      "step": 2195
    },
    {
      "epoch": 28.16,
      "grad_norm": 0.37804310252562595,
      "learning_rate": 0.004511968894140639,
      "loss": 2.8619,
      "loss_layer_12_head": 0.6425489187240601,
      "loss_layer_18_head": 0.5499609708786011,
      "loss_layer_24_head": 0.33730897307395935,
      "loss_layer_30_head": 0.19301500916481018,
      "loss_layer_36_head": 0.13809767365455627,
      "loss_layer_42_head": 0.08502845466136932,
      "loss_layer_6_head": 0.9140973091125488,
      "step": 2200
    },
    {
      "epoch": 28.16,
      "eval_loss": 5.516895294189453,
      "eval_loss_layer_12_head": 1.2565486431121826,
      "eval_loss_layer_18_head": 1.0981875658035278,
      "eval_loss_layer_24_head": 0.7339855432510376,
      "eval_loss_layer_30_head": 0.4624411165714264,
      "eval_loss_layer_36_head": 0.3220587372779846,
      "eval_loss_layer_42_head": 0.2037886083126068,
      "eval_loss_layer_6_head": 1.5917366743087769,
      "eval_runtime": 33.0726,
      "eval_samples_per_second": 9.676,
      "eval_steps_per_second": 0.605,
      "step": 2200
    },
    {
      "epoch": 28.224,
      "grad_norm": 0.3440364980530841,
      "learning_rate": 0.004508643466511287,
      "loss": 2.8448,
      "loss_layer_12_head": 0.6182918548583984,
      "loss_layer_18_head": 0.5268956422805786,
      "loss_layer_24_head": 0.32239672541618347,
      "loss_layer_30_head": 0.1929280012845993,
      "loss_layer_36_head": 0.13488659262657166,
      "loss_layer_42_head": 0.08002984523773193,
      "loss_layer_6_head": 0.9153164029121399,
      "step": 2205
    },
    {
      "epoch": 28.288,
      "grad_norm": 0.2753887493866683,
      "learning_rate": 0.0045053079818876095,
      "loss": 2.8355,
      "loss_layer_12_head": 0.6224581003189087,
      "loss_layer_18_head": 0.5171734690666199,
      "loss_layer_24_head": 0.3236997127532959,
      "loss_layer_30_head": 0.1885015368461609,
      "loss_layer_36_head": 0.13724128901958466,
      "loss_layer_42_head": 0.07483763992786407,
      "loss_layer_6_head": 0.9078689813613892,
      "step": 2210
    },
    {
      "epoch": 28.352,
      "grad_norm": 0.3884858714012794,
      "learning_rate": 0.004501962456969908,
      "loss": 2.8229,
      "loss_layer_12_head": 0.6462548971176147,
      "loss_layer_18_head": 0.53521329164505,
      "loss_layer_24_head": 0.335470587015152,
      "loss_layer_30_head": 0.21482236683368683,
      "loss_layer_36_head": 0.1424712836742401,
      "loss_layer_42_head": 0.07892707735300064,
      "loss_layer_6_head": 0.9373083114624023,
      "step": 2215
    },
    {
      "epoch": 28.416,
      "grad_norm": 0.374904973394686,
      "learning_rate": 0.004498606908508753,
      "loss": 2.8547,
      "loss_layer_12_head": 0.6357049942016602,
      "loss_layer_18_head": 0.5153493881225586,
      "loss_layer_24_head": 0.33098047971725464,
      "loss_layer_30_head": 0.20387883484363556,
      "loss_layer_36_head": 0.14368729293346405,
      "loss_layer_42_head": 0.08135879784822464,
      "loss_layer_6_head": 0.9139909744262695,
      "step": 2220
    },
    {
      "epoch": 28.48,
      "grad_norm": 0.44533008327896784,
      "learning_rate": 0.004495241353304902,
      "loss": 2.8738,
      "loss_layer_12_head": 0.6485244035720825,
      "loss_layer_18_head": 0.5060588717460632,
      "loss_layer_24_head": 0.32497578859329224,
      "loss_layer_30_head": 0.19586384296417236,
      "loss_layer_36_head": 0.1393355429172516,
      "loss_layer_42_head": 0.08122323453426361,
      "loss_layer_6_head": 0.8997598886489868,
      "step": 2225
    },
    {
      "epoch": 28.544,
      "grad_norm": 0.5095686936045166,
      "learning_rate": 0.004491865808209215,
      "loss": 2.8708,
      "loss_layer_12_head": 0.6856842637062073,
      "loss_layer_18_head": 0.515933632850647,
      "loss_layer_24_head": 0.33473676443099976,
      "loss_layer_30_head": 0.19362571835517883,
      "loss_layer_36_head": 0.1368073672056198,
      "loss_layer_42_head": 0.07622112333774567,
      "loss_layer_6_head": 0.9283815622329712,
      "step": 2230
    },
    {
      "epoch": 28.608,
      "grad_norm": 0.3922345025096432,
      "learning_rate": 0.00448848029012257,
      "loss": 2.8946,
      "loss_layer_12_head": 0.706060528755188,
      "loss_layer_18_head": 0.552822470664978,
      "loss_layer_24_head": 0.3506784439086914,
      "loss_layer_30_head": 0.20823974907398224,
      "loss_layer_36_head": 0.14482156932353973,
      "loss_layer_42_head": 0.07634530961513519,
      "loss_layer_6_head": 0.993486762046814,
      "step": 2235
    },
    {
      "epoch": 28.672,
      "grad_norm": 0.5553002344262368,
      "learning_rate": 0.004485084815995778,
      "loss": 2.9412,
      "loss_layer_12_head": 0.6914700865745544,
      "loss_layer_18_head": 0.544337809085846,
      "loss_layer_24_head": 0.341656893491745,
      "loss_layer_30_head": 0.20552317798137665,
      "loss_layer_36_head": 0.14885225892066956,
      "loss_layer_42_head": 0.07566074281930923,
      "loss_layer_6_head": 1.022914171218872,
      "step": 2240
    },
    {
      "epoch": 28.736,
      "grad_norm": 0.3994538686150298,
      "learning_rate": 0.004481679402829499,
      "loss": 2.9482,
      "loss_layer_12_head": 0.6900393962860107,
      "loss_layer_18_head": 0.5475031137466431,
      "loss_layer_24_head": 0.33903297781944275,
      "loss_layer_30_head": 0.21370847523212433,
      "loss_layer_36_head": 0.14300411939620972,
      "loss_layer_42_head": 0.07681452482938766,
      "loss_layer_6_head": 1.0299934148788452,
      "step": 2245
    },
    {
      "epoch": 28.8,
      "grad_norm": 0.48161246150663517,
      "learning_rate": 0.0044782640676741545,
      "loss": 2.944,
      "loss_layer_12_head": 0.6419116258621216,
      "loss_layer_18_head": 0.5092245936393738,
      "loss_layer_24_head": 0.31729525327682495,
      "loss_layer_30_head": 0.22591714560985565,
      "loss_layer_36_head": 0.14146195352077484,
      "loss_layer_42_head": 0.09127528965473175,
      "loss_layer_6_head": 0.9923983812332153,
      "step": 2250
    },
    {
      "epoch": 28.864,
      "grad_norm": 0.47915355921376945,
      "learning_rate": 0.0044748388276298475,
      "loss": 2.9694,
      "loss_layer_12_head": 0.6717289686203003,
      "loss_layer_18_head": 0.5387143492698669,
      "loss_layer_24_head": 0.332899808883667,
      "loss_layer_30_head": 0.25244858860969543,
      "loss_layer_36_head": 0.14744645357131958,
      "loss_layer_42_head": 0.07241874933242798,
      "loss_layer_6_head": 1.0181188583374023,
      "step": 2255
    },
    {
      "epoch": 28.928,
      "grad_norm": 0.4614868111881683,
      "learning_rate": 0.004471403699846272,
      "loss": 3.0079,
      "loss_layer_12_head": 0.652738094329834,
      "loss_layer_18_head": 0.5232523679733276,
      "loss_layer_24_head": 0.3237007260322571,
      "loss_layer_30_head": 0.274820476770401,
      "loss_layer_36_head": 0.14241741597652435,
      "loss_layer_42_head": 0.07316947728395462,
      "loss_layer_6_head": 0.9614084362983704,
      "step": 2260
    },
    {
      "epoch": 28.992,
      "grad_norm": 0.36013533999631564,
      "learning_rate": 0.004467958701522625,
      "loss": 2.9812,
      "loss_layer_12_head": 0.6605039834976196,
      "loss_layer_18_head": 0.5233721137046814,
      "loss_layer_24_head": 0.32634106278419495,
      "loss_layer_30_head": 0.28399187326431274,
      "loss_layer_36_head": 0.1402839869260788,
      "loss_layer_42_head": 0.0863712802529335,
      "loss_layer_6_head": 0.958611786365509,
      "step": 2265
    },
    {
      "epoch": 29.056,
      "grad_norm": 0.35417610798099286,
      "learning_rate": 0.00446450384990753,
      "loss": 2.8319,
      "loss_layer_12_head": 0.6352097988128662,
      "loss_layer_18_head": 0.4998127520084381,
      "loss_layer_24_head": 0.3123530447483063,
      "loss_layer_30_head": 0.2753804624080658,
      "loss_layer_36_head": 0.1382858008146286,
      "loss_layer_42_head": 0.0835491344332695,
      "loss_layer_6_head": 0.9165986776351929,
      "step": 2270
    },
    {
      "epoch": 29.12,
      "grad_norm": 0.4530722625922806,
      "learning_rate": 0.004461039162298939,
      "loss": 2.8157,
      "loss_layer_12_head": 0.6202890276908875,
      "loss_layer_18_head": 0.4909568428993225,
      "loss_layer_24_head": 0.3172857165336609,
      "loss_layer_30_head": 0.2479230910539627,
      "loss_layer_36_head": 0.13565100729465485,
      "loss_layer_42_head": 0.08335566520690918,
      "loss_layer_6_head": 0.9057416915893555,
      "step": 2275
    },
    {
      "epoch": 29.184,
      "grad_norm": 0.35606783918433005,
      "learning_rate": 0.004457564656044056,
      "loss": 2.7806,
      "loss_layer_12_head": 0.6248764991760254,
      "loss_layer_18_head": 0.4954879879951477,
      "loss_layer_24_head": 0.33379194140434265,
      "loss_layer_30_head": 0.2361137866973877,
      "loss_layer_36_head": 0.13553549349308014,
      "loss_layer_42_head": 0.09363728016614914,
      "loss_layer_6_head": 0.9061884880065918,
      "step": 2280
    },
    {
      "epoch": 29.248,
      "grad_norm": 0.316130754771413,
      "learning_rate": 0.004454080348539241,
      "loss": 2.8386,
      "loss_layer_12_head": 0.6361194252967834,
      "loss_layer_18_head": 0.5104520320892334,
      "loss_layer_24_head": 0.3565909266471863,
      "loss_layer_30_head": 0.2286929339170456,
      "loss_layer_36_head": 0.13367272913455963,
      "loss_layer_42_head": 0.09152592718601227,
      "loss_layer_6_head": 0.9204713702201843,
      "step": 2285
    },
    {
      "epoch": 29.312,
      "grad_norm": 0.517090973976984,
      "learning_rate": 0.004450586257229931,
      "loss": 2.9239,
      "loss_layer_12_head": 0.6245343089103699,
      "loss_layer_18_head": 0.48642468452453613,
      "loss_layer_24_head": 0.4241728186607361,
      "loss_layer_30_head": 0.20729169249534607,
      "loss_layer_36_head": 0.12597283720970154,
      "loss_layer_42_head": 0.08959697932004929,
      "loss_layer_6_head": 0.9033238291740417,
      "step": 2290
    },
    {
      "epoch": 29.376,
      "grad_norm": 0.5290676608544598,
      "learning_rate": 0.004447082399610549,
      "loss": 2.891,
      "loss_layer_12_head": 0.6734315752983093,
      "loss_layer_18_head": 0.5117517113685608,
      "loss_layer_24_head": 0.4191058278083801,
      "loss_layer_30_head": 0.21334803104400635,
      "loss_layer_36_head": 0.1311933845281601,
      "loss_layer_42_head": 0.08824415504932404,
      "loss_layer_6_head": 0.9379865527153015,
      "step": 2295
    },
    {
      "epoch": 29.44,
      "grad_norm": 0.4043054944069589,
      "learning_rate": 0.004443568793224415,
      "loss": 2.9442,
      "loss_layer_12_head": 0.6356271505355835,
      "loss_layer_18_head": 0.49112796783447266,
      "loss_layer_24_head": 0.3770146667957306,
      "loss_layer_30_head": 0.19702589511871338,
      "loss_layer_36_head": 0.1228431910276413,
      "loss_layer_42_head": 0.08939050883054733,
      "loss_layer_6_head": 0.9200025796890259,
      "step": 2300
    },
    {
      "epoch": 29.504,
      "grad_norm": 0.46274619269703116,
      "learning_rate": 0.00444004545566366,
      "loss": 2.9209,
      "loss_layer_12_head": 0.7046224474906921,
      "loss_layer_18_head": 0.5418529510498047,
      "loss_layer_24_head": 0.4004139006137848,
      "loss_layer_30_head": 0.2130076140165329,
      "loss_layer_36_head": 0.14110150933265686,
      "loss_layer_42_head": 0.09198819845914841,
      "loss_layer_6_head": 0.9882007837295532,
      "step": 2305
    },
    {
      "epoch": 29.568,
      "grad_norm": 0.4861919053805936,
      "learning_rate": 0.004436512404569136,
      "loss": 2.9858,
      "loss_layer_12_head": 0.690976619720459,
      "loss_layer_18_head": 0.5102543234825134,
      "loss_layer_24_head": 0.37804126739501953,
      "loss_layer_30_head": 0.2011752873659134,
      "loss_layer_36_head": 0.1383427083492279,
      "loss_layer_42_head": 0.08783556520938873,
      "loss_layer_6_head": 0.9309824109077454,
      "step": 2310
    },
    {
      "epoch": 29.632,
      "grad_norm": 0.4730948854817693,
      "learning_rate": 0.004432969657630335,
      "loss": 3.0261,
      "loss_layer_12_head": 0.7884088158607483,
      "loss_layer_18_head": 0.532573938369751,
      "loss_layer_24_head": 0.3804248869419098,
      "loss_layer_30_head": 0.20497103035449982,
      "loss_layer_36_head": 0.1478273719549179,
      "loss_layer_42_head": 0.08182187378406525,
      "loss_layer_6_head": 0.9726240038871765,
      "step": 2315
    },
    {
      "epoch": 29.696,
      "grad_norm": 0.40647179968728386,
      "learning_rate": 0.0044294172325852876,
      "loss": 3.0511,
      "loss_layer_12_head": 0.7857102155685425,
      "loss_layer_18_head": 0.5208314657211304,
      "loss_layer_24_head": 0.36689549684524536,
      "loss_layer_30_head": 0.20250192284584045,
      "loss_layer_36_head": 0.1508600413799286,
      "loss_layer_42_head": 0.07676565647125244,
      "loss_layer_6_head": 0.9397255182266235,
      "step": 2320
    },
    {
      "epoch": 29.76,
      "grad_norm": 0.41322173866723305,
      "learning_rate": 0.004425855147220487,
      "loss": 3.0789,
      "loss_layer_12_head": 0.8226727247238159,
      "loss_layer_18_head": 0.5248513221740723,
      "loss_layer_24_head": 0.37408512830734253,
      "loss_layer_30_head": 0.19583280384540558,
      "loss_layer_36_head": 0.13929159939289093,
      "loss_layer_42_head": 0.08196009695529938,
      "loss_layer_6_head": 0.9313977360725403,
      "step": 2325
    },
    {
      "epoch": 29.824,
      "grad_norm": 0.3103079122653119,
      "learning_rate": 0.004422283419370789,
      "loss": 3.0965,
      "loss_layer_12_head": 0.8031497001647949,
      "loss_layer_18_head": 0.5087049007415771,
      "loss_layer_24_head": 0.3702153265476227,
      "loss_layer_30_head": 0.19278450310230255,
      "loss_layer_36_head": 0.13569281995296478,
      "loss_layer_42_head": 0.0795128270983696,
      "loss_layer_6_head": 0.9110148549079895,
      "step": 2330
    },
    {
      "epoch": 29.888,
      "grad_norm": 0.2413583564998676,
      "learning_rate": 0.004418702066919334,
      "loss": 3.0033,
      "loss_layer_12_head": 0.7847518920898438,
      "loss_layer_18_head": 0.5492113828659058,
      "loss_layer_24_head": 0.3761199116706848,
      "loss_layer_30_head": 0.20826736092567444,
      "loss_layer_36_head": 0.13892602920532227,
      "loss_layer_42_head": 0.07847287505865097,
      "loss_layer_6_head": 0.9586340188980103,
      "step": 2335
    },
    {
      "epoch": 29.951999999999998,
      "grad_norm": 0.4133880205219806,
      "learning_rate": 0.004415111107797445,
      "loss": 2.948,
      "loss_layer_12_head": 0.73412024974823,
      "loss_layer_18_head": 0.534786581993103,
      "loss_layer_24_head": 0.36271148920059204,
      "loss_layer_30_head": 0.20488736033439636,
      "loss_layer_36_head": 0.13886985182762146,
      "loss_layer_42_head": 0.0757991224527359,
      "loss_layer_6_head": 0.9255987405776978,
      "step": 2340
    },
    {
      "epoch": 30.016,
      "grad_norm": 0.45913874500902524,
      "learning_rate": 0.0044115105599845505,
      "loss": 3.0403,
      "loss_layer_12_head": 0.7001451253890991,
      "loss_layer_18_head": 0.6520997285842896,
      "loss_layer_24_head": 0.348941832780838,
      "loss_layer_30_head": 0.19627046585083008,
      "loss_layer_36_head": 0.13534171879291534,
      "loss_layer_42_head": 0.07588890939950943,
      "loss_layer_6_head": 0.9225309491157532,
      "step": 2345
    },
    {
      "epoch": 30.08,
      "grad_norm": 0.4612310966900979,
      "learning_rate": 0.004407900441508084,
      "loss": 3.0542,
      "loss_layer_12_head": 0.6488720178604126,
      "loss_layer_18_head": 0.8304535150527954,
      "loss_layer_24_head": 0.32383593916893005,
      "loss_layer_30_head": 0.18435868620872498,
      "loss_layer_36_head": 0.1260366141796112,
      "loss_layer_42_head": 0.07139839231967926,
      "loss_layer_6_head": 0.8790257573127747,
      "step": 2350
    },
    {
      "epoch": 30.144,
      "grad_norm": 0.46609394618090505,
      "learning_rate": 0.004404280770443398,
      "loss": 3.0136,
      "loss_layer_12_head": 0.6441697478294373,
      "loss_layer_18_head": 0.7356642484664917,
      "loss_layer_24_head": 0.32398709654808044,
      "loss_layer_30_head": 0.18400707840919495,
      "loss_layer_36_head": 0.12492396682500839,
      "loss_layer_42_head": 0.0728595182299614,
      "loss_layer_6_head": 0.8941172361373901,
      "step": 2355
    },
    {
      "epoch": 30.208,
      "grad_norm": 0.4310164536013382,
      "learning_rate": 0.004400651564913676,
      "loss": 2.9341,
      "loss_layer_12_head": 0.6274635791778564,
      "loss_layer_18_head": 0.680252194404602,
      "loss_layer_24_head": 0.31686440110206604,
      "loss_layer_30_head": 0.18745937943458557,
      "loss_layer_36_head": 0.12818244099617004,
      "loss_layer_42_head": 0.07259224355220795,
      "loss_layer_6_head": 0.8922514915466309,
      "step": 2360
    },
    {
      "epoch": 30.272,
      "grad_norm": 0.3727029420829755,
      "learning_rate": 0.004397012843089838,
      "loss": 2.8805,
      "loss_layer_12_head": 0.625210165977478,
      "loss_layer_18_head": 0.6199266314506531,
      "loss_layer_24_head": 0.3131222128868103,
      "loss_layer_30_head": 0.1904037892818451,
      "loss_layer_36_head": 0.1295245736837387,
      "loss_layer_42_head": 0.08052670210599899,
      "loss_layer_6_head": 0.8934969902038574,
      "step": 2365
    },
    {
      "epoch": 30.336,
      "grad_norm": 0.5270563271441383,
      "learning_rate": 0.00439336462319045,
      "loss": 2.9081,
      "loss_layer_12_head": 0.643890917301178,
      "loss_layer_18_head": 0.5988008379936218,
      "loss_layer_24_head": 0.3393189311027527,
      "loss_layer_30_head": 0.2071586549282074,
      "loss_layer_36_head": 0.15210726857185364,
      "loss_layer_42_head": 0.08902803063392639,
      "loss_layer_6_head": 0.9499632716178894,
      "step": 2370
    },
    {
      "epoch": 30.4,
      "grad_norm": 0.43150071867129386,
      "learning_rate": 0.004389706923481633,
      "loss": 2.9338,
      "loss_layer_12_head": 0.6336660385131836,
      "loss_layer_18_head": 0.56342613697052,
      "loss_layer_24_head": 0.31830233335494995,
      "loss_layer_30_head": 0.1939631849527359,
      "loss_layer_36_head": 0.13562741875648499,
      "loss_layer_42_head": 0.07907290756702423,
      "loss_layer_6_head": 0.9891277551651001,
      "step": 2375
    },
    {
      "epoch": 30.464,
      "grad_norm": 0.49374206813234867,
      "learning_rate": 0.0043860397622769755,
      "loss": 2.9113,
      "loss_layer_12_head": 0.6477726697921753,
      "loss_layer_18_head": 0.5559790134429932,
      "loss_layer_24_head": 0.3344138264656067,
      "loss_layer_30_head": 0.20431268215179443,
      "loss_layer_36_head": 0.14482679963111877,
      "loss_layer_42_head": 0.08751516044139862,
      "loss_layer_6_head": 1.0305252075195312,
      "step": 2380
    },
    {
      "epoch": 30.528,
      "grad_norm": 0.5094039069644478,
      "learning_rate": 0.004382363157937435,
      "loss": 2.8671,
      "loss_layer_12_head": 0.6274566650390625,
      "loss_layer_18_head": 0.5259733200073242,
      "loss_layer_24_head": 0.3135523498058319,
      "loss_layer_30_head": 0.1932622492313385,
      "loss_layer_36_head": 0.13374567031860352,
      "loss_layer_42_head": 0.08830223232507706,
      "loss_layer_6_head": 0.9773724675178528,
      "step": 2385
    },
    {
      "epoch": 30.592,
      "grad_norm": 0.3922165828629571,
      "learning_rate": 0.004378677128871251,
      "loss": 2.9799,
      "loss_layer_12_head": 0.6185094714164734,
      "loss_layer_18_head": 0.5089303255081177,
      "loss_layer_24_head": 0.3057869076728821,
      "loss_layer_30_head": 0.1880747377872467,
      "loss_layer_36_head": 0.13187265396118164,
      "loss_layer_42_head": 0.08248680830001831,
      "loss_layer_6_head": 1.0063021183013916,
      "step": 2390
    },
    {
      "epoch": 30.656,
      "grad_norm": 0.3393104691477881,
      "learning_rate": 0.004374981693533848,
      "loss": 2.9536,
      "loss_layer_12_head": 0.6610860824584961,
      "loss_layer_18_head": 0.5352355241775513,
      "loss_layer_24_head": 0.3237577974796295,
      "loss_layer_30_head": 0.19882535934448242,
      "loss_layer_36_head": 0.13875961303710938,
      "loss_layer_42_head": 0.08165042847394943,
      "loss_layer_6_head": 1.023803472518921,
      "step": 2395
    },
    {
      "epoch": 30.72,
      "grad_norm": 0.2485645080823883,
      "learning_rate": 0.004371276870427753,
      "loss": 2.9146,
      "loss_layer_12_head": 0.6586200594902039,
      "loss_layer_18_head": 0.5262429714202881,
      "loss_layer_24_head": 0.32340413331985474,
      "loss_layer_30_head": 0.19818100333213806,
      "loss_layer_36_head": 0.15073995292186737,
      "loss_layer_42_head": 0.08152748644351959,
      "loss_layer_6_head": 0.9807730913162231,
      "step": 2400
    },
    {
      "epoch": 30.72,
      "eval_loss": 5.496026039123535,
      "eval_loss_layer_12_head": 1.2661367654800415,
      "eval_loss_layer_18_head": 1.0883880853652954,
      "eval_loss_layer_24_head": 0.7008371949195862,
      "eval_loss_layer_30_head": 0.4590054154396057,
      "eval_loss_layer_36_head": 0.3065876364707947,
      "eval_loss_layer_42_head": 0.1774936020374298,
      "eval_loss_layer_6_head": 1.6334257125854492,
      "eval_runtime": 33.0648,
      "eval_samples_per_second": 9.678,
      "eval_steps_per_second": 0.605,
      "step": 2400
    },
    {
      "epoch": 30.784,
      "grad_norm": 0.19345710302375985,
      "learning_rate": 0.00436756267810249,
      "loss": 2.8455,
      "loss_layer_12_head": 0.6523048281669617,
      "loss_layer_18_head": 0.5223513841629028,
      "loss_layer_24_head": 0.32487553358078003,
      "loss_layer_30_head": 0.19654437899589539,
      "loss_layer_36_head": 0.14182516932487488,
      "loss_layer_42_head": 0.08124905079603195,
      "loss_layer_6_head": 0.9537304639816284,
      "step": 2405
    },
    {
      "epoch": 30.848,
      "grad_norm": 0.21305336741018652,
      "learning_rate": 0.004363839135154497,
      "loss": 2.8703,
      "loss_layer_12_head": 0.6446037292480469,
      "loss_layer_18_head": 0.5153535008430481,
      "loss_layer_24_head": 0.32086193561553955,
      "loss_layer_30_head": 0.19337591528892517,
      "loss_layer_36_head": 0.13697603344917297,
      "loss_layer_42_head": 0.07871032506227493,
      "loss_layer_6_head": 0.9332365989685059,
      "step": 2410
    },
    {
      "epoch": 30.912,
      "grad_norm": 0.2710584976547138,
      "learning_rate": 0.004360106260227027,
      "loss": 2.9258,
      "loss_layer_12_head": 0.7002941966056824,
      "loss_layer_18_head": 0.5590299367904663,
      "loss_layer_24_head": 0.34282809495925903,
      "loss_layer_30_head": 0.20360009372234344,
      "loss_layer_36_head": 0.13802145421504974,
      "loss_layer_42_head": 0.07697184383869171,
      "loss_layer_6_head": 1.0074399709701538,
      "step": 2415
    },
    {
      "epoch": 30.976,
      "grad_norm": 0.28440373226804616,
      "learning_rate": 0.004356364072010058,
      "loss": 2.8415,
      "loss_layer_12_head": 0.690104603767395,
      "loss_layer_18_head": 0.5517177581787109,
      "loss_layer_24_head": 0.33594030141830444,
      "loss_layer_30_head": 0.19900009036064148,
      "loss_layer_36_head": 0.13719584047794342,
      "loss_layer_42_head": 0.07840054482221603,
      "loss_layer_6_head": 0.9791363477706909,
      "step": 2420
    },
    {
      "epoch": 31.04,
      "grad_norm": 0.23549191673042438,
      "learning_rate": 0.004352612589240199,
      "loss": 2.7575,
      "loss_layer_12_head": 0.6429499387741089,
      "loss_layer_18_head": 0.5060126185417175,
      "loss_layer_24_head": 0.30615440011024475,
      "loss_layer_30_head": 0.19005738198757172,
      "loss_layer_36_head": 0.12762241065502167,
      "loss_layer_42_head": 0.07652322947978973,
      "loss_layer_6_head": 0.9064340591430664,
      "step": 2425
    },
    {
      "epoch": 31.104,
      "grad_norm": 0.20464252464669502,
      "learning_rate": 0.004348851830700593,
      "loss": 2.7087,
      "loss_layer_12_head": 0.6369266510009766,
      "loss_layer_18_head": 0.5097042322158813,
      "loss_layer_24_head": 0.3159019947052002,
      "loss_layer_30_head": 0.2181735336780548,
      "loss_layer_36_head": 0.13391545414924622,
      "loss_layer_42_head": 0.12845589220523834,
      "loss_layer_6_head": 0.9042560458183289,
      "step": 2430
    },
    {
      "epoch": 31.168,
      "grad_norm": 0.28709174766240425,
      "learning_rate": 0.004345081815220829,
      "loss": 2.7089,
      "loss_layer_12_head": 0.6125655770301819,
      "loss_layer_18_head": 0.4851406514644623,
      "loss_layer_24_head": 0.2996302545070648,
      "loss_layer_30_head": 0.2054021656513214,
      "loss_layer_36_head": 0.1227855533361435,
      "loss_layer_42_head": 0.07299292832612991,
      "loss_layer_6_head": 0.8628458976745605,
      "step": 2435
    },
    {
      "epoch": 31.232,
      "grad_norm": 0.4049262450322382,
      "learning_rate": 0.004341302561676842,
      "loss": 2.7515,
      "loss_layer_12_head": 0.6451377868652344,
      "loss_layer_18_head": 0.5012729167938232,
      "loss_layer_24_head": 0.31415989995002747,
      "loss_layer_30_head": 0.22865085303783417,
      "loss_layer_36_head": 0.12529698014259338,
      "loss_layer_42_head": 0.07337956130504608,
      "loss_layer_6_head": 0.8903218507766724,
      "step": 2440
    },
    {
      "epoch": 31.296,
      "grad_norm": 0.27525887357666706,
      "learning_rate": 0.0043375140889908214,
      "loss": 2.7785,
      "loss_layer_12_head": 0.6425033807754517,
      "loss_layer_18_head": 0.4994775354862213,
      "loss_layer_24_head": 0.3115004003047943,
      "loss_layer_30_head": 0.213334321975708,
      "loss_layer_36_head": 0.12447787821292877,
      "loss_layer_42_head": 0.07190011441707611,
      "loss_layer_6_head": 0.8851497769355774,
      "step": 2445
    },
    {
      "epoch": 31.36,
      "grad_norm": 0.34108292183111866,
      "learning_rate": 0.004333716416131114,
      "loss": 2.8161,
      "loss_layer_12_head": 0.6560153961181641,
      "loss_layer_18_head": 0.5116560459136963,
      "loss_layer_24_head": 0.32144251465797424,
      "loss_layer_30_head": 0.23159527778625488,
      "loss_layer_36_head": 0.13238605856895447,
      "loss_layer_42_head": 0.07633794099092484,
      "loss_layer_6_head": 0.9090645909309387,
      "step": 2450
    },
    {
      "epoch": 31.424,
      "grad_norm": 0.4637706684230083,
      "learning_rate": 0.004329909562112134,
      "loss": 2.8576,
      "loss_layer_12_head": 0.659212589263916,
      "loss_layer_18_head": 0.5184289813041687,
      "loss_layer_24_head": 0.32582226395606995,
      "loss_layer_30_head": 0.2336900532245636,
      "loss_layer_36_head": 0.12876638770103455,
      "loss_layer_42_head": 0.07371669262647629,
      "loss_layer_6_head": 0.9223982691764832,
      "step": 2455
    },
    {
      "epoch": 31.488,
      "grad_norm": 0.5138029999554128,
      "learning_rate": 0.004326093545994258,
      "loss": 2.8561,
      "loss_layer_12_head": 0.6420512795448303,
      "loss_layer_18_head": 0.48391443490982056,
      "loss_layer_24_head": 0.30785712599754333,
      "loss_layer_30_head": 0.23524832725524902,
      "loss_layer_36_head": 0.12973396480083466,
      "loss_layer_42_head": 0.07070479542016983,
      "loss_layer_6_head": 0.8700225949287415,
      "step": 2460
    },
    {
      "epoch": 31.552,
      "grad_norm": 0.5177361449617911,
      "learning_rate": 0.0043222683868837436,
      "loss": 2.9564,
      "loss_layer_12_head": 0.7426990866661072,
      "loss_layer_18_head": 0.5188878178596497,
      "loss_layer_24_head": 0.3305582106113434,
      "loss_layer_30_head": 0.24975328147411346,
      "loss_layer_36_head": 0.17330005764961243,
      "loss_layer_42_head": 0.09468719363212585,
      "loss_layer_6_head": 0.9460204243659973,
      "step": 2465
    },
    {
      "epoch": 31.616,
      "grad_norm": 0.5672102707981601,
      "learning_rate": 0.0043184341039326215,
      "loss": 2.9686,
      "loss_layer_12_head": 0.7577100396156311,
      "loss_layer_18_head": 0.5066065788269043,
      "loss_layer_24_head": 0.32139086723327637,
      "loss_layer_30_head": 0.23129050433635712,
      "loss_layer_36_head": 0.1378742754459381,
      "loss_layer_42_head": 0.07693429291248322,
      "loss_layer_6_head": 0.9414851069450378,
      "step": 2470
    },
    {
      "epoch": 31.68,
      "grad_norm": 0.4652768825716275,
      "learning_rate": 0.004314590716338606,
      "loss": 2.9864,
      "loss_layer_12_head": 0.7513552308082581,
      "loss_layer_18_head": 0.5020307898521423,
      "loss_layer_24_head": 0.3221827447414398,
      "loss_layer_30_head": 0.22457358241081238,
      "loss_layer_36_head": 0.20267441868782043,
      "loss_layer_42_head": 0.10499592870473862,
      "loss_layer_6_head": 0.8999527096748352,
      "step": 2475
    },
    {
      "epoch": 31.744,
      "grad_norm": 0.3280626967284921,
      "learning_rate": 0.004310738243344996,
      "loss": 2.9295,
      "loss_layer_12_head": 0.7297580242156982,
      "loss_layer_18_head": 0.5086355209350586,
      "loss_layer_24_head": 0.3144855201244354,
      "loss_layer_30_head": 0.2066270112991333,
      "loss_layer_36_head": 0.13697369396686554,
      "loss_layer_42_head": 0.076801598072052,
      "loss_layer_6_head": 0.907352089881897,
      "step": 2480
    },
    {
      "epoch": 31.808,
      "grad_norm": 0.32653658716469636,
      "learning_rate": 0.0043068767042405785,
      "loss": 2.9217,
      "loss_layer_12_head": 0.7346735000610352,
      "loss_layer_18_head": 0.5066921710968018,
      "loss_layer_24_head": 0.31358814239501953,
      "loss_layer_30_head": 0.20147380232810974,
      "loss_layer_36_head": 0.13516008853912354,
      "loss_layer_42_head": 0.0737566277384758,
      "loss_layer_6_head": 0.9110029935836792,
      "step": 2485
    },
    {
      "epoch": 31.872,
      "grad_norm": 0.2975273929741614,
      "learning_rate": 0.004303006118359536,
      "loss": 3.0035,
      "loss_layer_12_head": 0.7832895517349243,
      "loss_layer_18_head": 0.542451024055481,
      "loss_layer_24_head": 0.33075886964797974,
      "loss_layer_30_head": 0.2067415416240692,
      "loss_layer_36_head": 0.14488841593265533,
      "loss_layer_42_head": 0.07364431768655777,
      "loss_layer_6_head": 0.9686379432678223,
      "step": 2490
    },
    {
      "epoch": 31.936,
      "grad_norm": 0.4264558420236524,
      "learning_rate": 0.004299126505081347,
      "loss": 2.9512,
      "loss_layer_12_head": 0.7383927702903748,
      "loss_layer_18_head": 0.5259587168693542,
      "loss_layer_24_head": 0.32610243558883667,
      "loss_layer_30_head": 0.20045459270477295,
      "loss_layer_36_head": 0.1471794843673706,
      "loss_layer_42_head": 0.08296526968479156,
      "loss_layer_6_head": 0.9408084750175476,
      "step": 2495
    },
    {
      "epoch": 32.0,
      "grad_norm": 0.37743047706745025,
      "learning_rate": 0.004295237883830685,
      "loss": 2.9437,
      "loss_layer_12_head": 0.6970030665397644,
      "loss_layer_18_head": 0.5143242478370667,
      "loss_layer_24_head": 0.316952109336853,
      "loss_layer_30_head": 0.19640633463859558,
      "loss_layer_36_head": 0.1423618644475937,
      "loss_layer_42_head": 0.08230853080749512,
      "loss_layer_6_head": 0.9181687235832214,
      "step": 2500
    },
    {
      "epoch": 32.064,
      "grad_norm": 0.2708836992536499,
      "learning_rate": 0.00429134027407733,
      "loss": 2.7521,
      "loss_layer_12_head": 0.6242228746414185,
      "loss_layer_18_head": 0.4660509526729584,
      "loss_layer_24_head": 0.28793397545814514,
      "loss_layer_30_head": 0.18245425820350647,
      "loss_layer_36_head": 0.13049116730690002,
      "loss_layer_42_head": 0.08296440541744232,
      "loss_layer_6_head": 0.8373792767524719,
      "step": 2505
    },
    {
      "epoch": 32.128,
      "grad_norm": 0.3582767398629366,
      "learning_rate": 0.0042874336953360615,
      "loss": 2.8369,
      "loss_layer_12_head": 0.6843419075012207,
      "loss_layer_18_head": 0.5061351656913757,
      "loss_layer_24_head": 0.3296048045158386,
      "loss_layer_30_head": 0.19601458311080933,
      "loss_layer_36_head": 0.1358725130558014,
      "loss_layer_42_head": 0.0846487283706665,
      "loss_layer_6_head": 0.9186986684799194,
      "step": 2510
    },
    {
      "epoch": 32.192,
      "grad_norm": 0.3848203176507095,
      "learning_rate": 0.0042835181671665706,
      "loss": 2.7905,
      "loss_layer_12_head": 0.7115434408187866,
      "loss_layer_18_head": 0.5478081107139587,
      "loss_layer_24_head": 0.33910664916038513,
      "loss_layer_30_head": 0.19919417798519135,
      "loss_layer_36_head": 0.13211753964424133,
      "loss_layer_42_head": 0.0809602439403534,
      "loss_layer_6_head": 0.9710380434989929,
      "step": 2515
    },
    {
      "epoch": 32.256,
      "grad_norm": 0.3108039419045784,
      "learning_rate": 0.004279593709173352,
      "loss": 2.7476,
      "loss_layer_12_head": 0.6411250233650208,
      "loss_layer_18_head": 0.5007718801498413,
      "loss_layer_24_head": 0.30839818716049194,
      "loss_layer_30_head": 0.1875855028629303,
      "loss_layer_36_head": 0.12580236792564392,
      "loss_layer_42_head": 0.07748179137706757,
      "loss_layer_6_head": 0.8741466403007507,
      "step": 2520
    },
    {
      "epoch": 32.32,
      "grad_norm": 0.33046497277134157,
      "learning_rate": 0.004275660341005614,
      "loss": 2.7825,
      "loss_layer_12_head": 0.6368816494941711,
      "loss_layer_18_head": 0.4925454258918762,
      "loss_layer_24_head": 0.30891555547714233,
      "loss_layer_30_head": 0.18964070081710815,
      "loss_layer_36_head": 0.12500204145908356,
      "loss_layer_42_head": 0.07777388393878937,
      "loss_layer_6_head": 0.8718814849853516,
      "step": 2525
    },
    {
      "epoch": 32.384,
      "grad_norm": 0.3614476656871924,
      "learning_rate": 0.004271718082357175,
      "loss": 2.7914,
      "loss_layer_12_head": 0.6562954187393188,
      "loss_layer_18_head": 0.5181206464767456,
      "loss_layer_24_head": 0.3323201537132263,
      "loss_layer_30_head": 0.1948789656162262,
      "loss_layer_36_head": 0.12602558732032776,
      "loss_layer_42_head": 0.07693319767713547,
      "loss_layer_6_head": 0.9109300374984741,
      "step": 2530
    },
    {
      "epoch": 32.448,
      "grad_norm": 0.3246470633947506,
      "learning_rate": 0.004267766952966369,
      "loss": 2.7977,
      "loss_layer_12_head": 0.650140106678009,
      "loss_layer_18_head": 0.4962919354438782,
      "loss_layer_24_head": 0.3131445646286011,
      "loss_layer_30_head": 0.18616250157356262,
      "loss_layer_36_head": 0.11747406423091888,
      "loss_layer_42_head": 0.07110822945833206,
      "loss_layer_6_head": 0.876452624797821,
      "step": 2535
    },
    {
      "epoch": 32.512,
      "grad_norm": 0.37881837106908534,
      "learning_rate": 0.004263806972615942,
      "loss": 2.8184,
      "loss_layer_12_head": 0.6953346729278564,
      "loss_layer_18_head": 0.5198138952255249,
      "loss_layer_24_head": 0.32559290528297424,
      "loss_layer_30_head": 0.1981888711452484,
      "loss_layer_36_head": 0.12268197536468506,
      "loss_layer_42_head": 0.0749700665473938,
      "loss_layer_6_head": 0.9035722613334656,
      "step": 2540
    },
    {
      "epoch": 32.576,
      "grad_norm": 0.3230004554589516,
      "learning_rate": 0.004259838161132957,
      "loss": 2.8435,
      "loss_layer_12_head": 0.6601161956787109,
      "loss_layer_18_head": 0.5294954180717468,
      "loss_layer_24_head": 0.3287796974182129,
      "loss_layer_30_head": 0.20208649337291718,
      "loss_layer_36_head": 0.12906700372695923,
      "loss_layer_42_head": 0.07746460288763046,
      "loss_layer_6_head": 0.898246169090271,
      "step": 2545
    },
    {
      "epoch": 32.64,
      "grad_norm": 0.4226761221325677,
      "learning_rate": 0.004255860538388694,
      "loss": 2.84,
      "loss_layer_12_head": 0.6569350361824036,
      "loss_layer_18_head": 0.5630866289138794,
      "loss_layer_24_head": 0.32765817642211914,
      "loss_layer_30_head": 0.20026902854442596,
      "loss_layer_36_head": 0.12830641865730286,
      "loss_layer_42_head": 0.07748326659202576,
      "loss_layer_6_head": 0.9160161018371582,
      "step": 2550
    },
    {
      "epoch": 32.704,
      "grad_norm": 0.3585008973394115,
      "learning_rate": 0.004251874124298547,
      "loss": 2.9513,
      "loss_layer_12_head": 0.6700294017791748,
      "loss_layer_18_head": 0.6667214035987854,
      "loss_layer_24_head": 0.34535011649131775,
      "loss_layer_30_head": 0.20549388229846954,
      "loss_layer_36_head": 0.13187505304813385,
      "loss_layer_42_head": 0.08004724234342575,
      "loss_layer_6_head": 0.9469305872917175,
      "step": 2555
    },
    {
      "epoch": 32.768,
      "grad_norm": 0.34663541559085087,
      "learning_rate": 0.004247878938821928,
      "loss": 2.9353,
      "loss_layer_12_head": 0.639850378036499,
      "loss_layer_18_head": 0.6093745827674866,
      "loss_layer_24_head": 0.3191365599632263,
      "loss_layer_30_head": 0.19089816510677338,
      "loss_layer_36_head": 0.12051371484994888,
      "loss_layer_42_head": 0.07992316782474518,
      "loss_layer_6_head": 0.9199528694152832,
      "step": 2560
    },
    {
      "epoch": 32.832,
      "grad_norm": 0.29404190692340176,
      "learning_rate": 0.00424387500196217,
      "loss": 2.9324,
      "loss_layer_12_head": 0.6533123850822449,
      "loss_layer_18_head": 0.6349425911903381,
      "loss_layer_24_head": 0.34083500504493713,
      "loss_layer_30_head": 0.19715335965156555,
      "loss_layer_36_head": 0.12457157671451569,
      "loss_layer_42_head": 0.0798678994178772,
      "loss_layer_6_head": 0.9253924489021301,
      "step": 2565
    },
    {
      "epoch": 32.896,
      "grad_norm": 0.23647260403820128,
      "learning_rate": 0.004239862333766418,
      "loss": 2.9298,
      "loss_layer_12_head": 0.677099347114563,
      "loss_layer_18_head": 0.6167623996734619,
      "loss_layer_24_head": 0.3539769649505615,
      "loss_layer_30_head": 0.20748047530651093,
      "loss_layer_36_head": 0.1371610462665558,
      "loss_layer_42_head": 0.0793168917298317,
      "loss_layer_6_head": 0.9860152006149292,
      "step": 2570
    },
    {
      "epoch": 32.96,
      "grad_norm": 0.2691740783743431,
      "learning_rate": 0.004235840954325534,
      "loss": 2.9045,
      "loss_layer_12_head": 0.6264919638633728,
      "loss_layer_18_head": 0.5633588433265686,
      "loss_layer_24_head": 0.32429039478302,
      "loss_layer_30_head": 0.18746836483478546,
      "loss_layer_36_head": 0.12119843065738678,
      "loss_layer_42_head": 0.07697796821594238,
      "loss_layer_6_head": 0.9139952659606934,
      "step": 2575
    },
    {
      "epoch": 33.024,
      "grad_norm": 0.2783336145554874,
      "learning_rate": 0.004231810883773999,
      "loss": 2.8557,
      "loss_layer_12_head": 0.6530717611312866,
      "loss_layer_18_head": 0.5595777034759521,
      "loss_layer_24_head": 0.3451862037181854,
      "loss_layer_30_head": 0.1954914629459381,
      "loss_layer_36_head": 0.1237841248512268,
      "loss_layer_42_head": 0.0714506283402443,
      "loss_layer_6_head": 0.9539521932601929,
      "step": 2580
    },
    {
      "epoch": 33.088,
      "grad_norm": 0.3226662233669668,
      "learning_rate": 0.004227772142289806,
      "loss": 2.7233,
      "loss_layer_12_head": 0.631679117679596,
      "loss_layer_18_head": 0.5294066667556763,
      "loss_layer_24_head": 0.3387308120727539,
      "loss_layer_30_head": 0.19209739565849304,
      "loss_layer_36_head": 0.1235671266913414,
      "loss_layer_42_head": 0.07081017643213272,
      "loss_layer_6_head": 0.9307260513305664,
      "step": 2585
    },
    {
      "epoch": 33.152,
      "grad_norm": 0.25398345600240996,
      "learning_rate": 0.004223724750094366,
      "loss": 2.7294,
      "loss_layer_12_head": 0.5983085632324219,
      "loss_layer_18_head": 0.5005430579185486,
      "loss_layer_24_head": 0.31896308064460754,
      "loss_layer_30_head": 0.1885015368461609,
      "loss_layer_36_head": 0.12656575441360474,
      "loss_layer_42_head": 0.07045037299394608,
      "loss_layer_6_head": 0.8887253999710083,
      "step": 2590
    },
    {
      "epoch": 33.216,
      "grad_norm": 0.42420976531618027,
      "learning_rate": 0.004219668727452396,
      "loss": 2.7938,
      "loss_layer_12_head": 0.6087580323219299,
      "loss_layer_18_head": 0.5045052170753479,
      "loss_layer_24_head": 0.33872899413108826,
      "loss_layer_30_head": 0.19494816660881042,
      "loss_layer_36_head": 0.12133397907018661,
      "loss_layer_42_head": 0.06789396703243256,
      "loss_layer_6_head": 0.9173173904418945,
      "step": 2595
    },
    {
      "epoch": 33.28,
      "grad_norm": 0.4309514945375266,
      "learning_rate": 0.004215604094671834,
      "loss": 2.8805,
      "loss_layer_12_head": 0.605047345161438,
      "loss_layer_18_head": 0.49785876274108887,
      "loss_layer_24_head": 0.40046581625938416,
      "loss_layer_30_head": 0.19429156184196472,
      "loss_layer_36_head": 0.1238332986831665,
      "loss_layer_42_head": 0.06936744600534439,
      "loss_layer_6_head": 0.9354404211044312,
      "step": 2600
    },
    {
      "epoch": 33.28,
      "eval_loss": 5.7325944900512695,
      "eval_loss_layer_12_head": 1.2473057508468628,
      "eval_loss_layer_18_head": 1.126779317855835,
      "eval_loss_layer_24_head": 0.8571988344192505,
      "eval_loss_layer_30_head": 0.5253663659095764,
      "eval_loss_layer_36_head": 0.3131699562072754,
      "eval_loss_layer_42_head": 0.18893791735172272,
      "eval_loss_layer_6_head": 1.7120240926742554,
      "eval_runtime": 33.0348,
      "eval_samples_per_second": 9.687,
      "eval_steps_per_second": 0.605,
      "step": 2600
    },
    {
      "epoch": 33.344,
      "grad_norm": 0.30905443386356446,
      "learning_rate": 0.00421153087210372,
      "loss": 2.9459,
      "loss_layer_12_head": 0.6238444447517395,
      "loss_layer_18_head": 0.5070158839225769,
      "loss_layer_24_head": 0.4167725145816803,
      "loss_layer_30_head": 0.2011946141719818,
      "loss_layer_36_head": 0.13065120577812195,
      "loss_layer_42_head": 0.07574349641799927,
      "loss_layer_6_head": 0.9456790685653687,
      "step": 2605
    },
    {
      "epoch": 33.408,
      "grad_norm": 0.3357009433037485,
      "learning_rate": 0.004207449080142104,
      "loss": 2.8525,
      "loss_layer_12_head": 0.6190840601921082,
      "loss_layer_18_head": 0.5051413774490356,
      "loss_layer_24_head": 0.3972363770008087,
      "loss_layer_30_head": 0.1992553174495697,
      "loss_layer_36_head": 0.13455288112163544,
      "loss_layer_42_head": 0.07542914897203445,
      "loss_layer_6_head": 0.9394603967666626,
      "step": 2610
    },
    {
      "epoch": 33.472,
      "grad_norm": 0.3943398330372077,
      "learning_rate": 0.004203358739223943,
      "loss": 2.8336,
      "loss_layer_12_head": 0.6307199597358704,
      "loss_layer_18_head": 0.5093135833740234,
      "loss_layer_24_head": 0.37974151968955994,
      "loss_layer_30_head": 0.2001461684703827,
      "loss_layer_36_head": 0.13043363392353058,
      "loss_layer_42_head": 0.07291281223297119,
      "loss_layer_6_head": 0.9364916682243347,
      "step": 2615
    },
    {
      "epoch": 33.536,
      "grad_norm": 0.32266708005338235,
      "learning_rate": 0.004199259869828998,
      "loss": 2.8988,
      "loss_layer_12_head": 0.6301755309104919,
      "loss_layer_18_head": 0.5196683406829834,
      "loss_layer_24_head": 0.35997089743614197,
      "loss_layer_30_head": 0.21205613017082214,
      "loss_layer_36_head": 0.1307404339313507,
      "loss_layer_42_head": 0.08222105354070663,
      "loss_layer_6_head": 0.9948984980583191,
      "step": 2620
    },
    {
      "epoch": 33.6,
      "grad_norm": 0.43788159784246583,
      "learning_rate": 0.004195152492479728,
      "loss": 2.8678,
      "loss_layer_12_head": 0.6431251764297485,
      "loss_layer_18_head": 0.5138626098632812,
      "loss_layer_24_head": 0.35221606492996216,
      "loss_layer_30_head": 0.1962614804506302,
      "loss_layer_36_head": 0.12459486722946167,
      "loss_layer_42_head": 0.08032798022031784,
      "loss_layer_6_head": 0.9510043263435364,
      "step": 2625
    },
    {
      "epoch": 33.664,
      "grad_norm": 0.33672392994214556,
      "learning_rate": 0.00419103662774119,
      "loss": 2.8047,
      "loss_layer_12_head": 0.6278454661369324,
      "loss_layer_18_head": 0.5141494274139404,
      "loss_layer_24_head": 0.3409757912158966,
      "loss_layer_30_head": 0.20382952690124512,
      "loss_layer_36_head": 0.12565001845359802,
      "loss_layer_42_head": 0.07749472558498383,
      "loss_layer_6_head": 0.9439318776130676,
      "step": 2630
    },
    {
      "epoch": 33.728,
      "grad_norm": 0.3469136339061625,
      "learning_rate": 0.004186912296220942,
      "loss": 2.8041,
      "loss_layer_12_head": 0.6184324622154236,
      "loss_layer_18_head": 0.4934358596801758,
      "loss_layer_24_head": 0.3214995861053467,
      "loss_layer_30_head": 0.200851172208786,
      "loss_layer_36_head": 0.11837556213140488,
      "loss_layer_42_head": 0.07741346210241318,
      "loss_layer_6_head": 0.9035131335258484,
      "step": 2635
    },
    {
      "epoch": 33.792,
      "grad_norm": 0.220848719842902,
      "learning_rate": 0.004182779518568926,
      "loss": 2.8305,
      "loss_layer_12_head": 0.6610060930252075,
      "loss_layer_18_head": 0.5364835262298584,
      "loss_layer_24_head": 0.3489699363708496,
      "loss_layer_30_head": 0.2160877287387848,
      "loss_layer_36_head": 0.12945732474327087,
      "loss_layer_42_head": 0.08395260572433472,
      "loss_layer_6_head": 0.9807375073432922,
      "step": 2640
    },
    {
      "epoch": 33.856,
      "grad_norm": 0.282348676843471,
      "learning_rate": 0.004178638315477378,
      "loss": 2.8586,
      "loss_layer_12_head": 0.6648001074790955,
      "loss_layer_18_head": 0.5286170840263367,
      "loss_layer_24_head": 0.33395570516586304,
      "loss_layer_30_head": 0.2018022984266281,
      "loss_layer_36_head": 0.12290839105844498,
      "loss_layer_42_head": 0.0812070444226265,
      "loss_layer_6_head": 0.9461675882339478,
      "step": 2645
    },
    {
      "epoch": 33.92,
      "grad_norm": 0.3578982857137259,
      "learning_rate": 0.004174488707680717,
      "loss": 2.8514,
      "loss_layer_12_head": 0.6302977204322815,
      "loss_layer_18_head": 0.5066864490509033,
      "loss_layer_24_head": 0.3181256949901581,
      "loss_layer_30_head": 0.19834056496620178,
      "loss_layer_36_head": 0.1282840520143509,
      "loss_layer_42_head": 0.07525322586297989,
      "loss_layer_6_head": 0.9191304445266724,
      "step": 2650
    },
    {
      "epoch": 33.984,
      "grad_norm": 0.2702374347694356,
      "learning_rate": 0.004170330715955444,
      "loss": 2.8859,
      "loss_layer_12_head": 0.683801531791687,
      "loss_layer_18_head": 0.5307459831237793,
      "loss_layer_24_head": 0.33235910534858704,
      "loss_layer_30_head": 0.2087191641330719,
      "loss_layer_36_head": 0.12826012074947357,
      "loss_layer_42_head": 0.07463525235652924,
      "loss_layer_6_head": 0.9388282895088196,
      "step": 2655
    },
    {
      "epoch": 34.048,
      "grad_norm": 0.3970629430705295,
      "learning_rate": 0.004166164361120036,
      "loss": 2.8588,
      "loss_layer_12_head": 0.7262111902236938,
      "loss_layer_18_head": 0.519107460975647,
      "loss_layer_24_head": 0.31902632117271423,
      "loss_layer_30_head": 0.20146577060222626,
      "loss_layer_36_head": 0.12503919005393982,
      "loss_layer_42_head": 0.07190229743719101,
      "loss_layer_6_head": 0.896382212638855,
      "step": 2660
    },
    {
      "epoch": 34.112,
      "grad_norm": 0.4586740026124035,
      "learning_rate": 0.004161989664034844,
      "loss": 2.6977,
      "loss_layer_12_head": 0.675337016582489,
      "loss_layer_18_head": 0.4945290684700012,
      "loss_layer_24_head": 0.3024293780326843,
      "loss_layer_30_head": 0.18928822875022888,
      "loss_layer_36_head": 0.12194602191448212,
      "loss_layer_42_head": 0.06846313923597336,
      "loss_layer_6_head": 0.8520544171333313,
      "step": 2665
    },
    {
      "epoch": 34.176,
      "grad_norm": 0.30026890003646073,
      "learning_rate": 0.004157806645601988,
      "loss": 2.8271,
      "loss_layer_12_head": 0.7172795534133911,
      "loss_layer_18_head": 0.5080059170722961,
      "loss_layer_24_head": 0.3281785547733307,
      "loss_layer_30_head": 0.20711426436901093,
      "loss_layer_36_head": 0.12999442219734192,
      "loss_layer_42_head": 0.07491367310285568,
      "loss_layer_6_head": 0.9000323414802551,
      "step": 2670
    },
    {
      "epoch": 34.24,
      "grad_norm": 0.29620226914224357,
      "learning_rate": 0.00415361532676525,
      "loss": 2.8032,
      "loss_layer_12_head": 0.7101647257804871,
      "loss_layer_18_head": 0.519394040107727,
      "loss_layer_24_head": 0.3077928423881531,
      "loss_layer_30_head": 0.19970257580280304,
      "loss_layer_36_head": 0.1204116940498352,
      "loss_layer_42_head": 0.06979154050350189,
      "loss_layer_6_head": 0.8890959024429321,
      "step": 2675
    },
    {
      "epoch": 34.304,
      "grad_norm": 0.3472116841777934,
      "learning_rate": 0.004149415728509971,
      "loss": 2.7721,
      "loss_layer_12_head": 0.6619457006454468,
      "loss_layer_18_head": 0.49889153242111206,
      "loss_layer_24_head": 0.30924513936042786,
      "loss_layer_30_head": 0.20423266291618347,
      "loss_layer_36_head": 0.13139687478542328,
      "loss_layer_42_head": 0.08537609875202179,
      "loss_layer_6_head": 0.8835328221321106,
      "step": 2680
    },
    {
      "epoch": 34.368,
      "grad_norm": 0.2760945033067904,
      "learning_rate": 0.004145207871862947,
      "loss": 2.7574,
      "loss_layer_12_head": 0.6647585034370422,
      "loss_layer_18_head": 0.502260684967041,
      "loss_layer_24_head": 0.30646997690200806,
      "loss_layer_30_head": 0.1958732157945633,
      "loss_layer_36_head": 0.12973220646381378,
      "loss_layer_42_head": 0.07320185750722885,
      "loss_layer_6_head": 0.9091441035270691,
      "step": 2685
    },
    {
      "epoch": 34.432,
      "grad_norm": 0.33433023113591814,
      "learning_rate": 0.004140991777892324,
      "loss": 2.7732,
      "loss_layer_12_head": 0.6402832269668579,
      "loss_layer_18_head": 0.5036174654960632,
      "loss_layer_24_head": 0.3048974871635437,
      "loss_layer_30_head": 0.19459208846092224,
      "loss_layer_36_head": 0.1304369568824768,
      "loss_layer_42_head": 0.07641393691301346,
      "loss_layer_6_head": 0.8845137357711792,
      "step": 2690
    },
    {
      "epoch": 34.496,
      "grad_norm": 0.5052537907567913,
      "learning_rate": 0.004136767467707488,
      "loss": 2.8374,
      "loss_layer_12_head": 0.6500932574272156,
      "loss_layer_18_head": 0.5149675607681274,
      "loss_layer_24_head": 0.31480270624160767,
      "loss_layer_30_head": 0.19728362560272217,
      "loss_layer_36_head": 0.15987873077392578,
      "loss_layer_42_head": 0.078626349568367,
      "loss_layer_6_head": 0.9074395298957825,
      "step": 2695
    },
    {
      "epoch": 34.56,
      "grad_norm": 0.5973009636396164,
      "learning_rate": 0.0041325349624589626,
      "loss": 2.9116,
      "loss_layer_12_head": 0.6577972173690796,
      "loss_layer_18_head": 0.5193386673927307,
      "loss_layer_24_head": 0.3131563663482666,
      "loss_layer_30_head": 0.199286088347435,
      "loss_layer_36_head": 0.1961132436990738,
      "loss_layer_42_head": 0.07936983555555344,
      "loss_layer_6_head": 0.9489824175834656,
      "step": 2700
    },
    {
      "epoch": 34.624,
      "grad_norm": 0.6020530068935358,
      "learning_rate": 0.004128294283338307,
      "loss": 3.0554,
      "loss_layer_12_head": 0.6625782251358032,
      "loss_layer_18_head": 0.5259482264518738,
      "loss_layer_24_head": 0.30995380878448486,
      "loss_layer_30_head": 0.20744295418262482,
      "loss_layer_36_head": 0.19920842349529266,
      "loss_layer_42_head": 0.08036056905984879,
      "loss_layer_6_head": 1.0085796117782593,
      "step": 2705
    },
    {
      "epoch": 34.688,
      "grad_norm": 0.4931075462495317,
      "learning_rate": 0.004124045451578001,
      "loss": 2.9593,
      "loss_layer_12_head": 0.6840303540229797,
      "loss_layer_18_head": 0.5533978939056396,
      "loss_layer_24_head": 0.33094507455825806,
      "loss_layer_30_head": 0.21549472212791443,
      "loss_layer_36_head": 0.18159158527851105,
      "loss_layer_42_head": 0.08155359327793121,
      "loss_layer_6_head": 1.0317232608795166,
      "step": 2710
    },
    {
      "epoch": 34.752,
      "grad_norm": 0.3288466336144575,
      "learning_rate": 0.004119788488451347,
      "loss": 2.9526,
      "loss_layer_12_head": 0.6575234532356262,
      "loss_layer_18_head": 0.545333981513977,
      "loss_layer_24_head": 0.3222271502017975,
      "loss_layer_30_head": 0.21215848624706268,
      "loss_layer_36_head": 0.16841565072536469,
      "loss_layer_42_head": 0.08017448335886002,
      "loss_layer_6_head": 0.9947413206100464,
      "step": 2715
    },
    {
      "epoch": 34.816,
      "grad_norm": 0.47576136189582385,
      "learning_rate": 0.004115523415272358,
      "loss": 2.9682,
      "loss_layer_12_head": 0.7168224453926086,
      "loss_layer_18_head": 0.5571868419647217,
      "loss_layer_24_head": 0.33126991987228394,
      "loss_layer_30_head": 0.2135584056377411,
      "loss_layer_36_head": 0.1608862727880478,
      "loss_layer_42_head": 0.07781721651554108,
      "loss_layer_6_head": 1.0021905899047852,
      "step": 2720
    },
    {
      "epoch": 34.88,
      "grad_norm": 0.3414967543465473,
      "learning_rate": 0.004111250253395652,
      "loss": 2.9163,
      "loss_layer_12_head": 0.6728025674819946,
      "loss_layer_18_head": 0.5010668039321899,
      "loss_layer_24_head": 0.302680104970932,
      "loss_layer_30_head": 0.19560614228248596,
      "loss_layer_36_head": 0.14451155066490173,
      "loss_layer_42_head": 0.07213635742664337,
      "loss_layer_6_head": 0.8992870450019836,
      "step": 2725
    },
    {
      "epoch": 34.944,
      "grad_norm": 0.2742285056520333,
      "learning_rate": 0.004106969024216348,
      "loss": 2.9211,
      "loss_layer_12_head": 0.6819601058959961,
      "loss_layer_18_head": 0.5184147357940674,
      "loss_layer_24_head": 0.312347412109375,
      "loss_layer_30_head": 0.19606636464595795,
      "loss_layer_36_head": 0.14073620736598969,
      "loss_layer_42_head": 0.0740114226937294,
      "loss_layer_6_head": 0.925912082195282,
      "step": 2730
    },
    {
      "epoch": 35.008,
      "grad_norm": 0.3254734814483609,
      "learning_rate": 0.004102679749169958,
      "loss": 2.8349,
      "loss_layer_12_head": 0.6759955286979675,
      "loss_layer_18_head": 0.5229719281196594,
      "loss_layer_24_head": 0.31841421127319336,
      "loss_layer_30_head": 0.19961383938789368,
      "loss_layer_36_head": 0.13458549976348877,
      "loss_layer_42_head": 0.07358081638813019,
      "loss_layer_6_head": 0.9303027987480164,
      "step": 2735
    },
    {
      "epoch": 35.072,
      "grad_norm": 0.2216151024982183,
      "learning_rate": 0.004098382449732275,
      "loss": 2.6815,
      "loss_layer_12_head": 0.6394282579421997,
      "loss_layer_18_head": 0.49850934743881226,
      "loss_layer_24_head": 0.3121407628059387,
      "loss_layer_30_head": 0.19601812958717346,
      "loss_layer_36_head": 0.13068975508213043,
      "loss_layer_42_head": 0.07108043879270554,
      "loss_layer_6_head": 0.8899500966072083,
      "step": 2740
    },
    {
      "epoch": 35.136,
      "grad_norm": 0.28713422012569806,
      "learning_rate": 0.004094077147419271,
      "loss": 2.7057,
      "loss_layer_12_head": 0.6438268423080444,
      "loss_layer_18_head": 0.5035595297813416,
      "loss_layer_24_head": 0.32272475957870483,
      "loss_layer_30_head": 0.20242372155189514,
      "loss_layer_36_head": 0.13149157166481018,
      "loss_layer_42_head": 0.07510354369878769,
      "loss_layer_6_head": 0.8853024244308472,
      "step": 2745
    },
    {
      "epoch": 35.2,
      "grad_norm": 0.3440629167666102,
      "learning_rate": 0.004089763863786987,
      "loss": 2.7521,
      "loss_layer_12_head": 0.7091253995895386,
      "loss_layer_18_head": 0.5312026739120483,
      "loss_layer_24_head": 0.3381216526031494,
      "loss_layer_30_head": 0.21631428599357605,
      "loss_layer_36_head": 0.13534808158874512,
      "loss_layer_42_head": 0.07773688435554504,
      "loss_layer_6_head": 0.9293904304504395,
      "step": 2750
    },
    {
      "epoch": 35.264,
      "grad_norm": 0.44138334725811357,
      "learning_rate": 0.004085442620431427,
      "loss": 2.7456,
      "loss_layer_12_head": 0.6594663858413696,
      "loss_layer_18_head": 0.4866492748260498,
      "loss_layer_24_head": 0.30100399255752563,
      "loss_layer_30_head": 0.19506093859672546,
      "loss_layer_36_head": 0.11895569413900375,
      "loss_layer_42_head": 0.06980771571397781,
      "loss_layer_6_head": 0.8427026867866516,
      "step": 2755
    },
    {
      "epoch": 35.328,
      "grad_norm": 0.3921882652205011,
      "learning_rate": 0.004081113438988443,
      "loss": 2.6939,
      "loss_layer_12_head": 0.6584978103637695,
      "loss_layer_18_head": 0.4839341640472412,
      "loss_layer_24_head": 0.29948127269744873,
      "loss_layer_30_head": 0.19003646075725555,
      "loss_layer_36_head": 0.11641867458820343,
      "loss_layer_42_head": 0.06540873646736145,
      "loss_layer_6_head": 0.8577641248703003,
      "step": 2760
    },
    {
      "epoch": 35.392,
      "grad_norm": 0.3518791921285613,
      "learning_rate": 0.0040767763411336385,
      "loss": 2.7804,
      "loss_layer_12_head": 0.7019029855728149,
      "loss_layer_18_head": 0.49669066071510315,
      "loss_layer_24_head": 0.3114005923271179,
      "loss_layer_30_head": 0.19761569797992706,
      "loss_layer_36_head": 0.12038294225931168,
      "loss_layer_42_head": 0.08017749339342117,
      "loss_layer_6_head": 0.8649842143058777,
      "step": 2765
    },
    {
      "epoch": 35.456,
      "grad_norm": 0.3304697980314363,
      "learning_rate": 0.0040724313485822495,
      "loss": 2.837,
      "loss_layer_12_head": 0.7580502033233643,
      "loss_layer_18_head": 0.5299946069717407,
      "loss_layer_24_head": 0.347163587808609,
      "loss_layer_30_head": 0.20894400775432587,
      "loss_layer_36_head": 0.12395443767309189,
      "loss_layer_42_head": 0.07757647335529327,
      "loss_layer_6_head": 0.9497647285461426,
      "step": 2770
    },
    {
      "epoch": 35.52,
      "grad_norm": 0.29127089337725315,
      "learning_rate": 0.00406807848308904,
      "loss": 2.8376,
      "loss_layer_12_head": 0.7147843241691589,
      "loss_layer_18_head": 0.5227614045143127,
      "loss_layer_24_head": 0.3622294068336487,
      "loss_layer_30_head": 0.20732328295707703,
      "loss_layer_36_head": 0.12222526222467422,
      "loss_layer_42_head": 0.08028804510831833,
      "loss_layer_6_head": 0.9214189648628235,
      "step": 2775
    },
    {
      "epoch": 35.584,
      "grad_norm": 0.3625127629847393,
      "learning_rate": 0.004063717766448194,
      "loss": 2.8683,
      "loss_layer_12_head": 0.7028826475143433,
      "loss_layer_18_head": 0.5158708095550537,
      "loss_layer_24_head": 0.34881502389907837,
      "loss_layer_30_head": 0.20343796908855438,
      "loss_layer_36_head": 0.11835350096225739,
      "loss_layer_42_head": 0.07280631363391876,
      "loss_layer_6_head": 0.9008199572563171,
      "step": 2780
    },
    {
      "epoch": 35.648,
      "grad_norm": 0.39028799702994477,
      "learning_rate": 0.004059349220493202,
      "loss": 2.863,
      "loss_layer_12_head": 0.6956790089607239,
      "loss_layer_18_head": 0.4946354329586029,
      "loss_layer_24_head": 0.3517816662788391,
      "loss_layer_30_head": 0.19228854775428772,
      "loss_layer_36_head": 0.11842534691095352,
      "loss_layer_42_head": 0.0793197900056839,
      "loss_layer_6_head": 0.8835943341255188,
      "step": 2785
    },
    {
      "epoch": 35.712,
      "grad_norm": 0.424153554816628,
      "learning_rate": 0.00405497286709676,
      "loss": 2.8838,
      "loss_layer_12_head": 0.695848822593689,
      "loss_layer_18_head": 0.5249794125556946,
      "loss_layer_24_head": 0.355752170085907,
      "loss_layer_30_head": 0.19872026145458221,
      "loss_layer_36_head": 0.12239561975002289,
      "loss_layer_42_head": 0.08452604711055756,
      "loss_layer_6_head": 0.9289144277572632,
      "step": 2790
    },
    {
      "epoch": 35.776,
      "grad_norm": 0.49859727543941246,
      "learning_rate": 0.00405058872817065,
      "loss": 2.8333,
      "loss_layer_12_head": 0.6486374139785767,
      "loss_layer_18_head": 0.4925224184989929,
      "loss_layer_24_head": 0.35328108072280884,
      "loss_layer_30_head": 0.19105742871761322,
      "loss_layer_36_head": 0.11892338842153549,
      "loss_layer_42_head": 0.09445977210998535,
      "loss_layer_6_head": 0.8915404081344604,
      "step": 2795
    },
    {
      "epoch": 35.84,
      "grad_norm": 0.4096555151785045,
      "learning_rate": 0.004046196825665638,
      "loss": 2.8492,
      "loss_layer_12_head": 0.6294233798980713,
      "loss_layer_18_head": 0.5065455436706543,
      "loss_layer_24_head": 0.3300698399543762,
      "loss_layer_30_head": 0.18601885437965393,
      "loss_layer_36_head": 0.11607158184051514,
      "loss_layer_42_head": 0.08519645035266876,
      "loss_layer_6_head": 0.8977681398391724,
      "step": 2800
    },
    {
      "epoch": 35.84,
      "eval_loss": 5.519309043884277,
      "eval_loss_layer_12_head": 1.2625792026519775,
      "eval_loss_layer_18_head": 1.0868055820465088,
      "eval_loss_layer_24_head": 0.7980098724365234,
      "eval_loss_layer_30_head": 0.4569031596183777,
      "eval_loss_layer_36_head": 0.28965479135513306,
      "eval_loss_layer_42_head": 0.1967446506023407,
      "eval_loss_layer_6_head": 1.6049944162368774,
      "eval_runtime": 33.1222,
      "eval_samples_per_second": 9.661,
      "eval_steps_per_second": 0.604,
      "step": 2800
    },
    {
      "epoch": 35.904,
      "grad_norm": 0.3484678670100771,
      "learning_rate": 0.004041797181571358,
      "loss": 2.8457,
      "loss_layer_12_head": 0.6281895637512207,
      "loss_layer_18_head": 0.5161920785903931,
      "loss_layer_24_head": 0.3226947784423828,
      "loss_layer_30_head": 0.18661637604236603,
      "loss_layer_36_head": 0.11771836131811142,
      "loss_layer_42_head": 0.07798431813716888,
      "loss_layer_6_head": 0.8837979435920715,
      "step": 2805
    },
    {
      "epoch": 35.968,
      "grad_norm": 0.29437277137023293,
      "learning_rate": 0.0040373898179162085,
      "loss": 2.8211,
      "loss_layer_12_head": 0.6273091435432434,
      "loss_layer_18_head": 0.5290659070014954,
      "loss_layer_24_head": 0.35070475935935974,
      "loss_layer_30_head": 0.18738451600074768,
      "loss_layer_36_head": 0.11827759444713593,
      "loss_layer_42_head": 0.08077546209096909,
      "loss_layer_6_head": 0.8786370158195496,
      "step": 2810
    },
    {
      "epoch": 36.032,
      "grad_norm": 0.27696361416844284,
      "learning_rate": 0.0040329747567672365,
      "loss": 2.7239,
      "loss_layer_12_head": 0.6508206129074097,
      "loss_layer_18_head": 0.5429742932319641,
      "loss_layer_24_head": 0.3211159110069275,
      "loss_layer_30_head": 0.1905827820301056,
      "loss_layer_36_head": 0.1192726120352745,
      "loss_layer_42_head": 0.07521289587020874,
      "loss_layer_6_head": 0.9104336500167847,
      "step": 2815
    },
    {
      "epoch": 36.096,
      "grad_norm": 0.2863709772858477,
      "learning_rate": 0.004028552020230031,
      "loss": 2.6671,
      "loss_layer_12_head": 0.630357563495636,
      "loss_layer_18_head": 0.5248185992240906,
      "loss_layer_24_head": 0.3100317716598511,
      "loss_layer_30_head": 0.18742386996746063,
      "loss_layer_36_head": 0.11817511171102524,
      "loss_layer_42_head": 0.07332591712474823,
      "loss_layer_6_head": 0.8797246217727661,
      "step": 2820
    },
    {
      "epoch": 36.16,
      "grad_norm": 0.18695218628226207,
      "learning_rate": 0.0040241216304486085,
      "loss": 2.6298,
      "loss_layer_12_head": 0.6349347829818726,
      "loss_layer_18_head": 0.519250750541687,
      "loss_layer_24_head": 0.3335909843444824,
      "loss_layer_30_head": 0.19021794199943542,
      "loss_layer_36_head": 0.11906565725803375,
      "loss_layer_42_head": 0.07256082445383072,
      "loss_layer_6_head": 0.8877579569816589,
      "step": 2825
    },
    {
      "epoch": 36.224,
      "grad_norm": 0.38971413758236123,
      "learning_rate": 0.004019683609605305,
      "loss": 2.6822,
      "loss_layer_12_head": 0.6506119966506958,
      "loss_layer_18_head": 0.507087767124176,
      "loss_layer_24_head": 0.30513548851013184,
      "loss_layer_30_head": 0.18134480714797974,
      "loss_layer_36_head": 0.11567530781030655,
      "loss_layer_42_head": 0.06888993084430695,
      "loss_layer_6_head": 0.8730639219284058,
      "step": 2830
    },
    {
      "epoch": 36.288,
      "grad_norm": 0.4695293138859214,
      "learning_rate": 0.004015237979920666,
      "loss": 2.7345,
      "loss_layer_12_head": 0.6814704537391663,
      "loss_layer_18_head": 0.5070143938064575,
      "loss_layer_24_head": 0.3050377666950226,
      "loss_layer_30_head": 0.18146221339702606,
      "loss_layer_36_head": 0.119310162961483,
      "loss_layer_42_head": 0.0698452815413475,
      "loss_layer_6_head": 0.8746509552001953,
      "step": 2835
    },
    {
      "epoch": 36.352,
      "grad_norm": 0.30228058088181176,
      "learning_rate": 0.004010784763653331,
      "loss": 2.7935,
      "loss_layer_12_head": 0.7011603713035583,
      "loss_layer_18_head": 0.49776220321655273,
      "loss_layer_24_head": 0.2986297309398651,
      "loss_layer_30_head": 0.17736199498176575,
      "loss_layer_36_head": 0.11473643779754639,
      "loss_layer_42_head": 0.0703994408249855,
      "loss_layer_6_head": 0.8836386799812317,
      "step": 2840
    },
    {
      "epoch": 36.416,
      "grad_norm": 0.23498264443190484,
      "learning_rate": 0.004006323983099925,
      "loss": 2.6877,
      "loss_layer_12_head": 0.6738174557685852,
      "loss_layer_18_head": 0.509238064289093,
      "loss_layer_24_head": 0.3086773753166199,
      "loss_layer_30_head": 0.18869516253471375,
      "loss_layer_36_head": 0.12177370488643646,
      "loss_layer_42_head": 0.0733829140663147,
      "loss_layer_6_head": 0.8831766843795776,
      "step": 2845
    },
    {
      "epoch": 36.48,
      "grad_norm": 0.3227323885745506,
      "learning_rate": 0.004001855660594947,
      "loss": 2.7605,
      "loss_layer_12_head": 0.6675036549568176,
      "loss_layer_18_head": 0.5181068778038025,
      "loss_layer_24_head": 0.3179686963558197,
      "loss_layer_30_head": 0.2065492868423462,
      "loss_layer_36_head": 0.1280774623155594,
      "loss_layer_42_head": 0.07755711674690247,
      "loss_layer_6_head": 0.9092572927474976,
      "step": 2850
    },
    {
      "epoch": 36.544,
      "grad_norm": 0.28039039454666403,
      "learning_rate": 0.003997379818510657,
      "loss": 2.7833,
      "loss_layer_12_head": 0.6330998539924622,
      "loss_layer_18_head": 0.5085466504096985,
      "loss_layer_24_head": 0.31635376811027527,
      "loss_layer_30_head": 0.20436005294322968,
      "loss_layer_36_head": 0.12249630689620972,
      "loss_layer_42_head": 0.07931692153215408,
      "loss_layer_6_head": 0.8710039854049683,
      "step": 2855
    },
    {
      "epoch": 36.608,
      "grad_norm": 0.27571157245054306,
      "learning_rate": 0.003992896479256966,
      "loss": 2.7779,
      "loss_layer_12_head": 0.6463629603385925,
      "loss_layer_18_head": 0.5181699991226196,
      "loss_layer_24_head": 0.32225924730300903,
      "loss_layer_30_head": 0.20113901793956757,
      "loss_layer_36_head": 0.12523993849754333,
      "loss_layer_42_head": 0.0806257575750351,
      "loss_layer_6_head": 0.917407214641571,
      "step": 2860
    },
    {
      "epoch": 36.672,
      "grad_norm": 0.26903694872552414,
      "learning_rate": 0.003988405665281318,
      "loss": 2.7374,
      "loss_layer_12_head": 0.656553328037262,
      "loss_layer_18_head": 0.5332093834877014,
      "loss_layer_24_head": 0.3255995213985443,
      "loss_layer_30_head": 0.20218046009540558,
      "loss_layer_36_head": 0.1243477314710617,
      "loss_layer_42_head": 0.07388947159051895,
      "loss_layer_6_head": 0.9238072633743286,
      "step": 2865
    },
    {
      "epoch": 36.736,
      "grad_norm": 0.2793294742180599,
      "learning_rate": 0.003983907399068586,
      "loss": 2.7904,
      "loss_layer_12_head": 0.6041092872619629,
      "loss_layer_18_head": 0.5082119107246399,
      "loss_layer_24_head": 0.30142349004745483,
      "loss_layer_30_head": 0.18954673409461975,
      "loss_layer_36_head": 0.12908603250980377,
      "loss_layer_42_head": 0.0726025253534317,
      "loss_layer_6_head": 0.8693143129348755,
      "step": 2870
    },
    {
      "epoch": 36.8,
      "grad_norm": 0.3532383902683582,
      "learning_rate": 0.003979401703140955,
      "loss": 2.7536,
      "loss_layer_12_head": 0.5955523252487183,
      "loss_layer_18_head": 0.4933423101902008,
      "loss_layer_24_head": 0.3033868074417114,
      "loss_layer_30_head": 0.18726399540901184,
      "loss_layer_36_head": 0.130640909075737,
      "loss_layer_42_head": 0.07347296923398972,
      "loss_layer_6_head": 0.8591744303703308,
      "step": 2875
    },
    {
      "epoch": 36.864,
      "grad_norm": 0.3560512362240708,
      "learning_rate": 0.0039748886000578075,
      "loss": 2.8124,
      "loss_layer_12_head": 0.6105268597602844,
      "loss_layer_18_head": 0.5070978403091431,
      "loss_layer_24_head": 0.3074779212474823,
      "loss_layer_30_head": 0.1971246749162674,
      "loss_layer_36_head": 0.1377653032541275,
      "loss_layer_42_head": 0.08042235672473907,
      "loss_layer_6_head": 0.9154857397079468,
      "step": 2880
    },
    {
      "epoch": 36.928,
      "grad_norm": 0.31508681829096785,
      "learning_rate": 0.0039703681124156134,
      "loss": 2.8071,
      "loss_layer_12_head": 0.6602622270584106,
      "loss_layer_18_head": 0.5432816743850708,
      "loss_layer_24_head": 0.32941097021102905,
      "loss_layer_30_head": 0.20606282353401184,
      "loss_layer_36_head": 0.1459718942642212,
      "loss_layer_42_head": 0.0817742794752121,
      "loss_layer_6_head": 0.9632779359817505,
      "step": 2885
    },
    {
      "epoch": 36.992,
      "grad_norm": 0.5254221306161467,
      "learning_rate": 0.003965840262847817,
      "loss": 2.8746,
      "loss_layer_12_head": 0.6223253011703491,
      "loss_layer_18_head": 0.5170004367828369,
      "loss_layer_24_head": 0.31799548864364624,
      "loss_layer_30_head": 0.19102899730205536,
      "loss_layer_36_head": 0.1472880095243454,
      "loss_layer_42_head": 0.07866054028272629,
      "loss_layer_6_head": 0.9395764470100403,
      "step": 2890
    },
    {
      "epoch": 37.056,
      "grad_norm": 0.535271158214785,
      "learning_rate": 0.003961305074024722,
      "loss": 2.7842,
      "loss_layer_12_head": 0.6302772760391235,
      "loss_layer_18_head": 0.5122483968734741,
      "loss_layer_24_head": 0.31934016942977905,
      "loss_layer_30_head": 0.1900312751531601,
      "loss_layer_36_head": 0.1507260501384735,
      "loss_layer_42_head": 0.08252908289432526,
      "loss_layer_6_head": 0.9956119656562805,
      "step": 2895
    },
    {
      "epoch": 37.12,
      "grad_norm": 0.4251104620852724,
      "learning_rate": 0.003956762568653378,
      "loss": 2.7614,
      "loss_layer_12_head": 0.5914536714553833,
      "loss_layer_18_head": 0.4827594757080078,
      "loss_layer_24_head": 0.30992990732192993,
      "loss_layer_30_head": 0.1821718066930771,
      "loss_layer_36_head": 0.15167613327503204,
      "loss_layer_42_head": 0.07855264097452164,
      "loss_layer_6_head": 0.933890700340271,
      "step": 2900
    },
    {
      "epoch": 37.184,
      "grad_norm": 0.4346184160256474,
      "learning_rate": 0.00395221276947747,
      "loss": 2.7964,
      "loss_layer_12_head": 0.6392738223075867,
      "loss_layer_18_head": 0.5211517810821533,
      "loss_layer_24_head": 0.34271588921546936,
      "loss_layer_30_head": 0.19711242616176605,
      "loss_layer_36_head": 0.1448635756969452,
      "loss_layer_42_head": 0.08250679820775986,
      "loss_layer_6_head": 0.9882232546806335,
      "step": 2905
    },
    {
      "epoch": 37.248,
      "grad_norm": 0.3384356595521196,
      "learning_rate": 0.003947655699277197,
      "loss": 2.6674,
      "loss_layer_12_head": 0.5950087308883667,
      "loss_layer_18_head": 0.49306178092956543,
      "loss_layer_24_head": 0.3145228922367096,
      "loss_layer_30_head": 0.19296430051326752,
      "loss_layer_36_head": 0.13640573620796204,
      "loss_layer_42_head": 0.07236428558826447,
      "loss_layer_6_head": 0.900347113609314,
      "step": 2910
    },
    {
      "epoch": 37.312,
      "grad_norm": 0.34977051410719623,
      "learning_rate": 0.00394309138086917,
      "loss": 2.7055,
      "loss_layer_12_head": 0.6044061779975891,
      "loss_layer_18_head": 0.49832281470298767,
      "loss_layer_24_head": 0.3224148154258728,
      "loss_layer_30_head": 0.19673606753349304,
      "loss_layer_36_head": 0.14417769014835358,
      "loss_layer_42_head": 0.07465990632772446,
      "loss_layer_6_head": 0.9028128385543823,
      "step": 2915
    },
    {
      "epoch": 37.376,
      "grad_norm": 0.2673343638658159,
      "learning_rate": 0.003938519837106284,
      "loss": 2.7798,
      "loss_layer_12_head": 0.6190887689590454,
      "loss_layer_18_head": 0.5168978571891785,
      "loss_layer_24_head": 0.31386953592300415,
      "loss_layer_30_head": 0.19474616646766663,
      "loss_layer_36_head": 0.1335524022579193,
      "loss_layer_42_head": 0.06800462305545807,
      "loss_layer_6_head": 0.9190707206726074,
      "step": 2920
    },
    {
      "epoch": 37.44,
      "grad_norm": 0.4349376822973637,
      "learning_rate": 0.003933941090877615,
      "loss": 2.7692,
      "loss_layer_12_head": 0.597338080406189,
      "loss_layer_18_head": 0.49876052141189575,
      "loss_layer_24_head": 0.305052250623703,
      "loss_layer_30_head": 0.19075345993041992,
      "loss_layer_36_head": 0.13425257802009583,
      "loss_layer_42_head": 0.06914927065372467,
      "loss_layer_6_head": 0.9105979204177856,
      "step": 2925
    },
    {
      "epoch": 37.504,
      "grad_norm": 0.43910246639515166,
      "learning_rate": 0.003929355165108299,
      "loss": 2.7124,
      "loss_layer_12_head": 0.5987933874130249,
      "loss_layer_18_head": 0.4896179735660553,
      "loss_layer_24_head": 0.2972896099090576,
      "loss_layer_30_head": 0.18183863162994385,
      "loss_layer_36_head": 0.12219484150409698,
      "loss_layer_42_head": 0.06615260243415833,
      "loss_layer_6_head": 0.9115802049636841,
      "step": 2930
    },
    {
      "epoch": 37.568,
      "grad_norm": 0.40121032977346255,
      "learning_rate": 0.003924762082759419,
      "loss": 2.8476,
      "loss_layer_12_head": 0.6522512435913086,
      "loss_layer_18_head": 0.5393657684326172,
      "loss_layer_24_head": 0.32813334465026855,
      "loss_layer_30_head": 0.20502790808677673,
      "loss_layer_36_head": 0.1368117779493332,
      "loss_layer_42_head": 0.0729900598526001,
      "loss_layer_6_head": 0.9917710423469543,
      "step": 2935
    },
    {
      "epoch": 37.632,
      "grad_norm": 0.3363594581102286,
      "learning_rate": 0.003920161866827889,
      "loss": 2.8225,
      "loss_layer_12_head": 0.6471527814865112,
      "loss_layer_18_head": 0.5225013494491577,
      "loss_layer_24_head": 0.31993570923805237,
      "loss_layer_30_head": 0.19789080321788788,
      "loss_layer_36_head": 0.12997117638587952,
      "loss_layer_42_head": 0.06805088371038437,
      "loss_layer_6_head": 0.9570301175117493,
      "step": 2940
    },
    {
      "epoch": 37.696,
      "grad_norm": 0.2917611725253952,
      "learning_rate": 0.003915554540346343,
      "loss": 2.7433,
      "loss_layer_12_head": 0.6423002481460571,
      "loss_layer_18_head": 0.522297739982605,
      "loss_layer_24_head": 0.3259136378765106,
      "loss_layer_30_head": 0.1963416039943695,
      "loss_layer_36_head": 0.12592557072639465,
      "loss_layer_42_head": 0.06712385267019272,
      "loss_layer_6_head": 0.9309896230697632,
      "step": 2945
    },
    {
      "epoch": 37.76,
      "grad_norm": 0.23680768936680086,
      "learning_rate": 0.0039109401263830125,
      "loss": 2.7673,
      "loss_layer_12_head": 0.6251779794692993,
      "loss_layer_18_head": 0.5144183039665222,
      "loss_layer_24_head": 0.3232366740703583,
      "loss_layer_30_head": 0.19386479258537292,
      "loss_layer_36_head": 0.12380131334066391,
      "loss_layer_42_head": 0.06713833659887314,
      "loss_layer_6_head": 0.8969538807868958,
      "step": 2950
    },
    {
      "epoch": 37.824,
      "grad_norm": 0.2978445652949941,
      "learning_rate": 0.003906318648041617,
      "loss": 2.7611,
      "loss_layer_12_head": 0.6185048818588257,
      "loss_layer_18_head": 0.5123646855354309,
      "loss_layer_24_head": 0.32553356885910034,
      "loss_layer_30_head": 0.1840483844280243,
      "loss_layer_36_head": 0.11654611676931381,
      "loss_layer_42_head": 0.06466501206159592,
      "loss_layer_6_head": 0.8873685002326965,
      "step": 2955
    },
    {
      "epoch": 37.888,
      "grad_norm": 0.32488734298650496,
      "learning_rate": 0.003901690128461247,
      "loss": 2.7917,
      "loss_layer_12_head": 0.6432753205299377,
      "loss_layer_18_head": 0.5388669967651367,
      "loss_layer_24_head": 0.35112300515174866,
      "loss_layer_30_head": 0.19298215210437775,
      "loss_layer_36_head": 0.12028638273477554,
      "loss_layer_42_head": 0.06800851970911026,
      "loss_layer_6_head": 0.9137986898422241,
      "step": 2960
    },
    {
      "epoch": 37.952,
      "grad_norm": 0.42747832673200237,
      "learning_rate": 0.003897054590816247,
      "loss": 2.83,
      "loss_layer_12_head": 0.6984372138977051,
      "loss_layer_18_head": 0.5573431253433228,
      "loss_layer_24_head": 0.3615175783634186,
      "loss_layer_30_head": 0.1993928700685501,
      "loss_layer_36_head": 0.12172533571720123,
      "loss_layer_42_head": 0.06766859441995621,
      "loss_layer_6_head": 0.9573516845703125,
      "step": 2965
    },
    {
      "epoch": 38.016,
      "grad_norm": 0.313102352961302,
      "learning_rate": 0.003892412058316098,
      "loss": 2.7925,
      "loss_layer_12_head": 0.6605373024940491,
      "loss_layer_18_head": 0.5378445386886597,
      "loss_layer_24_head": 0.3343183994293213,
      "loss_layer_30_head": 0.18666504323482513,
      "loss_layer_36_head": 0.11678838729858398,
      "loss_layer_42_head": 0.06543638557195663,
      "loss_layer_6_head": 0.8997335433959961,
      "step": 2970
    },
    {
      "epoch": 38.08,
      "grad_norm": 0.3619726560048037,
      "learning_rate": 0.0038877625542053074,
      "loss": 2.7269,
      "loss_layer_12_head": 0.6461249589920044,
      "loss_layer_18_head": 0.5353928804397583,
      "loss_layer_24_head": 0.3177814185619354,
      "loss_layer_30_head": 0.18546761572360992,
      "loss_layer_36_head": 0.11521182954311371,
      "loss_layer_42_head": 0.06592053920030594,
      "loss_layer_6_head": 0.8660628199577332,
      "step": 2975
    },
    {
      "epoch": 38.144,
      "grad_norm": 0.4616193095833356,
      "learning_rate": 0.0038831061017632847,
      "loss": 2.8128,
      "loss_layer_12_head": 0.721613347530365,
      "loss_layer_18_head": 0.5976910591125488,
      "loss_layer_24_head": 0.31533166766166687,
      "loss_layer_30_head": 0.19162192940711975,
      "loss_layer_36_head": 0.11783353984355927,
      "loss_layer_42_head": 0.06723606586456299,
      "loss_layer_6_head": 0.868303120136261,
      "step": 2980
    },
    {
      "epoch": 38.208,
      "grad_norm": 0.47511445503014066,
      "learning_rate": 0.0038784427243042296,
      "loss": 2.9632,
      "loss_layer_12_head": 0.7451907992362976,
      "loss_layer_18_head": 0.659608006477356,
      "loss_layer_24_head": 0.31810373067855835,
      "loss_layer_30_head": 0.19690105319023132,
      "loss_layer_36_head": 0.1215253695845604,
      "loss_layer_42_head": 0.06894634664058685,
      "loss_layer_6_head": 0.8882430791854858,
      "step": 2985
    },
    {
      "epoch": 38.272,
      "grad_norm": 0.36632595068085505,
      "learning_rate": 0.003873772445177015,
      "loss": 2.8738,
      "loss_layer_12_head": 0.7080209851264954,
      "loss_layer_18_head": 0.6352149248123169,
      "loss_layer_24_head": 0.3078046441078186,
      "loss_layer_30_head": 0.18977965414524078,
      "loss_layer_36_head": 0.11699408292770386,
      "loss_layer_42_head": 0.07005299627780914,
      "loss_layer_6_head": 0.8685942888259888,
      "step": 2990
    },
    {
      "epoch": 38.336,
      "grad_norm": 0.29338343059241623,
      "learning_rate": 0.00386909528776507,
      "loss": 2.8244,
      "loss_layer_12_head": 0.6761045455932617,
      "loss_layer_18_head": 0.6085726022720337,
      "loss_layer_24_head": 0.2980590760707855,
      "loss_layer_30_head": 0.18735027313232422,
      "loss_layer_36_head": 0.11242540180683136,
      "loss_layer_42_head": 0.06344935297966003,
      "loss_layer_6_head": 0.8559595346450806,
      "step": 2995
    },
    {
      "epoch": 38.4,
      "grad_norm": 0.251093499173636,
      "learning_rate": 0.0038644112754862613,
      "loss": 2.7414,
      "loss_layer_12_head": 0.660624623298645,
      "loss_layer_18_head": 0.5786865949630737,
      "loss_layer_24_head": 0.29942458868026733,
      "loss_layer_30_head": 0.1839340627193451,
      "loss_layer_36_head": 0.1139974445104599,
      "loss_layer_42_head": 0.06308557838201523,
      "loss_layer_6_head": 0.86597740650177,
      "step": 3000
    },
    {
      "epoch": 38.4,
      "eval_loss": 5.504083633422852,
      "eval_loss_layer_12_head": 1.2721730470657349,
      "eval_loss_layer_18_head": 1.1453802585601807,
      "eval_loss_layer_24_head": 0.6997052431106567,
      "eval_loss_layer_30_head": 0.46458181738853455,
      "eval_loss_layer_36_head": 0.29580703377723694,
      "eval_loss_layer_42_head": 0.17190739512443542,
      "eval_loss_layer_6_head": 1.589505910873413,
      "eval_runtime": 33.0779,
      "eval_samples_per_second": 9.674,
      "eval_steps_per_second": 0.605,
      "step": 3000
    },
    {
      "epoch": 38.464,
      "grad_norm": 0.24592125593629488,
      "learning_rate": 0.0038597204317927774,
      "loss": 2.7648,
      "loss_layer_12_head": 0.6472615003585815,
      "loss_layer_18_head": 0.5578858256340027,
      "loss_layer_24_head": 0.3043323755264282,
      "loss_layer_30_head": 0.19378477334976196,
      "loss_layer_36_head": 0.12248624861240387,
      "loss_layer_42_head": 0.06824394315481186,
      "loss_layer_6_head": 0.8716370463371277,
      "step": 3005
    },
    {
      "epoch": 38.528,
      "grad_norm": 0.22178756486816475,
      "learning_rate": 0.0038550227801710103,
      "loss": 2.7608,
      "loss_layer_12_head": 0.6037587523460388,
      "loss_layer_18_head": 0.5155850648880005,
      "loss_layer_24_head": 0.2917455732822418,
      "loss_layer_30_head": 0.19247809052467346,
      "loss_layer_36_head": 0.11423847824335098,
      "loss_layer_42_head": 0.07640949636697769,
      "loss_layer_6_head": 0.8275066614151001,
      "step": 3010
    },
    {
      "epoch": 38.592,
      "grad_norm": 0.30995694485780406,
      "learning_rate": 0.003850318344141439,
      "loss": 2.7317,
      "loss_layer_12_head": 0.6356690526008606,
      "loss_layer_18_head": 0.5317152142524719,
      "loss_layer_24_head": 0.3057738244533539,
      "loss_layer_30_head": 0.20711472630500793,
      "loss_layer_36_head": 0.12728825211524963,
      "loss_layer_42_head": 0.08383827656507492,
      "loss_layer_6_head": 0.8917452692985535,
      "step": 3015
    },
    {
      "epoch": 38.656,
      "grad_norm": 0.23796827822682345,
      "learning_rate": 0.0038456071472585097,
      "loss": 2.7383,
      "loss_layer_12_head": 0.6493391990661621,
      "loss_layer_18_head": 0.5357434749603271,
      "loss_layer_24_head": 0.31578654050827026,
      "loss_layer_30_head": 0.20513999462127686,
      "loss_layer_36_head": 0.12335816770792007,
      "loss_layer_42_head": 0.09057965874671936,
      "loss_layer_6_head": 0.9141073226928711,
      "step": 3020
    },
    {
      "epoch": 38.72,
      "grad_norm": 0.2524003562113516,
      "learning_rate": 0.003840889213110521,
      "loss": 2.7631,
      "loss_layer_12_head": 0.6360443830490112,
      "loss_layer_18_head": 0.5200978517532349,
      "loss_layer_24_head": 0.30984869599342346,
      "loss_layer_30_head": 0.20839419960975647,
      "loss_layer_36_head": 0.12439700216054916,
      "loss_layer_42_head": 0.09376123547554016,
      "loss_layer_6_head": 0.9096482992172241,
      "step": 3025
    },
    {
      "epoch": 38.784,
      "grad_norm": 0.4397550082516409,
      "learning_rate": 0.0038361645653195026,
      "loss": 2.8331,
      "loss_layer_12_head": 0.5937235951423645,
      "loss_layer_18_head": 0.48814716935157776,
      "loss_layer_24_head": 0.2994617521762848,
      "loss_layer_30_head": 0.21595275402069092,
      "loss_layer_36_head": 0.12268757820129395,
      "loss_layer_42_head": 0.0879332572221756,
      "loss_layer_6_head": 0.8815796971321106,
      "step": 3030
    },
    {
      "epoch": 38.848,
      "grad_norm": 0.3642370145368038,
      "learning_rate": 0.0038314332275411,
      "loss": 2.8818,
      "loss_layer_12_head": 0.6351650357246399,
      "loss_layer_18_head": 0.5192943811416626,
      "loss_layer_24_head": 0.32984572649002075,
      "loss_layer_30_head": 0.25361162424087524,
      "loss_layer_36_head": 0.13206881284713745,
      "loss_layer_42_head": 0.09539458155632019,
      "loss_layer_6_head": 0.9410699605941772,
      "step": 3035
    },
    {
      "epoch": 38.912,
      "grad_norm": 0.5656637548292068,
      "learning_rate": 0.003826695223464454,
      "loss": 2.9067,
      "loss_layer_12_head": 0.5891318917274475,
      "loss_layer_18_head": 0.480643093585968,
      "loss_layer_24_head": 0.31037047505378723,
      "loss_layer_30_head": 0.24467399716377258,
      "loss_layer_36_head": 0.12586024403572083,
      "loss_layer_42_head": 0.09484275430440903,
      "loss_layer_6_head": 0.9491490125656128,
      "step": 3040
    },
    {
      "epoch": 38.976,
      "grad_norm": 0.5597820822775731,
      "learning_rate": 0.003821950576812081,
      "loss": 2.9312,
      "loss_layer_12_head": 0.6734302639961243,
      "loss_layer_18_head": 0.550351619720459,
      "loss_layer_24_head": 0.34866639971733093,
      "loss_layer_30_head": 0.23890872299671173,
      "loss_layer_36_head": 0.13570788502693176,
      "loss_layer_42_head": 0.09604327380657196,
      "loss_layer_6_head": 1.0317109823226929,
      "step": 3045
    },
    {
      "epoch": 39.04,
      "grad_norm": 0.2739812211240054,
      "learning_rate": 0.0038171993113397585,
      "loss": 2.7751,
      "loss_layer_12_head": 0.6038552522659302,
      "loss_layer_18_head": 0.4852770268917084,
      "loss_layer_24_head": 0.3149782419204712,
      "loss_layer_30_head": 0.2067907154560089,
      "loss_layer_36_head": 0.1239745020866394,
      "loss_layer_42_head": 0.08636505901813507,
      "loss_layer_6_head": 0.9519402384757996,
      "step": 3050
    },
    {
      "epoch": 39.104,
      "grad_norm": 0.4322359582975524,
      "learning_rate": 0.0038124414508364,
      "loss": 2.755,
      "loss_layer_12_head": 0.5836786031723022,
      "loss_layer_18_head": 0.46906667947769165,
      "loss_layer_24_head": 0.3156316578388214,
      "loss_layer_30_head": 0.20084905624389648,
      "loss_layer_36_head": 0.12742778658866882,
      "loss_layer_42_head": 0.08565910160541534,
      "loss_layer_6_head": 0.9151412844657898,
      "step": 3055
    },
    {
      "epoch": 39.168,
      "grad_norm": 0.4154160337958281,
      "learning_rate": 0.003807677019123944,
      "loss": 2.6816,
      "loss_layer_12_head": 0.5885897874832153,
      "loss_layer_18_head": 0.4746008813381195,
      "loss_layer_24_head": 0.3267833888530731,
      "loss_layer_30_head": 0.19942393898963928,
      "loss_layer_36_head": 0.12906867265701294,
      "loss_layer_42_head": 0.08297822624444962,
      "loss_layer_6_head": 0.9026381373405457,
      "step": 3060
    },
    {
      "epoch": 39.232,
      "grad_norm": 0.3123452964764486,
      "learning_rate": 0.003802906040057226,
      "loss": 2.7718,
      "loss_layer_12_head": 0.5960981845855713,
      "loss_layer_18_head": 0.4815024435520172,
      "loss_layer_24_head": 0.34586289525032043,
      "loss_layer_30_head": 0.19717426598072052,
      "loss_layer_36_head": 0.1267048865556717,
      "loss_layer_42_head": 0.0830535963177681,
      "loss_layer_6_head": 0.9160143136978149,
      "step": 3065
    },
    {
      "epoch": 39.296,
      "grad_norm": 0.3995518090594903,
      "learning_rate": 0.003798128537523865,
      "loss": 2.809,
      "loss_layer_12_head": 0.5980879068374634,
      "loss_layer_18_head": 0.4828875660896301,
      "loss_layer_24_head": 0.36801692843437195,
      "loss_layer_30_head": 0.19810347259044647,
      "loss_layer_36_head": 0.12252388894557953,
      "loss_layer_42_head": 0.07836957275867462,
      "loss_layer_6_head": 0.9235044717788696,
      "step": 3070
    },
    {
      "epoch": 39.36,
      "grad_norm": 0.42465251034091367,
      "learning_rate": 0.003793344535444142,
      "loss": 2.7858,
      "loss_layer_12_head": 0.5854050517082214,
      "loss_layer_18_head": 0.48178333044052124,
      "loss_layer_24_head": 0.3497649133205414,
      "loss_layer_30_head": 0.20997881889343262,
      "loss_layer_36_head": 0.1221608966588974,
      "loss_layer_42_head": 0.07731282711029053,
      "loss_layer_6_head": 0.886701226234436,
      "step": 3075
    },
    {
      "epoch": 39.424,
      "grad_norm": 0.22819786452553015,
      "learning_rate": 0.0037885540577708805,
      "loss": 2.7591,
      "loss_layer_12_head": 0.6364872455596924,
      "loss_layer_18_head": 0.5202958583831787,
      "loss_layer_24_head": 0.35621073842048645,
      "loss_layer_30_head": 0.22776544094085693,
      "loss_layer_36_head": 0.128972589969635,
      "loss_layer_42_head": 0.07561160624027252,
      "loss_layer_6_head": 0.9372507333755493,
      "step": 3080
    },
    {
      "epoch": 39.488,
      "grad_norm": 0.4233561161631688,
      "learning_rate": 0.003783757128489326,
      "loss": 2.8079,
      "loss_layer_12_head": 0.6367073059082031,
      "loss_layer_18_head": 0.5254141092300415,
      "loss_layer_24_head": 0.3436564803123474,
      "loss_layer_30_head": 0.23009245097637177,
      "loss_layer_36_head": 0.11977871507406235,
      "loss_layer_42_head": 0.07319626957178116,
      "loss_layer_6_head": 0.9569320678710938,
      "step": 3085
    },
    {
      "epoch": 39.552,
      "grad_norm": 0.3881166767057087,
      "learning_rate": 0.0037789537716170253,
      "loss": 2.7855,
      "loss_layer_12_head": 0.613114058971405,
      "loss_layer_18_head": 0.5012235045433044,
      "loss_layer_24_head": 0.3379970192909241,
      "loss_layer_30_head": 0.20913943648338318,
      "loss_layer_36_head": 0.11745939403772354,
      "loss_layer_42_head": 0.0797743946313858,
      "loss_layer_6_head": 0.9097383618354797,
      "step": 3090
    },
    {
      "epoch": 39.616,
      "grad_norm": 0.23769144553666421,
      "learning_rate": 0.0037741440112037095,
      "loss": 2.7262,
      "loss_layer_12_head": 0.6141175031661987,
      "loss_layer_18_head": 0.5087946653366089,
      "loss_layer_24_head": 0.32603663206100464,
      "loss_layer_30_head": 0.19756761193275452,
      "loss_layer_36_head": 0.1153147965669632,
      "loss_layer_42_head": 0.0778246745467186,
      "loss_layer_6_head": 0.9185993075370789,
      "step": 3095
    },
    {
      "epoch": 39.68,
      "grad_norm": 0.4488506984428842,
      "learning_rate": 0.00376932787133117,
      "loss": 2.8164,
      "loss_layer_12_head": 0.6043463945388794,
      "loss_layer_18_head": 0.5077724456787109,
      "loss_layer_24_head": 0.3153781592845917,
      "loss_layer_30_head": 0.19422955811023712,
      "loss_layer_36_head": 0.11776633560657501,
      "loss_layer_42_head": 0.07959893345832825,
      "loss_layer_6_head": 0.9219226837158203,
      "step": 3100
    },
    {
      "epoch": 39.744,
      "grad_norm": 0.4095279666512369,
      "learning_rate": 0.003764505376113138,
      "loss": 2.7765,
      "loss_layer_12_head": 0.6586071848869324,
      "loss_layer_18_head": 0.5443697571754456,
      "loss_layer_24_head": 0.32876789569854736,
      "loss_layer_30_head": 0.2001303732395172,
      "loss_layer_36_head": 0.12329047918319702,
      "loss_layer_42_head": 0.07767374068498611,
      "loss_layer_6_head": 0.9770455360412598,
      "step": 3105
    },
    {
      "epoch": 39.808,
      "grad_norm": 0.1962327961406296,
      "learning_rate": 0.003759676549695168,
      "loss": 2.775,
      "loss_layer_12_head": 0.6400882005691528,
      "loss_layer_18_head": 0.5287758708000183,
      "loss_layer_24_head": 0.3301001787185669,
      "loss_layer_30_head": 0.2020631581544876,
      "loss_layer_36_head": 0.12460029125213623,
      "loss_layer_42_head": 0.08124488592147827,
      "loss_layer_6_head": 0.9413391351699829,
      "step": 3110
    },
    {
      "epoch": 39.872,
      "grad_norm": 0.4102446538269867,
      "learning_rate": 0.003754841416254512,
      "loss": 2.7872,
      "loss_layer_12_head": 0.6157166361808777,
      "loss_layer_18_head": 0.4936802387237549,
      "loss_layer_24_head": 0.3090026378631592,
      "loss_layer_30_head": 0.18421049416065216,
      "loss_layer_36_head": 0.11597124487161636,
      "loss_layer_42_head": 0.07273619621992111,
      "loss_layer_6_head": 0.9062643051147461,
      "step": 3115
    },
    {
      "epoch": 39.936,
      "grad_norm": 0.5436104258619374,
      "learning_rate": 0.00375,
      "loss": 2.7857,
      "loss_layer_12_head": 0.6372242569923401,
      "loss_layer_18_head": 0.5171113014221191,
      "loss_layer_24_head": 0.3244021236896515,
      "loss_layer_30_head": 0.19536344707012177,
      "loss_layer_36_head": 0.12226893752813339,
      "loss_layer_42_head": 0.07427143305540085,
      "loss_layer_6_head": 0.9510096311569214,
      "step": 3120
    },
    {
      "epoch": 40.0,
      "grad_norm": 0.31227124351583835,
      "learning_rate": 0.0037451523251719205,
      "loss": 2.766,
      "loss_layer_12_head": 0.6329264640808105,
      "loss_layer_18_head": 0.5011588931083679,
      "loss_layer_24_head": 0.3094762861728668,
      "loss_layer_30_head": 0.18521365523338318,
      "loss_layer_36_head": 0.11625976860523224,
      "loss_layer_42_head": 0.06869713217020035,
      "loss_layer_6_head": 0.9413679242134094,
      "step": 3125
    },
    {
      "epoch": 40.064,
      "grad_norm": 0.22877349472390357,
      "learning_rate": 0.0037402984160418975,
      "loss": 2.5774,
      "loss_layer_12_head": 0.5927484035491943,
      "loss_layer_18_head": 0.47695010900497437,
      "loss_layer_24_head": 0.2906946539878845,
      "loss_layer_30_head": 0.1755046546459198,
      "loss_layer_36_head": 0.11177919059991837,
      "loss_layer_42_head": 0.06491716206073761,
      "loss_layer_6_head": 0.8760043978691101,
      "step": 3130
    },
    {
      "epoch": 40.128,
      "grad_norm": 0.34644361470939256,
      "learning_rate": 0.0037354382969127676,
      "loss": 2.6075,
      "loss_layer_12_head": 0.6011173129081726,
      "loss_layer_18_head": 0.4818390905857086,
      "loss_layer_24_head": 0.288738876581192,
      "loss_layer_30_head": 0.17289377748966217,
      "loss_layer_36_head": 0.10773468017578125,
      "loss_layer_42_head": 0.06374011933803558,
      "loss_layer_6_head": 0.866104245185852,
      "step": 3135
    },
    {
      "epoch": 40.192,
      "grad_norm": 0.20176471175405114,
      "learning_rate": 0.0037305719921184623,
      "loss": 2.6478,
      "loss_layer_12_head": 0.6031110286712646,
      "loss_layer_18_head": 0.4781804084777832,
      "loss_layer_24_head": 0.29242268204689026,
      "loss_layer_30_head": 0.1809597909450531,
      "loss_layer_36_head": 0.11300389468669891,
      "loss_layer_42_head": 0.08098061382770538,
      "loss_layer_6_head": 0.8655881881713867,
      "step": 3140
    },
    {
      "epoch": 40.256,
      "grad_norm": 0.29021342758894164,
      "learning_rate": 0.003725699526023882,
      "loss": 2.6294,
      "loss_layer_12_head": 0.6017006039619446,
      "loss_layer_18_head": 0.47265100479125977,
      "loss_layer_24_head": 0.28772151470184326,
      "loss_layer_30_head": 0.17351442575454712,
      "loss_layer_36_head": 0.1125485897064209,
      "loss_layer_42_head": 0.06657262146472931,
      "loss_layer_6_head": 0.8663709759712219,
      "step": 3145
    },
    {
      "epoch": 40.32,
      "grad_norm": 0.2465435485716508,
      "learning_rate": 0.003720820923024778,
      "loss": 2.5902,
      "loss_layer_12_head": 0.5941504836082458,
      "loss_layer_18_head": 0.4781847596168518,
      "loss_layer_24_head": 0.2913573682308197,
      "loss_layer_30_head": 0.17666999995708466,
      "loss_layer_36_head": 0.11400370299816132,
      "loss_layer_42_head": 0.07346412539482117,
      "loss_layer_6_head": 0.8549949526786804,
      "step": 3150
    },
    {
      "epoch": 40.384,
      "grad_norm": 0.19977271775041708,
      "learning_rate": 0.0037159362075476253,
      "loss": 2.6372,
      "loss_layer_12_head": 0.6328919529914856,
      "loss_layer_18_head": 0.5075710415840149,
      "loss_layer_24_head": 0.3122320771217346,
      "loss_layer_30_head": 0.1881488710641861,
      "loss_layer_36_head": 0.1179674044251442,
      "loss_layer_42_head": 0.06539580971002579,
      "loss_layer_6_head": 0.9006508588790894,
      "step": 3155
    },
    {
      "epoch": 40.448,
      "grad_norm": 0.2036691752398082,
      "learning_rate": 0.0037110454040495066,
      "loss": 2.66,
      "loss_layer_12_head": 0.6061472296714783,
      "loss_layer_18_head": 0.4970950484275818,
      "loss_layer_24_head": 0.29530489444732666,
      "loss_layer_30_head": 0.17806801199913025,
      "loss_layer_36_head": 0.11323803663253784,
      "loss_layer_42_head": 0.06441748142242432,
      "loss_layer_6_head": 0.8540397882461548,
      "step": 3160
    },
    {
      "epoch": 40.512,
      "grad_norm": 0.24281459699508542,
      "learning_rate": 0.0037061485370179837,
      "loss": 2.6811,
      "loss_layer_12_head": 0.6028560400009155,
      "loss_layer_18_head": 0.5023148655891418,
      "loss_layer_24_head": 0.2961641252040863,
      "loss_layer_30_head": 0.17944392561912537,
      "loss_layer_36_head": 0.11276106536388397,
      "loss_layer_42_head": 0.061495862901210785,
      "loss_layer_6_head": 0.8584586977958679,
      "step": 3165
    },
    {
      "epoch": 40.576,
      "grad_norm": 0.21433340032436105,
      "learning_rate": 0.0037012456309709787,
      "loss": 2.6724,
      "loss_layer_12_head": 0.6242291331291199,
      "loss_layer_18_head": 0.5247893929481506,
      "loss_layer_24_head": 0.31006088852882385,
      "loss_layer_30_head": 0.18574002385139465,
      "loss_layer_36_head": 0.1295963078737259,
      "loss_layer_42_head": 0.06737679243087769,
      "loss_layer_6_head": 0.8814622163772583,
      "step": 3170
    },
    {
      "epoch": 40.64,
      "grad_norm": 0.2188458517267226,
      "learning_rate": 0.00369633671045665,
      "loss": 2.6923,
      "loss_layer_12_head": 0.6200834512710571,
      "loss_layer_18_head": 0.49812978506088257,
      "loss_layer_24_head": 0.29711610078811646,
      "loss_layer_30_head": 0.18047167360782623,
      "loss_layer_36_head": 0.12078557908535004,
      "loss_layer_42_head": 0.06398826092481613,
      "loss_layer_6_head": 0.8670821189880371,
      "step": 3175
    },
    {
      "epoch": 40.704,
      "grad_norm": 0.2869814600143744,
      "learning_rate": 0.0036914218000532696,
      "loss": 2.7244,
      "loss_layer_12_head": 0.6211539506912231,
      "loss_layer_18_head": 0.49396592378616333,
      "loss_layer_24_head": 0.29533591866493225,
      "loss_layer_30_head": 0.17796598374843597,
      "loss_layer_36_head": 0.12061163038015366,
      "loss_layer_42_head": 0.06133504956960678,
      "loss_layer_6_head": 0.859447181224823,
      "step": 3180
    },
    {
      "epoch": 40.768,
      "grad_norm": 0.3057982281592835,
      "learning_rate": 0.003686500924369101,
      "loss": 2.6573,
      "loss_layer_12_head": 0.5977885723114014,
      "loss_layer_18_head": 0.474559485912323,
      "loss_layer_24_head": 0.28045234084129333,
      "loss_layer_30_head": 0.17006340622901917,
      "loss_layer_36_head": 0.12222598493099213,
      "loss_layer_42_head": 0.061814140528440475,
      "loss_layer_6_head": 0.8287792205810547,
      "step": 3185
    },
    {
      "epoch": 40.832,
      "grad_norm": 0.2581366088461307,
      "learning_rate": 0.003681574108042274,
      "loss": 2.7626,
      "loss_layer_12_head": 0.6783490180969238,
      "loss_layer_18_head": 0.5328351259231567,
      "loss_layer_24_head": 0.3365214765071869,
      "loss_layer_30_head": 0.19413094222545624,
      "loss_layer_36_head": 0.14398324489593506,
      "loss_layer_42_head": 0.07372082769870758,
      "loss_layer_6_head": 0.9237712025642395,
      "step": 3190
    },
    {
      "epoch": 40.896,
      "grad_norm": 0.26416086523450144,
      "learning_rate": 0.003676641375740662,
      "loss": 2.7781,
      "loss_layer_12_head": 0.6417745351791382,
      "loss_layer_18_head": 0.5134273767471313,
      "loss_layer_24_head": 0.3316226601600647,
      "loss_layer_30_head": 0.1872214376926422,
      "loss_layer_36_head": 0.18015660345554352,
      "loss_layer_42_head": 0.06879256665706635,
      "loss_layer_6_head": 0.8889487981796265,
      "step": 3195
    },
    {
      "epoch": 40.96,
      "grad_norm": 0.2694959824107292,
      "learning_rate": 0.003671702752161759,
      "loss": 2.8092,
      "loss_layer_12_head": 0.6394708752632141,
      "loss_layer_18_head": 0.5083087682723999,
      "loss_layer_24_head": 0.3230512738227844,
      "loss_layer_30_head": 0.18288075923919678,
      "loss_layer_36_head": 0.17252400517463684,
      "loss_layer_42_head": 0.06645487248897552,
      "loss_layer_6_head": 0.8781023025512695,
      "step": 3200
    },
    {
      "epoch": 40.96,
      "eval_loss": 5.487564563751221,
      "eval_loss_layer_12_head": 1.2511590719223022,
      "eval_loss_layer_18_head": 1.0804758071899414,
      "eval_loss_layer_24_head": 0.7122905850410461,
      "eval_loss_layer_30_head": 0.46017780900001526,
      "eval_loss_layer_36_head": 0.35437604784965515,
      "eval_loss_layer_42_head": 0.17385894060134888,
      "eval_loss_layer_6_head": 1.5899020433425903,
      "eval_runtime": 33.0345,
      "eval_samples_per_second": 9.687,
      "eval_steps_per_second": 0.605,
      "step": 3200
    },
    {
      "epoch": 41.024,
      "grad_norm": 0.3114262873334899,
      "learning_rate": 0.003666758262032558,
      "loss": 2.7544,
      "loss_layer_12_head": 0.6133272051811218,
      "loss_layer_18_head": 0.4982215464115143,
      "loss_layer_24_head": 0.33510226011276245,
      "loss_layer_30_head": 0.18715807795524597,
      "loss_layer_36_head": 0.16388842463493347,
      "loss_layer_42_head": 0.07192866504192352,
      "loss_layer_6_head": 0.8558881878852844,
      "step": 3205
    },
    {
      "epoch": 41.088,
      "grad_norm": 0.24987140170721567,
      "learning_rate": 0.0036618079301094213,
      "loss": 2.5942,
      "loss_layer_12_head": 0.5903779864311218,
      "loss_layer_18_head": 0.47151345014572144,
      "loss_layer_24_head": 0.3202332556247711,
      "loss_layer_30_head": 0.18168288469314575,
      "loss_layer_36_head": 0.14033150672912598,
      "loss_layer_42_head": 0.07468752562999725,
      "loss_layer_6_head": 0.8356053233146667,
      "step": 3210
    },
    {
      "epoch": 41.152,
      "grad_norm": 0.1952465270763384,
      "learning_rate": 0.0036568517811779635,
      "loss": 2.6239,
      "loss_layer_12_head": 0.6300411820411682,
      "loss_layer_18_head": 0.5078171491622925,
      "loss_layer_24_head": 0.3298870027065277,
      "loss_layer_30_head": 0.19359995424747467,
      "loss_layer_36_head": 0.14058391749858856,
      "loss_layer_42_head": 0.07575077563524246,
      "loss_layer_6_head": 0.8838054537773132,
      "step": 3215
    },
    {
      "epoch": 41.216,
      "grad_norm": 0.22931687015244626,
      "learning_rate": 0.0036518898400529215,
      "loss": 2.6257,
      "loss_layer_12_head": 0.5962705612182617,
      "loss_layer_18_head": 0.4712589383125305,
      "loss_layer_24_head": 0.3074672818183899,
      "loss_layer_30_head": 0.1811036616563797,
      "loss_layer_36_head": 0.12424594163894653,
      "loss_layer_42_head": 0.06891561299562454,
      "loss_layer_6_head": 0.8379788398742676,
      "step": 3220
    },
    {
      "epoch": 41.28,
      "grad_norm": 0.1908398766826634,
      "learning_rate": 0.0036469221315780353,
      "loss": 2.6679,
      "loss_layer_12_head": 0.6253092288970947,
      "loss_layer_18_head": 0.5115126371383667,
      "loss_layer_24_head": 0.3210536539554596,
      "loss_layer_30_head": 0.18615934252738953,
      "loss_layer_36_head": 0.12639640271663666,
      "loss_layer_42_head": 0.0724358856678009,
      "loss_layer_6_head": 0.8764041066169739,
      "step": 3225
    },
    {
      "epoch": 41.344,
      "grad_norm": 0.17290736421067632,
      "learning_rate": 0.0036419486806259192,
      "loss": 2.6536,
      "loss_layer_12_head": 0.6199604272842407,
      "loss_layer_18_head": 0.5063698887825012,
      "loss_layer_24_head": 0.3150082230567932,
      "loss_layer_30_head": 0.18595537543296814,
      "loss_layer_36_head": 0.12413308769464493,
      "loss_layer_42_head": 0.07177269458770752,
      "loss_layer_6_head": 0.8738277554512024,
      "step": 3230
    },
    {
      "epoch": 41.408,
      "grad_norm": 0.2072084933355073,
      "learning_rate": 0.0036369695120979403,
      "loss": 2.6723,
      "loss_layer_12_head": 0.623599648475647,
      "loss_layer_18_head": 0.5107603073120117,
      "loss_layer_24_head": 0.31838661432266235,
      "loss_layer_30_head": 0.2249329835176468,
      "loss_layer_36_head": 0.14536790549755096,
      "loss_layer_42_head": 0.09252939373254776,
      "loss_layer_6_head": 0.8676303029060364,
      "step": 3235
    },
    {
      "epoch": 41.472,
      "grad_norm": 0.23130715320359935,
      "learning_rate": 0.0036319846509240937,
      "loss": 2.6614,
      "loss_layer_12_head": 0.6295963525772095,
      "loss_layer_18_head": 0.5039094686508179,
      "loss_layer_24_head": 0.3115098476409912,
      "loss_layer_30_head": 0.18662229180335999,
      "loss_layer_36_head": 0.12044142186641693,
      "loss_layer_42_head": 0.06980974227190018,
      "loss_layer_6_head": 0.8799479603767395,
      "step": 3240
    },
    {
      "epoch": 41.536,
      "grad_norm": 0.23407430491746034,
      "learning_rate": 0.003626994122062874,
      "loss": 2.6196,
      "loss_layer_12_head": 0.6077539324760437,
      "loss_layer_18_head": 0.48642343282699585,
      "loss_layer_24_head": 0.2957361340522766,
      "loss_layer_30_head": 0.17789289355278015,
      "loss_layer_36_head": 0.11458496749401093,
      "loss_layer_42_head": 0.06866320222616196,
      "loss_layer_6_head": 0.8495038151741028,
      "step": 3245
    },
    {
      "epoch": 41.6,
      "grad_norm": 0.1729157230543318,
      "learning_rate": 0.0036219979505011557,
      "loss": 2.677,
      "loss_layer_12_head": 0.6269460916519165,
      "loss_layer_18_head": 0.5104289054870605,
      "loss_layer_24_head": 0.30582308769226074,
      "loss_layer_30_head": 0.187034010887146,
      "loss_layer_36_head": 0.11905916035175323,
      "loss_layer_42_head": 0.06857718527317047,
      "loss_layer_6_head": 0.8876310586929321,
      "step": 3250
    },
    {
      "epoch": 41.664,
      "grad_norm": 0.2367201002064759,
      "learning_rate": 0.0036169961612540647,
      "loss": 2.7307,
      "loss_layer_12_head": 0.6361056566238403,
      "loss_layer_18_head": 0.5186387896537781,
      "loss_layer_24_head": 0.32103487849235535,
      "loss_layer_30_head": 0.2180800884962082,
      "loss_layer_36_head": 0.12485262006521225,
      "loss_layer_42_head": 0.07388447225093842,
      "loss_layer_6_head": 0.8889653086662292,
      "step": 3255
    },
    {
      "epoch": 41.728,
      "grad_norm": 0.18569714844152602,
      "learning_rate": 0.003611988779364853,
      "loss": 2.7098,
      "loss_layer_12_head": 0.6107989549636841,
      "loss_layer_18_head": 0.4909587800502777,
      "loss_layer_24_head": 0.2997896373271942,
      "loss_layer_30_head": 0.18753619492053986,
      "loss_layer_36_head": 0.1123322993516922,
      "loss_layer_42_head": 0.06323741376399994,
      "loss_layer_6_head": 0.8629115223884583,
      "step": 3260
    },
    {
      "epoch": 41.792,
      "grad_norm": 0.21282374899852594,
      "learning_rate": 0.0036069758299047765,
      "loss": 2.6934,
      "loss_layer_12_head": 0.6263232827186584,
      "loss_layer_18_head": 0.5139836668968201,
      "loss_layer_24_head": 0.31315210461616516,
      "loss_layer_30_head": 0.18950895965099335,
      "loss_layer_36_head": 0.11491559445858002,
      "loss_layer_42_head": 0.06470932066440582,
      "loss_layer_6_head": 0.8852592706680298,
      "step": 3265
    },
    {
      "epoch": 41.856,
      "grad_norm": 0.23641977240264073,
      "learning_rate": 0.003601957337972964,
      "loss": 2.7289,
      "loss_layer_12_head": 0.6471854448318481,
      "loss_layer_18_head": 0.5373227596282959,
      "loss_layer_24_head": 0.3180321753025055,
      "loss_layer_30_head": 0.19294044375419617,
      "loss_layer_36_head": 0.11779401451349258,
      "loss_layer_42_head": 0.06497116386890411,
      "loss_layer_6_head": 0.9184333086013794,
      "step": 3270
    },
    {
      "epoch": 41.92,
      "grad_norm": 0.25950378407547764,
      "learning_rate": 0.003596933328696298,
      "loss": 2.7148,
      "loss_layer_12_head": 0.6367674469947815,
      "loss_layer_18_head": 0.5352760553359985,
      "loss_layer_24_head": 0.31659501791000366,
      "loss_layer_30_head": 0.191913440823555,
      "loss_layer_36_head": 0.11999158561229706,
      "loss_layer_42_head": 0.06697367131710052,
      "loss_layer_6_head": 0.9072548747062683,
      "step": 3275
    },
    {
      "epoch": 41.984,
      "grad_norm": 0.22066742834622285,
      "learning_rate": 0.003591903827229282,
      "loss": 2.6877,
      "loss_layer_12_head": 0.5977888703346252,
      "loss_layer_18_head": 0.49888259172439575,
      "loss_layer_24_head": 0.29708096385002136,
      "loss_layer_30_head": 0.17773592472076416,
      "loss_layer_36_head": 0.11139806360006332,
      "loss_layer_42_head": 0.06247534602880478,
      "loss_layer_6_head": 0.8564966917037964,
      "step": 3280
    },
    {
      "epoch": 42.048,
      "grad_norm": 0.3308611581596246,
      "learning_rate": 0.0035868688587539213,
      "loss": 2.6851,
      "loss_layer_12_head": 0.6008946299552917,
      "loss_layer_18_head": 0.4913597106933594,
      "loss_layer_24_head": 0.2957885265350342,
      "loss_layer_30_head": 0.18221071362495422,
      "loss_layer_36_head": 0.14234955608844757,
      "loss_layer_42_head": 0.07343493402004242,
      "loss_layer_6_head": 0.8733687400817871,
      "step": 3285
    },
    {
      "epoch": 42.112,
      "grad_norm": 0.5564192120480225,
      "learning_rate": 0.0035818284484795903,
      "loss": 2.6412,
      "loss_layer_12_head": 0.6164496541023254,
      "loss_layer_18_head": 0.49888402223587036,
      "loss_layer_24_head": 0.3026379644870758,
      "loss_layer_30_head": 0.18238389492034912,
      "loss_layer_36_head": 0.11365504562854767,
      "loss_layer_42_head": 0.06417565792798996,
      "loss_layer_6_head": 0.9298850297927856,
      "step": 3290
    },
    {
      "epoch": 42.176,
      "grad_norm": 0.5444431501129395,
      "learning_rate": 0.0035767826216429144,
      "loss": 2.7036,
      "loss_layer_12_head": 0.5820105075836182,
      "loss_layer_18_head": 0.466066837310791,
      "loss_layer_24_head": 0.28499144315719604,
      "loss_layer_30_head": 0.1713535487651825,
      "loss_layer_36_head": 0.10816244781017303,
      "loss_layer_42_head": 0.06201348453760147,
      "loss_layer_6_head": 0.9386860728263855,
      "step": 3295
    },
    {
      "epoch": 42.24,
      "grad_norm": 0.394975472675338,
      "learning_rate": 0.003571731403507635,
      "loss": 2.6997,
      "loss_layer_12_head": 0.6077699661254883,
      "loss_layer_18_head": 0.4978962540626526,
      "loss_layer_24_head": 0.294658899307251,
      "loss_layer_30_head": 0.1792612075805664,
      "loss_layer_36_head": 0.11060982942581177,
      "loss_layer_42_head": 0.06276079267263412,
      "loss_layer_6_head": 0.9686356782913208,
      "step": 3300
    },
    {
      "epoch": 42.304,
      "grad_norm": 0.3395268479269435,
      "learning_rate": 0.003566674819364489,
      "loss": 2.7201,
      "loss_layer_12_head": 0.6091223955154419,
      "loss_layer_18_head": 0.501832127571106,
      "loss_layer_24_head": 0.30197587609291077,
      "loss_layer_30_head": 0.18997706472873688,
      "loss_layer_36_head": 0.11230902373790741,
      "loss_layer_42_head": 0.06219832971692085,
      "loss_layer_6_head": 0.9561971426010132,
      "step": 3305
    },
    {
      "epoch": 42.368,
      "grad_norm": 0.3329207668012452,
      "learning_rate": 0.00356161289453108,
      "loss": 2.7053,
      "loss_layer_12_head": 0.619464635848999,
      "loss_layer_18_head": 0.5084964036941528,
      "loss_layer_24_head": 0.30682462453842163,
      "loss_layer_30_head": 0.19047123193740845,
      "loss_layer_36_head": 0.11504773795604706,
      "loss_layer_42_head": 0.06501453369855881,
      "loss_layer_6_head": 0.9908730387687683,
      "step": 3310
    },
    {
      "epoch": 42.432,
      "grad_norm": 0.2776731293644964,
      "learning_rate": 0.003556545654351749,
      "loss": 2.7162,
      "loss_layer_12_head": 0.6029380559921265,
      "loss_layer_18_head": 0.48817962408065796,
      "loss_layer_24_head": 0.29165494441986084,
      "loss_layer_30_head": 0.18454787135124207,
      "loss_layer_36_head": 0.1103317141532898,
      "loss_layer_42_head": 0.06725440919399261,
      "loss_layer_6_head": 0.9274389147758484,
      "step": 3315
    },
    {
      "epoch": 42.496,
      "grad_norm": 0.2793921537687215,
      "learning_rate": 0.003551473124197454,
      "loss": 2.6917,
      "loss_layer_12_head": 0.6371121406555176,
      "loss_layer_18_head": 0.5074896216392517,
      "loss_layer_24_head": 0.3121703863143921,
      "loss_layer_30_head": 0.18871934711933136,
      "loss_layer_36_head": 0.11853235960006714,
      "loss_layer_42_head": 0.0696822851896286,
      "loss_layer_6_head": 0.9236465692520142,
      "step": 3320
    },
    {
      "epoch": 42.56,
      "grad_norm": 0.3324938591158146,
      "learning_rate": 0.0035463953294656366,
      "loss": 2.7557,
      "loss_layer_12_head": 0.6599602103233337,
      "loss_layer_18_head": 0.49543362855911255,
      "loss_layer_24_head": 0.310953289270401,
      "loss_layer_30_head": 0.18484032154083252,
      "loss_layer_36_head": 0.11619257926940918,
      "loss_layer_42_head": 0.0703888013958931,
      "loss_layer_6_head": 0.9049943089485168,
      "step": 3325
    },
    {
      "epoch": 42.624,
      "grad_norm": 0.4878986499434331,
      "learning_rate": 0.0035413122955801003,
      "loss": 2.807,
      "loss_layer_12_head": 0.6708885431289673,
      "loss_layer_18_head": 0.49295181035995483,
      "loss_layer_24_head": 0.3126205801963806,
      "loss_layer_30_head": 0.18356665968894958,
      "loss_layer_36_head": 0.12015299499034882,
      "loss_layer_42_head": 0.07860212028026581,
      "loss_layer_6_head": 0.8591581583023071,
      "step": 3330
    },
    {
      "epoch": 42.688,
      "grad_norm": 0.5619352324336725,
      "learning_rate": 0.0035362240479908753,
      "loss": 2.839,
      "loss_layer_12_head": 0.7508238554000854,
      "loss_layer_18_head": 0.520358681678772,
      "loss_layer_24_head": 0.32158663868904114,
      "loss_layer_30_head": 0.18547789752483368,
      "loss_layer_36_head": 0.12168946117162704,
      "loss_layer_42_head": 0.08150520920753479,
      "loss_layer_6_head": 0.907148003578186,
      "step": 3335
    },
    {
      "epoch": 42.752,
      "grad_norm": 0.37095665999119676,
      "learning_rate": 0.0035311306121741017,
      "loss": 2.8992,
      "loss_layer_12_head": 0.7545850872993469,
      "loss_layer_18_head": 0.48756590485572815,
      "loss_layer_24_head": 0.30478015542030334,
      "loss_layer_30_head": 0.186281219124794,
      "loss_layer_36_head": 0.1202142983675003,
      "loss_layer_42_head": 0.08346305787563324,
      "loss_layer_6_head": 0.856026828289032,
      "step": 3340
    },
    {
      "epoch": 42.816,
      "grad_norm": 0.23870189916206414,
      "learning_rate": 0.0035260320136318926,
      "loss": 2.8559,
      "loss_layer_12_head": 0.7044896483421326,
      "loss_layer_18_head": 0.5007058382034302,
      "loss_layer_24_head": 0.31562739610671997,
      "loss_layer_30_head": 0.1902187019586563,
      "loss_layer_36_head": 0.1288297474384308,
      "loss_layer_42_head": 0.08653372526168823,
      "loss_layer_6_head": 0.8557745218276978,
      "step": 3345
    },
    {
      "epoch": 42.88,
      "grad_norm": 0.21035786275528032,
      "learning_rate": 0.00352092827789221,
      "loss": 2.7872,
      "loss_layer_12_head": 0.691085696220398,
      "loss_layer_18_head": 0.5126772522926331,
      "loss_layer_24_head": 0.3212660253047943,
      "loss_layer_30_head": 0.19346901774406433,
      "loss_layer_36_head": 0.13620220124721527,
      "loss_layer_42_head": 0.08402286469936371,
      "loss_layer_6_head": 0.8937976956367493,
      "step": 3350
    },
    {
      "epoch": 42.944,
      "grad_norm": 0.17173056645036675,
      "learning_rate": 0.0035158194305087414,
      "loss": 2.7993,
      "loss_layer_12_head": 0.6631667613983154,
      "loss_layer_18_head": 0.5043042302131653,
      "loss_layer_24_head": 0.31337791681289673,
      "loss_layer_30_head": 0.19264814257621765,
      "loss_layer_36_head": 0.13038626313209534,
      "loss_layer_42_head": 0.07229328900575638,
      "loss_layer_6_head": 0.8761781454086304,
      "step": 3355
    },
    {
      "epoch": 43.008,
      "grad_norm": 0.23085005901597974,
      "learning_rate": 0.003510705497060762,
      "loss": 2.724,
      "loss_layer_12_head": 0.6451908349990845,
      "loss_layer_18_head": 0.5098661184310913,
      "loss_layer_24_head": 0.30940741300582886,
      "loss_layer_30_head": 0.189787358045578,
      "loss_layer_36_head": 0.13171932101249695,
      "loss_layer_42_head": 0.07122714817523956,
      "loss_layer_6_head": 0.8700235486030579,
      "step": 3360
    },
    {
      "epoch": 43.072,
      "grad_norm": 0.23439665380829766,
      "learning_rate": 0.0035055865031530173,
      "loss": 2.59,
      "loss_layer_12_head": 0.6116033792495728,
      "loss_layer_18_head": 0.4850834012031555,
      "loss_layer_24_head": 0.28956371545791626,
      "loss_layer_30_head": 0.1827908158302307,
      "loss_layer_36_head": 0.12408431619405746,
      "loss_layer_42_head": 0.06898702681064606,
      "loss_layer_6_head": 0.8376851081848145,
      "step": 3365
    },
    {
      "epoch": 43.136,
      "grad_norm": 0.3122183972267335,
      "learning_rate": 0.003500462474415584,
      "loss": 2.6079,
      "loss_layer_12_head": 0.5955246686935425,
      "loss_layer_18_head": 0.48465877771377563,
      "loss_layer_24_head": 0.28800445795059204,
      "loss_layer_30_head": 0.18371644616127014,
      "loss_layer_36_head": 0.11737276613712311,
      "loss_layer_42_head": 0.0662488266825676,
      "loss_layer_6_head": 0.837476909160614,
      "step": 3370
    },
    {
      "epoch": 43.2,
      "grad_norm": 0.20928400327653113,
      "learning_rate": 0.0034953334365037526,
      "loss": 2.5933,
      "loss_layer_12_head": 0.6131819486618042,
      "loss_layer_18_head": 0.5050573348999023,
      "loss_layer_24_head": 0.29583871364593506,
      "loss_layer_30_head": 0.18941733241081238,
      "loss_layer_36_head": 0.11936704069375992,
      "loss_layer_42_head": 0.07039276510477066,
      "loss_layer_6_head": 0.8593696355819702,
      "step": 3375
    },
    {
      "epoch": 43.264,
      "grad_norm": 0.23262930507599364,
      "learning_rate": 0.003490199415097892,
      "loss": 2.5417,
      "loss_layer_12_head": 0.5911301970481873,
      "loss_layer_18_head": 0.47820210456848145,
      "loss_layer_24_head": 0.2843300402164459,
      "loss_layer_30_head": 0.1823400855064392,
      "loss_layer_36_head": 0.11363937705755234,
      "loss_layer_42_head": 0.06464181840419769,
      "loss_layer_6_head": 0.8314143419265747,
      "step": 3380
    },
    {
      "epoch": 43.328,
      "grad_norm": 0.14614774566758038,
      "learning_rate": 0.0034850604359033233,
      "loss": 2.6347,
      "loss_layer_12_head": 0.5878945589065552,
      "loss_layer_18_head": 0.4708290994167328,
      "loss_layer_24_head": 0.2824513614177704,
      "loss_layer_30_head": 0.18224017322063446,
      "loss_layer_36_head": 0.11307130753993988,
      "loss_layer_42_head": 0.06809186935424805,
      "loss_layer_6_head": 0.8325718641281128,
      "step": 3385
    },
    {
      "epoch": 43.392,
      "grad_norm": 0.2102910983985316,
      "learning_rate": 0.003479916524650188,
      "loss": 2.6407,
      "loss_layer_12_head": 0.5776407122612,
      "loss_layer_18_head": 0.4656095504760742,
      "loss_layer_24_head": 0.2865152955055237,
      "loss_layer_30_head": 0.19058619439601898,
      "loss_layer_36_head": 0.11238870769739151,
      "loss_layer_42_head": 0.06997496634721756,
      "loss_layer_6_head": 0.822827160358429,
      "step": 3390
    },
    {
      "epoch": 43.456,
      "grad_norm": 0.15182950980705767,
      "learning_rate": 0.0034747677070933254,
      "loss": 2.6478,
      "loss_layer_12_head": 0.598751962184906,
      "loss_layer_18_head": 0.48921307921409607,
      "loss_layer_24_head": 0.2994233965873718,
      "loss_layer_30_head": 0.1854785978794098,
      "loss_layer_36_head": 0.11442265659570694,
      "loss_layer_42_head": 0.06894880533218384,
      "loss_layer_6_head": 0.8510902523994446,
      "step": 3395
    },
    {
      "epoch": 43.52,
      "grad_norm": 0.18846991106887073,
      "learning_rate": 0.0034696140090121376,
      "loss": 2.5986,
      "loss_layer_12_head": 0.5797563791275024,
      "loss_layer_18_head": 0.4830295443534851,
      "loss_layer_24_head": 0.29327329993247986,
      "loss_layer_30_head": 0.19062745571136475,
      "loss_layer_36_head": 0.11116355657577515,
      "loss_layer_42_head": 0.06554026901721954,
      "loss_layer_6_head": 0.8243520855903625,
      "step": 3400
    },
    {
      "epoch": 43.52,
      "eval_loss": 5.426503658294678,
      "eval_loss_layer_12_head": 1.2407386302947998,
      "eval_loss_layer_18_head": 1.0889889001846313,
      "eval_loss_layer_24_head": 0.6998987197875977,
      "eval_loss_layer_30_head": 0.47191429138183594,
      "eval_loss_layer_36_head": 0.29139554500579834,
      "eval_loss_layer_42_head": 0.1743050515651703,
      "eval_loss_layer_6_head": 1.5933241844177246,
      "eval_runtime": 33.0885,
      "eval_samples_per_second": 9.671,
      "eval_steps_per_second": 0.604,
      "step": 3400
    },
    {
      "epoch": 43.584,
      "grad_norm": 0.19985551244648234,
      "learning_rate": 0.0034644554562104635,
      "loss": 2.6617,
      "loss_layer_12_head": 0.6071485280990601,
      "loss_layer_18_head": 0.509914755821228,
      "loss_layer_24_head": 0.30729636549949646,
      "loss_layer_30_head": 0.19266238808631897,
      "loss_layer_36_head": 0.11742794513702393,
      "loss_layer_42_head": 0.06597033143043518,
      "loss_layer_6_head": 0.8575075268745422,
      "step": 3405
    },
    {
      "epoch": 43.648,
      "grad_norm": 0.20738385488203026,
      "learning_rate": 0.003459292074516449,
      "loss": 2.6664,
      "loss_layer_12_head": 0.6195417642593384,
      "loss_layer_18_head": 0.531676173210144,
      "loss_layer_24_head": 0.31342411041259766,
      "loss_layer_30_head": 0.18927304446697235,
      "loss_layer_36_head": 0.11851409822702408,
      "loss_layer_42_head": 0.06854297965765,
      "loss_layer_6_head": 0.8754452466964722,
      "step": 3410
    },
    {
      "epoch": 43.712,
      "grad_norm": 0.2555515043678726,
      "learning_rate": 0.003454123889782418,
      "loss": 2.7314,
      "loss_layer_12_head": 0.6361206769943237,
      "loss_layer_18_head": 0.5496989488601685,
      "loss_layer_24_head": 0.3265947699546814,
      "loss_layer_30_head": 0.19198459386825562,
      "loss_layer_36_head": 0.11904784291982651,
      "loss_layer_42_head": 0.06779046356678009,
      "loss_layer_6_head": 0.8991361856460571,
      "step": 3415
    },
    {
      "epoch": 43.776,
      "grad_norm": 0.2964476344473837,
      "learning_rate": 0.0034489509278847414,
      "loss": 2.7281,
      "loss_layer_12_head": 0.6195502281188965,
      "loss_layer_18_head": 0.5400804281234741,
      "loss_layer_24_head": 0.3158927857875824,
      "loss_layer_30_head": 0.1922754943370819,
      "loss_layer_36_head": 0.1228099837899208,
      "loss_layer_42_head": 0.07026252895593643,
      "loss_layer_6_head": 0.8723239898681641,
      "step": 3420
    },
    {
      "epoch": 43.84,
      "grad_norm": 0.3238830424488282,
      "learning_rate": 0.0034437732147237087,
      "loss": 2.7998,
      "loss_layer_12_head": 0.6126322746276855,
      "loss_layer_18_head": 0.5878234505653381,
      "loss_layer_24_head": 0.31443488597869873,
      "loss_layer_30_head": 0.18358637392520905,
      "loss_layer_36_head": 0.11429297924041748,
      "loss_layer_42_head": 0.06677732616662979,
      "loss_layer_6_head": 0.8799151182174683,
      "step": 3425
    },
    {
      "epoch": 43.904,
      "grad_norm": 0.32283555356237487,
      "learning_rate": 0.0034385907762234,
      "loss": 2.7824,
      "loss_layer_12_head": 0.6423940658569336,
      "loss_layer_18_head": 0.615967869758606,
      "loss_layer_24_head": 0.3297182023525238,
      "loss_layer_30_head": 0.19576650857925415,
      "loss_layer_36_head": 0.12417466938495636,
      "loss_layer_42_head": 0.07015486061573029,
      "loss_layer_6_head": 0.909716010093689,
      "step": 3430
    },
    {
      "epoch": 43.968,
      "grad_norm": 0.22577248456476762,
      "learning_rate": 0.0034334036383315523,
      "loss": 2.7619,
      "loss_layer_12_head": 0.6245972514152527,
      "loss_layer_18_head": 0.5552530884742737,
      "loss_layer_24_head": 0.3097267746925354,
      "loss_layer_30_head": 0.1894182711839676,
      "loss_layer_36_head": 0.11654015630483627,
      "loss_layer_42_head": 0.06578510999679565,
      "loss_layer_6_head": 0.8644906878471375,
      "step": 3435
    },
    {
      "epoch": 44.032,
      "grad_norm": 0.2027271253976208,
      "learning_rate": 0.003428211827019434,
      "loss": 2.6822,
      "loss_layer_12_head": 0.6008732914924622,
      "loss_layer_18_head": 0.5067128539085388,
      "loss_layer_24_head": 0.2902821898460388,
      "loss_layer_30_head": 0.174702450633049,
      "loss_layer_36_head": 0.10679541528224945,
      "loss_layer_42_head": 0.06028091907501221,
      "loss_layer_6_head": 0.8274690508842468,
      "step": 3440
    },
    {
      "epoch": 44.096,
      "grad_norm": 0.3140630195479908,
      "learning_rate": 0.0034230153682817113,
      "loss": 2.585,
      "loss_layer_12_head": 0.6309368014335632,
      "loss_layer_18_head": 0.5060824751853943,
      "loss_layer_24_head": 0.29731327295303345,
      "loss_layer_30_head": 0.17819425463676453,
      "loss_layer_36_head": 0.114010751247406,
      "loss_layer_42_head": 0.06210971623659134,
      "loss_layer_6_head": 0.853227436542511,
      "step": 3445
    },
    {
      "epoch": 44.16,
      "grad_norm": 0.39898636902121026,
      "learning_rate": 0.003417814288136319,
      "loss": 2.6445,
      "loss_layer_12_head": 0.7064390778541565,
      "loss_layer_18_head": 0.499808132648468,
      "loss_layer_24_head": 0.2954602837562561,
      "loss_layer_30_head": 0.1748739778995514,
      "loss_layer_36_head": 0.11549659073352814,
      "loss_layer_42_head": 0.060772139579057693,
      "loss_layer_6_head": 0.8477666974067688,
      "step": 3450
    },
    {
      "epoch": 44.224,
      "grad_norm": 0.31830291791757154,
      "learning_rate": 0.0034126086126243316,
      "loss": 2.6836,
      "loss_layer_12_head": 0.697464644908905,
      "loss_layer_18_head": 0.5111352205276489,
      "loss_layer_24_head": 0.30742961168289185,
      "loss_layer_30_head": 0.18444645404815674,
      "loss_layer_36_head": 0.12110551446676254,
      "loss_layer_42_head": 0.06322896480560303,
      "loss_layer_6_head": 0.8634821772575378,
      "step": 3455
    },
    {
      "epoch": 44.288,
      "grad_norm": 0.27375087201728054,
      "learning_rate": 0.003407398367809832,
      "loss": 2.6632,
      "loss_layer_12_head": 0.6405032873153687,
      "loss_layer_18_head": 0.5075263977050781,
      "loss_layer_24_head": 0.29781174659729004,
      "loss_layer_30_head": 0.18273557722568512,
      "loss_layer_36_head": 0.12652137875556946,
      "loss_layer_42_head": 0.06369291990995407,
      "loss_layer_6_head": 0.8317699432373047,
      "step": 3460
    },
    {
      "epoch": 44.352,
      "grad_norm": 0.36708378182107404,
      "learning_rate": 0.0034021835797797806,
      "loss": 2.7066,
      "loss_layer_12_head": 0.6728273034095764,
      "loss_layer_18_head": 0.566621720790863,
      "loss_layer_24_head": 0.32599636912345886,
      "loss_layer_30_head": 0.19212117791175842,
      "loss_layer_36_head": 0.1262470781803131,
      "loss_layer_42_head": 0.06272374093532562,
      "loss_layer_6_head": 0.9028280973434448,
      "step": 3465
    },
    {
      "epoch": 44.416,
      "grad_norm": 0.387083868381733,
      "learning_rate": 0.0033969642746438833,
      "loss": 2.6994,
      "loss_layer_12_head": 0.6426473259925842,
      "loss_layer_18_head": 0.5595172643661499,
      "loss_layer_24_head": 0.3244446814060211,
      "loss_layer_30_head": 0.18991728127002716,
      "loss_layer_36_head": 0.12399164587259293,
      "loss_layer_42_head": 0.06279143691062927,
      "loss_layer_6_head": 0.8830623626708984,
      "step": 3470
    },
    {
      "epoch": 44.48,
      "grad_norm": 0.28324438886173503,
      "learning_rate": 0.0033917404785344667,
      "loss": 2.7547,
      "loss_layer_12_head": 0.6512131690979004,
      "loss_layer_18_head": 0.5933526158332825,
      "loss_layer_24_head": 0.32273292541503906,
      "loss_layer_30_head": 0.18905283510684967,
      "loss_layer_36_head": 0.12177157402038574,
      "loss_layer_42_head": 0.062835194170475,
      "loss_layer_6_head": 0.8953493237495422,
      "step": 3475
    },
    {
      "epoch": 44.544,
      "grad_norm": 0.2637392174786811,
      "learning_rate": 0.003386512217606339,
      "loss": 2.7526,
      "loss_layer_12_head": 0.6456092596054077,
      "loss_layer_18_head": 0.5604689121246338,
      "loss_layer_24_head": 0.31435900926589966,
      "loss_layer_30_head": 0.18550840020179749,
      "loss_layer_36_head": 0.11789441108703613,
      "loss_layer_42_head": 0.06283041089773178,
      "loss_layer_6_head": 0.9107282757759094,
      "step": 3480
    },
    {
      "epoch": 44.608,
      "grad_norm": 0.2854304084061612,
      "learning_rate": 0.0033812795180366657,
      "loss": 2.6746,
      "loss_layer_12_head": 0.5876504778862,
      "loss_layer_18_head": 0.5063294172286987,
      "loss_layer_24_head": 0.290393590927124,
      "loss_layer_30_head": 0.17507250607013702,
      "loss_layer_36_head": 0.11245021969079971,
      "loss_layer_42_head": 0.06521595269441605,
      "loss_layer_6_head": 0.8214141130447388,
      "step": 3485
    },
    {
      "epoch": 44.672,
      "grad_norm": 0.237898129993107,
      "learning_rate": 0.0033760424060248345,
      "loss": 2.6713,
      "loss_layer_12_head": 0.6307154893875122,
      "loss_layer_18_head": 0.5293525457382202,
      "loss_layer_24_head": 0.31150108575820923,
      "loss_layer_30_head": 0.1840444952249527,
      "loss_layer_36_head": 0.11633744090795517,
      "loss_layer_42_head": 0.07439196109771729,
      "loss_layer_6_head": 0.8883455395698547,
      "step": 3490
    },
    {
      "epoch": 44.736,
      "grad_norm": 0.21087102405953248,
      "learning_rate": 0.003370800907792325,
      "loss": 2.6977,
      "loss_layer_12_head": 0.6065112352371216,
      "loss_layer_18_head": 0.5085740089416504,
      "loss_layer_24_head": 0.29862362146377563,
      "loss_layer_30_head": 0.17864517867565155,
      "loss_layer_36_head": 0.11361835151910782,
      "loss_layer_42_head": 0.06925202906131744,
      "loss_layer_6_head": 0.8597921133041382,
      "step": 3495
    },
    {
      "epoch": 44.8,
      "grad_norm": 0.22176563251123532,
      "learning_rate": 0.003365555049582582,
      "loss": 2.6466,
      "loss_layer_12_head": 0.6222425699234009,
      "loss_layer_18_head": 0.512480616569519,
      "loss_layer_24_head": 0.3054293990135193,
      "loss_layer_30_head": 0.1836685985326767,
      "loss_layer_36_head": 0.1146554946899414,
      "loss_layer_42_head": 0.0660967081785202,
      "loss_layer_6_head": 0.8773821592330933,
      "step": 3500
    },
    {
      "epoch": 44.864,
      "grad_norm": 0.15646742491658608,
      "learning_rate": 0.0033603048576608735,
      "loss": 2.6955,
      "loss_layer_12_head": 0.6543017625808716,
      "loss_layer_18_head": 0.5300337076187134,
      "loss_layer_24_head": 0.3196091055870056,
      "loss_layer_30_head": 0.19745245575904846,
      "loss_layer_36_head": 0.12003966420888901,
      "loss_layer_42_head": 0.06860999017953873,
      "loss_layer_6_head": 0.9202666282653809,
      "step": 3505
    },
    {
      "epoch": 44.928,
      "grad_norm": 0.19555721329641643,
      "learning_rate": 0.003355050358314172,
      "loss": 2.6504,
      "loss_layer_12_head": 0.6062322854995728,
      "loss_layer_18_head": 0.4915919303894043,
      "loss_layer_24_head": 0.2982628047466278,
      "loss_layer_30_head": 0.18734076619148254,
      "loss_layer_36_head": 0.11371280997991562,
      "loss_layer_42_head": 0.0648636519908905,
      "loss_layer_6_head": 0.8568073511123657,
      "step": 3510
    },
    {
      "epoch": 44.992,
      "grad_norm": 0.21053105248050766,
      "learning_rate": 0.0033497915778510122,
      "loss": 2.6292,
      "loss_layer_12_head": 0.606296718120575,
      "loss_layer_18_head": 0.49080556631088257,
      "loss_layer_24_head": 0.29937079548835754,
      "loss_layer_30_head": 0.18982428312301636,
      "loss_layer_36_head": 0.11383094638586044,
      "loss_layer_42_head": 0.06481240689754486,
      "loss_layer_6_head": 0.856545627117157,
      "step": 3515
    },
    {
      "epoch": 45.056,
      "grad_norm": 0.19764558191074075,
      "learning_rate": 0.003344528542601368,
      "loss": 2.5606,
      "loss_layer_12_head": 0.5647932291030884,
      "loss_layer_18_head": 0.4528743326663971,
      "loss_layer_24_head": 0.2837297320365906,
      "loss_layer_30_head": 0.18876434862613678,
      "loss_layer_36_head": 0.10569324344396591,
      "loss_layer_42_head": 0.06143871694803238,
      "loss_layer_6_head": 0.8017005920410156,
      "step": 3520
    },
    {
      "epoch": 45.12,
      "grad_norm": 0.20076799831509978,
      "learning_rate": 0.0033392612789165123,
      "loss": 2.5636,
      "loss_layer_12_head": 0.5941595435142517,
      "loss_layer_18_head": 0.476142019033432,
      "loss_layer_24_head": 0.2947177290916443,
      "loss_layer_30_head": 0.19350551068782806,
      "loss_layer_36_head": 0.11166516691446304,
      "loss_layer_42_head": 0.06437446177005768,
      "loss_layer_6_head": 0.8433893322944641,
      "step": 3525
    },
    {
      "epoch": 45.184,
      "grad_norm": 0.28648601672293544,
      "learning_rate": 0.003333989813168891,
      "loss": 2.5844,
      "loss_layer_12_head": 0.5932879447937012,
      "loss_layer_18_head": 0.46824246644973755,
      "loss_layer_24_head": 0.28885987401008606,
      "loss_layer_30_head": 0.1828027218580246,
      "loss_layer_36_head": 0.10755407810211182,
      "loss_layer_42_head": 0.0614149272441864,
      "loss_layer_6_head": 0.8343443870544434,
      "step": 3530
    },
    {
      "epoch": 45.248,
      "grad_norm": 0.25220256622941467,
      "learning_rate": 0.0033287141717519898,
      "loss": 2.5616,
      "loss_layer_12_head": 0.5707033276557922,
      "loss_layer_18_head": 0.44406405091285706,
      "loss_layer_24_head": 0.27546074986457825,
      "loss_layer_30_head": 0.18051043152809143,
      "loss_layer_36_head": 0.10728339850902557,
      "loss_layer_42_head": 0.06668568402528763,
      "loss_layer_6_head": 0.8035060167312622,
      "step": 3535
    },
    {
      "epoch": 45.312,
      "grad_norm": 0.23003424848522142,
      "learning_rate": 0.003323434381080199,
      "loss": 2.5659,
      "loss_layer_12_head": 0.6289854049682617,
      "loss_layer_18_head": 0.49982887506484985,
      "loss_layer_24_head": 0.3054986596107483,
      "loss_layer_30_head": 0.1914674937725067,
      "loss_layer_36_head": 0.11783891916275024,
      "loss_layer_42_head": 0.06790826469659805,
      "loss_layer_6_head": 0.8943486213684082,
      "step": 3540
    },
    {
      "epoch": 45.376,
      "grad_norm": 0.34381267974515295,
      "learning_rate": 0.0033181504675886876,
      "loss": 2.665,
      "loss_layer_12_head": 0.5921050906181335,
      "loss_layer_18_head": 0.47321105003356934,
      "loss_layer_24_head": 0.2895044684410095,
      "loss_layer_30_head": 0.18383245170116425,
      "loss_layer_36_head": 0.109003446996212,
      "loss_layer_42_head": 0.06069584935903549,
      "loss_layer_6_head": 0.884057343006134,
      "step": 3545
    },
    {
      "epoch": 45.44,
      "grad_norm": 0.3436994655381804,
      "learning_rate": 0.003312862457733263,
      "loss": 2.6515,
      "loss_layer_12_head": 0.6049162149429321,
      "loss_layer_18_head": 0.48592129349708557,
      "loss_layer_24_head": 0.3099120259284973,
      "loss_layer_30_head": 0.22803369164466858,
      "loss_layer_36_head": 0.12101010978221893,
      "loss_layer_42_head": 0.09803657233715057,
      "loss_layer_6_head": 0.8922468423843384,
      "step": 3550
    },
    {
      "epoch": 45.504,
      "grad_norm": 0.3962838004382706,
      "learning_rate": 0.003307570377990245,
      "loss": 2.6458,
      "loss_layer_12_head": 0.5788633227348328,
      "loss_layer_18_head": 0.4638703465461731,
      "loss_layer_24_head": 0.2867237627506256,
      "loss_layer_30_head": 0.18134775757789612,
      "loss_layer_36_head": 0.11431890726089478,
      "loss_layer_42_head": 0.06329449266195297,
      "loss_layer_6_head": 0.9006697535514832,
      "step": 3555
    },
    {
      "epoch": 45.568,
      "grad_norm": 0.20498875148491624,
      "learning_rate": 0.0033022742548563288,
      "loss": 2.6434,
      "loss_layer_12_head": 0.5866985321044922,
      "loss_layer_18_head": 0.4725605845451355,
      "loss_layer_24_head": 0.288006454706192,
      "loss_layer_30_head": 0.18472610414028168,
      "loss_layer_36_head": 0.1221243143081665,
      "loss_layer_42_head": 0.06304088234901428,
      "loss_layer_6_head": 0.8881959915161133,
      "step": 3560
    },
    {
      "epoch": 45.632,
      "grad_norm": 0.24000121626556487,
      "learning_rate": 0.0032969741148484574,
      "loss": 2.7128,
      "loss_layer_12_head": 0.5857793688774109,
      "loss_layer_18_head": 0.4713711738586426,
      "loss_layer_24_head": 0.2932243049144745,
      "loss_layer_30_head": 0.18426427245140076,
      "loss_layer_36_head": 0.14069528877735138,
      "loss_layer_42_head": 0.0649409145116806,
      "loss_layer_6_head": 0.8729990124702454,
      "step": 3565
    },
    {
      "epoch": 45.696,
      "grad_norm": 0.20697474832454318,
      "learning_rate": 0.003291669984503682,
      "loss": 2.6967,
      "loss_layer_12_head": 0.6249872446060181,
      "loss_layer_18_head": 0.4983815550804138,
      "loss_layer_24_head": 0.30525070428848267,
      "loss_layer_30_head": 0.19256457686424255,
      "loss_layer_36_head": 0.1633586287498474,
      "loss_layer_42_head": 0.06847206503152847,
      "loss_layer_6_head": 0.9155941009521484,
      "step": 3570
    },
    {
      "epoch": 45.76,
      "grad_norm": 0.2608723446679453,
      "learning_rate": 0.003286361890379034,
      "loss": 2.7214,
      "loss_layer_12_head": 0.6011894941329956,
      "loss_layer_18_head": 0.4827272891998291,
      "loss_layer_24_head": 0.2954626679420471,
      "loss_layer_30_head": 0.1790604442358017,
      "loss_layer_36_head": 0.14626431465148926,
      "loss_layer_42_head": 0.06320399791002274,
      "loss_layer_6_head": 0.8650292158126831,
      "step": 3575
    },
    {
      "epoch": 45.824,
      "grad_norm": 0.22037068324233733,
      "learning_rate": 0.0032810498590513937,
      "loss": 2.727,
      "loss_layer_12_head": 0.6285165548324585,
      "loss_layer_18_head": 0.504229724407196,
      "loss_layer_24_head": 0.3170621991157532,
      "loss_layer_30_head": 0.22011998295783997,
      "loss_layer_36_head": 0.1480255424976349,
      "loss_layer_42_head": 0.11131776869297028,
      "loss_layer_6_head": 0.8842833638191223,
      "step": 3580
    },
    {
      "epoch": 45.888,
      "grad_norm": 0.23408282308879383,
      "learning_rate": 0.0032757339171173507,
      "loss": 2.7378,
      "loss_layer_12_head": 0.6120829582214355,
      "loss_layer_18_head": 0.4906477928161621,
      "loss_layer_24_head": 0.303697407245636,
      "loss_layer_30_head": 0.1790274828672409,
      "loss_layer_36_head": 0.13265664875507355,
      "loss_layer_42_head": 0.06705978512763977,
      "loss_layer_6_head": 0.865090548992157,
      "step": 3585
    },
    {
      "epoch": 45.952,
      "grad_norm": 0.25247839009623224,
      "learning_rate": 0.0032704140911930767,
      "loss": 2.6983,
      "loss_layer_12_head": 0.6451810002326965,
      "loss_layer_18_head": 0.5143178701400757,
      "loss_layer_24_head": 0.3213422894477844,
      "loss_layer_30_head": 0.18916963040828705,
      "loss_layer_36_head": 0.13195398449897766,
      "loss_layer_42_head": 0.070411816239357,
      "loss_layer_6_head": 0.9173599481582642,
      "step": 3590
    },
    {
      "epoch": 46.016,
      "grad_norm": 0.19646760282460843,
      "learning_rate": 0.0032650904079141885,
      "loss": 2.6567,
      "loss_layer_12_head": 0.5791570544242859,
      "loss_layer_18_head": 0.4614434838294983,
      "loss_layer_24_head": 0.2964390218257904,
      "loss_layer_30_head": 0.17183849215507507,
      "loss_layer_36_head": 0.12297286093235016,
      "loss_layer_42_head": 0.06881777942180634,
      "loss_layer_6_head": 0.8205267786979675,
      "step": 3595
    },
    {
      "epoch": 46.08,
      "grad_norm": 0.24211817496903956,
      "learning_rate": 0.003259762893935617,
      "loss": 2.5645,
      "loss_layer_12_head": 0.5770832300186157,
      "loss_layer_18_head": 0.4530993402004242,
      "loss_layer_24_head": 0.30065345764160156,
      "loss_layer_30_head": 0.16831466555595398,
      "loss_layer_36_head": 0.12193317711353302,
      "loss_layer_42_head": 0.06940454989671707,
      "loss_layer_6_head": 0.8182371258735657,
      "step": 3600
    },
    {
      "epoch": 46.08,
      "eval_loss": 5.463980674743652,
      "eval_loss_layer_12_head": 1.2546415328979492,
      "eval_loss_layer_18_head": 1.0868241786956787,
      "eval_loss_layer_24_head": 0.7156407237052917,
      "eval_loss_layer_30_head": 0.45728546380996704,
      "eval_loss_layer_36_head": 0.30961698293685913,
      "eval_loss_layer_42_head": 0.18091556429862976,
      "eval_loss_layer_6_head": 1.589300513267517,
      "eval_runtime": 33.0604,
      "eval_samples_per_second": 9.679,
      "eval_steps_per_second": 0.605,
      "step": 3600
    },
    {
      "epoch": 46.144,
      "grad_norm": 0.22245876641217782,
      "learning_rate": 0.0032544315759314734,
      "loss": 2.6169,
      "loss_layer_12_head": 0.6139820218086243,
      "loss_layer_18_head": 0.48544663190841675,
      "loss_layer_24_head": 0.31790104508399963,
      "loss_layer_30_head": 0.1812545359134674,
      "loss_layer_36_head": 0.1338702142238617,
      "loss_layer_42_head": 0.0714409351348877,
      "loss_layer_6_head": 0.8522067070007324,
      "step": 3605
    },
    {
      "epoch": 46.208,
      "grad_norm": 0.21551692218979834,
      "learning_rate": 0.003249096480594914,
      "loss": 2.6249,
      "loss_layer_12_head": 0.5906568765640259,
      "loss_layer_18_head": 0.46043235063552856,
      "loss_layer_24_head": 0.32948893308639526,
      "loss_layer_30_head": 0.17134669423103333,
      "loss_layer_36_head": 0.1233082190155983,
      "loss_layer_42_head": 0.06691473722457886,
      "loss_layer_6_head": 0.8205707669258118,
      "step": 3610
    },
    {
      "epoch": 46.272,
      "grad_norm": 0.30433540902020634,
      "learning_rate": 0.003243757634638008,
      "loss": 2.6806,
      "loss_layer_12_head": 0.6037302613258362,
      "loss_layer_18_head": 0.461590439081192,
      "loss_layer_24_head": 0.3717188239097595,
      "loss_layer_30_head": 0.1692582666873932,
      "loss_layer_36_head": 0.12342791259288788,
      "loss_layer_42_head": 0.07041792571544647,
      "loss_layer_6_head": 0.8322204351425171,
      "step": 3615
    },
    {
      "epoch": 46.336,
      "grad_norm": 0.5152245824994891,
      "learning_rate": 0.003238415064791603,
      "loss": 2.7524,
      "loss_layer_12_head": 0.6958147883415222,
      "loss_layer_18_head": 0.47723764181137085,
      "loss_layer_24_head": 0.3758837580680847,
      "loss_layer_30_head": 0.17525388300418854,
      "loss_layer_36_head": 0.12100402265787125,
      "loss_layer_42_head": 0.06960131973028183,
      "loss_layer_6_head": 0.8494621515274048,
      "step": 3620
    },
    {
      "epoch": 46.4,
      "grad_norm": 0.37914975161261466,
      "learning_rate": 0.003233068797805194,
      "loss": 2.7507,
      "loss_layer_12_head": 0.7088296413421631,
      "loss_layer_18_head": 0.4787866175174713,
      "loss_layer_24_head": 0.3671482503414154,
      "loss_layer_30_head": 0.17698104679584503,
      "loss_layer_36_head": 0.1207663044333458,
      "loss_layer_42_head": 0.06776197254657745,
      "loss_layer_6_head": 0.8453279733657837,
      "step": 3625
    },
    {
      "epoch": 46.464,
      "grad_norm": 0.27794170382919214,
      "learning_rate": 0.003227718860446782,
      "loss": 2.6995,
      "loss_layer_12_head": 0.6852826476097107,
      "loss_layer_18_head": 0.4735082685947418,
      "loss_layer_24_head": 0.3443160057067871,
      "loss_layer_30_head": 0.16870731115341187,
      "loss_layer_36_head": 0.11389879137277603,
      "loss_layer_42_head": 0.06614242494106293,
      "loss_layer_6_head": 0.8480418920516968,
      "step": 3630
    },
    {
      "epoch": 46.528,
      "grad_norm": 0.302010129177763,
      "learning_rate": 0.003222365279502752,
      "loss": 2.7413,
      "loss_layer_12_head": 0.6703281998634338,
      "loss_layer_18_head": 0.4773436188697815,
      "loss_layer_24_head": 0.3366287648677826,
      "loss_layer_30_head": 0.1830337643623352,
      "loss_layer_36_head": 0.11957414448261261,
      "loss_layer_42_head": 0.07364897429943085,
      "loss_layer_6_head": 0.8344033360481262,
      "step": 3635
    },
    {
      "epoch": 46.592,
      "grad_norm": 0.3564135776538389,
      "learning_rate": 0.003217008081777726,
      "loss": 2.6804,
      "loss_layer_12_head": 0.666201651096344,
      "loss_layer_18_head": 0.47884249687194824,
      "loss_layer_24_head": 0.3286014199256897,
      "loss_layer_30_head": 0.17591170966625214,
      "loss_layer_36_head": 0.11579371988773346,
      "loss_layer_42_head": 0.07083015888929367,
      "loss_layer_6_head": 0.835128664970398,
      "step": 3640
    },
    {
      "epoch": 46.656,
      "grad_norm": 0.22072497264961913,
      "learning_rate": 0.003211647294094437,
      "loss": 2.7151,
      "loss_layer_12_head": 0.6768962144851685,
      "loss_layer_18_head": 0.5016362071037292,
      "loss_layer_24_head": 0.33407264947891235,
      "loss_layer_30_head": 0.18022257089614868,
      "loss_layer_36_head": 0.116010382771492,
      "loss_layer_42_head": 0.0684981644153595,
      "loss_layer_6_head": 0.8910082578659058,
      "step": 3645
    },
    {
      "epoch": 46.72,
      "grad_norm": 0.2664366137593644,
      "learning_rate": 0.0032062829432935925,
      "loss": 2.6928,
      "loss_layer_12_head": 0.6738896369934082,
      "loss_layer_18_head": 0.49060511589050293,
      "loss_layer_24_head": 0.32154732942581177,
      "loss_layer_30_head": 0.17807415127754211,
      "loss_layer_36_head": 0.1158461943268776,
      "loss_layer_42_head": 0.06484069675207138,
      "loss_layer_6_head": 0.8614140748977661,
      "step": 3650
    },
    {
      "epoch": 46.784,
      "grad_norm": 0.3076356040269577,
      "learning_rate": 0.00320091505623374,
      "loss": 2.7085,
      "loss_layer_12_head": 0.6846492886543274,
      "loss_layer_18_head": 0.5147966146469116,
      "loss_layer_24_head": 0.32849955558776855,
      "loss_layer_30_head": 0.18680869042873383,
      "loss_layer_36_head": 0.11853863298892975,
      "loss_layer_42_head": 0.06691797077655792,
      "loss_layer_6_head": 0.9155701398849487,
      "step": 3655
    },
    {
      "epoch": 46.848,
      "grad_norm": 0.20575209306502124,
      "learning_rate": 0.003195543659791132,
      "loss": 2.716,
      "loss_layer_12_head": 0.6674710512161255,
      "loss_layer_18_head": 0.5117928385734558,
      "loss_layer_24_head": 0.3235340118408203,
      "loss_layer_30_head": 0.18357017636299133,
      "loss_layer_36_head": 0.11627982556819916,
      "loss_layer_42_head": 0.06449062377214432,
      "loss_layer_6_head": 0.8973787426948547,
      "step": 3660
    },
    {
      "epoch": 46.912,
      "grad_norm": 0.3800880283497079,
      "learning_rate": 0.003190168780859592,
      "loss": 2.7232,
      "loss_layer_12_head": 0.6621751189231873,
      "loss_layer_18_head": 0.48557180166244507,
      "loss_layer_24_head": 0.30817294120788574,
      "loss_layer_30_head": 0.18064680695533752,
      "loss_layer_36_head": 0.1116049662232399,
      "loss_layer_42_head": 0.0653185099363327,
      "loss_layer_6_head": 0.8548812866210938,
      "step": 3665
    },
    {
      "epoch": 46.976,
      "grad_norm": 0.4581336490857984,
      "learning_rate": 0.0031847904463503814,
      "loss": 2.7664,
      "loss_layer_12_head": 0.6682694554328918,
      "loss_layer_18_head": 0.4938744604587555,
      "loss_layer_24_head": 0.30783864855766296,
      "loss_layer_30_head": 0.18352296948432922,
      "loss_layer_36_head": 0.11485044658184052,
      "loss_layer_42_head": 0.06489665806293488,
      "loss_layer_6_head": 0.875826358795166,
      "step": 3670
    },
    {
      "epoch": 47.04,
      "grad_norm": 0.28567780251148617,
      "learning_rate": 0.003179408683192061,
      "loss": 2.5891,
      "loss_layer_12_head": 0.6123559474945068,
      "loss_layer_18_head": 0.4606500566005707,
      "loss_layer_24_head": 0.2917476296424866,
      "loss_layer_30_head": 0.17972508072853088,
      "loss_layer_36_head": 0.11118602752685547,
      "loss_layer_42_head": 0.06430713832378387,
      "loss_layer_6_head": 0.8250728845596313,
      "step": 3675
    },
    {
      "epoch": 47.104,
      "grad_norm": 0.2732200376333573,
      "learning_rate": 0.00317402351833036,
      "loss": 2.5939,
      "loss_layer_12_head": 0.6425760984420776,
      "loss_layer_18_head": 0.4785030782222748,
      "loss_layer_24_head": 0.2975092828273773,
      "loss_layer_30_head": 0.17857575416564941,
      "loss_layer_36_head": 0.11105994135141373,
      "loss_layer_42_head": 0.061512283980846405,
      "loss_layer_6_head": 0.8716540336608887,
      "step": 3680
    },
    {
      "epoch": 47.168,
      "grad_norm": 0.5203440518121454,
      "learning_rate": 0.003168634978728037,
      "loss": 2.5701,
      "loss_layer_12_head": 0.6067346930503845,
      "loss_layer_18_head": 0.45778170228004456,
      "loss_layer_24_head": 0.28349268436431885,
      "loss_layer_30_head": 0.17370471358299255,
      "loss_layer_36_head": 0.10759858042001724,
      "loss_layer_42_head": 0.05863229185342789,
      "loss_layer_6_head": 0.8477280735969543,
      "step": 3685
    },
    {
      "epoch": 47.232,
      "grad_norm": 0.19999630770972648,
      "learning_rate": 0.003163243091364752,
      "loss": 2.5917,
      "loss_layer_12_head": 0.5996958017349243,
      "loss_layer_18_head": 0.45250287652015686,
      "loss_layer_24_head": 0.2781546115875244,
      "loss_layer_30_head": 0.17168749868869781,
      "loss_layer_36_head": 0.10689833015203476,
      "loss_layer_42_head": 0.06078004837036133,
      "loss_layer_6_head": 0.8513409495353699,
      "step": 3690
    },
    {
      "epoch": 47.296,
      "grad_norm": 0.2844637558268629,
      "learning_rate": 0.0031578478832369216,
      "loss": 2.6742,
      "loss_layer_12_head": 0.6572837233543396,
      "loss_layer_18_head": 0.49631327390670776,
      "loss_layer_24_head": 0.300764262676239,
      "loss_layer_30_head": 0.1884530484676361,
      "loss_layer_36_head": 0.11238644272089005,
      "loss_layer_42_head": 0.06264229863882065,
      "loss_layer_6_head": 0.9088292121887207,
      "step": 3695
    },
    {
      "epoch": 47.36,
      "grad_norm": 0.3219199979962743,
      "learning_rate": 0.0031524493813575934,
      "loss": 2.6172,
      "loss_layer_12_head": 0.617926299571991,
      "loss_layer_18_head": 0.4819389283657074,
      "loss_layer_24_head": 0.2957989275455475,
      "loss_layer_30_head": 0.18552234768867493,
      "loss_layer_36_head": 0.11202795803546906,
      "loss_layer_42_head": 0.06315083801746368,
      "loss_layer_6_head": 0.8624653816223145,
      "step": 3700
    },
    {
      "epoch": 47.424,
      "grad_norm": 0.19620637237832628,
      "learning_rate": 0.003147047612756302,
      "loss": 2.6235,
      "loss_layer_12_head": 0.6155635118484497,
      "loss_layer_18_head": 0.48290008306503296,
      "loss_layer_24_head": 0.29589149355888367,
      "loss_layer_30_head": 0.1864364743232727,
      "loss_layer_36_head": 0.11440746486186981,
      "loss_layer_42_head": 0.06391565501689911,
      "loss_layer_6_head": 0.8651612401008606,
      "step": 3705
    },
    {
      "epoch": 47.488,
      "grad_norm": 0.2795915260047668,
      "learning_rate": 0.003141642604478942,
      "loss": 2.65,
      "loss_layer_12_head": 0.656493067741394,
      "loss_layer_18_head": 0.5246139764785767,
      "loss_layer_24_head": 0.3109099268913269,
      "loss_layer_30_head": 0.19268842041492462,
      "loss_layer_36_head": 0.11461284011602402,
      "loss_layer_42_head": 0.06842734664678574,
      "loss_layer_6_head": 0.9066864252090454,
      "step": 3710
    },
    {
      "epoch": 47.552,
      "grad_norm": 0.3137083529841953,
      "learning_rate": 0.0031362343835876273,
      "loss": 2.6071,
      "loss_layer_12_head": 0.654264509677887,
      "loss_layer_18_head": 0.516398012638092,
      "loss_layer_24_head": 0.3085891604423523,
      "loss_layer_30_head": 0.19745400547981262,
      "loss_layer_36_head": 0.11714746057987213,
      "loss_layer_42_head": 0.07219770550727844,
      "loss_layer_6_head": 0.9010494351387024,
      "step": 3715
    },
    {
      "epoch": 47.616,
      "grad_norm": 0.24085037283961255,
      "learning_rate": 0.0031308229771605544,
      "loss": 2.6342,
      "loss_layer_12_head": 0.658222496509552,
      "loss_layer_18_head": 0.5066931247711182,
      "loss_layer_24_head": 0.30341845750808716,
      "loss_layer_30_head": 0.20023146271705627,
      "loss_layer_36_head": 0.11578724533319473,
      "loss_layer_42_head": 0.06546033918857574,
      "loss_layer_6_head": 0.8692223429679871,
      "step": 3720
    },
    {
      "epoch": 47.68,
      "grad_norm": 0.319494708089583,
      "learning_rate": 0.0031254084122918735,
      "loss": 2.717,
      "loss_layer_12_head": 0.6715075373649597,
      "loss_layer_18_head": 0.5128521919250488,
      "loss_layer_24_head": 0.30562788248062134,
      "loss_layer_30_head": 0.1996576189994812,
      "loss_layer_36_head": 0.11646630614995956,
      "loss_layer_42_head": 0.06495203077793121,
      "loss_layer_6_head": 0.8817533254623413,
      "step": 3725
    },
    {
      "epoch": 47.744,
      "grad_norm": 0.38559723430259357,
      "learning_rate": 0.0031199907160915463,
      "loss": 2.7272,
      "loss_layer_12_head": 0.6448809504508972,
      "loss_layer_18_head": 0.5153014659881592,
      "loss_layer_24_head": 0.29847392439842224,
      "loss_layer_30_head": 0.19514092803001404,
      "loss_layer_36_head": 0.11077572405338287,
      "loss_layer_42_head": 0.0630265399813652,
      "loss_layer_6_head": 0.8637567758560181,
      "step": 3730
    },
    {
      "epoch": 47.808,
      "grad_norm": 0.2741466694415238,
      "learning_rate": 0.003114569915685213,
      "loss": 2.7235,
      "loss_layer_12_head": 0.6358543634414673,
      "loss_layer_18_head": 0.5306086540222168,
      "loss_layer_24_head": 0.2949868440628052,
      "loss_layer_30_head": 0.1906265914440155,
      "loss_layer_36_head": 0.11215747892856598,
      "loss_layer_42_head": 0.06239762902259827,
      "loss_layer_6_head": 0.8610727190971375,
      "step": 3735
    },
    {
      "epoch": 47.872,
      "grad_norm": 0.3756156827321585,
      "learning_rate": 0.0031091460382140544,
      "loss": 2.7275,
      "loss_layer_12_head": 0.629956841468811,
      "loss_layer_18_head": 0.5974238514900208,
      "loss_layer_24_head": 0.2955223023891449,
      "loss_layer_30_head": 0.18600091338157654,
      "loss_layer_36_head": 0.11211760342121124,
      "loss_layer_42_head": 0.06213392689824104,
      "loss_layer_6_head": 0.8690745234489441,
      "step": 3740
    },
    {
      "epoch": 47.936,
      "grad_norm": 0.29679308299248003,
      "learning_rate": 0.003103719110834662,
      "loss": 2.8361,
      "loss_layer_12_head": 0.629340410232544,
      "loss_layer_18_head": 0.6096481084823608,
      "loss_layer_24_head": 0.29905176162719727,
      "loss_layer_30_head": 0.18502256274223328,
      "loss_layer_36_head": 0.11201824247837067,
      "loss_layer_42_head": 0.06161431223154068,
      "loss_layer_6_head": 0.8824082612991333,
      "step": 3745
    },
    {
      "epoch": 48.0,
      "grad_norm": 0.21980715173075074,
      "learning_rate": 0.0030982891607188945,
      "loss": 2.7309,
      "loss_layer_12_head": 0.6152459383010864,
      "loss_layer_18_head": 0.5698500871658325,
      "loss_layer_24_head": 0.29503631591796875,
      "loss_layer_30_head": 0.17896899580955505,
      "loss_layer_36_head": 0.11054155975580215,
      "loss_layer_42_head": 0.061433762311935425,
      "loss_layer_6_head": 0.8522292971611023,
      "step": 3750
    },
    {
      "epoch": 48.064,
      "grad_norm": 0.23902507605848924,
      "learning_rate": 0.003092856215053744,
      "loss": 2.5425,
      "loss_layer_12_head": 0.5846749544143677,
      "loss_layer_18_head": 0.5188774466514587,
      "loss_layer_24_head": 0.28190645575523376,
      "loss_layer_30_head": 0.1743389070034027,
      "loss_layer_36_head": 0.108132004737854,
      "loss_layer_42_head": 0.05985206365585327,
      "loss_layer_6_head": 0.8156237602233887,
      "step": 3755
    },
    {
      "epoch": 48.128,
      "grad_norm": 0.1397566748746041,
      "learning_rate": 0.0030874203010412054,
      "loss": 2.5486,
      "loss_layer_12_head": 0.5960351228713989,
      "loss_layer_18_head": 0.5154932737350464,
      "loss_layer_24_head": 0.28950104117393494,
      "loss_layer_30_head": 0.17779111862182617,
      "loss_layer_36_head": 0.11277779191732407,
      "loss_layer_42_head": 0.0621732696890831,
      "loss_layer_6_head": 0.8280693888664246,
      "step": 3760
    },
    {
      "epoch": 48.192,
      "grad_norm": 0.18123678046173355,
      "learning_rate": 0.0030819814458981306,
      "loss": 2.4948,
      "loss_layer_12_head": 0.5844622850418091,
      "loss_layer_18_head": 0.4960240423679352,
      "loss_layer_24_head": 0.28074368834495544,
      "loss_layer_30_head": 0.1704888790845871,
      "loss_layer_36_head": 0.10780014097690582,
      "loss_layer_42_head": 0.0574844665825367,
      "loss_layer_6_head": 0.8277271389961243,
      "step": 3765
    },
    {
      "epoch": 48.256,
      "grad_norm": 0.17136055604892816,
      "learning_rate": 0.0030765396768561003,
      "loss": 2.5716,
      "loss_layer_12_head": 0.5866295695304871,
      "loss_layer_18_head": 0.4929826855659485,
      "loss_layer_24_head": 0.2808472514152527,
      "loss_layer_30_head": 0.17178848385810852,
      "loss_layer_36_head": 0.10924514383077621,
      "loss_layer_42_head": 0.05960596352815628,
      "loss_layer_6_head": 0.835018515586853,
      "step": 3770
    },
    {
      "epoch": 48.32,
      "grad_norm": 0.1783626268957663,
      "learning_rate": 0.0030710950211612843,
      "loss": 2.5625,
      "loss_layer_12_head": 0.5640789270401001,
      "loss_layer_18_head": 0.4709322452545166,
      "loss_layer_24_head": 0.27036821842193604,
      "loss_layer_30_head": 0.16599544882774353,
      "loss_layer_36_head": 0.10402043163776398,
      "loss_layer_42_head": 0.06254281103610992,
      "loss_layer_6_head": 0.8085581064224243,
      "step": 3775
    },
    {
      "epoch": 48.384,
      "grad_norm": 0.18533490126518018,
      "learning_rate": 0.003065647506074306,
      "loss": 2.5557,
      "loss_layer_12_head": 0.5693805813789368,
      "loss_layer_18_head": 0.4720279276371002,
      "loss_layer_24_head": 0.279356986284256,
      "loss_layer_30_head": 0.17089149355888367,
      "loss_layer_36_head": 0.10934798419475555,
      "loss_layer_42_head": 0.062064122408628464,
      "loss_layer_6_head": 0.8222864270210266,
      "step": 3780
    },
    {
      "epoch": 48.448,
      "grad_norm": 0.2688068931661317,
      "learning_rate": 0.003060197158870102,
      "loss": 2.5894,
      "loss_layer_12_head": 0.601957380771637,
      "loss_layer_18_head": 0.4948192536830902,
      "loss_layer_24_head": 0.29822683334350586,
      "loss_layer_30_head": 0.1816636174917221,
      "loss_layer_36_head": 0.11730935424566269,
      "loss_layer_42_head": 0.06320096552371979,
      "loss_layer_6_head": 0.8634182810783386,
      "step": 3785
    },
    {
      "epoch": 48.512,
      "grad_norm": 0.19398144644649506,
      "learning_rate": 0.003054744006837794,
      "loss": 2.5673,
      "loss_layer_12_head": 0.587921142578125,
      "loss_layer_18_head": 0.480598509311676,
      "loss_layer_24_head": 0.28960901498794556,
      "loss_layer_30_head": 0.1755875051021576,
      "loss_layer_36_head": 0.11260640621185303,
      "loss_layer_42_head": 0.06586384028196335,
      "loss_layer_6_head": 0.8449599146842957,
      "step": 3790
    },
    {
      "epoch": 48.576,
      "grad_norm": 0.1607645332232192,
      "learning_rate": 0.0030492880772805433,
      "loss": 2.5658,
      "loss_layer_12_head": 0.5569297671318054,
      "loss_layer_18_head": 0.4531589150428772,
      "loss_layer_24_head": 0.2736010253429413,
      "loss_layer_30_head": 0.1658240109682083,
      "loss_layer_36_head": 0.1090262159705162,
      "loss_layer_42_head": 0.06171371787786484,
      "loss_layer_6_head": 0.8091610074043274,
      "step": 3795
    },
    {
      "epoch": 48.64,
      "grad_norm": 0.1987065479083494,
      "learning_rate": 0.0030438293975154187,
      "loss": 2.6286,
      "loss_layer_12_head": 0.5931164026260376,
      "loss_layer_18_head": 0.47854867577552795,
      "loss_layer_24_head": 0.28957146406173706,
      "loss_layer_30_head": 0.17572054266929626,
      "loss_layer_36_head": 0.11583950370550156,
      "loss_layer_42_head": 0.06350217759609222,
      "loss_layer_6_head": 0.8486884832382202,
      "step": 3800
    },
    {
      "epoch": 48.64,
      "eval_loss": 5.407393455505371,
      "eval_loss_layer_12_head": 1.2429840564727783,
      "eval_loss_layer_18_head": 1.0898300409317017,
      "eval_loss_layer_24_head": 0.6973127126693726,
      "eval_loss_layer_30_head": 0.4576980173587799,
      "eval_loss_layer_36_head": 0.2948791980743408,
      "eval_loss_layer_42_head": 0.17572419345378876,
      "eval_loss_layer_6_head": 1.580519676208496,
      "eval_runtime": 33.0459,
      "eval_samples_per_second": 9.683,
      "eval_steps_per_second": 0.605,
      "step": 3800
    },
    {
      "epoch": 48.704,
      "grad_norm": 0.14304467119956213,
      "learning_rate": 0.0030383679948732607,
      "loss": 2.608,
      "loss_layer_12_head": 0.5977441668510437,
      "loss_layer_18_head": 0.4846114218235016,
      "loss_layer_24_head": 0.2911698520183563,
      "loss_layer_30_head": 0.1746290624141693,
      "loss_layer_36_head": 0.11291255801916122,
      "loss_layer_42_head": 0.0636364221572876,
      "loss_layer_6_head": 0.8583235740661621,
      "step": 3805
    },
    {
      "epoch": 48.768,
      "grad_norm": 0.21115339396562557,
      "learning_rate": 0.00303290389669854,
      "loss": 2.5909,
      "loss_layer_12_head": 0.5983569622039795,
      "loss_layer_18_head": 0.48569124937057495,
      "loss_layer_24_head": 0.2918764650821686,
      "loss_layer_30_head": 0.1756625473499298,
      "loss_layer_36_head": 0.11503762006759644,
      "loss_layer_42_head": 0.06335446983575821,
      "loss_layer_6_head": 0.8610029220581055,
      "step": 3810
    },
    {
      "epoch": 48.832,
      "grad_norm": 0.1677393935307316,
      "learning_rate": 0.003027437130349227,
      "loss": 2.6119,
      "loss_layer_12_head": 0.6144179105758667,
      "loss_layer_18_head": 0.49929147958755493,
      "loss_layer_24_head": 0.29889172315597534,
      "loss_layer_30_head": 0.1788996160030365,
      "loss_layer_36_head": 0.11289115250110626,
      "loss_layer_42_head": 0.0615074560046196,
      "loss_layer_6_head": 0.8852108120918274,
      "step": 3815
    },
    {
      "epoch": 48.896,
      "grad_norm": 0.22214412354530014,
      "learning_rate": 0.003021967723196647,
      "loss": 2.6237,
      "loss_layer_12_head": 0.5978437066078186,
      "loss_layer_18_head": 0.4863978922367096,
      "loss_layer_24_head": 0.29241126775741577,
      "loss_layer_30_head": 0.17911753058433533,
      "loss_layer_36_head": 0.11432339251041412,
      "loss_layer_42_head": 0.06493993103504181,
      "loss_layer_6_head": 0.8633782267570496,
      "step": 3820
    },
    {
      "epoch": 48.96,
      "grad_norm": 0.20751812428610375,
      "learning_rate": 0.003016495702625351,
      "loss": 2.6427,
      "loss_layer_12_head": 0.6472228765487671,
      "loss_layer_18_head": 0.52493816614151,
      "loss_layer_24_head": 0.3175273537635803,
      "loss_layer_30_head": 0.1941504031419754,
      "loss_layer_36_head": 0.1266017109155655,
      "loss_layer_42_head": 0.07314454019069672,
      "loss_layer_6_head": 0.9190411567687988,
      "step": 3825
    },
    {
      "epoch": 49.024,
      "grad_norm": 0.14371813193299177,
      "learning_rate": 0.0030110210960329724,
      "loss": 2.6194,
      "loss_layer_12_head": 0.6215782761573792,
      "loss_layer_18_head": 0.5026232004165649,
      "loss_layer_24_head": 0.30278563499450684,
      "loss_layer_30_head": 0.18476101756095886,
      "loss_layer_36_head": 0.11636742204427719,
      "loss_layer_42_head": 0.0709095373749733,
      "loss_layer_6_head": 0.9011163711547852,
      "step": 3830
    },
    {
      "epoch": 49.088,
      "grad_norm": 0.23219253420571848,
      "learning_rate": 0.003005543930830095,
      "loss": 2.5269,
      "loss_layer_12_head": 0.5662212371826172,
      "loss_layer_18_head": 0.45611661672592163,
      "loss_layer_24_head": 0.2798483371734619,
      "loss_layer_30_head": 0.1751675009727478,
      "loss_layer_36_head": 0.11780457198619843,
      "loss_layer_42_head": 0.0784616619348526,
      "loss_layer_6_head": 0.8192461133003235,
      "step": 3835
    },
    {
      "epoch": 49.152,
      "grad_norm": 0.23449011179596188,
      "learning_rate": 0.003000064234440111,
      "loss": 2.5247,
      "loss_layer_12_head": 0.581813633441925,
      "loss_layer_18_head": 0.46889734268188477,
      "loss_layer_24_head": 0.29201188683509827,
      "loss_layer_30_head": 0.18612758815288544,
      "loss_layer_36_head": 0.11958048492670059,
      "loss_layer_42_head": 0.08292384445667267,
      "loss_layer_6_head": 0.8359322547912598,
      "step": 3840
    },
    {
      "epoch": 49.216,
      "grad_norm": 0.2959376697115443,
      "learning_rate": 0.0029945820342990874,
      "loss": 2.5193,
      "loss_layer_12_head": 0.5777071714401245,
      "loss_layer_18_head": 0.46162286400794983,
      "loss_layer_24_head": 0.2870132029056549,
      "loss_layer_30_head": 0.1782182902097702,
      "loss_layer_36_head": 0.11293108761310577,
      "loss_layer_42_head": 0.07494541257619858,
      "loss_layer_6_head": 0.8347188830375671,
      "step": 3845
    },
    {
      "epoch": 49.28,
      "grad_norm": 0.2652960387168858,
      "learning_rate": 0.0029890973578556267,
      "loss": 2.572,
      "loss_layer_12_head": 0.5779236555099487,
      "loss_layer_18_head": 0.4570494592189789,
      "loss_layer_24_head": 0.2846868336200714,
      "loss_layer_30_head": 0.17374704778194427,
      "loss_layer_36_head": 0.11712255328893661,
      "loss_layer_42_head": 0.07375837862491608,
      "loss_layer_6_head": 0.8365662693977356,
      "step": 3850
    },
    {
      "epoch": 49.344,
      "grad_norm": 0.22675147038710933,
      "learning_rate": 0.002983610232570728,
      "loss": 2.5712,
      "loss_layer_12_head": 0.591295599937439,
      "loss_layer_18_head": 0.4802575707435608,
      "loss_layer_24_head": 0.30070728063583374,
      "loss_layer_30_head": 0.1842481791973114,
      "loss_layer_36_head": 0.12863799929618835,
      "loss_layer_42_head": 0.07428416609764099,
      "loss_layer_6_head": 0.8462551832199097,
      "step": 3855
    },
    {
      "epoch": 49.408,
      "grad_norm": 0.25304936711153175,
      "learning_rate": 0.002978120685917656,
      "loss": 2.557,
      "loss_layer_12_head": 0.6003594994544983,
      "loss_layer_18_head": 0.4811546206474304,
      "loss_layer_24_head": 0.2953068017959595,
      "loss_layer_30_head": 0.1780962198972702,
      "loss_layer_36_head": 0.11991509050130844,
      "loss_layer_42_head": 0.0666462779045105,
      "loss_layer_6_head": 0.8767542839050293,
      "step": 3860
    },
    {
      "epoch": 49.472,
      "grad_norm": 0.22586439358659804,
      "learning_rate": 0.0029726287453817933,
      "loss": 2.5957,
      "loss_layer_12_head": 0.572717547416687,
      "loss_layer_18_head": 0.45781564712524414,
      "loss_layer_24_head": 0.28322696685791016,
      "loss_layer_30_head": 0.1775863915681839,
      "loss_layer_36_head": 0.1288948506116867,
      "loss_layer_42_head": 0.06663616746664047,
      "loss_layer_6_head": 0.8259660601615906,
      "step": 3865
    },
    {
      "epoch": 49.536,
      "grad_norm": 0.3563927785071091,
      "learning_rate": 0.0029671344384605124,
      "loss": 2.6182,
      "loss_layer_12_head": 0.5880976915359497,
      "loss_layer_18_head": 0.4668155312538147,
      "loss_layer_24_head": 0.28762251138687134,
      "loss_layer_30_head": 0.19065603613853455,
      "loss_layer_36_head": 0.12724831700325012,
      "loss_layer_42_head": 0.06710107624530792,
      "loss_layer_6_head": 0.8631643056869507,
      "step": 3870
    },
    {
      "epoch": 49.6,
      "grad_norm": 0.2846919464010354,
      "learning_rate": 0.0029616377926630316,
      "loss": 2.6276,
      "loss_layer_12_head": 0.6171402931213379,
      "loss_layer_18_head": 0.49567994475364685,
      "loss_layer_24_head": 0.30561918020248413,
      "loss_layer_30_head": 0.2004934549331665,
      "loss_layer_36_head": 0.12769034504890442,
      "loss_layer_42_head": 0.06769181787967682,
      "loss_layer_6_head": 0.89268958568573,
      "step": 3875
    },
    {
      "epoch": 49.664,
      "grad_norm": 0.24067874051827676,
      "learning_rate": 0.002956138835510282,
      "loss": 2.6276,
      "loss_layer_12_head": 0.6068697571754456,
      "loss_layer_18_head": 0.48779240250587463,
      "loss_layer_24_head": 0.2983923554420471,
      "loss_layer_30_head": 0.19392640888690948,
      "loss_layer_36_head": 0.1249309554696083,
      "loss_layer_42_head": 0.06415416300296783,
      "loss_layer_6_head": 0.8734499216079712,
      "step": 3880
    },
    {
      "epoch": 49.728,
      "grad_norm": 0.17558117532633516,
      "learning_rate": 0.0029506375945347646,
      "loss": 2.6401,
      "loss_layer_12_head": 0.5932475328445435,
      "loss_layer_18_head": 0.48279738426208496,
      "loss_layer_24_head": 0.2960584759712219,
      "loss_layer_30_head": 0.1859237402677536,
      "loss_layer_36_head": 0.12242022901773453,
      "loss_layer_42_head": 0.07089149206876755,
      "loss_layer_6_head": 0.8559579849243164,
      "step": 3885
    },
    {
      "epoch": 49.792,
      "grad_norm": 0.15887193859837823,
      "learning_rate": 0.002945134097280417,
      "loss": 2.5762,
      "loss_layer_12_head": 0.5941755175590515,
      "loss_layer_18_head": 0.48179706931114197,
      "loss_layer_24_head": 0.29653316736221313,
      "loss_layer_30_head": 0.18284103274345398,
      "loss_layer_36_head": 0.1156386286020279,
      "loss_layer_42_head": 0.0644725039601326,
      "loss_layer_6_head": 0.8534826040267944,
      "step": 3890
    },
    {
      "epoch": 49.856,
      "grad_norm": 0.21462485514850052,
      "learning_rate": 0.002939628371302473,
      "loss": 2.6361,
      "loss_layer_12_head": 0.616106390953064,
      "loss_layer_18_head": 0.5008279085159302,
      "loss_layer_24_head": 0.3048247992992401,
      "loss_layer_30_head": 0.18211978673934937,
      "loss_layer_36_head": 0.11130233108997345,
      "loss_layer_42_head": 0.060228943824768066,
      "loss_layer_6_head": 0.887053370475769,
      "step": 3895
    },
    {
      "epoch": 49.92,
      "grad_norm": 0.2546118004887763,
      "learning_rate": 0.0029341204441673263,
      "loss": 2.6037,
      "loss_layer_12_head": 0.6070103049278259,
      "loss_layer_18_head": 0.4914790689945221,
      "loss_layer_24_head": 0.3039492964744568,
      "loss_layer_30_head": 0.18037360906600952,
      "loss_layer_36_head": 0.1117684468626976,
      "loss_layer_42_head": 0.06309590488672256,
      "loss_layer_6_head": 0.8716564178466797,
      "step": 3900
    },
    {
      "epoch": 49.984,
      "grad_norm": 0.19461492509058317,
      "learning_rate": 0.0029286103434523893,
      "loss": 2.6302,
      "loss_layer_12_head": 0.5991615056991577,
      "loss_layer_18_head": 0.488283634185791,
      "loss_layer_24_head": 0.30061087012290955,
      "loss_layer_30_head": 0.17868909239768982,
      "loss_layer_36_head": 0.11220617592334747,
      "loss_layer_42_head": 0.061895549297332764,
      "loss_layer_6_head": 0.8575571179389954,
      "step": 3905
    },
    {
      "epoch": 50.048,
      "grad_norm": 0.1853544483031064,
      "learning_rate": 0.002923098096745959,
      "loss": 2.5126,
      "loss_layer_12_head": 0.5713303685188293,
      "loss_layer_18_head": 0.4585196375846863,
      "loss_layer_24_head": 0.2813715934753418,
      "loss_layer_30_head": 0.16769084334373474,
      "loss_layer_36_head": 0.10547100007534027,
      "loss_layer_42_head": 0.05938064306974411,
      "loss_layer_6_head": 0.8172779083251953,
      "step": 3910
    },
    {
      "epoch": 50.112,
      "grad_norm": 0.14567626525475896,
      "learning_rate": 0.002917583731647077,
      "loss": 2.4723,
      "loss_layer_12_head": 0.5588682889938354,
      "loss_layer_18_head": 0.4500862956047058,
      "loss_layer_24_head": 0.2765876352787018,
      "loss_layer_30_head": 0.16542576253414154,
      "loss_layer_36_head": 0.10534600168466568,
      "loss_layer_42_head": 0.05778098851442337,
      "loss_layer_6_head": 0.8020512461662292,
      "step": 3915
    },
    {
      "epoch": 50.176,
      "grad_norm": 0.1205606803531784,
      "learning_rate": 0.0029120672757653914,
      "loss": 2.4918,
      "loss_layer_12_head": 0.5834987759590149,
      "loss_layer_18_head": 0.4687116742134094,
      "loss_layer_24_head": 0.2915172874927521,
      "loss_layer_30_head": 0.17333875596523285,
      "loss_layer_36_head": 0.11025587469339371,
      "loss_layer_42_head": 0.06261521577835083,
      "loss_layer_6_head": 0.8343191146850586,
      "step": 3920
    },
    {
      "epoch": 50.24,
      "grad_norm": 0.2116788476338717,
      "learning_rate": 0.0029065487567210176,
      "loss": 2.5385,
      "loss_layer_12_head": 0.562193751335144,
      "loss_layer_18_head": 0.4486507475376129,
      "loss_layer_24_head": 0.28606829047203064,
      "loss_layer_30_head": 0.16576553881168365,
      "loss_layer_36_head": 0.10395984351634979,
      "loss_layer_42_head": 0.06385688483715057,
      "loss_layer_6_head": 0.8138986825942993,
      "step": 3925
    },
    {
      "epoch": 50.304,
      "grad_norm": 0.1369746586668492,
      "learning_rate": 0.0029010282021444006,
      "loss": 2.539,
      "loss_layer_12_head": 0.5634893178939819,
      "loss_layer_18_head": 0.4507543444633484,
      "loss_layer_24_head": 0.28509020805358887,
      "loss_layer_30_head": 0.16570807993412018,
      "loss_layer_36_head": 0.10596638917922974,
      "loss_layer_42_head": 0.063723124563694,
      "loss_layer_6_head": 0.8109868764877319,
      "step": 3930
    },
    {
      "epoch": 50.368,
      "grad_norm": 0.14531987788332562,
      "learning_rate": 0.0028955056396761797,
      "loss": 2.5288,
      "loss_layer_12_head": 0.6191779971122742,
      "loss_layer_18_head": 0.49589043855667114,
      "loss_layer_24_head": 0.3119671940803528,
      "loss_layer_30_head": 0.18032006919384003,
      "loss_layer_36_head": 0.11317239701747894,
      "loss_layer_42_head": 0.0666654109954834,
      "loss_layer_6_head": 0.8847519159317017,
      "step": 3935
    },
    {
      "epoch": 50.432,
      "grad_norm": 0.16135479054419447,
      "learning_rate": 0.002889981096967045,
      "loss": 2.5244,
      "loss_layer_12_head": 0.598736584186554,
      "loss_layer_18_head": 0.4821798801422119,
      "loss_layer_24_head": 0.3107772171497345,
      "loss_layer_30_head": 0.17660576105117798,
      "loss_layer_36_head": 0.11084624379873276,
      "loss_layer_42_head": 0.06483382731676102,
      "loss_layer_6_head": 0.8506501913070679,
      "step": 3940
    },
    {
      "epoch": 50.496,
      "grad_norm": 0.22123071251980653,
      "learning_rate": 0.0028844546016776012,
      "loss": 2.5836,
      "loss_layer_12_head": 0.5720422863960266,
      "loss_layer_18_head": 0.4580574929714203,
      "loss_layer_24_head": 0.31002697348594666,
      "loss_layer_30_head": 0.16626882553100586,
      "loss_layer_36_head": 0.10530398786067963,
      "loss_layer_42_head": 0.061948858201503754,
      "loss_layer_6_head": 0.814234733581543,
      "step": 3945
    },
    {
      "epoch": 50.56,
      "grad_norm": 0.19157293994501653,
      "learning_rate": 0.0028789261814782313,
      "loss": 2.5165,
      "loss_layer_12_head": 0.6030136942863464,
      "loss_layer_18_head": 0.4827168881893158,
      "loss_layer_24_head": 0.31043392419815063,
      "loss_layer_30_head": 0.17139145731925964,
      "loss_layer_36_head": 0.10828211158514023,
      "loss_layer_42_head": 0.06416897475719452,
      "loss_layer_6_head": 0.8635629415512085,
      "step": 3950
    },
    {
      "epoch": 50.624,
      "grad_norm": 0.1780763495418053,
      "learning_rate": 0.0028733958640489547,
      "loss": 2.6074,
      "loss_layer_12_head": 0.6210950613021851,
      "loss_layer_18_head": 0.501477062702179,
      "loss_layer_24_head": 0.32331258058547974,
      "loss_layer_30_head": 0.198713019490242,
      "loss_layer_36_head": 0.11738256365060806,
      "loss_layer_42_head": 0.07439212501049042,
      "loss_layer_6_head": 0.8840791583061218,
      "step": 3955
    },
    {
      "epoch": 50.688,
      "grad_norm": 0.22718096847640656,
      "learning_rate": 0.00286786367707929,
      "loss": 2.6234,
      "loss_layer_12_head": 0.5986446142196655,
      "loss_layer_18_head": 0.476602166891098,
      "loss_layer_24_head": 0.30624261498451233,
      "loss_layer_30_head": 0.2036999762058258,
      "loss_layer_36_head": 0.11306273937225342,
      "loss_layer_42_head": 0.08093731105327606,
      "loss_layer_6_head": 0.8438626527786255,
      "step": 3960
    },
    {
      "epoch": 50.752,
      "grad_norm": 0.2072807646339249,
      "learning_rate": 0.0028623296482681165,
      "loss": 2.653,
      "loss_layer_12_head": 0.6093775033950806,
      "loss_layer_18_head": 0.49039140343666077,
      "loss_layer_24_head": 0.30724915862083435,
      "loss_layer_30_head": 0.18167644739151,
      "loss_layer_36_head": 0.11615642160177231,
      "loss_layer_42_head": 0.06702502071857452,
      "loss_layer_6_head": 0.871726393699646,
      "step": 3965
    },
    {
      "epoch": 50.816,
      "grad_norm": 0.260234332097801,
      "learning_rate": 0.002856793805323536,
      "loss": 2.5924,
      "loss_layer_12_head": 0.5930246710777283,
      "loss_layer_18_head": 0.4750140309333801,
      "loss_layer_24_head": 0.2915870249271393,
      "loss_layer_30_head": 0.17037856578826904,
      "loss_layer_36_head": 0.10941748321056366,
      "loss_layer_42_head": 0.06094125658273697,
      "loss_layer_6_head": 0.8588629961013794,
      "step": 3970
    },
    {
      "epoch": 50.88,
      "grad_norm": 0.16130138613039038,
      "learning_rate": 0.002851256175962732,
      "loss": 2.6287,
      "loss_layer_12_head": 0.6207832098007202,
      "loss_layer_18_head": 0.5016980171203613,
      "loss_layer_24_head": 0.30685341358184814,
      "loss_layer_30_head": 0.18744511902332306,
      "loss_layer_36_head": 0.11543668806552887,
      "loss_layer_42_head": 0.06285912543535233,
      "loss_layer_6_head": 0.8878720998764038,
      "step": 3975
    },
    {
      "epoch": 50.944,
      "grad_norm": 0.259041927952478,
      "learning_rate": 0.002845716787911833,
      "loss": 2.6367,
      "loss_layer_12_head": 0.6058130264282227,
      "loss_layer_18_head": 0.4918006956577301,
      "loss_layer_24_head": 0.30855685472488403,
      "loss_layer_30_head": 0.18642035126686096,
      "loss_layer_36_head": 0.11695041507482529,
      "loss_layer_42_head": 0.0687691792845726,
      "loss_layer_6_head": 0.8676468133926392,
      "step": 3980
    },
    {
      "epoch": 51.008,
      "grad_norm": 0.2989182274806169,
      "learning_rate": 0.0028401756689057736,
      "loss": 2.6167,
      "loss_layer_12_head": 0.6080283522605896,
      "loss_layer_18_head": 0.49360188841819763,
      "loss_layer_24_head": 0.29667747020721436,
      "loss_layer_30_head": 0.18041086196899414,
      "loss_layer_36_head": 0.11321131885051727,
      "loss_layer_42_head": 0.06524789333343506,
      "loss_layer_6_head": 0.8650078773498535,
      "step": 3985
    },
    {
      "epoch": 51.072,
      "grad_norm": 0.2487426382782862,
      "learning_rate": 0.002834632846688154,
      "loss": 2.477,
      "loss_layer_12_head": 0.5536460876464844,
      "loss_layer_18_head": 0.4549512267112732,
      "loss_layer_24_head": 0.272408664226532,
      "loss_layer_30_head": 0.18003681302070618,
      "loss_layer_36_head": 0.1091206893324852,
      "loss_layer_42_head": 0.0664164274930954,
      "loss_layer_6_head": 0.7916552424430847,
      "step": 3990
    },
    {
      "epoch": 51.136,
      "grad_norm": 0.18409173942122087,
      "learning_rate": 0.0028290883490111034,
      "loss": 2.5058,
      "loss_layer_12_head": 0.6195579767227173,
      "loss_layer_18_head": 0.5068653225898743,
      "loss_layer_24_head": 0.30737531185150146,
      "loss_layer_30_head": 0.19452834129333496,
      "loss_layer_36_head": 0.12273107469081879,
      "loss_layer_42_head": 0.07189725339412689,
      "loss_layer_6_head": 0.8807812929153442,
      "step": 3995
    },
    {
      "epoch": 51.2,
      "grad_norm": 0.21875616889863783,
      "learning_rate": 0.0028235422036351383,
      "loss": 2.5402,
      "loss_layer_12_head": 0.5758143663406372,
      "loss_layer_18_head": 0.4689738154411316,
      "loss_layer_24_head": 0.28392213582992554,
      "loss_layer_30_head": 0.1782093346118927,
      "loss_layer_36_head": 0.11533632129430771,
      "loss_layer_42_head": 0.0721585750579834,
      "loss_layer_6_head": 0.819286048412323,
      "step": 4000
    },
    {
      "epoch": 51.2,
      "eval_loss": 5.449836254119873,
      "eval_loss_layer_12_head": 1.2550771236419678,
      "eval_loss_layer_18_head": 1.085659384727478,
      "eval_loss_layer_24_head": 0.7044129371643066,
      "eval_loss_layer_30_head": 0.47041597962379456,
      "eval_loss_layer_36_head": 0.2965102791786194,
      "eval_loss_layer_42_head": 0.18325789272785187,
      "eval_loss_layer_6_head": 1.6051485538482666,
      "eval_runtime": 33.1082,
      "eval_samples_per_second": 9.665,
      "eval_steps_per_second": 0.604,
      "step": 4000
    },
    {
      "epoch": 51.264,
      "grad_norm": 0.26113925829852636,
      "learning_rate": 0.0028179944383290275,
      "loss": 2.4765,
      "loss_layer_12_head": 0.5739409327507019,
      "loss_layer_18_head": 0.46336954832077026,
      "loss_layer_24_head": 0.2789846956729889,
      "loss_layer_30_head": 0.1689014583826065,
      "loss_layer_36_head": 0.1080164909362793,
      "loss_layer_42_head": 0.06580491364002228,
      "loss_layer_6_head": 0.8259090185165405,
      "step": 4005
    },
    {
      "epoch": 51.328,
      "grad_norm": 0.32343251548331753,
      "learning_rate": 0.0028124450808696463,
      "loss": 2.555,
      "loss_layer_12_head": 0.5801182985305786,
      "loss_layer_18_head": 0.47447142004966736,
      "loss_layer_24_head": 0.2904179096221924,
      "loss_layer_30_head": 0.17654231190681458,
      "loss_layer_36_head": 0.11467921733856201,
      "loss_layer_42_head": 0.07289145141839981,
      "loss_layer_6_head": 0.8318241238594055,
      "step": 4010
    },
    {
      "epoch": 51.392,
      "grad_norm": 0.24893142554091358,
      "learning_rate": 0.002806894159041846,
      "loss": 2.6077,
      "loss_layer_12_head": 0.6181851625442505,
      "loss_layer_18_head": 0.498310387134552,
      "loss_layer_24_head": 0.2985556125640869,
      "loss_layer_30_head": 0.17962518334388733,
      "loss_layer_36_head": 0.11351066827774048,
      "loss_layer_42_head": 0.06823652237653732,
      "loss_layer_6_head": 0.8830288648605347,
      "step": 4015
    },
    {
      "epoch": 51.456,
      "grad_norm": 0.16300346749978928,
      "learning_rate": 0.0028013417006383075,
      "loss": 2.5309,
      "loss_layer_12_head": 0.5816090106964111,
      "loss_layer_18_head": 0.46991676092147827,
      "loss_layer_24_head": 0.28248077630996704,
      "loss_layer_30_head": 0.16865870356559753,
      "loss_layer_36_head": 0.10862003266811371,
      "loss_layer_42_head": 0.06453286111354828,
      "loss_layer_6_head": 0.8280074000358582,
      "step": 4020
    },
    {
      "epoch": 51.52,
      "grad_norm": 0.18043388114706133,
      "learning_rate": 0.002795787733459408,
      "loss": 2.571,
      "loss_layer_12_head": 0.5912407040596008,
      "loss_layer_18_head": 0.4789276719093323,
      "loss_layer_24_head": 0.2921951413154602,
      "loss_layer_30_head": 0.17339059710502625,
      "loss_layer_36_head": 0.11059196293354034,
      "loss_layer_42_head": 0.06519712507724762,
      "loss_layer_6_head": 0.8469179272651672,
      "step": 4025
    },
    {
      "epoch": 51.584,
      "grad_norm": 0.1529244038886138,
      "learning_rate": 0.002790232285313076,
      "loss": 2.5741,
      "loss_layer_12_head": 0.6114966869354248,
      "loss_layer_18_head": 0.49590006470680237,
      "loss_layer_24_head": 0.3008139431476593,
      "loss_layer_30_head": 0.17820753157138824,
      "loss_layer_36_head": 0.11371765285730362,
      "loss_layer_42_head": 0.06591828167438507,
      "loss_layer_6_head": 0.8713754415512085,
      "step": 4030
    },
    {
      "epoch": 51.648,
      "grad_norm": 0.1257328188216335,
      "learning_rate": 0.002784675384014656,
      "loss": 2.5965,
      "loss_layer_12_head": 0.589821994304657,
      "loss_layer_18_head": 0.4772394299507141,
      "loss_layer_24_head": 0.2857882082462311,
      "loss_layer_30_head": 0.16909539699554443,
      "loss_layer_36_head": 0.10655518621206284,
      "loss_layer_42_head": 0.059574246406555176,
      "loss_layer_6_head": 0.8410855531692505,
      "step": 4035
    },
    {
      "epoch": 51.712,
      "grad_norm": 0.14745710231733009,
      "learning_rate": 0.0027791170573867696,
      "loss": 2.5607,
      "loss_layer_12_head": 0.5805457830429077,
      "loss_layer_18_head": 0.4694159924983978,
      "loss_layer_24_head": 0.2841266393661499,
      "loss_layer_30_head": 0.16650639474391937,
      "loss_layer_36_head": 0.10620055347681046,
      "loss_layer_42_head": 0.05990232899785042,
      "loss_layer_6_head": 0.8255990147590637,
      "step": 4040
    },
    {
      "epoch": 51.776,
      "grad_norm": 0.25945692051596897,
      "learning_rate": 0.0027735573332591723,
      "loss": 2.5784,
      "loss_layer_12_head": 0.6204644441604614,
      "loss_layer_18_head": 0.5010818243026733,
      "loss_layer_24_head": 0.3016743063926697,
      "loss_layer_30_head": 0.1831238567829132,
      "loss_layer_36_head": 0.11487344652414322,
      "loss_layer_42_head": 0.061844177544116974,
      "loss_layer_6_head": 0.8739821314811707,
      "step": 4045
    },
    {
      "epoch": 51.84,
      "grad_norm": 0.20993552456608086,
      "learning_rate": 0.00276799623946862,
      "loss": 2.5981,
      "loss_layer_12_head": 0.6121804714202881,
      "loss_layer_18_head": 0.4921717643737793,
      "loss_layer_24_head": 0.2938535511493683,
      "loss_layer_30_head": 0.17435197532176971,
      "loss_layer_36_head": 0.1113201156258583,
      "loss_layer_42_head": 0.06041323021054268,
      "loss_layer_6_head": 0.8735694885253906,
      "step": 4050
    },
    {
      "epoch": 51.904,
      "grad_norm": 0.12681292870463645,
      "learning_rate": 0.002762433803858722,
      "loss": 2.5855,
      "loss_layer_12_head": 0.6440340280532837,
      "loss_layer_18_head": 0.5140979886054993,
      "loss_layer_24_head": 0.3115968108177185,
      "loss_layer_30_head": 0.18368302285671234,
      "loss_layer_36_head": 0.116519495844841,
      "loss_layer_42_head": 0.06387030333280563,
      "loss_layer_6_head": 0.9058243632316589,
      "step": 4055
    },
    {
      "epoch": 51.968,
      "grad_norm": 0.1845994177154764,
      "learning_rate": 0.002756870054279811,
      "loss": 2.604,
      "loss_layer_12_head": 0.625022292137146,
      "loss_layer_18_head": 0.5018419027328491,
      "loss_layer_24_head": 0.30823636054992676,
      "loss_layer_30_head": 0.18217918276786804,
      "loss_layer_36_head": 0.12235584110021591,
      "loss_layer_42_head": 0.06411074846982956,
      "loss_layer_6_head": 0.8792604207992554,
      "step": 4060
    },
    {
      "epoch": 52.032,
      "grad_norm": 0.16342355559378324,
      "learning_rate": 0.0027513050185887927,
      "loss": 2.5213,
      "loss_layer_12_head": 0.6113088726997375,
      "loss_layer_18_head": 0.47911587357521057,
      "loss_layer_24_head": 0.28909140825271606,
      "loss_layer_30_head": 0.16948358714580536,
      "loss_layer_36_head": 0.10770396888256073,
      "loss_layer_42_head": 0.05739089846611023,
      "loss_layer_6_head": 0.8523110151290894,
      "step": 4065
    },
    {
      "epoch": 52.096,
      "grad_norm": 0.2255938494236085,
      "learning_rate": 0.002745738724649018,
      "loss": 2.4811,
      "loss_layer_12_head": 0.5952485799789429,
      "loss_layer_18_head": 0.4649713635444641,
      "loss_layer_24_head": 0.2827903628349304,
      "loss_layer_30_head": 0.16629065573215485,
      "loss_layer_36_head": 0.10827787220478058,
      "loss_layer_42_head": 0.059494685381650925,
      "loss_layer_6_head": 0.8300880193710327,
      "step": 4070
    },
    {
      "epoch": 52.16,
      "grad_norm": 0.1952069049701419,
      "learning_rate": 0.0027401712003301336,
      "loss": 2.479,
      "loss_layer_12_head": 0.5861165523529053,
      "loss_layer_18_head": 0.45549115538597107,
      "loss_layer_24_head": 0.2770305275917053,
      "loss_layer_30_head": 0.16635438799858093,
      "loss_layer_36_head": 0.10803048312664032,
      "loss_layer_42_head": 0.057335592806339264,
      "loss_layer_6_head": 0.8146300315856934,
      "step": 4075
    },
    {
      "epoch": 52.224,
      "grad_norm": 0.21544022049291522,
      "learning_rate": 0.002734602473507948,
      "loss": 2.5363,
      "loss_layer_12_head": 0.6376689672470093,
      "loss_layer_18_head": 0.5085365772247314,
      "loss_layer_24_head": 0.31134364008903503,
      "loss_layer_30_head": 0.18594060838222504,
      "loss_layer_36_head": 0.13063748180866241,
      "loss_layer_42_head": 0.06533834338188171,
      "loss_layer_6_head": 0.8812043070793152,
      "step": 4080
    },
    {
      "epoch": 52.288,
      "grad_norm": 0.2295885205199677,
      "learning_rate": 0.0027290325720642915,
      "loss": 2.4888,
      "loss_layer_12_head": 0.565386950969696,
      "loss_layer_18_head": 0.4456862807273865,
      "loss_layer_24_head": 0.26999324560165405,
      "loss_layer_30_head": 0.16207921504974365,
      "loss_layer_36_head": 0.11202798038721085,
      "loss_layer_42_head": 0.05963345617055893,
      "loss_layer_6_head": 0.7893596887588501,
      "step": 4085
    },
    {
      "epoch": 52.352,
      "grad_norm": 0.19128513939362501,
      "learning_rate": 0.002723461523886873,
      "loss": 2.5227,
      "loss_layer_12_head": 0.6147063970565796,
      "loss_layer_18_head": 0.4947744905948639,
      "loss_layer_24_head": 0.29306939244270325,
      "loss_layer_30_head": 0.17488324642181396,
      "loss_layer_36_head": 0.11395712196826935,
      "loss_layer_42_head": 0.06508536636829376,
      "loss_layer_6_head": 0.8673809766769409,
      "step": 4090
    },
    {
      "epoch": 52.416,
      "grad_norm": 0.23046587016979683,
      "learning_rate": 0.002717889356869146,
      "loss": 2.5682,
      "loss_layer_12_head": 0.5895868539810181,
      "loss_layer_18_head": 0.4777819514274597,
      "loss_layer_24_head": 0.2957989573478699,
      "loss_layer_30_head": 0.17918741703033447,
      "loss_layer_36_head": 0.12053434550762177,
      "loss_layer_42_head": 0.06754304468631744,
      "loss_layer_6_head": 0.8285537958145142,
      "step": 4095
    },
    {
      "epoch": 52.48,
      "grad_norm": 0.155617353539784,
      "learning_rate": 0.002712316098910162,
      "loss": 2.5288,
      "loss_layer_12_head": 0.5746709108352661,
      "loss_layer_18_head": 0.4716859757900238,
      "loss_layer_24_head": 0.2794245779514313,
      "loss_layer_30_head": 0.17008832097053528,
      "loss_layer_36_head": 0.10967735946178436,
      "loss_layer_42_head": 0.06436165422201157,
      "loss_layer_6_head": 0.8110925555229187,
      "step": 4100
    },
    {
      "epoch": 52.544,
      "grad_norm": 0.22356835208944048,
      "learning_rate": 0.0027067417779144394,
      "loss": 2.539,
      "loss_layer_12_head": 0.6193991303443909,
      "loss_layer_18_head": 0.4975149631500244,
      "loss_layer_24_head": 0.296499103307724,
      "loss_layer_30_head": 0.1766699105501175,
      "loss_layer_36_head": 0.1113186627626419,
      "loss_layer_42_head": 0.06380704045295715,
      "loss_layer_6_head": 0.8678559064865112,
      "step": 4105
    },
    {
      "epoch": 52.608,
      "grad_norm": 0.2219621587558149,
      "learning_rate": 0.002701166421791815,
      "loss": 2.6074,
      "loss_layer_12_head": 0.6238259077072144,
      "loss_layer_18_head": 0.4999205470085144,
      "loss_layer_24_head": 0.3012193441390991,
      "loss_layer_30_head": 0.17912697792053223,
      "loss_layer_36_head": 0.10955314338207245,
      "loss_layer_42_head": 0.06370266526937485,
      "loss_layer_6_head": 0.8873143196105957,
      "step": 4110
    },
    {
      "epoch": 52.672,
      "grad_norm": 0.32555410795082945,
      "learning_rate": 0.00269559005845731,
      "loss": 2.5729,
      "loss_layer_12_head": 0.6157333254814148,
      "loss_layer_18_head": 0.495767205953598,
      "loss_layer_24_head": 0.30918505787849426,
      "loss_layer_30_head": 0.1821192353963852,
      "loss_layer_36_head": 0.1112036481499672,
      "loss_layer_42_head": 0.06373155117034912,
      "loss_layer_6_head": 0.8782930374145508,
      "step": 4115
    },
    {
      "epoch": 52.736,
      "grad_norm": 0.24962098882418002,
      "learning_rate": 0.00269001271583099,
      "loss": 2.605,
      "loss_layer_12_head": 0.586893618106842,
      "loss_layer_18_head": 0.4725547730922699,
      "loss_layer_24_head": 0.29974085092544556,
      "loss_layer_30_head": 0.18005456030368805,
      "loss_layer_36_head": 0.11024041473865509,
      "loss_layer_42_head": 0.062175147235393524,
      "loss_layer_6_head": 0.8452178835868835,
      "step": 4120
    },
    {
      "epoch": 52.8,
      "grad_norm": 0.24375087595555878,
      "learning_rate": 0.0026844344218378206,
      "loss": 2.6117,
      "loss_layer_12_head": 0.6181777715682983,
      "loss_layer_18_head": 0.4985972046852112,
      "loss_layer_24_head": 0.31200075149536133,
      "loss_layer_30_head": 0.1992335170507431,
      "loss_layer_36_head": 0.1136327013373375,
      "loss_layer_42_head": 0.06380222737789154,
      "loss_layer_6_head": 0.8845838308334351,
      "step": 4125
    },
    {
      "epoch": 52.864,
      "grad_norm": 0.21515885646321004,
      "learning_rate": 0.0026788552044075343,
      "loss": 2.6402,
      "loss_layer_12_head": 0.6171239614486694,
      "loss_layer_18_head": 0.49389010667800903,
      "loss_layer_24_head": 0.3038467466831207,
      "loss_layer_30_head": 0.19912859797477722,
      "loss_layer_36_head": 0.11225540935993195,
      "loss_layer_42_head": 0.061090271919965744,
      "loss_layer_6_head": 0.8756195902824402,
      "step": 4130
    },
    {
      "epoch": 52.928,
      "grad_norm": 0.23112012581987948,
      "learning_rate": 0.002673275091474483,
      "loss": 2.6448,
      "loss_layer_12_head": 0.6093727350234985,
      "loss_layer_18_head": 0.48557591438293457,
      "loss_layer_24_head": 0.29639318585395813,
      "loss_layer_30_head": 0.18701736629009247,
      "loss_layer_36_head": 0.10933129489421844,
      "loss_layer_42_head": 0.060083091259002686,
      "loss_layer_6_head": 0.8651523590087891,
      "step": 4135
    },
    {
      "epoch": 52.992,
      "grad_norm": 0.1777053901665474,
      "learning_rate": 0.0026676941109775062,
      "loss": 2.6502,
      "loss_layer_12_head": 0.6324563026428223,
      "loss_layer_18_head": 0.5081391334533691,
      "loss_layer_24_head": 0.3088374137878418,
      "loss_layer_30_head": 0.191867396235466,
      "loss_layer_36_head": 0.11197139322757721,
      "loss_layer_42_head": 0.06063435226678848,
      "loss_layer_6_head": 0.8936551213264465,
      "step": 4140
    },
    {
      "epoch": 53.056,
      "grad_norm": 0.2312053952805846,
      "learning_rate": 0.002662112290859785,
      "loss": 2.5338,
      "loss_layer_12_head": 0.5718913078308105,
      "loss_layer_18_head": 0.4567931592464447,
      "loss_layer_24_head": 0.28400909900665283,
      "loss_layer_30_head": 0.17476297914981842,
      "loss_layer_36_head": 0.10480271279811859,
      "loss_layer_42_head": 0.0575125589966774,
      "loss_layer_6_head": 0.8167108297348022,
      "step": 4145
    },
    {
      "epoch": 53.12,
      "grad_norm": 0.15453442127683986,
      "learning_rate": 0.002656529659068705,
      "loss": 2.4832,
      "loss_layer_12_head": 0.5723087787628174,
      "loss_layer_18_head": 0.46050509810447693,
      "loss_layer_24_head": 0.29029232263565063,
      "loss_layer_30_head": 0.1789209246635437,
      "loss_layer_36_head": 0.1075645461678505,
      "loss_layer_42_head": 0.05890269950032234,
      "loss_layer_6_head": 0.8088299632072449,
      "step": 4150
    },
    {
      "epoch": 53.184,
      "grad_norm": 0.18712831620087156,
      "learning_rate": 0.002650946243555715,
      "loss": 2.4966,
      "loss_layer_12_head": 0.5646978616714478,
      "loss_layer_18_head": 0.45411014556884766,
      "loss_layer_24_head": 0.27876749634742737,
      "loss_layer_30_head": 0.17630381882190704,
      "loss_layer_36_head": 0.1046772375702858,
      "loss_layer_42_head": 0.0567803755402565,
      "loss_layer_6_head": 0.8150539398193359,
      "step": 4155
    },
    {
      "epoch": 53.248,
      "grad_norm": 0.20539971389099676,
      "learning_rate": 0.0026453620722761894,
      "loss": 2.5046,
      "loss_layer_12_head": 0.5762676000595093,
      "loss_layer_18_head": 0.4637802541255951,
      "loss_layer_24_head": 0.29095619916915894,
      "loss_layer_30_head": 0.17579254508018494,
      "loss_layer_36_head": 0.10896308720111847,
      "loss_layer_42_head": 0.058619339019060135,
      "loss_layer_6_head": 0.8186262845993042,
      "step": 4160
    },
    {
      "epoch": 53.312,
      "grad_norm": 0.20523761464581858,
      "learning_rate": 0.0026397771731892857,
      "loss": 2.5271,
      "loss_layer_12_head": 0.5830076932907104,
      "loss_layer_18_head": 0.46479424834251404,
      "loss_layer_24_head": 0.2888796031475067,
      "loss_layer_30_head": 0.1820581555366516,
      "loss_layer_36_head": 0.10618060827255249,
      "loss_layer_42_head": 0.05669588968157768,
      "loss_layer_6_head": 0.8300201296806335,
      "step": 4165
    },
    {
      "epoch": 53.376,
      "grad_norm": 0.16899627374109852,
      "learning_rate": 0.0026341915742578038,
      "loss": 2.5086,
      "loss_layer_12_head": 0.5793240070343018,
      "loss_layer_18_head": 0.46324896812438965,
      "loss_layer_24_head": 0.2885502576828003,
      "loss_layer_30_head": 0.1764083206653595,
      "loss_layer_36_head": 0.10819049179553986,
      "loss_layer_42_head": 0.05780510976910591,
      "loss_layer_6_head": 0.8255365490913391,
      "step": 4170
    },
    {
      "epoch": 53.44,
      "grad_norm": 0.2849227055107151,
      "learning_rate": 0.002628605303448051,
      "loss": 2.5166,
      "loss_layer_12_head": 0.5895195007324219,
      "loss_layer_18_head": 0.4772466719150543,
      "loss_layer_24_head": 0.29167500138282776,
      "loss_layer_30_head": 0.17960961163043976,
      "loss_layer_36_head": 0.11015380918979645,
      "loss_layer_42_head": 0.056556202471256256,
      "loss_layer_6_head": 0.8454585075378418,
      "step": 4175
    },
    {
      "epoch": 53.504,
      "grad_norm": 0.2341984372268443,
      "learning_rate": 0.0026230183887296955,
      "loss": 2.5503,
      "loss_layer_12_head": 0.5901676416397095,
      "loss_layer_18_head": 0.47871047258377075,
      "loss_layer_24_head": 0.31510692834854126,
      "loss_layer_30_head": 0.18425318598747253,
      "loss_layer_36_head": 0.12439726293087006,
      "loss_layer_42_head": 0.06636907905340195,
      "loss_layer_6_head": 0.8517929315567017,
      "step": 4180
    },
    {
      "epoch": 53.568,
      "grad_norm": 0.17213391711287218,
      "learning_rate": 0.002617430858075632,
      "loss": 2.5605,
      "loss_layer_12_head": 0.6022549271583557,
      "loss_layer_18_head": 0.48647451400756836,
      "loss_layer_24_head": 0.2968886196613312,
      "loss_layer_30_head": 0.18030515313148499,
      "loss_layer_36_head": 0.1107364147901535,
      "loss_layer_42_head": 0.05922335386276245,
      "loss_layer_6_head": 0.8712204098701477,
      "step": 4185
    },
    {
      "epoch": 53.632,
      "grad_norm": 0.16010342879847617,
      "learning_rate": 0.0026118427394618356,
      "loss": 2.5758,
      "loss_layer_12_head": 0.6016020774841309,
      "loss_layer_18_head": 0.491417795419693,
      "loss_layer_24_head": 0.3006415069103241,
      "loss_layer_30_head": 0.17931994795799255,
      "loss_layer_36_head": 0.11009039729833603,
      "loss_layer_42_head": 0.06048227474093437,
      "loss_layer_6_head": 0.8646378517150879,
      "step": 4190
    },
    {
      "epoch": 53.696,
      "grad_norm": 0.1928404601711342,
      "learning_rate": 0.0026062540608672298,
      "loss": 2.5852,
      "loss_layer_12_head": 0.5869933366775513,
      "loss_layer_18_head": 0.477443128824234,
      "loss_layer_24_head": 0.2909673750400543,
      "loss_layer_30_head": 0.1773912012577057,
      "loss_layer_36_head": 0.10800890624523163,
      "loss_layer_42_head": 0.060516513884067535,
      "loss_layer_6_head": 0.836473822593689,
      "step": 4195
    },
    {
      "epoch": 53.76,
      "grad_norm": 0.19033300318030275,
      "learning_rate": 0.002600664850273538,
      "loss": 2.6027,
      "loss_layer_12_head": 0.6110284328460693,
      "loss_layer_18_head": 0.49214839935302734,
      "loss_layer_24_head": 0.2969120442867279,
      "loss_layer_30_head": 0.18003904819488525,
      "loss_layer_36_head": 0.11042412370443344,
      "loss_layer_42_head": 0.06315432488918304,
      "loss_layer_6_head": 0.8740862607955933,
      "step": 4200
    },
    {
      "epoch": 53.76,
      "eval_loss": 5.504047393798828,
      "eval_loss_layer_12_head": 1.2576563358306885,
      "eval_loss_layer_18_head": 1.0812591314315796,
      "eval_loss_layer_24_head": 0.7198082208633423,
      "eval_loss_layer_30_head": 0.5051268935203552,
      "eval_loss_layer_36_head": 0.3220524787902832,
      "eval_loss_layer_42_head": 0.1834176778793335,
      "eval_loss_layer_6_head": 1.6329669952392578,
      "eval_runtime": 33.0532,
      "eval_samples_per_second": 9.681,
      "eval_steps_per_second": 0.605,
      "step": 4200
    },
    {
      "epoch": 53.824,
      "grad_norm": 0.1378584140136119,
      "learning_rate": 0.0025950751356651493,
      "loss": 2.5573,
      "loss_layer_12_head": 0.5975342988967896,
      "loss_layer_18_head": 0.4812083840370178,
      "loss_layer_24_head": 0.30054429173469543,
      "loss_layer_30_head": 0.20062777400016785,
      "loss_layer_36_head": 0.12674984335899353,
      "loss_layer_42_head": 0.06301885843276978,
      "loss_layer_6_head": 0.8625162243843079,
      "step": 4205
    },
    {
      "epoch": 53.888,
      "grad_norm": 0.21412283957464254,
      "learning_rate": 0.0025894849450289765,
      "loss": 2.566,
      "loss_layer_12_head": 0.6121038198471069,
      "loss_layer_18_head": 0.4930172562599182,
      "loss_layer_24_head": 0.30286604166030884,
      "loss_layer_30_head": 0.18156865239143372,
      "loss_layer_36_head": 0.11557785421609879,
      "loss_layer_42_head": 0.060135431587696075,
      "loss_layer_6_head": 0.8721855282783508,
      "step": 4210
    },
    {
      "epoch": 53.952,
      "grad_norm": 0.16379473181540846,
      "learning_rate": 0.0025838943063543137,
      "loss": 2.6255,
      "loss_layer_12_head": 0.5971307754516602,
      "loss_layer_18_head": 0.47660866379737854,
      "loss_layer_24_head": 0.2892858386039734,
      "loss_layer_30_head": 0.1731681376695633,
      "loss_layer_36_head": 0.12183620780706406,
      "loss_layer_42_head": 0.06109664589166641,
      "loss_layer_6_head": 0.8482160568237305,
      "step": 4215
    },
    {
      "epoch": 54.016,
      "grad_norm": 0.1497590021870943,
      "learning_rate": 0.0025783032476327005,
      "loss": 2.5949,
      "loss_layer_12_head": 0.5720672607421875,
      "loss_layer_18_head": 0.4573948383331299,
      "loss_layer_24_head": 0.28160470724105835,
      "loss_layer_30_head": 0.16995874047279358,
      "loss_layer_36_head": 0.13293729722499847,
      "loss_layer_42_head": 0.06084040552377701,
      "loss_layer_6_head": 0.8113880157470703,
      "step": 4220
    },
    {
      "epoch": 54.08,
      "grad_norm": 0.2817533021243182,
      "learning_rate": 0.0025727117968577786,
      "loss": 2.4654,
      "loss_layer_12_head": 0.5626724362373352,
      "loss_layer_18_head": 0.44664397835731506,
      "loss_layer_24_head": 0.28214961290359497,
      "loss_layer_30_head": 0.1738344132900238,
      "loss_layer_36_head": 0.12742000818252563,
      "loss_layer_42_head": 0.06509667634963989,
      "loss_layer_6_head": 0.7980576157569885,
      "step": 4225
    },
    {
      "epoch": 54.144,
      "grad_norm": 0.2854803078756864,
      "learning_rate": 0.0025671199820251536,
      "loss": 2.4696,
      "loss_layer_12_head": 0.5517435073852539,
      "loss_layer_18_head": 0.43882378935813904,
      "loss_layer_24_head": 0.2772171199321747,
      "loss_layer_30_head": 0.1624375730752945,
      "loss_layer_36_head": 0.11566152423620224,
      "loss_layer_42_head": 0.06340347230434418,
      "loss_layer_6_head": 0.7964197993278503,
      "step": 4230
    },
    {
      "epoch": 54.208,
      "grad_norm": 0.3672300236239408,
      "learning_rate": 0.0025615278311322508,
      "loss": 2.5067,
      "loss_layer_12_head": 0.5731328725814819,
      "loss_layer_18_head": 0.4585558772087097,
      "loss_layer_24_head": 0.2855757772922516,
      "loss_layer_30_head": 0.16511501371860504,
      "loss_layer_36_head": 0.11526542901992798,
      "loss_layer_42_head": 0.06087537482380867,
      "loss_layer_6_head": 0.8368692398071289,
      "step": 4235
    },
    {
      "epoch": 54.272,
      "grad_norm": 0.33611210054088736,
      "learning_rate": 0.002555935372178183,
      "loss": 2.4827,
      "loss_layer_12_head": 0.5658725500106812,
      "loss_layer_18_head": 0.4537169337272644,
      "loss_layer_24_head": 0.2794841527938843,
      "loss_layer_30_head": 0.16338801383972168,
      "loss_layer_36_head": 0.11097149550914764,
      "loss_layer_42_head": 0.06263978779315948,
      "loss_layer_6_head": 0.816378116607666,
      "step": 4240
    },
    {
      "epoch": 54.336,
      "grad_norm": 0.2880295523803247,
      "learning_rate": 0.002550342633163601,
      "loss": 2.5431,
      "loss_layer_12_head": 0.5447441339492798,
      "loss_layer_18_head": 0.43572989106178284,
      "loss_layer_24_head": 0.2615463137626648,
      "loss_layer_30_head": 0.1540253460407257,
      "loss_layer_36_head": 0.1021452322602272,
      "loss_layer_42_head": 0.05551673099398613,
      "loss_layer_6_head": 0.7833169102668762,
      "step": 4245
    },
    {
      "epoch": 54.4,
      "grad_norm": 0.14627251814244616,
      "learning_rate": 0.0025447496420905606,
      "loss": 2.5581,
      "loss_layer_12_head": 0.606192946434021,
      "loss_layer_18_head": 0.4903246760368347,
      "loss_layer_24_head": 0.2992309033870697,
      "loss_layer_30_head": 0.1780223250389099,
      "loss_layer_36_head": 0.11442434787750244,
      "loss_layer_42_head": 0.06334378570318222,
      "loss_layer_6_head": 0.8601619005203247,
      "step": 4250
    },
    {
      "epoch": 54.464,
      "grad_norm": 0.12795038656662153,
      "learning_rate": 0.002539156426962379,
      "loss": 2.5472,
      "loss_layer_12_head": 0.589502215385437,
      "loss_layer_18_head": 0.47287893295288086,
      "loss_layer_24_head": 0.2872318625450134,
      "loss_layer_30_head": 0.16742676496505737,
      "loss_layer_36_head": 0.10869023948907852,
      "loss_layer_42_head": 0.0593157522380352,
      "loss_layer_6_head": 0.842410683631897,
      "step": 4255
    },
    {
      "epoch": 54.528,
      "grad_norm": 0.1719317646466488,
      "learning_rate": 0.002533563015783494,
      "loss": 2.5387,
      "loss_layer_12_head": 0.5876104831695557,
      "loss_layer_18_head": 0.4701828956604004,
      "loss_layer_24_head": 0.28683847188949585,
      "loss_layer_30_head": 0.16753257811069489,
      "loss_layer_36_head": 0.10782208293676376,
      "loss_layer_42_head": 0.06026170402765274,
      "loss_layer_6_head": 0.8425711393356323,
      "step": 4260
    },
    {
      "epoch": 54.592,
      "grad_norm": 0.12994616742368514,
      "learning_rate": 0.0025279694365593266,
      "loss": 2.5528,
      "loss_layer_12_head": 0.6023968458175659,
      "loss_layer_18_head": 0.48520898818969727,
      "loss_layer_24_head": 0.2934209704399109,
      "loss_layer_30_head": 0.17098964750766754,
      "loss_layer_36_head": 0.1076728105545044,
      "loss_layer_42_head": 0.060300953686237335,
      "loss_layer_6_head": 0.8676518201828003,
      "step": 4265
    },
    {
      "epoch": 54.656,
      "grad_norm": 0.18444500808340436,
      "learning_rate": 0.002522375717296137,
      "loss": 2.5708,
      "loss_layer_12_head": 0.6136434078216553,
      "loss_layer_18_head": 0.49238553643226624,
      "loss_layer_24_head": 0.2979400157928467,
      "loss_layer_30_head": 0.17316976189613342,
      "loss_layer_36_head": 0.1101909652352333,
      "loss_layer_42_head": 0.06378228962421417,
      "loss_layer_6_head": 0.8695393800735474,
      "step": 4270
    },
    {
      "epoch": 54.72,
      "grad_norm": 0.14999597423637015,
      "learning_rate": 0.0025167818860008908,
      "loss": 2.5348,
      "loss_layer_12_head": 0.6097851991653442,
      "loss_layer_18_head": 0.4907146394252777,
      "loss_layer_24_head": 0.29322880506515503,
      "loss_layer_30_head": 0.17171618342399597,
      "loss_layer_36_head": 0.11044476926326752,
      "loss_layer_42_head": 0.05949532240629196,
      "loss_layer_6_head": 0.8678953051567078,
      "step": 4275
    },
    {
      "epoch": 54.784,
      "grad_norm": 0.13536969826382428,
      "learning_rate": 0.0025111879706811086,
      "loss": 2.5831,
      "loss_layer_12_head": 0.5739878416061401,
      "loss_layer_18_head": 0.4611971974372864,
      "loss_layer_24_head": 0.2809790372848511,
      "loss_layer_30_head": 0.16251270473003387,
      "loss_layer_36_head": 0.10358639806509018,
      "loss_layer_42_head": 0.058133430778980255,
      "loss_layer_6_head": 0.8205119371414185,
      "step": 4280
    },
    {
      "epoch": 54.848,
      "grad_norm": 0.14875462600599984,
      "learning_rate": 0.0025055939993447367,
      "loss": 2.5504,
      "loss_layer_12_head": 0.5975229144096375,
      "loss_layer_18_head": 0.4837900698184967,
      "loss_layer_24_head": 0.29424935579299927,
      "loss_layer_30_head": 0.173075333237648,
      "loss_layer_36_head": 0.10958262532949448,
      "loss_layer_42_head": 0.06312854588031769,
      "loss_layer_6_head": 0.8593814969062805,
      "step": 4285
    },
    {
      "epoch": 54.912,
      "grad_norm": 0.1350213519208274,
      "learning_rate": 0.0025,
      "loss": 2.5519,
      "loss_layer_12_head": 0.5911614894866943,
      "loss_layer_18_head": 0.4787793755531311,
      "loss_layer_24_head": 0.2932833731174469,
      "loss_layer_30_head": 0.17296655476093292,
      "loss_layer_36_head": 0.10856993496417999,
      "loss_layer_42_head": 0.06130681186914444,
      "loss_layer_6_head": 0.8465656042098999,
      "step": 4290
    },
    {
      "epoch": 54.976,
      "grad_norm": 0.1770144093056875,
      "learning_rate": 0.0024944060006552634,
      "loss": 2.5348,
      "loss_layer_12_head": 0.6126807928085327,
      "loss_layer_18_head": 0.49186962842941284,
      "loss_layer_24_head": 0.29697713255882263,
      "loss_layer_30_head": 0.1723133623600006,
      "loss_layer_36_head": 0.1078997403383255,
      "loss_layer_42_head": 0.058521904051303864,
      "loss_layer_6_head": 0.868830680847168,
      "step": 4295
    },
    {
      "epoch": 55.04,
      "grad_norm": 0.21486828654316617,
      "learning_rate": 0.0024888120293188915,
      "loss": 2.453,
      "loss_layer_12_head": 0.574276328086853,
      "loss_layer_18_head": 0.46167072653770447,
      "loss_layer_24_head": 0.27901941537857056,
      "loss_layer_30_head": 0.164504274725914,
      "loss_layer_36_head": 0.10399948060512543,
      "loss_layer_42_head": 0.05590296909213066,
      "loss_layer_6_head": 0.822663426399231,
      "step": 4300
    },
    {
      "epoch": 55.104,
      "grad_norm": 0.19820393585972573,
      "learning_rate": 0.00248321811399911,
      "loss": 2.499,
      "loss_layer_12_head": 0.5889183878898621,
      "loss_layer_18_head": 0.471810907125473,
      "loss_layer_24_head": 0.2883966863155365,
      "loss_layer_30_head": 0.1678212285041809,
      "loss_layer_36_head": 0.10585695505142212,
      "loss_layer_42_head": 0.05776315927505493,
      "loss_layer_6_head": 0.8347657322883606,
      "step": 4305
    },
    {
      "epoch": 55.168,
      "grad_norm": 0.18839903356179555,
      "learning_rate": 0.0024776242827038637,
      "loss": 2.4665,
      "loss_layer_12_head": 0.5662781000137329,
      "loss_layer_18_head": 0.45508328080177307,
      "loss_layer_24_head": 0.2767255902290344,
      "loss_layer_30_head": 0.16559450328350067,
      "loss_layer_36_head": 0.10580658912658691,
      "loss_layer_42_head": 0.05919630452990532,
      "loss_layer_6_head": 0.8069319725036621,
      "step": 4310
    },
    {
      "epoch": 55.232,
      "grad_norm": 0.19406571048783108,
      "learning_rate": 0.002472030563440674,
      "loss": 2.4647,
      "loss_layer_12_head": 0.580419659614563,
      "loss_layer_18_head": 0.4661577641963959,
      "loss_layer_24_head": 0.2829776704311371,
      "loss_layer_30_head": 0.1648774892091751,
      "loss_layer_36_head": 0.10569070279598236,
      "loss_layer_42_head": 0.056278377771377563,
      "loss_layer_6_head": 0.8259084820747375,
      "step": 4315
    },
    {
      "epoch": 55.296,
      "grad_norm": 0.20637943684704985,
      "learning_rate": 0.0024664369842165067,
      "loss": 2.4664,
      "loss_layer_12_head": 0.5717100501060486,
      "loss_layer_18_head": 0.4605150818824768,
      "loss_layer_24_head": 0.2770109176635742,
      "loss_layer_30_head": 0.1649463027715683,
      "loss_layer_36_head": 0.1045730859041214,
      "loss_layer_42_head": 0.0562475323677063,
      "loss_layer_6_head": 0.820482611656189,
      "step": 4320
    },
    {
      "epoch": 55.36,
      "grad_norm": 0.16967701418484546,
      "learning_rate": 0.002460843573037622,
      "loss": 2.5476,
      "loss_layer_12_head": 0.5908252596855164,
      "loss_layer_18_head": 0.47885704040527344,
      "loss_layer_24_head": 0.2910996079444885,
      "loss_layer_30_head": 0.1797390878200531,
      "loss_layer_36_head": 0.11385854333639145,
      "loss_layer_42_head": 0.06040918827056885,
      "loss_layer_6_head": 0.8440021276473999,
      "step": 4325
    },
    {
      "epoch": 55.424,
      "grad_norm": 0.15496880496920865,
      "learning_rate": 0.0024552503579094395,
      "loss": 2.523,
      "loss_layer_12_head": 0.594721257686615,
      "loss_layer_18_head": 0.4767284393310547,
      "loss_layer_24_head": 0.293151319026947,
      "loss_layer_30_head": 0.18291689455509186,
      "loss_layer_36_head": 0.11341114342212677,
      "loss_layer_42_head": 0.05842383950948715,
      "loss_layer_6_head": 0.8404184579849243,
      "step": 4330
    },
    {
      "epoch": 55.488,
      "grad_norm": 0.1938471813354179,
      "learning_rate": 0.0024496573668364,
      "loss": 2.4699,
      "loss_layer_12_head": 0.5899877548217773,
      "loss_layer_18_head": 0.4730832576751709,
      "loss_layer_24_head": 0.28621411323547363,
      "loss_layer_30_head": 0.17703649401664734,
      "loss_layer_36_head": 0.11293051391839981,
      "loss_layer_42_head": 0.05805889889597893,
      "loss_layer_6_head": 0.841762900352478,
      "step": 4335
    },
    {
      "epoch": 55.552,
      "grad_norm": 0.1865873693438108,
      "learning_rate": 0.0024440646278218177,
      "loss": 2.5766,
      "loss_layer_12_head": 0.6008879542350769,
      "loss_layer_18_head": 0.48752063512802124,
      "loss_layer_24_head": 0.2983608543872833,
      "loss_layer_30_head": 0.18723224103450775,
      "loss_layer_36_head": 0.11174342781305313,
      "loss_layer_42_head": 0.06110728532075882,
      "loss_layer_6_head": 0.8533975481987,
      "step": 4340
    },
    {
      "epoch": 55.616,
      "grad_norm": 0.182242930635603,
      "learning_rate": 0.0024384721688677493,
      "loss": 2.5766,
      "loss_layer_12_head": 0.5757268071174622,
      "loss_layer_18_head": 0.46423405408859253,
      "loss_layer_24_head": 0.28106364607810974,
      "loss_layer_30_head": 0.1810971051454544,
      "loss_layer_36_head": 0.10616783052682877,
      "loss_layer_42_head": 0.05854790285229683,
      "loss_layer_6_head": 0.8103270530700684,
      "step": 4345
    },
    {
      "epoch": 55.68,
      "grad_norm": 0.2411212854554715,
      "learning_rate": 0.0024328800179748474,
      "loss": 2.5711,
      "loss_layer_12_head": 0.5921677350997925,
      "loss_layer_18_head": 0.46807366609573364,
      "loss_layer_24_head": 0.2794244885444641,
      "loss_layer_30_head": 0.1822454035282135,
      "loss_layer_36_head": 0.10536499321460724,
      "loss_layer_42_head": 0.05920606106519699,
      "loss_layer_6_head": 0.8229131698608398,
      "step": 4350
    },
    {
      "epoch": 55.744,
      "grad_norm": 0.31758624847842987,
      "learning_rate": 0.0024272882031422215,
      "loss": 2.5797,
      "loss_layer_12_head": 0.6548439264297485,
      "loss_layer_18_head": 0.5011368989944458,
      "loss_layer_24_head": 0.2972460389137268,
      "loss_layer_30_head": 0.1880016326904297,
      "loss_layer_36_head": 0.10980691015720367,
      "loss_layer_42_head": 0.06044121831655502,
      "loss_layer_6_head": 0.8695741891860962,
      "step": 4355
    },
    {
      "epoch": 55.808,
      "grad_norm": 0.31807383321048127,
      "learning_rate": 0.0024216967523672996,
      "loss": 2.6056,
      "loss_layer_12_head": 0.6195405125617981,
      "loss_layer_18_head": 0.45054951310157776,
      "loss_layer_24_head": 0.2635253369808197,
      "loss_layer_30_head": 0.166463702917099,
      "loss_layer_36_head": 0.09944634884595871,
      "loss_layer_42_head": 0.05420919507741928,
      "loss_layer_6_head": 0.7999774217605591,
      "step": 4360
    },
    {
      "epoch": 55.872,
      "grad_norm": 0.22625113785278858,
      "learning_rate": 0.0024161056936456873,
      "loss": 2.6702,
      "loss_layer_12_head": 0.6771770119667053,
      "loss_layer_18_head": 0.49992674589157104,
      "loss_layer_24_head": 0.2994401156902313,
      "loss_layer_30_head": 0.184663325548172,
      "loss_layer_36_head": 0.1140771135687828,
      "loss_layer_42_head": 0.062156688421964645,
      "loss_layer_6_head": 0.8659213781356812,
      "step": 4365
    },
    {
      "epoch": 55.936,
      "grad_norm": 0.29593556199521626,
      "learning_rate": 0.0024105150549710236,
      "loss": 2.5857,
      "loss_layer_12_head": 0.6329454183578491,
      "loss_layer_18_head": 0.46634596586227417,
      "loss_layer_24_head": 0.27667075395584106,
      "loss_layer_30_head": 0.1711854487657547,
      "loss_layer_36_head": 0.10577268898487091,
      "loss_layer_42_head": 0.0567166805267334,
      "loss_layer_6_head": 0.8170658349990845,
      "step": 4370
    },
    {
      "epoch": 56.0,
      "grad_norm": 0.20281902106836242,
      "learning_rate": 0.002404924864334851,
      "loss": 2.6556,
      "loss_layer_12_head": 0.6764439344406128,
      "loss_layer_18_head": 0.4930901527404785,
      "loss_layer_24_head": 0.2958858609199524,
      "loss_layer_30_head": 0.18028199672698975,
      "loss_layer_36_head": 0.10674085468053818,
      "loss_layer_42_head": 0.05938177555799484,
      "loss_layer_6_head": 0.8677433729171753,
      "step": 4375
    },
    {
      "epoch": 56.064,
      "grad_norm": 0.27883053137415514,
      "learning_rate": 0.002399335149726463,
      "loss": 2.5374,
      "loss_layer_12_head": 0.6353329420089722,
      "loss_layer_18_head": 0.4442032277584076,
      "loss_layer_24_head": 0.27248871326446533,
      "loss_layer_30_head": 0.16721227765083313,
      "loss_layer_36_head": 0.10267417132854462,
      "loss_layer_42_head": 0.05578412860631943,
      "loss_layer_6_head": 0.7867193818092346,
      "step": 4380
    },
    {
      "epoch": 56.128,
      "grad_norm": 0.278558884611558,
      "learning_rate": 0.0023937459391327708,
      "loss": 2.5198,
      "loss_layer_12_head": 0.6203794479370117,
      "loss_layer_18_head": 0.4551053047180176,
      "loss_layer_24_head": 0.27880704402923584,
      "loss_layer_30_head": 0.16575975716114044,
      "loss_layer_36_head": 0.10273070633411407,
      "loss_layer_42_head": 0.05597027391195297,
      "loss_layer_6_head": 0.801399827003479,
      "step": 4385
    },
    {
      "epoch": 56.192,
      "grad_norm": 0.23992204938638131,
      "learning_rate": 0.002388157260538165,
      "loss": 2.4939,
      "loss_layer_12_head": 0.5918669700622559,
      "loss_layer_18_head": 0.44800859689712524,
      "loss_layer_24_head": 0.2783586382865906,
      "loss_layer_30_head": 0.16176094114780426,
      "loss_layer_36_head": 0.10147212445735931,
      "loss_layer_42_head": 0.05546004697680473,
      "loss_layer_6_head": 0.7897508144378662,
      "step": 4390
    },
    {
      "epoch": 56.256,
      "grad_norm": 0.21495939320740498,
      "learning_rate": 0.0023825691419243696,
      "loss": 2.4973,
      "loss_layer_12_head": 0.5916481018066406,
      "loss_layer_18_head": 0.45686855912208557,
      "loss_layer_24_head": 0.2912493050098419,
      "loss_layer_30_head": 0.16586416959762573,
      "loss_layer_36_head": 0.10762973874807358,
      "loss_layer_42_head": 0.06007847934961319,
      "loss_layer_6_head": 0.8057502508163452,
      "step": 4395
    },
    {
      "epoch": 56.32,
      "grad_norm": 0.17652228297601427,
      "learning_rate": 0.0023769816112703046,
      "loss": 2.4852,
      "loss_layer_12_head": 0.5756695866584778,
      "loss_layer_18_head": 0.45849284529685974,
      "loss_layer_24_head": 0.2879125475883484,
      "loss_layer_30_head": 0.16796058416366577,
      "loss_layer_36_head": 0.10795922577381134,
      "loss_layer_42_head": 0.06022549420595169,
      "loss_layer_6_head": 0.7969522476196289,
      "step": 4400
    },
    {
      "epoch": 56.32,
      "eval_loss": 5.435645580291748,
      "eval_loss_layer_12_head": 1.2525551319122314,
      "eval_loss_layer_18_head": 1.0857855081558228,
      "eval_loss_layer_24_head": 0.7113651037216187,
      "eval_loss_layer_30_head": 0.45802274346351624,
      "eval_loss_layer_36_head": 0.2926025688648224,
      "eval_loss_layer_42_head": 0.18605944514274597,
      "eval_loss_layer_6_head": 1.5925477743148804,
      "eval_runtime": 33.0449,
      "eval_samples_per_second": 9.684,
      "eval_steps_per_second": 0.605,
      "step": 4400
    },
    {
      "epoch": 56.384,
      "grad_norm": 0.1789666151559161,
      "learning_rate": 0.0023713946965519496,
      "loss": 2.5129,
      "loss_layer_12_head": 0.5662041902542114,
      "loss_layer_18_head": 0.455340713262558,
      "loss_layer_24_head": 0.2828534245491028,
      "loss_layer_30_head": 0.16197171807289124,
      "loss_layer_36_head": 0.10114751756191254,
      "loss_layer_42_head": 0.05529598146677017,
      "loss_layer_6_head": 0.7933381199836731,
      "step": 4405
    },
    {
      "epoch": 56.448,
      "grad_norm": 0.17067796897617735,
      "learning_rate": 0.002365808425742196,
      "loss": 2.5402,
      "loss_layer_12_head": 0.606728732585907,
      "loss_layer_18_head": 0.48434457182884216,
      "loss_layer_24_head": 0.3087387681007385,
      "loss_layer_30_head": 0.18298545479774475,
      "loss_layer_36_head": 0.11520683765411377,
      "loss_layer_42_head": 0.07391555607318878,
      "loss_layer_6_head": 0.8525981903076172,
      "step": 4410
    },
    {
      "epoch": 56.512,
      "grad_norm": 0.17430111452054203,
      "learning_rate": 0.0023602228268107144,
      "loss": 2.5367,
      "loss_layer_12_head": 0.5975836515426636,
      "loss_layer_18_head": 0.484306663274765,
      "loss_layer_24_head": 0.30011487007141113,
      "loss_layer_30_head": 0.17803971469402313,
      "loss_layer_36_head": 0.11113481223583221,
      "loss_layer_42_head": 0.06298957765102386,
      "loss_layer_6_head": 0.8569208383560181,
      "step": 4415
    },
    {
      "epoch": 56.576,
      "grad_norm": 0.18055579135731675,
      "learning_rate": 0.0023546379277238107,
      "loss": 2.5804,
      "loss_layer_12_head": 0.6130940318107605,
      "loss_layer_18_head": 0.494082510471344,
      "loss_layer_24_head": 0.30286192893981934,
      "loss_layer_30_head": 0.177048459649086,
      "loss_layer_36_head": 0.11077429354190826,
      "loss_layer_42_head": 0.06453189998865128,
      "loss_layer_6_head": 0.873234748840332,
      "step": 4420
    },
    {
      "epoch": 56.64,
      "grad_norm": 0.13290029236906636,
      "learning_rate": 0.0023490537564442846,
      "loss": 2.5411,
      "loss_layer_12_head": 0.5823470950126648,
      "loss_layer_18_head": 0.47628313302993774,
      "loss_layer_24_head": 0.2881101667881012,
      "loss_layer_30_head": 0.17582383751869202,
      "loss_layer_36_head": 0.10557415336370468,
      "loss_layer_42_head": 0.060631293803453445,
      "loss_layer_6_head": 0.8256770968437195,
      "step": 4425
    },
    {
      "epoch": 56.704,
      "grad_norm": 0.21167757338434726,
      "learning_rate": 0.002343470340931295,
      "loss": 2.5482,
      "loss_layer_12_head": 0.5971741676330566,
      "loss_layer_18_head": 0.48882144689559937,
      "loss_layer_24_head": 0.2949162423610687,
      "loss_layer_30_head": 0.18153618276119232,
      "loss_layer_36_head": 0.1083507165312767,
      "loss_layer_42_head": 0.0650261789560318,
      "loss_layer_6_head": 0.8544259071350098,
      "step": 4430
    },
    {
      "epoch": 56.768,
      "grad_norm": 0.22414169718280336,
      "learning_rate": 0.002337887709140216,
      "loss": 2.548,
      "loss_layer_12_head": 0.5883963704109192,
      "loss_layer_18_head": 0.4787723422050476,
      "loss_layer_24_head": 0.2876366078853607,
      "loss_layer_30_head": 0.175010085105896,
      "loss_layer_36_head": 0.10664020478725433,
      "loss_layer_42_head": 0.061940472573041916,
      "loss_layer_6_head": 0.8392828106880188,
      "step": 4435
    },
    {
      "epoch": 56.832,
      "grad_norm": 0.22157974533993033,
      "learning_rate": 0.002332305889022494,
      "loss": 2.586,
      "loss_layer_12_head": 0.5700522065162659,
      "loss_layer_18_head": 0.4654887318611145,
      "loss_layer_24_head": 0.2783108055591583,
      "loss_layer_30_head": 0.17037798464298248,
      "loss_layer_36_head": 0.10330487787723541,
      "loss_layer_42_head": 0.056980930268764496,
      "loss_layer_6_head": 0.8167275190353394,
      "step": 4440
    },
    {
      "epoch": 56.896,
      "grad_norm": 0.21596927395318613,
      "learning_rate": 0.0023267249085255175,
      "loss": 2.563,
      "loss_layer_12_head": 0.5734266042709351,
      "loss_layer_18_head": 0.47155970335006714,
      "loss_layer_24_head": 0.28131791949272156,
      "loss_layer_30_head": 0.17437875270843506,
      "loss_layer_36_head": 0.10709867626428604,
      "loss_layer_42_head": 0.05814199894666672,
      "loss_layer_6_head": 0.8227623701095581,
      "step": 4445
    },
    {
      "epoch": 56.96,
      "grad_norm": 0.23556151528464955,
      "learning_rate": 0.002321144795592467,
      "loss": 2.5917,
      "loss_layer_12_head": 0.6156028509140015,
      "loss_layer_18_head": 0.5009576082229614,
      "loss_layer_24_head": 0.29606086015701294,
      "loss_layer_30_head": 0.1790095865726471,
      "loss_layer_36_head": 0.1074845939874649,
      "loss_layer_42_head": 0.05882176011800766,
      "loss_layer_6_head": 0.877709686756134,
      "step": 4450
    },
    {
      "epoch": 57.024,
      "grad_norm": 0.2703535158197075,
      "learning_rate": 0.0023155655781621795,
      "loss": 2.5562,
      "loss_layer_12_head": 0.5849881172180176,
      "loss_layer_18_head": 0.4798177182674408,
      "loss_layer_24_head": 0.2866442799568176,
      "loss_layer_30_head": 0.1793004423379898,
      "loss_layer_36_head": 0.10663628578186035,
      "loss_layer_42_head": 0.057594120502471924,
      "loss_layer_6_head": 0.8359482884407043,
      "step": 4455
    },
    {
      "epoch": 57.088,
      "grad_norm": 0.22848062613337086,
      "learning_rate": 0.0023099872841690103,
      "loss": 2.4465,
      "loss_layer_12_head": 0.5971131920814514,
      "loss_layer_18_head": 0.4855070114135742,
      "loss_layer_24_head": 0.2964119613170624,
      "loss_layer_30_head": 0.17941585183143616,
      "loss_layer_36_head": 0.10852830111980438,
      "loss_layer_42_head": 0.057468481361866,
      "loss_layer_6_head": 0.8578914403915405,
      "step": 4460
    },
    {
      "epoch": 57.152,
      "grad_norm": 0.20318936038953098,
      "learning_rate": 0.00230440994154269,
      "loss": 2.4288,
      "loss_layer_12_head": 0.5813121795654297,
      "loss_layer_18_head": 0.4700354039669037,
      "loss_layer_24_head": 0.2864946722984314,
      "loss_layer_30_head": 0.1796506941318512,
      "loss_layer_36_head": 0.10614664852619171,
      "loss_layer_42_head": 0.05745818465948105,
      "loss_layer_6_head": 0.8358052968978882,
      "step": 4465
    },
    {
      "epoch": 57.216,
      "grad_norm": 0.16355127807022987,
      "learning_rate": 0.0022988335782081855,
      "loss": 2.4612,
      "loss_layer_12_head": 0.5898504257202148,
      "loss_layer_18_head": 0.4776988923549652,
      "loss_layer_24_head": 0.3001847267150879,
      "loss_layer_30_head": 0.1852388083934784,
      "loss_layer_36_head": 0.10961882770061493,
      "loss_layer_42_head": 0.06655486673116684,
      "loss_layer_6_head": 0.8333114385604858,
      "step": 4470
    },
    {
      "epoch": 57.28,
      "grad_norm": 0.23662758051666447,
      "learning_rate": 0.002293258222085561,
      "loss": 2.5036,
      "loss_layer_12_head": 0.5678946375846863,
      "loss_layer_18_head": 0.4585058093070984,
      "loss_layer_24_head": 0.2900890111923218,
      "loss_layer_30_head": 0.17641283571720123,
      "loss_layer_36_head": 0.11269589513540268,
      "loss_layer_42_head": 0.06481662392616272,
      "loss_layer_6_head": 0.8220020532608032,
      "step": 4475
    },
    {
      "epoch": 57.344,
      "grad_norm": 0.1724406762560376,
      "learning_rate": 0.0022876839010898377,
      "loss": 2.4707,
      "loss_layer_12_head": 0.5802193284034729,
      "loss_layer_18_head": 0.4710553288459778,
      "loss_layer_24_head": 0.2845357656478882,
      "loss_layer_30_head": 0.17493577301502228,
      "loss_layer_36_head": 0.10898109525442123,
      "loss_layer_42_head": 0.05822696164250374,
      "loss_layer_6_head": 0.8321797251701355,
      "step": 4480
    },
    {
      "epoch": 57.408,
      "grad_norm": 0.18210512808654467,
      "learning_rate": 0.0022821106431308545,
      "loss": 2.5133,
      "loss_layer_12_head": 0.5603395700454712,
      "loss_layer_18_head": 0.45822811126708984,
      "loss_layer_24_head": 0.27792587876319885,
      "loss_layer_30_head": 0.1674807220697403,
      "loss_layer_36_head": 0.10714640468358994,
      "loss_layer_42_head": 0.05493900179862976,
      "loss_layer_6_head": 0.8039437532424927,
      "step": 4485
    },
    {
      "epoch": 57.472,
      "grad_norm": 0.18399510128273905,
      "learning_rate": 0.0022765384761131275,
      "loss": 2.4945,
      "loss_layer_12_head": 0.5696504712104797,
      "loss_layer_18_head": 0.46265560388565063,
      "loss_layer_24_head": 0.28569504618644714,
      "loss_layer_30_head": 0.17213912308216095,
      "loss_layer_36_head": 0.10988461971282959,
      "loss_layer_42_head": 0.05918467044830322,
      "loss_layer_6_head": 0.820097804069519,
      "step": 4490
    },
    {
      "epoch": 57.536,
      "grad_norm": 0.14985598004904502,
      "learning_rate": 0.0022709674279357086,
      "loss": 2.4999,
      "loss_layer_12_head": 0.6064666509628296,
      "loss_layer_18_head": 0.4905788004398346,
      "loss_layer_24_head": 0.2979227304458618,
      "loss_layer_30_head": 0.17861633002758026,
      "loss_layer_36_head": 0.11234886944293976,
      "loss_layer_42_head": 0.05627857893705368,
      "loss_layer_6_head": 0.8598806262016296,
      "step": 4495
    },
    {
      "epoch": 57.6,
      "grad_norm": 0.12469873797206027,
      "learning_rate": 0.002265397526492052,
      "loss": 2.4991,
      "loss_layer_12_head": 0.5830758810043335,
      "loss_layer_18_head": 0.47082266211509705,
      "loss_layer_24_head": 0.2848433554172516,
      "loss_layer_30_head": 0.17296554148197174,
      "loss_layer_36_head": 0.11026298999786377,
      "loss_layer_42_head": 0.0563112273812294,
      "loss_layer_6_head": 0.8314511179924011,
      "step": 4500
    },
    {
      "epoch": 57.664,
      "grad_norm": 0.2112254713787123,
      "learning_rate": 0.002259828799669867,
      "loss": 2.5604,
      "loss_layer_12_head": 0.6022500991821289,
      "loss_layer_18_head": 0.4846401810646057,
      "loss_layer_24_head": 0.29193076491355896,
      "loss_layer_30_head": 0.17449164390563965,
      "loss_layer_36_head": 0.11249242722988129,
      "loss_layer_42_head": 0.05871480703353882,
      "loss_layer_6_head": 0.8656318783760071,
      "step": 4505
    },
    {
      "epoch": 57.728,
      "grad_norm": 0.2290204830870032,
      "learning_rate": 0.002254261275350982,
      "loss": 2.4953,
      "loss_layer_12_head": 0.569220781326294,
      "loss_layer_18_head": 0.4583941102027893,
      "loss_layer_24_head": 0.2774435877799988,
      "loss_layer_30_head": 0.17020829021930695,
      "loss_layer_36_head": 0.11035017669200897,
      "loss_layer_42_head": 0.056515783071517944,
      "loss_layer_6_head": 0.8195785284042358,
      "step": 4510
    },
    {
      "epoch": 57.792,
      "grad_norm": 0.13511250471480823,
      "learning_rate": 0.002248694981411208,
      "loss": 2.5617,
      "loss_layer_12_head": 0.6134846210479736,
      "loss_layer_18_head": 0.49031925201416016,
      "loss_layer_24_head": 0.2977134883403778,
      "loss_layer_30_head": 0.18010742962360382,
      "loss_layer_36_head": 0.11479191482067108,
      "loss_layer_42_head": 0.05903942510485649,
      "loss_layer_6_head": 0.8781617283821106,
      "step": 4515
    },
    {
      "epoch": 57.856,
      "grad_norm": 0.16459794982272416,
      "learning_rate": 0.00224312994572019,
      "loss": 2.5374,
      "loss_layer_12_head": 0.607029914855957,
      "loss_layer_18_head": 0.4904373288154602,
      "loss_layer_24_head": 0.29916366934776306,
      "loss_layer_30_head": 0.18168263137340546,
      "loss_layer_36_head": 0.11601754277944565,
      "loss_layer_42_head": 0.06131085753440857,
      "loss_layer_6_head": 0.867841899394989,
      "step": 4520
    },
    {
      "epoch": 57.92,
      "grad_norm": 0.2165425711746449,
      "learning_rate": 0.002237566196141278,
      "loss": 2.5316,
      "loss_layer_12_head": 0.5727505683898926,
      "loss_layer_18_head": 0.46079221367836,
      "loss_layer_24_head": 0.2788800597190857,
      "loss_layer_30_head": 0.16763320565223694,
      "loss_layer_36_head": 0.11344524472951889,
      "loss_layer_42_head": 0.05814027786254883,
      "loss_layer_6_head": 0.8163038492202759,
      "step": 4525
    },
    {
      "epoch": 57.984,
      "grad_norm": 0.19828070964323538,
      "learning_rate": 0.0022320037605313807,
      "loss": 2.5692,
      "loss_layer_12_head": 0.6003071069717407,
      "loss_layer_18_head": 0.4872647821903229,
      "loss_layer_24_head": 0.29704543948173523,
      "loss_layer_30_head": 0.179453045129776,
      "loss_layer_36_head": 0.11824941635131836,
      "loss_layer_42_head": 0.06139494106173515,
      "loss_layer_6_head": 0.8560463786125183,
      "step": 4530
    },
    {
      "epoch": 58.048,
      "grad_norm": 0.1278230073545603,
      "learning_rate": 0.0022264426667408283,
      "loss": 2.4489,
      "loss_layer_12_head": 0.5764017701148987,
      "loss_layer_18_head": 0.46223655343055725,
      "loss_layer_24_head": 0.2812119424343109,
      "loss_layer_30_head": 0.16856727004051208,
      "loss_layer_36_head": 0.11175708472728729,
      "loss_layer_42_head": 0.05950898677110672,
      "loss_layer_6_head": 0.8265436887741089,
      "step": 4535
    },
    {
      "epoch": 58.112,
      "grad_norm": 0.11239679926515211,
      "learning_rate": 0.0022208829426132305,
      "loss": 2.3956,
      "loss_layer_12_head": 0.5507399439811707,
      "loss_layer_18_head": 0.44255584478378296,
      "loss_layer_24_head": 0.2688181400299072,
      "loss_layer_30_head": 0.1620536744594574,
      "loss_layer_36_head": 0.10867072641849518,
      "loss_layer_42_head": 0.0563640221953392,
      "loss_layer_6_head": 0.7948781251907349,
      "step": 4540
    },
    {
      "epoch": 58.176,
      "grad_norm": 0.17513012622436147,
      "learning_rate": 0.0022153246159853446,
      "loss": 2.4238,
      "loss_layer_12_head": 0.5696214437484741,
      "loss_layer_18_head": 0.4527609348297119,
      "loss_layer_24_head": 0.2730925977230072,
      "loss_layer_30_head": 0.1664622724056244,
      "loss_layer_36_head": 0.10759119689464569,
      "loss_layer_42_head": 0.05722712352871895,
      "loss_layer_6_head": 0.8249085545539856,
      "step": 4545
    },
    {
      "epoch": 58.24,
      "grad_norm": 0.158761190871937,
      "learning_rate": 0.0022097677146869243,
      "loss": 2.4777,
      "loss_layer_12_head": 0.5966030359268188,
      "loss_layer_18_head": 0.4793243408203125,
      "loss_layer_24_head": 0.29062002897262573,
      "loss_layer_30_head": 0.17801925539970398,
      "loss_layer_36_head": 0.11212010681629181,
      "loss_layer_42_head": 0.061325620859861374,
      "loss_layer_6_head": 0.8466509580612183,
      "step": 4550
    },
    {
      "epoch": 58.304,
      "grad_norm": 0.18661457819956426,
      "learning_rate": 0.0022042122665405926,
      "loss": 2.4806,
      "loss_layer_12_head": 0.5805360674858093,
      "loss_layer_18_head": 0.46310240030288696,
      "loss_layer_24_head": 0.27996405959129333,
      "loss_layer_30_head": 0.16960285604000092,
      "loss_layer_36_head": 0.10729736089706421,
      "loss_layer_42_head": 0.05934927612543106,
      "loss_layer_6_head": 0.825975775718689,
      "step": 4555
    },
    {
      "epoch": 58.368,
      "grad_norm": 0.2215008485084806,
      "learning_rate": 0.0021986582993616926,
      "loss": 2.4684,
      "loss_layer_12_head": 0.5754228830337524,
      "loss_layer_18_head": 0.45829153060913086,
      "loss_layer_24_head": 0.2772030234336853,
      "loss_layer_30_head": 0.16725139319896698,
      "loss_layer_36_head": 0.10664784908294678,
      "loss_layer_42_head": 0.05964239686727524,
      "loss_layer_6_head": 0.8192952871322632,
      "step": 4560
    },
    {
      "epoch": 58.432,
      "grad_norm": 0.15239207945626118,
      "learning_rate": 0.0021931058409581546,
      "loss": 2.494,
      "loss_layer_12_head": 0.5930020213127136,
      "loss_layer_18_head": 0.4745727479457855,
      "loss_layer_24_head": 0.28661245107650757,
      "loss_layer_30_head": 0.17128418385982513,
      "loss_layer_36_head": 0.11070281267166138,
      "loss_layer_42_head": 0.061830341815948486,
      "loss_layer_6_head": 0.8477579355239868,
      "step": 4565
    },
    {
      "epoch": 58.496,
      "grad_norm": 0.15040498500543265,
      "learning_rate": 0.0021875549191303543,
      "loss": 2.4722,
      "loss_layer_12_head": 0.5596949458122253,
      "loss_layer_18_head": 0.4448203444480896,
      "loss_layer_24_head": 0.2676331400871277,
      "loss_layer_30_head": 0.1549762487411499,
      "loss_layer_36_head": 0.10021992027759552,
      "loss_layer_42_head": 0.055803991854190826,
      "loss_layer_6_head": 0.7978628277778625,
      "step": 4570
    },
    {
      "epoch": 58.56,
      "grad_norm": 0.13613383258724596,
      "learning_rate": 0.0021820055616709735,
      "loss": 2.512,
      "loss_layer_12_head": 0.5781766176223755,
      "loss_layer_18_head": 0.46287378668785095,
      "loss_layer_24_head": 0.27971380949020386,
      "loss_layer_30_head": 0.16588933765888214,
      "loss_layer_36_head": 0.10516221821308136,
      "loss_layer_42_head": 0.057543493807315826,
      "loss_layer_6_head": 0.8252560496330261,
      "step": 4575
    },
    {
      "epoch": 58.624,
      "grad_norm": 0.14053396816065195,
      "learning_rate": 0.0021764577963648613,
      "loss": 2.4914,
      "loss_layer_12_head": 0.6176890134811401,
      "loss_layer_18_head": 0.49333256483078003,
      "loss_layer_24_head": 0.29561707377433777,
      "loss_layer_30_head": 0.17136751115322113,
      "loss_layer_36_head": 0.1076594740152359,
      "loss_layer_42_head": 0.05871790647506714,
      "loss_layer_6_head": 0.8856876492500305,
      "step": 4580
    },
    {
      "epoch": 58.688,
      "grad_norm": 0.16313439889587297,
      "learning_rate": 0.0021709116509888967,
      "loss": 2.4731,
      "loss_layer_12_head": 0.5726068615913391,
      "loss_layer_18_head": 0.45508474111557007,
      "loss_layer_24_head": 0.2728608250617981,
      "loss_layer_30_head": 0.16124099493026733,
      "loss_layer_36_head": 0.1005663275718689,
      "loss_layer_42_head": 0.05565021559596062,
      "loss_layer_6_head": 0.8281255960464478,
      "step": 4585
    },
    {
      "epoch": 58.752,
      "grad_norm": 0.14616705244436776,
      "learning_rate": 0.002165367153311847,
      "loss": 2.4816,
      "loss_layer_12_head": 0.6009113192558289,
      "loss_layer_18_head": 0.48359233140945435,
      "loss_layer_24_head": 0.31056347489356995,
      "loss_layer_30_head": 0.17175769805908203,
      "loss_layer_36_head": 0.1081700325012207,
      "loss_layer_42_head": 0.05965833738446236,
      "loss_layer_6_head": 0.8554708361625671,
      "step": 4590
    },
    {
      "epoch": 58.816,
      "grad_norm": 0.21664120468625417,
      "learning_rate": 0.0021598243310942265,
      "loss": 2.5532,
      "loss_layer_12_head": 0.5780547857284546,
      "loss_layer_18_head": 0.46487903594970703,
      "loss_layer_24_head": 0.28046971559524536,
      "loss_layer_30_head": 0.16261973977088928,
      "loss_layer_36_head": 0.10132857412099838,
      "loss_layer_42_head": 0.05539955571293831,
      "loss_layer_6_head": 0.8330074548721313,
      "step": 4595
    },
    {
      "epoch": 58.88,
      "grad_norm": 0.16815466997915046,
      "learning_rate": 0.0021542832120881677,
      "loss": 2.4804,
      "loss_layer_12_head": 0.5650031566619873,
      "loss_layer_18_head": 0.45689597725868225,
      "loss_layer_24_head": 0.2741110920906067,
      "loss_layer_30_head": 0.16062983870506287,
      "loss_layer_36_head": 0.1005447655916214,
      "loss_layer_42_head": 0.055652182549238205,
      "loss_layer_6_head": 0.8087980151176453,
      "step": 4600
    },
    {
      "epoch": 58.88,
      "eval_loss": 5.417877197265625,
      "eval_loss_layer_12_head": 1.241725206375122,
      "eval_loss_layer_18_head": 1.0781893730163574,
      "eval_loss_layer_24_head": 0.7667786478996277,
      "eval_loss_layer_30_head": 0.4487524628639221,
      "eval_loss_layer_36_head": 0.28704124689102173,
      "eval_loss_layer_42_head": 0.17080830037593842,
      "eval_loss_layer_6_head": 1.5894994735717773,
      "eval_runtime": 33.0959,
      "eval_samples_per_second": 9.669,
      "eval_steps_per_second": 0.604,
      "step": 4600
    },
    {
      "epoch": 58.944,
      "grad_norm": 0.18974005197119617,
      "learning_rate": 0.002148743824037269,
      "loss": 2.5428,
      "loss_layer_12_head": 0.571753203868866,
      "loss_layer_18_head": 0.45762911438941956,
      "loss_layer_24_head": 0.2757968008518219,
      "loss_layer_30_head": 0.15843194723129272,
      "loss_layer_36_head": 0.09751478582620621,
      "loss_layer_42_head": 0.05398407578468323,
      "loss_layer_6_head": 0.8199464082717896,
      "step": 4605
    },
    {
      "epoch": 59.008,
      "grad_norm": 0.17195820171450818,
      "learning_rate": 0.0021432061946764645,
      "loss": 2.5712,
      "loss_layer_12_head": 0.6037660837173462,
      "loss_layer_18_head": 0.48311740159988403,
      "loss_layer_24_head": 0.29109007120132446,
      "loss_layer_30_head": 0.1673159897327423,
      "loss_layer_36_head": 0.10588864237070084,
      "loss_layer_42_head": 0.05747683718800545,
      "loss_layer_6_head": 0.8654545545578003,
      "step": 4610
    },
    {
      "epoch": 59.072,
      "grad_norm": 0.14548162634171288,
      "learning_rate": 0.0021376703517318836,
      "loss": 2.4484,
      "loss_layer_12_head": 0.5612436532974243,
      "loss_layer_18_head": 0.4474439024925232,
      "loss_layer_24_head": 0.2753118574619293,
      "loss_layer_30_head": 0.15982148051261902,
      "loss_layer_36_head": 0.10203397274017334,
      "loss_layer_42_head": 0.058989621698856354,
      "loss_layer_6_head": 0.7993270754814148,
      "step": 4615
    },
    {
      "epoch": 59.136,
      "grad_norm": 0.14741548254669526,
      "learning_rate": 0.00213213632292071,
      "loss": 2.3968,
      "loss_layer_12_head": 0.5600190162658691,
      "loss_layer_18_head": 0.44927310943603516,
      "loss_layer_24_head": 0.2729829251766205,
      "loss_layer_30_head": 0.1584458202123642,
      "loss_layer_36_head": 0.10046267509460449,
      "loss_layer_42_head": 0.06051095575094223,
      "loss_layer_6_head": 0.7968991994857788,
      "step": 4620
    },
    {
      "epoch": 59.2,
      "grad_norm": 0.13361883868375918,
      "learning_rate": 0.0021266041359510454,
      "loss": 2.4383,
      "loss_layer_12_head": 0.5436227321624756,
      "loss_layer_18_head": 0.43597620725631714,
      "loss_layer_24_head": 0.2673892378807068,
      "loss_layer_30_head": 0.15652170777320862,
      "loss_layer_36_head": 0.09849027544260025,
      "loss_layer_42_head": 0.05849454551935196,
      "loss_layer_6_head": 0.7828616499900818,
      "step": 4625
    },
    {
      "epoch": 59.264,
      "grad_norm": 0.14559337570265102,
      "learning_rate": 0.002121073818521769,
      "loss": 2.4277,
      "loss_layer_12_head": 0.54389488697052,
      "loss_layer_18_head": 0.4395785927772522,
      "loss_layer_24_head": 0.319366991519928,
      "loss_layer_30_head": 0.17695581912994385,
      "loss_layer_36_head": 0.11806805431842804,
      "loss_layer_42_head": 0.05958566814661026,
      "loss_layer_6_head": 0.7731888294219971,
      "step": 4630
    },
    {
      "epoch": 59.328,
      "grad_norm": 0.1292795207115396,
      "learning_rate": 0.002115545398322399,
      "loss": 2.4209,
      "loss_layer_12_head": 0.5490065813064575,
      "loss_layer_18_head": 0.43888816237449646,
      "loss_layer_24_head": 0.2669915556907654,
      "loss_layer_30_head": 0.15464283525943756,
      "loss_layer_36_head": 0.09872941672801971,
      "loss_layer_42_head": 0.05715802311897278,
      "loss_layer_6_head": 0.7812833189964294,
      "step": 4635
    },
    {
      "epoch": 59.392,
      "grad_norm": 0.13177997210980216,
      "learning_rate": 0.0021100189030329557,
      "loss": 2.5282,
      "loss_layer_12_head": 0.5800895094871521,
      "loss_layer_18_head": 0.4661039710044861,
      "loss_layer_24_head": 0.2862584888935089,
      "loss_layer_30_head": 0.16687318682670593,
      "loss_layer_36_head": 0.1051797866821289,
      "loss_layer_42_head": 0.05943337082862854,
      "loss_layer_6_head": 0.8314909934997559,
      "step": 4640
    },
    {
      "epoch": 59.456,
      "grad_norm": 0.14476999804327303,
      "learning_rate": 0.002104494360323821,
      "loss": 2.454,
      "loss_layer_12_head": 0.5774174332618713,
      "loss_layer_18_head": 0.4637104868888855,
      "loss_layer_24_head": 0.2896471917629242,
      "loss_layer_30_head": 0.16324514150619507,
      "loss_layer_36_head": 0.10484713315963745,
      "loss_layer_42_head": 0.056944578886032104,
      "loss_layer_6_head": 0.8227840662002563,
      "step": 4645
    },
    {
      "epoch": 59.52,
      "grad_norm": 0.26866411748165125,
      "learning_rate": 0.002098971797855599,
      "loss": 2.5221,
      "loss_layer_12_head": 0.5708309412002563,
      "loss_layer_18_head": 0.4626857340335846,
      "loss_layer_24_head": 0.31038445234298706,
      "loss_layer_30_head": 0.16171947121620178,
      "loss_layer_36_head": 0.10384635627269745,
      "loss_layer_42_head": 0.05671144649386406,
      "loss_layer_6_head": 0.8197504281997681,
      "step": 4650
    },
    {
      "epoch": 59.584,
      "grad_norm": 0.26556589455538565,
      "learning_rate": 0.002093451243278983,
      "loss": 2.5715,
      "loss_layer_12_head": 0.5944358110427856,
      "loss_layer_18_head": 0.48179641366004944,
      "loss_layer_24_head": 0.3444107174873352,
      "loss_layer_30_head": 0.17053712904453278,
      "loss_layer_36_head": 0.10785571485757828,
      "loss_layer_42_head": 0.058969058096408844,
      "loss_layer_6_head": 0.8406673669815063,
      "step": 4655
    },
    {
      "epoch": 59.648,
      "grad_norm": 0.1990040463376391,
      "learning_rate": 0.0020879327242346096,
      "loss": 2.6092,
      "loss_layer_12_head": 0.6168867349624634,
      "loss_layer_18_head": 0.49590378999710083,
      "loss_layer_24_head": 0.3730694353580475,
      "loss_layer_30_head": 0.17376163601875305,
      "loss_layer_36_head": 0.1086416020989418,
      "loss_layer_42_head": 0.060128748416900635,
      "loss_layer_6_head": 0.870803952217102,
      "step": 4660
    },
    {
      "epoch": 59.712,
      "grad_norm": 0.1735568237333485,
      "learning_rate": 0.0020824162683529225,
      "loss": 2.5152,
      "loss_layer_12_head": 0.5705769658088684,
      "loss_layer_18_head": 0.4603880047798157,
      "loss_layer_24_head": 0.33475393056869507,
      "loss_layer_30_head": 0.15986992418766022,
      "loss_layer_36_head": 0.10103785991668701,
      "loss_layer_42_head": 0.05503993481397629,
      "loss_layer_6_head": 0.8102024793624878,
      "step": 4665
    },
    {
      "epoch": 59.776,
      "grad_norm": 0.1892398716464447,
      "learning_rate": 0.0020769019032540414,
      "loss": 2.5531,
      "loss_layer_12_head": 0.6034451723098755,
      "loss_layer_18_head": 0.4848301410675049,
      "loss_layer_24_head": 0.339438259601593,
      "loss_layer_30_head": 0.1713165044784546,
      "loss_layer_36_head": 0.10806481540203094,
      "loss_layer_42_head": 0.057984620332717896,
      "loss_layer_6_head": 0.8535141944885254,
      "step": 4670
    },
    {
      "epoch": 59.84,
      "grad_norm": 0.16677665618348805,
      "learning_rate": 0.0020713896565476113,
      "loss": 2.5536,
      "loss_layer_12_head": 0.5853129625320435,
      "loss_layer_18_head": 0.472302109003067,
      "loss_layer_24_head": 0.31986209750175476,
      "loss_layer_30_head": 0.17473192512989044,
      "loss_layer_36_head": 0.11199700832366943,
      "loss_layer_42_head": 0.05941686034202576,
      "loss_layer_6_head": 0.8250142335891724,
      "step": 4675
    },
    {
      "epoch": 59.904,
      "grad_norm": 0.15131867045737488,
      "learning_rate": 0.0020658795558326742,
      "loss": 2.5769,
      "loss_layer_12_head": 0.5732026100158691,
      "loss_layer_18_head": 0.4630556106567383,
      "loss_layer_24_head": 0.31059780716896057,
      "loss_layer_30_head": 0.1959940642118454,
      "loss_layer_36_head": 0.13501323759555817,
      "loss_layer_42_head": 0.056332312524318695,
      "loss_layer_6_head": 0.8275019526481628,
      "step": 4680
    },
    {
      "epoch": 59.968,
      "grad_norm": 0.1336822253745635,
      "learning_rate": 0.0020603716286975273,
      "loss": 2.553,
      "loss_layer_12_head": 0.5759366154670715,
      "loss_layer_18_head": 0.4614599347114563,
      "loss_layer_24_head": 0.29864490032196045,
      "loss_layer_30_head": 0.1643674671649933,
      "loss_layer_36_head": 0.10434909164905548,
      "loss_layer_42_head": 0.056179195642471313,
      "loss_layer_6_head": 0.8206602334976196,
      "step": 4685
    },
    {
      "epoch": 60.032,
      "grad_norm": 0.10557126398003093,
      "learning_rate": 0.002054865902719584,
      "loss": 2.484,
      "loss_layer_12_head": 0.6034387350082397,
      "loss_layer_18_head": 0.4799923896789551,
      "loss_layer_24_head": 0.30696067214012146,
      "loss_layer_30_head": 0.1680414229631424,
      "loss_layer_36_head": 0.10632847249507904,
      "loss_layer_42_head": 0.0554925873875618,
      "loss_layer_6_head": 0.856594443321228,
      "step": 4690
    },
    {
      "epoch": 60.096,
      "grad_norm": 0.13728141145752415,
      "learning_rate": 0.002049362405465236,
      "loss": 2.4026,
      "loss_layer_12_head": 0.5635955333709717,
      "loss_layer_18_head": 0.4489367604255676,
      "loss_layer_24_head": 0.29117846488952637,
      "loss_layer_30_head": 0.172239750623703,
      "loss_layer_36_head": 0.1156499832868576,
      "loss_layer_42_head": 0.05590670555830002,
      "loss_layer_6_head": 0.8053801655769348,
      "step": 4695
    },
    {
      "epoch": 60.16,
      "grad_norm": 0.16733025338353047,
      "learning_rate": 0.002043861164489719,
      "loss": 2.452,
      "loss_layer_12_head": 0.5522740483283997,
      "loss_layer_18_head": 0.4394863247871399,
      "loss_layer_24_head": 0.2836727201938629,
      "loss_layer_30_head": 0.16016581654548645,
      "loss_layer_36_head": 0.10404539108276367,
      "loss_layer_42_head": 0.05397038534283638,
      "loss_layer_6_head": 0.783711850643158,
      "step": 4700
    },
    {
      "epoch": 60.224,
      "grad_norm": 0.13565327316682316,
      "learning_rate": 0.002038362207336968,
      "loss": 2.4429,
      "loss_layer_12_head": 0.5636254549026489,
      "loss_layer_18_head": 0.4494708180427551,
      "loss_layer_24_head": 0.2944619059562683,
      "loss_layer_30_head": 0.16209246218204498,
      "loss_layer_36_head": 0.10295188426971436,
      "loss_layer_42_head": 0.05457041785120964,
      "loss_layer_6_head": 0.8019887208938599,
      "step": 4705
    },
    {
      "epoch": 60.288,
      "grad_norm": 0.21631134177247294,
      "learning_rate": 0.002032865561539488,
      "loss": 2.4002,
      "loss_layer_12_head": 0.5547246932983398,
      "loss_layer_18_head": 0.4414152204990387,
      "loss_layer_24_head": 0.28859683871269226,
      "loss_layer_30_head": 0.15403515100479126,
      "loss_layer_36_head": 0.10015028715133667,
      "loss_layer_42_head": 0.052341412752866745,
      "loss_layer_6_head": 0.7943946719169617,
      "step": 4710
    },
    {
      "epoch": 60.352,
      "grad_norm": 0.19413229766060092,
      "learning_rate": 0.0020273712546182076,
      "loss": 2.4722,
      "loss_layer_12_head": 0.605604350566864,
      "loss_layer_18_head": 0.48442739248275757,
      "loss_layer_24_head": 0.31784483790397644,
      "loss_layer_30_head": 0.1707189977169037,
      "loss_layer_36_head": 0.10727381706237793,
      "loss_layer_42_head": 0.05650920793414116,
      "loss_layer_6_head": 0.8595749139785767,
      "step": 4715
    },
    {
      "epoch": 60.416,
      "grad_norm": 0.1292688143936891,
      "learning_rate": 0.002021879314082344,
      "loss": 2.5151,
      "loss_layer_12_head": 0.6001886129379272,
      "loss_layer_18_head": 0.4783724844455719,
      "loss_layer_24_head": 0.30472320318222046,
      "loss_layer_30_head": 0.16966404020786285,
      "loss_layer_36_head": 0.10604814440011978,
      "loss_layer_42_head": 0.055479537695646286,
      "loss_layer_6_head": 0.8526209592819214,
      "step": 4720
    },
    {
      "epoch": 60.48,
      "grad_norm": 0.1496519876507065,
      "learning_rate": 0.002016389767429272,
      "loss": 2.4903,
      "loss_layer_12_head": 0.5903602242469788,
      "loss_layer_18_head": 0.4677322506904602,
      "loss_layer_24_head": 0.29523545503616333,
      "loss_layer_30_head": 0.1676800400018692,
      "loss_layer_36_head": 0.10412196069955826,
      "loss_layer_42_head": 0.05471722409129143,
      "loss_layer_6_head": 0.8452634811401367,
      "step": 4725
    },
    {
      "epoch": 60.544,
      "grad_norm": 0.1518343159374307,
      "learning_rate": 0.0020109026421443743,
      "loss": 2.4814,
      "loss_layer_12_head": 0.6067124605178833,
      "loss_layer_18_head": 0.485752671957016,
      "loss_layer_24_head": 0.30708232522010803,
      "loss_layer_30_head": 0.1731034368276596,
      "loss_layer_36_head": 0.10860159248113632,
      "loss_layer_42_head": 0.056604206562042236,
      "loss_layer_6_head": 0.8624668121337891,
      "step": 4730
    },
    {
      "epoch": 60.608,
      "grad_norm": 0.11203833018055016,
      "learning_rate": 0.0020054179657009127,
      "loss": 2.5086,
      "loss_layer_12_head": 0.5633145570755005,
      "loss_layer_18_head": 0.45005232095718384,
      "loss_layer_24_head": 0.2810554504394531,
      "loss_layer_30_head": 0.15952235460281372,
      "loss_layer_36_head": 0.10255016386508942,
      "loss_layer_42_head": 0.055212073028087616,
      "loss_layer_6_head": 0.8018887639045715,
      "step": 4735
    },
    {
      "epoch": 60.672,
      "grad_norm": 0.1706847849710609,
      "learning_rate": 0.0019999357655598893,
      "loss": 2.4978,
      "loss_layer_12_head": 0.5754832029342651,
      "loss_layer_18_head": 0.46011972427368164,
      "loss_layer_24_head": 0.2909855246543884,
      "loss_layer_30_head": 0.1638341099023819,
      "loss_layer_36_head": 0.10160362720489502,
      "loss_layer_42_head": 0.05265159532427788,
      "loss_layer_6_head": 0.821864902973175,
      "step": 4740
    },
    {
      "epoch": 60.736,
      "grad_norm": 0.17713336231245194,
      "learning_rate": 0.001994456069169906,
      "loss": 2.4816,
      "loss_layer_12_head": 0.574891984462738,
      "loss_layer_18_head": 0.4629650115966797,
      "loss_layer_24_head": 0.2953246235847473,
      "loss_layer_30_head": 0.16892166435718536,
      "loss_layer_36_head": 0.10652611404657364,
      "loss_layer_42_head": 0.05506289750337601,
      "loss_layer_6_head": 0.8155174255371094,
      "step": 4745
    },
    {
      "epoch": 60.8,
      "grad_norm": 0.14017861606286444,
      "learning_rate": 0.0019889789039670277,
      "loss": 2.5398,
      "loss_layer_12_head": 0.5929597616195679,
      "loss_layer_18_head": 0.47338899970054626,
      "loss_layer_24_head": 0.29471173882484436,
      "loss_layer_30_head": 0.16643309593200684,
      "loss_layer_36_head": 0.10540518909692764,
      "loss_layer_42_head": 0.05425422266125679,
      "loss_layer_6_head": 0.8420042991638184,
      "step": 4750
    },
    {
      "epoch": 60.864,
      "grad_norm": 0.2158839016891175,
      "learning_rate": 0.0019835042973746497,
      "loss": 2.5373,
      "loss_layer_12_head": 0.6356561779975891,
      "loss_layer_18_head": 0.5105929374694824,
      "loss_layer_24_head": 0.3242155909538269,
      "loss_layer_30_head": 0.19542737305164337,
      "loss_layer_36_head": 0.1217864379286766,
      "loss_layer_42_head": 0.060808759182691574,
      "loss_layer_6_head": 0.9040555953979492,
      "step": 4755
    },
    {
      "epoch": 60.928,
      "grad_norm": 0.14982030005089433,
      "learning_rate": 0.001978032276803354,
      "loss": 2.4904,
      "loss_layer_12_head": 0.6067327260971069,
      "loss_layer_18_head": 0.4860507547855377,
      "loss_layer_24_head": 0.3007921278476715,
      "loss_layer_30_head": 0.1687561571598053,
      "loss_layer_36_head": 0.10589154064655304,
      "loss_layer_42_head": 0.05776595324277878,
      "loss_layer_6_head": 0.8610495328903198,
      "step": 4760
    },
    {
      "epoch": 60.992,
      "grad_norm": 0.12768584360175617,
      "learning_rate": 0.0019725628696507735,
      "loss": 2.5135,
      "loss_layer_12_head": 0.6120956540107727,
      "loss_layer_18_head": 0.4891830384731293,
      "loss_layer_24_head": 0.30174148082733154,
      "loss_layer_30_head": 0.16993722319602966,
      "loss_layer_36_head": 0.10652657598257065,
      "loss_layer_42_head": 0.0591597855091095,
      "loss_layer_6_head": 0.8677686452865601,
      "step": 4765
    },
    {
      "epoch": 61.056,
      "grad_norm": 0.1272755598187253,
      "learning_rate": 0.0019670961033014605,
      "loss": 2.4108,
      "loss_layer_12_head": 0.5623937845230103,
      "loss_layer_18_head": 0.44778114557266235,
      "loss_layer_24_head": 0.2766125798225403,
      "loss_layer_30_head": 0.1579383909702301,
      "loss_layer_36_head": 0.10054762661457062,
      "loss_layer_42_head": 0.05596218630671501,
      "loss_layer_6_head": 0.8033598065376282,
      "step": 4770
    },
    {
      "epoch": 61.12,
      "grad_norm": 0.14340429952677106,
      "learning_rate": 0.0019616320051267394,
      "loss": 2.4122,
      "loss_layer_12_head": 0.5714733004570007,
      "loss_layer_18_head": 0.4535956382751465,
      "loss_layer_24_head": 0.2774640917778015,
      "loss_layer_30_head": 0.16028742492198944,
      "loss_layer_36_head": 0.101193867623806,
      "loss_layer_42_head": 0.05486568063497543,
      "loss_layer_6_head": 0.8142407536506653,
      "step": 4775
    },
    {
      "epoch": 61.184,
      "grad_norm": 0.13182262246028364,
      "learning_rate": 0.001956170602484582,
      "loss": 2.3706,
      "loss_layer_12_head": 0.5490120053291321,
      "loss_layer_18_head": 0.4420822560787201,
      "loss_layer_24_head": 0.2703676223754883,
      "loss_layer_30_head": 0.15814340114593506,
      "loss_layer_36_head": 0.09886299073696136,
      "loss_layer_42_head": 0.05549837276339531,
      "loss_layer_6_head": 0.7848033308982849,
      "step": 4780
    },
    {
      "epoch": 61.248,
      "grad_norm": 0.14475947433235006,
      "learning_rate": 0.0019507119227194579,
      "loss": 2.4297,
      "loss_layer_12_head": 0.5703462362289429,
      "loss_layer_18_head": 0.4581505358219147,
      "loss_layer_24_head": 0.27731621265411377,
      "loss_layer_30_head": 0.16295596957206726,
      "loss_layer_36_head": 0.10114892572164536,
      "loss_layer_42_head": 0.055361486971378326,
      "loss_layer_6_head": 0.8143447637557983,
      "step": 4785
    },
    {
      "epoch": 61.312,
      "grad_norm": 0.1555180878994307,
      "learning_rate": 0.0019452559931622067,
      "loss": 2.437,
      "loss_layer_12_head": 0.5484186410903931,
      "loss_layer_18_head": 0.4400172829627991,
      "loss_layer_24_head": 0.2661481499671936,
      "loss_layer_30_head": 0.16012583673000336,
      "loss_layer_36_head": 0.099065400660038,
      "loss_layer_42_head": 0.05457884073257446,
      "loss_layer_6_head": 0.7794829607009888,
      "step": 4790
    },
    {
      "epoch": 61.376,
      "grad_norm": 0.14930465470591092,
      "learning_rate": 0.0019398028411298984,
      "loss": 2.4314,
      "loss_layer_12_head": 0.586336076259613,
      "loss_layer_18_head": 0.4656570553779602,
      "loss_layer_24_head": 0.2803557813167572,
      "loss_layer_30_head": 0.17169049382209778,
      "loss_layer_36_head": 0.10425577312707901,
      "loss_layer_42_head": 0.05571814253926277,
      "loss_layer_6_head": 0.8332621455192566,
      "step": 4795
    },
    {
      "epoch": 61.44,
      "grad_norm": 0.1581253484623268,
      "learning_rate": 0.0019343524939256951,
      "loss": 2.4591,
      "loss_layer_12_head": 0.5908767580986023,
      "loss_layer_18_head": 0.4685834050178528,
      "loss_layer_24_head": 0.28369641304016113,
      "loss_layer_30_head": 0.17189720273017883,
      "loss_layer_36_head": 0.1051347628235817,
      "loss_layer_42_head": 0.05556752160191536,
      "loss_layer_6_head": 0.8403329849243164,
      "step": 4800
    },
    {
      "epoch": 61.44,
      "eval_loss": 5.384252071380615,
      "eval_loss_layer_12_head": 1.2437020540237427,
      "eval_loss_layer_18_head": 1.0749523639678955,
      "eval_loss_layer_24_head": 0.6884297728538513,
      "eval_loss_layer_30_head": 0.4508899748325348,
      "eval_loss_layer_36_head": 0.2912006676197052,
      "eval_loss_layer_42_head": 0.17076215147972107,
      "eval_loss_layer_6_head": 1.5924973487854004,
      "eval_runtime": 33.057,
      "eval_samples_per_second": 9.68,
      "eval_steps_per_second": 0.605,
      "step": 4800
    },
    {
      "epoch": 61.504,
      "grad_norm": 0.163033139166545,
      "learning_rate": 0.0019289049788387155,
      "loss": 2.4512,
      "loss_layer_12_head": 0.5738240480422974,
      "loss_layer_18_head": 0.45751112699508667,
      "loss_layer_24_head": 0.2741357088088989,
      "loss_layer_30_head": 0.16528570652008057,
      "loss_layer_36_head": 0.10738436877727509,
      "loss_layer_42_head": 0.05739639326930046,
      "loss_layer_6_head": 0.8149957656860352,
      "step": 4805
    },
    {
      "epoch": 61.568,
      "grad_norm": 0.1669621527060116,
      "learning_rate": 0.0019234603231438995,
      "loss": 2.4893,
      "loss_layer_12_head": 0.6006470322608948,
      "loss_layer_18_head": 0.4759623408317566,
      "loss_layer_24_head": 0.2869727611541748,
      "loss_layer_30_head": 0.17264726758003235,
      "loss_layer_36_head": 0.10882623493671417,
      "loss_layer_42_head": 0.05849475413560867,
      "loss_layer_6_head": 0.8391639590263367,
      "step": 4810
    },
    {
      "epoch": 61.632,
      "grad_norm": 0.13609686660295559,
      "learning_rate": 0.0019180185541018697,
      "loss": 2.4725,
      "loss_layer_12_head": 0.5831413865089417,
      "loss_layer_18_head": 0.4625890254974365,
      "loss_layer_24_head": 0.2769843637943268,
      "loss_layer_30_head": 0.16496029496192932,
      "loss_layer_36_head": 0.1042642742395401,
      "loss_layer_42_head": 0.057401012629270554,
      "loss_layer_6_head": 0.8156934976577759,
      "step": 4815
    },
    {
      "epoch": 61.696,
      "grad_norm": 0.19719594675653265,
      "learning_rate": 0.0019125796989587947,
      "loss": 2.4714,
      "loss_layer_12_head": 0.5874515771865845,
      "loss_layer_18_head": 0.46620598435401917,
      "loss_layer_24_head": 0.27870503067970276,
      "loss_layer_30_head": 0.1652490645647049,
      "loss_layer_36_head": 0.10313926637172699,
      "loss_layer_42_head": 0.05640377476811409,
      "loss_layer_6_head": 0.8299869298934937,
      "step": 4820
    },
    {
      "epoch": 61.76,
      "grad_norm": 0.1483543351897196,
      "learning_rate": 0.0019071437849462558,
      "loss": 2.5231,
      "loss_layer_12_head": 0.5891705751419067,
      "loss_layer_18_head": 0.4661490023136139,
      "loss_layer_24_head": 0.27575749158859253,
      "loss_layer_30_head": 0.16233739256858826,
      "loss_layer_36_head": 0.10248257964849472,
      "loss_layer_42_head": 0.056567560881376266,
      "loss_layer_6_head": 0.823656439781189,
      "step": 4825
    },
    {
      "epoch": 61.824,
      "grad_norm": 0.11936764690369309,
      "learning_rate": 0.0019017108392811065,
      "loss": 2.4899,
      "loss_layer_12_head": 0.5691090822219849,
      "loss_layer_18_head": 0.4538057744503021,
      "loss_layer_24_head": 0.27002280950546265,
      "loss_layer_30_head": 0.15910547971725464,
      "loss_layer_36_head": 0.10122839361429214,
      "loss_layer_42_head": 0.05467410013079643,
      "loss_layer_6_head": 0.8097670674324036,
      "step": 4830
    },
    {
      "epoch": 61.888,
      "grad_norm": 0.13008407591510837,
      "learning_rate": 0.0018962808891653377,
      "loss": 2.4856,
      "loss_layer_12_head": 0.5910561084747314,
      "loss_layer_18_head": 0.4753972887992859,
      "loss_layer_24_head": 0.284925639629364,
      "loss_layer_30_head": 0.16883495450019836,
      "loss_layer_36_head": 0.1056467667222023,
      "loss_layer_42_head": 0.057128049433231354,
      "loss_layer_6_head": 0.8352652788162231,
      "step": 4835
    },
    {
      "epoch": 61.952,
      "grad_norm": 0.12866680573906716,
      "learning_rate": 0.0018908539617859454,
      "loss": 2.5051,
      "loss_layer_12_head": 0.5818908214569092,
      "loss_layer_18_head": 0.4666762948036194,
      "loss_layer_24_head": 0.28178682923316956,
      "loss_layer_30_head": 0.16651077568531036,
      "loss_layer_36_head": 0.10539817810058594,
      "loss_layer_42_head": 0.05577533692121506,
      "loss_layer_6_head": 0.8205731511116028,
      "step": 4840
    },
    {
      "epoch": 62.016,
      "grad_norm": 0.1281549737341574,
      "learning_rate": 0.0018854300843147876,
      "loss": 2.4621,
      "loss_layer_12_head": 0.5784858465194702,
      "loss_layer_18_head": 0.4655452370643616,
      "loss_layer_24_head": 0.2757534980773926,
      "loss_layer_30_head": 0.16239607334136963,
      "loss_layer_36_head": 0.10191822052001953,
      "loss_layer_42_head": 0.054289620369672775,
      "loss_layer_6_head": 0.8259713053703308,
      "step": 4845
    },
    {
      "epoch": 62.08,
      "grad_norm": 0.10808668464091709,
      "learning_rate": 0.001880009283908454,
      "loss": 2.3831,
      "loss_layer_12_head": 0.540663480758667,
      "loss_layer_18_head": 0.4349795877933502,
      "loss_layer_24_head": 0.27232903242111206,
      "loss_layer_30_head": 0.1569068729877472,
      "loss_layer_36_head": 0.10301442444324493,
      "loss_layer_42_head": 0.05393258482217789,
      "loss_layer_6_head": 0.7661340236663818,
      "step": 4850
    },
    {
      "epoch": 62.144,
      "grad_norm": 0.10990788548259127,
      "learning_rate": 0.0018745915877081266,
      "loss": 2.4032,
      "loss_layer_12_head": 0.5595546364784241,
      "loss_layer_18_head": 0.44903936982154846,
      "loss_layer_24_head": 0.2643570303916931,
      "loss_layer_30_head": 0.1549614816904068,
      "loss_layer_36_head": 0.0964282900094986,
      "loss_layer_42_head": 0.05185719579458237,
      "loss_layer_6_head": 0.7997180223464966,
      "step": 4855
    },
    {
      "epoch": 62.208,
      "grad_norm": 0.14378075753726619,
      "learning_rate": 0.0018691770228394455,
      "loss": 2.4259,
      "loss_layer_12_head": 0.5589427351951599,
      "loss_layer_18_head": 0.4495798945426941,
      "loss_layer_24_head": 0.26686573028564453,
      "loss_layer_30_head": 0.1557522565126419,
      "loss_layer_36_head": 0.09702153503894806,
      "loss_layer_42_head": 0.05313504487276077,
      "loss_layer_6_head": 0.792573094367981,
      "step": 4860
    },
    {
      "epoch": 62.272,
      "grad_norm": 0.11035459154903463,
      "learning_rate": 0.0018637656164123735,
      "loss": 2.397,
      "loss_layer_12_head": 0.576178252696991,
      "loss_layer_18_head": 0.4664303660392761,
      "loss_layer_24_head": 0.27633383870124817,
      "loss_layer_30_head": 0.1647893786430359,
      "loss_layer_36_head": 0.10174153745174408,
      "loss_layer_42_head": 0.054617784917354584,
      "loss_layer_6_head": 0.82830411195755,
      "step": 4865
    },
    {
      "epoch": 62.336,
      "grad_norm": 0.1140202246215926,
      "learning_rate": 0.0018583573955210581,
      "loss": 2.4062,
      "loss_layer_12_head": 0.5478945374488831,
      "loss_layer_18_head": 0.44644078612327576,
      "loss_layer_24_head": 0.26377469301223755,
      "loss_layer_30_head": 0.15540629625320435,
      "loss_layer_36_head": 0.096404530107975,
      "loss_layer_42_head": 0.05191885679960251,
      "loss_layer_6_head": 0.7778886556625366,
      "step": 4870
    },
    {
      "epoch": 62.4,
      "grad_norm": 0.13491813386735607,
      "learning_rate": 0.001852952387243698,
      "loss": 2.4427,
      "loss_layer_12_head": 0.5638699531555176,
      "loss_layer_18_head": 0.45520129799842834,
      "loss_layer_24_head": 0.2703298330307007,
      "loss_layer_30_head": 0.16153238713741302,
      "loss_layer_36_head": 0.09928220510482788,
      "loss_layer_42_head": 0.05339984968304634,
      "loss_layer_6_head": 0.8040329813957214,
      "step": 4875
    },
    {
      "epoch": 62.464,
      "grad_norm": 0.14794986069195154,
      "learning_rate": 0.0018475506186424074,
      "loss": 2.4391,
      "loss_layer_12_head": 0.5860821008682251,
      "loss_layer_18_head": 0.4720079302787781,
      "loss_layer_24_head": 0.277862012386322,
      "loss_layer_30_head": 0.16631029546260834,
      "loss_layer_36_head": 0.10179787874221802,
      "loss_layer_42_head": 0.05404890328645706,
      "loss_layer_6_head": 0.8409948348999023,
      "step": 4880
    },
    {
      "epoch": 62.528,
      "grad_norm": 0.13299140254403005,
      "learning_rate": 0.001842152116763079,
      "loss": 2.4236,
      "loss_layer_12_head": 0.580651044845581,
      "loss_layer_18_head": 0.4686204791069031,
      "loss_layer_24_head": 0.2796538472175598,
      "loss_layer_30_head": 0.16732069849967957,
      "loss_layer_36_head": 0.10161809623241425,
      "loss_layer_42_head": 0.053827594965696335,
      "loss_layer_6_head": 0.8191978335380554,
      "step": 4885
    },
    {
      "epoch": 62.592,
      "grad_norm": 0.1514569976974009,
      "learning_rate": 0.0018367569086352481,
      "loss": 2.4622,
      "loss_layer_12_head": 0.5877529978752136,
      "loss_layer_18_head": 0.47509437799453735,
      "loss_layer_24_head": 0.28425687551498413,
      "loss_layer_30_head": 0.1735060065984726,
      "loss_layer_36_head": 0.10594642162322998,
      "loss_layer_42_head": 0.05782978981733322,
      "loss_layer_6_head": 0.8334298133850098,
      "step": 4890
    },
    {
      "epoch": 62.656,
      "grad_norm": 0.13594108451456424,
      "learning_rate": 0.0018313650212719629,
      "loss": 2.4623,
      "loss_layer_12_head": 0.5774968266487122,
      "loss_layer_18_head": 0.46520406007766724,
      "loss_layer_24_head": 0.2774277329444885,
      "loss_layer_30_head": 0.16315141320228577,
      "loss_layer_36_head": 0.10083387792110443,
      "loss_layer_42_head": 0.053237371146678925,
      "loss_layer_6_head": 0.8245152235031128,
      "step": 4895
    },
    {
      "epoch": 62.72,
      "grad_norm": 0.13580118670762711,
      "learning_rate": 0.0018259764816696412,
      "loss": 2.4677,
      "loss_layer_12_head": 0.607758641242981,
      "loss_layer_18_head": 0.4884551465511322,
      "loss_layer_24_head": 0.2939067482948303,
      "loss_layer_30_head": 0.17491258680820465,
      "loss_layer_36_head": 0.11092191934585571,
      "loss_layer_42_head": 0.05910903215408325,
      "loss_layer_6_head": 0.8550692796707153,
      "step": 4900
    },
    {
      "epoch": 62.784,
      "grad_norm": 0.11871349398206012,
      "learning_rate": 0.0018205913168079391,
      "loss": 2.4925,
      "loss_layer_12_head": 0.5720194578170776,
      "loss_layer_18_head": 0.4593378007411957,
      "loss_layer_24_head": 0.27713900804519653,
      "loss_layer_30_head": 0.16544651985168457,
      "loss_layer_36_head": 0.10486842691898346,
      "loss_layer_42_head": 0.05619387701153755,
      "loss_layer_6_head": 0.8148472905158997,
      "step": 4905
    },
    {
      "epoch": 62.848,
      "grad_norm": 0.1282119210577911,
      "learning_rate": 0.001815209553649619,
      "loss": 2.4589,
      "loss_layer_12_head": 0.5756368637084961,
      "loss_layer_18_head": 0.4577915668487549,
      "loss_layer_24_head": 0.27379152178764343,
      "loss_layer_30_head": 0.1591828465461731,
      "loss_layer_36_head": 0.09888018667697906,
      "loss_layer_42_head": 0.0521562397480011,
      "loss_layer_6_head": 0.8130771517753601,
      "step": 4910
    },
    {
      "epoch": 62.912,
      "grad_norm": 0.11235395562278948,
      "learning_rate": 0.0018098312191404079,
      "loss": 2.5058,
      "loss_layer_12_head": 0.5908940434455872,
      "loss_layer_18_head": 0.47112831473350525,
      "loss_layer_24_head": 0.2850990295410156,
      "loss_layer_30_head": 0.16485543549060822,
      "loss_layer_36_head": 0.10270430147647858,
      "loss_layer_42_head": 0.054346732795238495,
      "loss_layer_6_head": 0.8342084884643555,
      "step": 4915
    },
    {
      "epoch": 62.976,
      "grad_norm": 0.09539881879047904,
      "learning_rate": 0.0018044563402088685,
      "loss": 2.4935,
      "loss_layer_12_head": 0.5843280553817749,
      "loss_layer_18_head": 0.4666789472103119,
      "loss_layer_24_head": 0.2784744203090668,
      "loss_layer_30_head": 0.1616848260164261,
      "loss_layer_36_head": 0.10051804780960083,
      "loss_layer_42_head": 0.05251885578036308,
      "loss_layer_6_head": 0.8379327058792114,
      "step": 4920
    },
    {
      "epoch": 63.04,
      "grad_norm": 0.10592216986136348,
      "learning_rate": 0.0017990849437662607,
      "loss": 2.4566,
      "loss_layer_12_head": 0.573088526725769,
      "loss_layer_18_head": 0.4633301794528961,
      "loss_layer_24_head": 0.27617064118385315,
      "loss_layer_30_head": 0.16111892461776733,
      "loss_layer_36_head": 0.10043790191411972,
      "loss_layer_42_head": 0.05375634506344795,
      "loss_layer_6_head": 0.8088246583938599,
      "step": 4925
    },
    {
      "epoch": 63.104,
      "grad_norm": 0.12844326593809988,
      "learning_rate": 0.0017937170567064076,
      "loss": 2.3945,
      "loss_layer_12_head": 0.5444973707199097,
      "loss_layer_18_head": 0.4341188967227936,
      "loss_layer_24_head": 0.25981441140174866,
      "loss_layer_30_head": 0.15140146017074585,
      "loss_layer_36_head": 0.09630300849676132,
      "loss_layer_42_head": 0.05097171664237976,
      "loss_layer_6_head": 0.7794933319091797,
      "step": 4930
    },
    {
      "epoch": 63.168,
      "grad_norm": 0.1569587111760659,
      "learning_rate": 0.0017883527059055632,
      "loss": 2.383,
      "loss_layer_12_head": 0.5313379764556885,
      "loss_layer_18_head": 0.4243772029876709,
      "loss_layer_24_head": 0.2539359927177429,
      "loss_layer_30_head": 0.15044912695884705,
      "loss_layer_36_head": 0.09551897644996643,
      "loss_layer_42_head": 0.05222855880856514,
      "loss_layer_6_head": 0.7660560011863708,
      "step": 4935
    },
    {
      "epoch": 63.232,
      "grad_norm": 0.18032384576498123,
      "learning_rate": 0.0017829919182222752,
      "loss": 2.3628,
      "loss_layer_12_head": 0.5429480075836182,
      "loss_layer_18_head": 0.43525591492652893,
      "loss_layer_24_head": 0.26386719942092896,
      "loss_layer_30_head": 0.15403945744037628,
      "loss_layer_36_head": 0.09761422127485275,
      "loss_layer_42_head": 0.053725581616163254,
      "loss_layer_6_head": 0.7785869240760803,
      "step": 4940
    },
    {
      "epoch": 63.296,
      "grad_norm": 0.14305539732950992,
      "learning_rate": 0.001777634720497248,
      "loss": 2.3927,
      "loss_layer_12_head": 0.5798231959342957,
      "loss_layer_18_head": 0.4626556932926178,
      "loss_layer_24_head": 0.2760562300682068,
      "loss_layer_30_head": 0.1591934859752655,
      "loss_layer_36_head": 0.10025303065776825,
      "loss_layer_42_head": 0.05513770505785942,
      "loss_layer_6_head": 0.8271686434745789,
      "step": 4945
    },
    {
      "epoch": 63.36,
      "grad_norm": 0.17483258345016983,
      "learning_rate": 0.0017722811395532178,
      "loss": 2.4024,
      "loss_layer_12_head": 0.5582070350646973,
      "loss_layer_18_head": 0.4473714828491211,
      "loss_layer_24_head": 0.2708691954612732,
      "loss_layer_30_head": 0.15889310836791992,
      "loss_layer_36_head": 0.09960553795099258,
      "loss_layer_42_head": 0.05434397980570793,
      "loss_layer_6_head": 0.8001024127006531,
      "step": 4950
    },
    {
      "epoch": 63.424,
      "grad_norm": 0.10434028247863024,
      "learning_rate": 0.0017669312021948076,
      "loss": 2.4598,
      "loss_layer_12_head": 0.5746174454689026,
      "loss_layer_18_head": 0.4625338912010193,
      "loss_layer_24_head": 0.2781594395637512,
      "loss_layer_30_head": 0.16268286108970642,
      "loss_layer_36_head": 0.10314687341451645,
      "loss_layer_42_head": 0.0551738366484642,
      "loss_layer_6_head": 0.8273215293884277,
      "step": 4955
    },
    {
      "epoch": 63.488,
      "grad_norm": 0.1311543964826378,
      "learning_rate": 0.0017615849352083973,
      "loss": 2.427,
      "loss_layer_12_head": 0.5739254951477051,
      "loss_layer_18_head": 0.4579378068447113,
      "loss_layer_24_head": 0.2729737162590027,
      "loss_layer_30_head": 0.15855176746845245,
      "loss_layer_36_head": 0.10179802030324936,
      "loss_layer_42_head": 0.05431380867958069,
      "loss_layer_6_head": 0.8193160891532898,
      "step": 4960
    },
    {
      "epoch": 63.552,
      "grad_norm": 0.10131106297381683,
      "learning_rate": 0.001756242365361993,
      "loss": 2.4653,
      "loss_layer_12_head": 0.5512405037879944,
      "loss_layer_18_head": 0.441876083612442,
      "loss_layer_24_head": 0.266448438167572,
      "loss_layer_30_head": 0.15739835798740387,
      "loss_layer_36_head": 0.10281649976968765,
      "loss_layer_42_head": 0.057243842631578445,
      "loss_layer_6_head": 0.7850488424301147,
      "step": 4965
    },
    {
      "epoch": 63.616,
      "grad_norm": 0.10050337859903076,
      "learning_rate": 0.0017509035194050867,
      "loss": 2.405,
      "loss_layer_12_head": 0.5544155240058899,
      "loss_layer_18_head": 0.4436910152435303,
      "loss_layer_24_head": 0.26363083720207214,
      "loss_layer_30_head": 0.154893159866333,
      "loss_layer_36_head": 0.09756343811750412,
      "loss_layer_42_head": 0.05263711139559746,
      "loss_layer_6_head": 0.7955571413040161,
      "step": 4970
    },
    {
      "epoch": 63.68,
      "grad_norm": 0.11193348550701465,
      "learning_rate": 0.0017455684240685265,
      "loss": 2.4369,
      "loss_layer_12_head": 0.5562783479690552,
      "loss_layer_18_head": 0.4457663595676422,
      "loss_layer_24_head": 0.26744014024734497,
      "loss_layer_30_head": 0.15589284896850586,
      "loss_layer_36_head": 0.0976102203130722,
      "loss_layer_42_head": 0.05263698101043701,
      "loss_layer_6_head": 0.7911028861999512,
      "step": 4975
    },
    {
      "epoch": 63.744,
      "grad_norm": 0.17642464287183685,
      "learning_rate": 0.001740237106064383,
      "loss": 2.491,
      "loss_layer_12_head": 0.5973817110061646,
      "loss_layer_18_head": 0.4730873703956604,
      "loss_layer_24_head": 0.2836846709251404,
      "loss_layer_30_head": 0.16459247469902039,
      "loss_layer_36_head": 0.10222555696964264,
      "loss_layer_42_head": 0.05491171404719353,
      "loss_layer_6_head": 0.8541342616081238,
      "step": 4980
    },
    {
      "epoch": 63.808,
      "grad_norm": 0.18284637337434523,
      "learning_rate": 0.0017349095920858116,
      "loss": 2.5053,
      "loss_layer_12_head": 0.5630054473876953,
      "loss_layer_18_head": 0.45046311616897583,
      "loss_layer_24_head": 0.26939699053764343,
      "loss_layer_30_head": 0.157556414604187,
      "loss_layer_36_head": 0.10005861520767212,
      "loss_layer_42_head": 0.053111713379621506,
      "loss_layer_6_head": 0.8042359352111816,
      "step": 4985
    },
    {
      "epoch": 63.872,
      "grad_norm": 0.14292036080265852,
      "learning_rate": 0.0017295859088069232,
      "loss": 2.5125,
      "loss_layer_12_head": 0.5845515727996826,
      "loss_layer_18_head": 0.46609964966773987,
      "loss_layer_24_head": 0.2780998945236206,
      "loss_layer_30_head": 0.16039426624774933,
      "loss_layer_36_head": 0.09934006631374359,
      "loss_layer_42_head": 0.05256739258766174,
      "loss_layer_6_head": 0.8295612335205078,
      "step": 4990
    },
    {
      "epoch": 63.936,
      "grad_norm": 0.09659297032848871,
      "learning_rate": 0.0017242660828826499,
      "loss": 2.4419,
      "loss_layer_12_head": 0.5924203991889954,
      "loss_layer_18_head": 0.47634977102279663,
      "loss_layer_24_head": 0.28391599655151367,
      "loss_layer_30_head": 0.16416728496551514,
      "loss_layer_36_head": 0.10228274017572403,
      "loss_layer_42_head": 0.05518922954797745,
      "loss_layer_6_head": 0.8418809771537781,
      "step": 4995
    },
    {
      "epoch": 64.0,
      "grad_norm": 0.10475104796074512,
      "learning_rate": 0.001718950140948606,
      "loss": 2.4773,
      "loss_layer_12_head": 0.6103941202163696,
      "loss_layer_18_head": 0.4899115562438965,
      "loss_layer_24_head": 0.29159194231033325,
      "loss_layer_30_head": 0.16905245184898376,
      "loss_layer_36_head": 0.10602307319641113,
      "loss_layer_42_head": 0.05590132623910904,
      "loss_layer_6_head": 0.8663412928581238,
      "step": 5000
    },
    {
      "epoch": 64.0,
      "eval_loss": 5.403820514678955,
      "eval_loss_layer_12_head": 1.2449629306793213,
      "eval_loss_layer_18_head": 1.0797317028045654,
      "eval_loss_layer_24_head": 0.6914545893669128,
      "eval_loss_layer_30_head": 0.44862064719200134,
      "eval_loss_layer_36_head": 0.2932533025741577,
      "eval_loss_layer_42_head": 0.19942812621593475,
      "eval_loss_layer_6_head": 1.595217227935791,
      "eval_runtime": 33.111,
      "eval_samples_per_second": 9.664,
      "eval_steps_per_second": 0.604,
      "step": 5000
    },
    {
      "epoch": 64.064,
      "grad_norm": 0.128642087041745,
      "learning_rate": 0.0017136381096209664,
      "loss": 2.4042,
      "loss_layer_12_head": 0.5721402168273926,
      "loss_layer_18_head": 0.46097904443740845,
      "loss_layer_24_head": 0.2737685739994049,
      "loss_layer_30_head": 0.1601933091878891,
      "loss_layer_36_head": 0.100108802318573,
      "loss_layer_42_head": 0.05328350514173508,
      "loss_layer_6_head": 0.817008376121521,
      "step": 5005
    },
    {
      "epoch": 64.128,
      "grad_norm": 0.101907628429597,
      "learning_rate": 0.0017083300154963194,
      "loss": 2.3611,
      "loss_layer_12_head": 0.5511847734451294,
      "loss_layer_18_head": 0.4411926865577698,
      "loss_layer_24_head": 0.264930784702301,
      "loss_layer_30_head": 0.154744952917099,
      "loss_layer_36_head": 0.09842383861541748,
      "loss_layer_42_head": 0.05187402293086052,
      "loss_layer_6_head": 0.7908003926277161,
      "step": 5010
    },
    {
      "epoch": 64.192,
      "grad_norm": 0.09953456826856782,
      "learning_rate": 0.0017030258851515434,
      "loss": 2.3995,
      "loss_layer_12_head": 0.6225600838661194,
      "loss_layer_18_head": 0.4977831244468689,
      "loss_layer_24_head": 0.29791945219039917,
      "loss_layer_30_head": 0.1716182380914688,
      "loss_layer_36_head": 0.10804762691259384,
      "loss_layer_42_head": 0.05729670077562332,
      "loss_layer_6_head": 0.8813173174858093,
      "step": 5015
    },
    {
      "epoch": 64.256,
      "grad_norm": 0.11257021573597246,
      "learning_rate": 0.0016977257451436711,
      "loss": 2.3644,
      "loss_layer_12_head": 0.5380385518074036,
      "loss_layer_18_head": 0.42969316244125366,
      "loss_layer_24_head": 0.25889626145362854,
      "loss_layer_30_head": 0.1528262197971344,
      "loss_layer_36_head": 0.09693945944309235,
      "loss_layer_42_head": 0.05195866897702217,
      "loss_layer_6_head": 0.771964967250824,
      "step": 5020
    },
    {
      "epoch": 64.32,
      "grad_norm": 0.10531720036301002,
      "learning_rate": 0.0016924296220097556,
      "loss": 2.3621,
      "loss_layer_12_head": 0.5415047407150269,
      "loss_layer_18_head": 0.43147581815719604,
      "loss_layer_24_head": 0.2594066262245178,
      "loss_layer_30_head": 0.15202584862709045,
      "loss_layer_36_head": 0.09576857089996338,
      "loss_layer_42_head": 0.051868002861738205,
      "loss_layer_6_head": 0.7704850435256958,
      "step": 5025
    },
    {
      "epoch": 64.384,
      "grad_norm": 0.10289080960115392,
      "learning_rate": 0.0016871375422667374,
      "loss": 2.399,
      "loss_layer_12_head": 0.5706857442855835,
      "loss_layer_18_head": 0.45535969734191895,
      "loss_layer_24_head": 0.2742857038974762,
      "loss_layer_30_head": 0.15920689702033997,
      "loss_layer_36_head": 0.09963327646255493,
      "loss_layer_42_head": 0.05345886945724487,
      "loss_layer_6_head": 0.8048334121704102,
      "step": 5030
    },
    {
      "epoch": 64.448,
      "grad_norm": 0.10943631926806047,
      "learning_rate": 0.0016818495324113128,
      "loss": 2.4118,
      "loss_layer_12_head": 0.5627689361572266,
      "loss_layer_18_head": 0.4535555839538574,
      "loss_layer_24_head": 0.2794666886329651,
      "loss_layer_30_head": 0.16725586354732513,
      "loss_layer_36_head": 0.09953020513057709,
      "loss_layer_42_head": 0.0663003921508789,
      "loss_layer_6_head": 0.8020162582397461,
      "step": 5035
    },
    {
      "epoch": 64.512,
      "grad_norm": 0.11583329280891598,
      "learning_rate": 0.0016765656189198011,
      "loss": 2.4241,
      "loss_layer_12_head": 0.5723711252212524,
      "loss_layer_18_head": 0.4594680368900299,
      "loss_layer_24_head": 0.2791701853275299,
      "loss_layer_30_head": 0.1672666072845459,
      "loss_layer_36_head": 0.10174308717250824,
      "loss_layer_42_head": 0.059270042926073074,
      "loss_layer_6_head": 0.8164860606193542,
      "step": 5040
    },
    {
      "epoch": 64.576,
      "grad_norm": 0.11680702762750492,
      "learning_rate": 0.001671285828248011,
      "loss": 2.4132,
      "loss_layer_12_head": 0.5931552648544312,
      "loss_layer_18_head": 0.4753449857234955,
      "loss_layer_24_head": 0.2816978394985199,
      "loss_layer_30_head": 0.1633681058883667,
      "loss_layer_36_head": 0.10081668198108673,
      "loss_layer_42_head": 0.05322108790278435,
      "loss_layer_6_head": 0.8415107727050781,
      "step": 5045
    },
    {
      "epoch": 64.64,
      "grad_norm": 0.12331083560104557,
      "learning_rate": 0.0016660101868311093,
      "loss": 2.4354,
      "loss_layer_12_head": 0.5459426641464233,
      "loss_layer_18_head": 0.4381128251552582,
      "loss_layer_24_head": 0.2613673508167267,
      "loss_layer_30_head": 0.15246187150478363,
      "loss_layer_36_head": 0.09562890231609344,
      "loss_layer_42_head": 0.051486432552337646,
      "loss_layer_6_head": 0.7798988223075867,
      "step": 5050
    },
    {
      "epoch": 64.704,
      "grad_norm": 0.1251591963247103,
      "learning_rate": 0.0016607387210834886,
      "loss": 2.4702,
      "loss_layer_12_head": 0.5739006996154785,
      "loss_layer_18_head": 0.4592026174068451,
      "loss_layer_24_head": 0.27600473165512085,
      "loss_layer_30_head": 0.16126756370067596,
      "loss_layer_36_head": 0.10064685344696045,
      "loss_layer_42_head": 0.054322730749845505,
      "loss_layer_6_head": 0.8210123181343079,
      "step": 5055
    },
    {
      "epoch": 64.768,
      "grad_norm": 0.1339275106030149,
      "learning_rate": 0.0016554714573986324,
      "loss": 2.4577,
      "loss_layer_12_head": 0.6108711361885071,
      "loss_layer_18_head": 0.4913061261177063,
      "loss_layer_24_head": 0.2931200861930847,
      "loss_layer_30_head": 0.1710003912448883,
      "loss_layer_36_head": 0.10517990589141846,
      "loss_layer_42_head": 0.05645984411239624,
      "loss_layer_6_head": 0.8666002154350281,
      "step": 5060
    },
    {
      "epoch": 64.832,
      "grad_norm": 0.12464204817142154,
      "learning_rate": 0.0016502084221489877,
      "loss": 2.4932,
      "loss_layer_12_head": 0.6041873693466187,
      "loss_layer_18_head": 0.48271435499191284,
      "loss_layer_24_head": 0.2901732623577118,
      "loss_layer_30_head": 0.16825446486473083,
      "loss_layer_36_head": 0.10244061052799225,
      "loss_layer_42_head": 0.05452496558427811,
      "loss_layer_6_head": 0.8564674258232117,
      "step": 5065
    },
    {
      "epoch": 64.896,
      "grad_norm": 0.1576509136678185,
      "learning_rate": 0.0016449496416858283,
      "loss": 2.4696,
      "loss_layer_12_head": 0.5788620114326477,
      "loss_layer_18_head": 0.4612220823764801,
      "loss_layer_24_head": 0.27673593163490295,
      "loss_layer_30_head": 0.16074080765247345,
      "loss_layer_36_head": 0.09926779568195343,
      "loss_layer_42_head": 0.053282104432582855,
      "loss_layer_6_head": 0.8203428983688354,
      "step": 5070
    },
    {
      "epoch": 64.96,
      "grad_norm": 0.1207628361197344,
      "learning_rate": 0.0016396951423391266,
      "loss": 2.5106,
      "loss_layer_12_head": 0.6160336136817932,
      "loss_layer_18_head": 0.4893895089626312,
      "loss_layer_24_head": 0.2925024628639221,
      "loss_layer_30_head": 0.17081183195114136,
      "loss_layer_36_head": 0.10354391485452652,
      "loss_layer_42_head": 0.05477716773748398,
      "loss_layer_6_head": 0.8668672442436218,
      "step": 5075
    },
    {
      "epoch": 65.024,
      "grad_norm": 0.10467152634222947,
      "learning_rate": 0.0016344449504174191,
      "loss": 2.4527,
      "loss_layer_12_head": 0.5904930233955383,
      "loss_layer_18_head": 0.47017210721969604,
      "loss_layer_24_head": 0.28174272179603577,
      "loss_layer_30_head": 0.1648465096950531,
      "loss_layer_36_head": 0.10143059492111206,
      "loss_layer_42_head": 0.05356142669916153,
      "loss_layer_6_head": 0.8350132703781128,
      "step": 5080
    },
    {
      "epoch": 65.088,
      "grad_norm": 0.10763327722014827,
      "learning_rate": 0.0016291990922076745,
      "loss": 2.3361,
      "loss_layer_12_head": 0.5690428018569946,
      "loss_layer_18_head": 0.4541719853878021,
      "loss_layer_24_head": 0.2739066481590271,
      "loss_layer_30_head": 0.15979261696338654,
      "loss_layer_36_head": 0.09753571450710297,
      "loss_layer_42_head": 0.051229000091552734,
      "loss_layer_6_head": 0.8096511960029602,
      "step": 5085
    },
    {
      "epoch": 65.152,
      "grad_norm": 0.1051109417027076,
      "learning_rate": 0.001623957593975166,
      "loss": 2.3464,
      "loss_layer_12_head": 0.5519159436225891,
      "loss_layer_18_head": 0.4385809004306793,
      "loss_layer_24_head": 0.2655315399169922,
      "loss_layer_30_head": 0.15824398398399353,
      "loss_layer_36_head": 0.09882314503192902,
      "loss_layer_42_head": 0.053186558187007904,
      "loss_layer_6_head": 0.7851976156234741,
      "step": 5090
    },
    {
      "epoch": 65.216,
      "grad_norm": 0.15905144414181493,
      "learning_rate": 0.0016187204819633346,
      "loss": 2.3672,
      "loss_layer_12_head": 0.5428670644760132,
      "loss_layer_18_head": 0.4329861104488373,
      "loss_layer_24_head": 0.2601413130760193,
      "loss_layer_30_head": 0.15438255667686462,
      "loss_layer_36_head": 0.09586609899997711,
      "loss_layer_42_head": 0.05113685876131058,
      "loss_layer_6_head": 0.7670931816101074,
      "step": 5095
    },
    {
      "epoch": 65.28,
      "grad_norm": 0.12701339832600245,
      "learning_rate": 0.0016134877823936607,
      "loss": 2.3851,
      "loss_layer_12_head": 0.5497230291366577,
      "loss_layer_18_head": 0.44018998742103577,
      "loss_layer_24_head": 0.26427847146987915,
      "loss_layer_30_head": 0.1574961245059967,
      "loss_layer_36_head": 0.09797327220439911,
      "loss_layer_42_head": 0.0515299029648304,
      "loss_layer_6_head": 0.7827061414718628,
      "step": 5100
    },
    {
      "epoch": 65.344,
      "grad_norm": 0.1078053690013135,
      "learning_rate": 0.0016082595214655334,
      "loss": 2.4008,
      "loss_layer_12_head": 0.5619354248046875,
      "loss_layer_18_head": 0.447976291179657,
      "loss_layer_24_head": 0.2689681649208069,
      "loss_layer_30_head": 0.1574321687221527,
      "loss_layer_36_head": 0.09932412207126617,
      "loss_layer_42_head": 0.05193851515650749,
      "loss_layer_6_head": 0.8003455400466919,
      "step": 5105
    },
    {
      "epoch": 65.408,
      "grad_norm": 0.09583707385302526,
      "learning_rate": 0.0016030357253561172,
      "loss": 2.3865,
      "loss_layer_12_head": 0.581489622592926,
      "loss_layer_18_head": 0.464430034160614,
      "loss_layer_24_head": 0.27872732281684875,
      "loss_layer_30_head": 0.16363981366157532,
      "loss_layer_36_head": 0.10323194414377213,
      "loss_layer_42_head": 0.053651053458452225,
      "loss_layer_6_head": 0.8330544233322144,
      "step": 5110
    },
    {
      "epoch": 65.472,
      "grad_norm": 0.10723657553926003,
      "learning_rate": 0.00159781642022022,
      "loss": 2.4104,
      "loss_layer_12_head": 0.606437087059021,
      "loss_layer_18_head": 0.4847811162471771,
      "loss_layer_24_head": 0.28904566168785095,
      "loss_layer_30_head": 0.1686481535434723,
      "loss_layer_36_head": 0.10446260869503021,
      "loss_layer_42_head": 0.05410463362932205,
      "loss_layer_6_head": 0.8603846430778503,
      "step": 5115
    },
    {
      "epoch": 65.536,
      "grad_norm": 0.14431950901422044,
      "learning_rate": 0.0015926016321901688,
      "loss": 2.4264,
      "loss_layer_12_head": 0.5394851565361023,
      "loss_layer_18_head": 0.4293598234653473,
      "loss_layer_24_head": 0.2538837790489197,
      "loss_layer_30_head": 0.14877048134803772,
      "loss_layer_36_head": 0.09373395144939423,
      "loss_layer_42_head": 0.048886336386203766,
      "loss_layer_6_head": 0.7683518528938293,
      "step": 5120
    },
    {
      "epoch": 65.6,
      "grad_norm": 0.10995364054223856,
      "learning_rate": 0.001587391387375669,
      "loss": 2.41,
      "loss_layer_12_head": 0.5729557275772095,
      "loss_layer_18_head": 0.45518916845321655,
      "loss_layer_24_head": 0.27106016874313354,
      "loss_layer_30_head": 0.1569114327430725,
      "loss_layer_36_head": 0.10276130586862564,
      "loss_layer_42_head": 0.05186569690704346,
      "loss_layer_6_head": 0.8222551345825195,
      "step": 5125
    },
    {
      "epoch": 65.664,
      "grad_norm": 0.13890167492440236,
      "learning_rate": 0.0015821857118636811,
      "loss": 2.407,
      "loss_layer_12_head": 0.5531170964241028,
      "loss_layer_18_head": 0.4407879710197449,
      "loss_layer_24_head": 0.26450806856155396,
      "loss_layer_30_head": 0.1535550206899643,
      "loss_layer_36_head": 0.10284234583377838,
      "loss_layer_42_head": 0.0503159761428833,
      "loss_layer_6_head": 0.7942212224006653,
      "step": 5130
    },
    {
      "epoch": 65.728,
      "grad_norm": 0.12480137049678254,
      "learning_rate": 0.0015769846317182892,
      "loss": 2.4487,
      "loss_layer_12_head": 0.5928313732147217,
      "loss_layer_18_head": 0.4737528860569,
      "loss_layer_24_head": 0.2860558331012726,
      "loss_layer_30_head": 0.1675199568271637,
      "loss_layer_36_head": 0.12181703001260757,
      "loss_layer_42_head": 0.05903315544128418,
      "loss_layer_6_head": 0.841263473033905,
      "step": 5135
    },
    {
      "epoch": 65.792,
      "grad_norm": 0.10545389079583059,
      "learning_rate": 0.0015717881729805658,
      "loss": 2.5018,
      "loss_layer_12_head": 0.5860747694969177,
      "loss_layer_18_head": 0.46626633405685425,
      "loss_layer_24_head": 0.2767810523509979,
      "loss_layer_30_head": 0.16080421209335327,
      "loss_layer_36_head": 0.11126577854156494,
      "loss_layer_42_head": 0.05226144939661026,
      "loss_layer_6_head": 0.8418697118759155,
      "step": 5140
    },
    {
      "epoch": 65.856,
      "grad_norm": 0.10549016360335825,
      "learning_rate": 0.0015665963616684475,
      "loss": 2.4617,
      "loss_layer_12_head": 0.6157280206680298,
      "loss_layer_18_head": 0.4955260753631592,
      "loss_layer_24_head": 0.2971367835998535,
      "loss_layer_30_head": 0.17143255472183228,
      "loss_layer_36_head": 0.11340661346912384,
      "loss_layer_42_head": 0.05461735650897026,
      "loss_layer_6_head": 0.8681467175483704,
      "step": 5145
    },
    {
      "epoch": 65.92,
      "grad_norm": 0.13262402585679864,
      "learning_rate": 0.0015614092237766007,
      "loss": 2.475,
      "loss_layer_12_head": 0.5893651247024536,
      "loss_layer_18_head": 0.47160211205482483,
      "loss_layer_24_head": 0.28590986132621765,
      "loss_layer_30_head": 0.16392435133457184,
      "loss_layer_36_head": 0.10815385729074478,
      "loss_layer_42_head": 0.053535331040620804,
      "loss_layer_6_head": 0.8383434414863586,
      "step": 5150
    },
    {
      "epoch": 65.984,
      "grad_norm": 0.1267343496900236,
      "learning_rate": 0.0015562267852762912,
      "loss": 2.5291,
      "loss_layer_12_head": 0.5998532176017761,
      "loss_layer_18_head": 0.47886934876441956,
      "loss_layer_24_head": 0.2903502583503723,
      "loss_layer_30_head": 0.16483134031295776,
      "loss_layer_36_head": 0.10678652673959732,
      "loss_layer_42_head": 0.053588204085826874,
      "loss_layer_6_head": 0.8510700464248657,
      "step": 5155
    },
    {
      "epoch": 66.048,
      "grad_norm": 0.12913380441256822,
      "learning_rate": 0.0015510490721152592,
      "loss": 2.3793,
      "loss_layer_12_head": 0.575817883014679,
      "loss_layer_18_head": 0.46248412132263184,
      "loss_layer_24_head": 0.280609130859375,
      "loss_layer_30_head": 0.1617339849472046,
      "loss_layer_36_head": 0.10355842113494873,
      "loss_layer_42_head": 0.05268820375204086,
      "loss_layer_6_head": 0.8183478116989136,
      "step": 5160
    },
    {
      "epoch": 66.112,
      "grad_norm": 0.174621057286195,
      "learning_rate": 0.0015458761102175827,
      "loss": 2.3499,
      "loss_layer_12_head": 0.5809169411659241,
      "loss_layer_18_head": 0.4661749005317688,
      "loss_layer_24_head": 0.2821301817893982,
      "loss_layer_30_head": 0.1599940061569214,
      "loss_layer_36_head": 0.10141108185052872,
      "loss_layer_42_head": 0.05254720523953438,
      "loss_layer_6_head": 0.8283140063285828,
      "step": 5165
    },
    {
      "epoch": 66.176,
      "grad_norm": 0.11617401526532993,
      "learning_rate": 0.0015407079254835508,
      "loss": 2.3821,
      "loss_layer_12_head": 0.5573533773422241,
      "loss_layer_18_head": 0.4488266408443451,
      "loss_layer_24_head": 0.27403944730758667,
      "loss_layer_30_head": 0.1581585705280304,
      "loss_layer_36_head": 0.0994090884923935,
      "loss_layer_42_head": 0.05210883170366287,
      "loss_layer_6_head": 0.7892599701881409,
      "step": 5170
    },
    {
      "epoch": 66.24,
      "grad_norm": 0.10917977715613968,
      "learning_rate": 0.001535544543789537,
      "loss": 2.3893,
      "loss_layer_12_head": 0.5512558221817017,
      "loss_layer_18_head": 0.4416576325893402,
      "loss_layer_24_head": 0.2716659903526306,
      "loss_layer_30_head": 0.1535770446062088,
      "loss_layer_36_head": 0.09767137467861176,
      "loss_layer_42_head": 0.05139914155006409,
      "loss_layer_6_head": 0.7889996767044067,
      "step": 5175
    },
    {
      "epoch": 66.304,
      "grad_norm": 0.13240434974258353,
      "learning_rate": 0.0015303859909878632,
      "loss": 2.3005,
      "loss_layer_12_head": 0.5477917194366455,
      "loss_layer_18_head": 0.4397219717502594,
      "loss_layer_24_head": 0.27692165970802307,
      "loss_layer_30_head": 0.15778204798698425,
      "loss_layer_36_head": 0.10364608466625214,
      "loss_layer_42_head": 0.05219224840402603,
      "loss_layer_6_head": 0.785728394985199,
      "step": 5180
    },
    {
      "epoch": 66.368,
      "grad_norm": 0.15045650173202188,
      "learning_rate": 0.0015252322929066753,
      "loss": 2.3893,
      "loss_layer_12_head": 0.574607789516449,
      "loss_layer_18_head": 0.46148261427879333,
      "loss_layer_24_head": 0.2828689217567444,
      "loss_layer_30_head": 0.15844754874706268,
      "loss_layer_36_head": 0.09825337678194046,
      "loss_layer_42_head": 0.05193888396024704,
      "loss_layer_6_head": 0.8129286766052246,
      "step": 5185
    },
    {
      "epoch": 66.432,
      "grad_norm": 0.12005140953742055,
      "learning_rate": 0.0015200834753498127,
      "loss": 2.4436,
      "loss_layer_12_head": 0.5836073160171509,
      "loss_layer_18_head": 0.4637044370174408,
      "loss_layer_24_head": 0.28275734186172485,
      "loss_layer_30_head": 0.15940195322036743,
      "loss_layer_36_head": 0.09847284853458405,
      "loss_layer_42_head": 0.05349533632397652,
      "loss_layer_6_head": 0.8298639059066772,
      "step": 5190
    },
    {
      "epoch": 66.496,
      "grad_norm": 0.10336775824738777,
      "learning_rate": 0.0015149395640966779,
      "loss": 2.4178,
      "loss_layer_12_head": 0.5568500757217407,
      "loss_layer_18_head": 0.44904518127441406,
      "loss_layer_24_head": 0.2775748372077942,
      "loss_layer_30_head": 0.15874847769737244,
      "loss_layer_36_head": 0.098392054438591,
      "loss_layer_42_head": 0.05226748064160347,
      "loss_layer_6_head": 0.7940891981124878,
      "step": 5195
    },
    {
      "epoch": 66.56,
      "grad_norm": 0.13241049793383383,
      "learning_rate": 0.001509800584902108,
      "loss": 2.4562,
      "loss_layer_12_head": 0.5863093733787537,
      "loss_layer_18_head": 0.46662744879722595,
      "loss_layer_24_head": 0.28991150856018066,
      "loss_layer_30_head": 0.15963459014892578,
      "loss_layer_36_head": 0.09875790029764175,
      "loss_layer_42_head": 0.05209606885910034,
      "loss_layer_6_head": 0.8258094787597656,
      "step": 5200
    },
    {
      "epoch": 66.56,
      "eval_loss": 5.392152309417725,
      "eval_loss_layer_12_head": 1.2484610080718994,
      "eval_loss_layer_18_head": 1.0776201486587524,
      "eval_loss_layer_24_head": 0.6967743635177612,
      "eval_loss_layer_30_head": 0.44787707924842834,
      "eval_loss_layer_36_head": 0.28706231713294983,
      "eval_loss_layer_42_head": 0.16963715851306915,
      "eval_loss_layer_6_head": 1.591783046722412,
      "eval_runtime": 33.068,
      "eval_samples_per_second": 9.677,
      "eval_steps_per_second": 0.605,
      "step": 5200
    },
    {
      "epoch": 66.624,
      "grad_norm": 0.16191072094282405,
      "learning_rate": 0.0015046665634962475,
      "loss": 2.4147,
      "loss_layer_12_head": 0.589972198009491,
      "loss_layer_18_head": 0.47152742743492126,
      "loss_layer_24_head": 0.2948974668979645,
      "loss_layer_30_head": 0.1623515784740448,
      "loss_layer_36_head": 0.10045258700847626,
      "loss_layer_42_head": 0.05311299487948418,
      "loss_layer_6_head": 0.8378392457962036,
      "step": 5205
    },
    {
      "epoch": 66.688,
      "grad_norm": 0.13706254698484682,
      "learning_rate": 0.001499537525584416,
      "loss": 2.4648,
      "loss_layer_12_head": 0.5931217670440674,
      "loss_layer_18_head": 0.47600188851356506,
      "loss_layer_24_head": 0.308777391910553,
      "loss_layer_30_head": 0.16290122270584106,
      "loss_layer_36_head": 0.09984590113162994,
      "loss_layer_42_head": 0.0524873249232769,
      "loss_layer_6_head": 0.8395147323608398,
      "step": 5210
    },
    {
      "epoch": 66.752,
      "grad_norm": 0.1779223448565014,
      "learning_rate": 0.0014944134968469833,
      "loss": 2.439,
      "loss_layer_12_head": 0.5392745137214661,
      "loss_layer_18_head": 0.42887502908706665,
      "loss_layer_24_head": 0.2948439419269562,
      "loss_layer_30_head": 0.15068620443344116,
      "loss_layer_36_head": 0.09440300613641739,
      "loss_layer_42_head": 0.05145072937011719,
      "loss_layer_6_head": 0.7666341066360474,
      "step": 5215
    },
    {
      "epoch": 66.816,
      "grad_norm": 0.18119280613573233,
      "learning_rate": 0.001489294502939238,
      "loss": 2.4678,
      "loss_layer_12_head": 0.6043515205383301,
      "loss_layer_18_head": 0.4831230640411377,
      "loss_layer_24_head": 0.3170244097709656,
      "loss_layer_30_head": 0.16725808382034302,
      "loss_layer_36_head": 0.10351943969726562,
      "loss_layer_42_head": 0.05549372360110283,
      "loss_layer_6_head": 0.8501769304275513,
      "step": 5220
    },
    {
      "epoch": 66.88,
      "grad_norm": 0.13459033770365028,
      "learning_rate": 0.0014841805694912583,
      "loss": 2.466,
      "loss_layer_12_head": 0.5435455441474915,
      "loss_layer_18_head": 0.4371386170387268,
      "loss_layer_24_head": 0.28321653604507446,
      "loss_layer_30_head": 0.15519121289253235,
      "loss_layer_36_head": 0.09781339764595032,
      "loss_layer_42_head": 0.05196285992860794,
      "loss_layer_6_head": 0.7761009931564331,
      "step": 5225
    },
    {
      "epoch": 66.944,
      "grad_norm": 0.1298061968161023,
      "learning_rate": 0.0014790717221077898,
      "loss": 2.4933,
      "loss_layer_12_head": 0.5894067287445068,
      "loss_layer_18_head": 0.4710465967655182,
      "loss_layer_24_head": 0.29570311307907104,
      "loss_layer_30_head": 0.16225235164165497,
      "loss_layer_36_head": 0.10165181010961533,
      "loss_layer_42_head": 0.054485075175762177,
      "loss_layer_6_head": 0.8337615728378296,
      "step": 5230
    },
    {
      "epoch": 67.008,
      "grad_norm": 0.10849981869882394,
      "learning_rate": 0.0014739679863681086,
      "loss": 2.4588,
      "loss_layer_12_head": 0.5631723999977112,
      "loss_layer_18_head": 0.44824886322021484,
      "loss_layer_24_head": 0.27680662274360657,
      "loss_layer_30_head": 0.15581032633781433,
      "loss_layer_36_head": 0.09697931259870529,
      "loss_layer_42_head": 0.05224229022860527,
      "loss_layer_6_head": 0.7979419231414795,
      "step": 5235
    },
    {
      "epoch": 67.072,
      "grad_norm": 0.102914845301251,
      "learning_rate": 0.001468869387825899,
      "loss": 2.3364,
      "loss_layer_12_head": 0.5490279793739319,
      "loss_layer_18_head": 0.43630725145339966,
      "loss_layer_24_head": 0.2681925892829895,
      "loss_layer_30_head": 0.15238827466964722,
      "loss_layer_36_head": 0.09465701878070831,
      "loss_layer_42_head": 0.05011289194226265,
      "loss_layer_6_head": 0.7817306518554688,
      "step": 5240
    },
    {
      "epoch": 67.136,
      "grad_norm": 0.10321088784920095,
      "learning_rate": 0.001463775952009125,
      "loss": 2.3471,
      "loss_layer_12_head": 0.5957650542259216,
      "loss_layer_18_head": 0.4730437397956848,
      "loss_layer_24_head": 0.2890301048755646,
      "loss_layer_30_head": 0.16381335258483887,
      "loss_layer_36_head": 0.10148737579584122,
      "loss_layer_42_head": 0.054029274731874466,
      "loss_layer_6_head": 0.8454655408859253,
      "step": 5245
    },
    {
      "epoch": 67.2,
      "grad_norm": 0.13111035968958887,
      "learning_rate": 0.0014586877044199015,
      "loss": 2.3535,
      "loss_layer_12_head": 0.5443896055221558,
      "loss_layer_18_head": 0.43127673864364624,
      "loss_layer_24_head": 0.2620411515235901,
      "loss_layer_30_head": 0.14987054467201233,
      "loss_layer_36_head": 0.09360454231500626,
      "loss_layer_42_head": 0.04979507625102997,
      "loss_layer_6_head": 0.7823222875595093,
      "step": 5250
    },
    {
      "epoch": 67.264,
      "grad_norm": 0.1817552393841058,
      "learning_rate": 0.0014536046705343633,
      "loss": 2.3909,
      "loss_layer_12_head": 0.5808327794075012,
      "loss_layer_18_head": 0.46026211977005005,
      "loss_layer_24_head": 0.27789655327796936,
      "loss_layer_30_head": 0.16021640598773956,
      "loss_layer_36_head": 0.09841648489236832,
      "loss_layer_42_head": 0.05202724412083626,
      "loss_layer_6_head": 0.8238946199417114,
      "step": 5255
    },
    {
      "epoch": 67.328,
      "grad_norm": 0.10987006127100375,
      "learning_rate": 0.0014485268758025467,
      "loss": 2.3766,
      "loss_layer_12_head": 0.5754832625389099,
      "loss_layer_18_head": 0.45896443724632263,
      "loss_layer_24_head": 0.27637800574302673,
      "loss_layer_30_head": 0.1592140793800354,
      "loss_layer_36_head": 0.09824047982692719,
      "loss_layer_42_head": 0.05163562297821045,
      "loss_layer_6_head": 0.8226001858711243,
      "step": 5260
    },
    {
      "epoch": 67.392,
      "grad_norm": 0.13149676991956122,
      "learning_rate": 0.0014434543456482518,
      "loss": 2.3644,
      "loss_layer_12_head": 0.5703908801078796,
      "loss_layer_18_head": 0.4563451409339905,
      "loss_layer_24_head": 0.2741951048374176,
      "loss_layer_30_head": 0.15920661389827728,
      "loss_layer_36_head": 0.0973561629652977,
      "loss_layer_42_head": 0.05089563876390457,
      "loss_layer_6_head": 0.8119163513183594,
      "step": 5265
    },
    {
      "epoch": 67.456,
      "grad_norm": 0.2086662334394102,
      "learning_rate": 0.0014383871054689213,
      "loss": 2.3922,
      "loss_layer_12_head": 0.5692805051803589,
      "loss_layer_18_head": 0.4595755636692047,
      "loss_layer_24_head": 0.27245622873306274,
      "loss_layer_30_head": 0.16067852079868317,
      "loss_layer_36_head": 0.09791797399520874,
      "loss_layer_42_head": 0.05187373608350754,
      "loss_layer_6_head": 0.8089947700500488,
      "step": 5270
    },
    {
      "epoch": 67.52,
      "grad_norm": 0.09671951085170219,
      "learning_rate": 0.0014333251806355114,
      "loss": 2.4089,
      "loss_layer_12_head": 0.592002809047699,
      "loss_layer_18_head": 0.47572773694992065,
      "loss_layer_24_head": 0.2839796543121338,
      "loss_layer_30_head": 0.16629961133003235,
      "loss_layer_36_head": 0.10143260657787323,
      "loss_layer_42_head": 0.053486406803131104,
      "loss_layer_6_head": 0.8420193791389465,
      "step": 5275
    },
    {
      "epoch": 67.584,
      "grad_norm": 0.19162864469090896,
      "learning_rate": 0.001428268596492364,
      "loss": 2.4658,
      "loss_layer_12_head": 0.5722814798355103,
      "loss_layer_18_head": 0.4598506987094879,
      "loss_layer_24_head": 0.2729315459728241,
      "loss_layer_30_head": 0.1651681512594223,
      "loss_layer_36_head": 0.09765272587537766,
      "loss_layer_42_head": 0.05150548368692398,
      "loss_layer_6_head": 0.8194282650947571,
      "step": 5280
    },
    {
      "epoch": 67.648,
      "grad_norm": 0.2066364579248886,
      "learning_rate": 0.001423217378357085,
      "loss": 2.4269,
      "loss_layer_12_head": 0.5701189637184143,
      "loss_layer_18_head": 0.46421822905540466,
      "loss_layer_24_head": 0.27403873205184937,
      "loss_layer_30_head": 0.16465747356414795,
      "loss_layer_36_head": 0.09885530173778534,
      "loss_layer_42_head": 0.05258981138467789,
      "loss_layer_6_head": 0.8098844289779663,
      "step": 5285
    },
    {
      "epoch": 67.712,
      "grad_norm": 0.13810853811181112,
      "learning_rate": 0.0014181715515204094,
      "loss": 2.4496,
      "loss_layer_12_head": 0.5842360258102417,
      "loss_layer_18_head": 0.4747219681739807,
      "loss_layer_24_head": 0.2791658937931061,
      "loss_layer_30_head": 0.1640622913837433,
      "loss_layer_36_head": 0.09913156926631927,
      "loss_layer_42_head": 0.05131257325410843,
      "loss_layer_6_head": 0.8373994827270508,
      "step": 5290
    },
    {
      "epoch": 67.776,
      "grad_norm": 0.14499622796269704,
      "learning_rate": 0.0014131311412460795,
      "loss": 2.4342,
      "loss_layer_12_head": 0.5868300795555115,
      "loss_layer_18_head": 0.46979743242263794,
      "loss_layer_24_head": 0.27826061844825745,
      "loss_layer_30_head": 0.1639150083065033,
      "loss_layer_36_head": 0.10132886469364166,
      "loss_layer_42_head": 0.0528276190161705,
      "loss_layer_6_head": 0.8368266224861145,
      "step": 5295
    },
    {
      "epoch": 67.84,
      "grad_norm": 0.09922358477984598,
      "learning_rate": 0.0014080961727707185,
      "loss": 2.4066,
      "loss_layer_12_head": 0.5585824251174927,
      "loss_layer_18_head": 0.4471663534641266,
      "loss_layer_24_head": 0.26423245668411255,
      "loss_layer_30_head": 0.15351445972919464,
      "loss_layer_36_head": 0.09600724279880524,
      "loss_layer_42_head": 0.05036250501871109,
      "loss_layer_6_head": 0.7947167158126831,
      "step": 5300
    },
    {
      "epoch": 67.904,
      "grad_norm": 0.09191329284133154,
      "learning_rate": 0.0014030666713037026,
      "loss": 2.4458,
      "loss_layer_12_head": 0.569415271282196,
      "loss_layer_18_head": 0.4552089273929596,
      "loss_layer_24_head": 0.26977139711380005,
      "loss_layer_30_head": 0.15580052137374878,
      "loss_layer_36_head": 0.09613678604364395,
      "loss_layer_42_head": 0.05078815296292305,
      "loss_layer_6_head": 0.8126457333564758,
      "step": 5305
    },
    {
      "epoch": 67.968,
      "grad_norm": 0.11261108161761188,
      "learning_rate": 0.0013980426620270351,
      "loss": 2.4416,
      "loss_layer_12_head": 0.5624688863754272,
      "loss_layer_18_head": 0.4521670937538147,
      "loss_layer_24_head": 0.2725401520729065,
      "loss_layer_30_head": 0.1606042981147766,
      "loss_layer_36_head": 0.09809018671512604,
      "loss_layer_42_head": 0.055001772940158844,
      "loss_layer_6_head": 0.8002514839172363,
      "step": 5310
    },
    {
      "epoch": 68.032,
      "grad_norm": 0.11297204445583521,
      "learning_rate": 0.001393024170095224,
      "loss": 2.359,
      "loss_layer_12_head": 0.5645781755447388,
      "loss_layer_18_head": 0.4501315951347351,
      "loss_layer_24_head": 0.2686399519443512,
      "loss_layer_30_head": 0.15651008486747742,
      "loss_layer_36_head": 0.09817198663949966,
      "loss_layer_42_head": 0.05202815681695938,
      "loss_layer_6_head": 0.8111818432807922,
      "step": 5315
    },
    {
      "epoch": 68.096,
      "grad_norm": 0.09321882747545121,
      "learning_rate": 0.0013880112206351475,
      "loss": 2.3252,
      "loss_layer_12_head": 0.5628981590270996,
      "loss_layer_18_head": 0.4489511549472809,
      "loss_layer_24_head": 0.2696271538734436,
      "loss_layer_30_head": 0.15692469477653503,
      "loss_layer_36_head": 0.09853383153676987,
      "loss_layer_42_head": 0.05248745158314705,
      "loss_layer_6_head": 0.8054313659667969,
      "step": 5320
    },
    {
      "epoch": 68.16,
      "grad_norm": 0.09578653259782523,
      "learning_rate": 0.0013830038387459354,
      "loss": 2.3347,
      "loss_layer_12_head": 0.5337448120117188,
      "loss_layer_18_head": 0.4261392652988434,
      "loss_layer_24_head": 0.2559625208377838,
      "loss_layer_30_head": 0.15098945796489716,
      "loss_layer_36_head": 0.09315741807222366,
      "loss_layer_42_head": 0.05038934201002121,
      "loss_layer_6_head": 0.7588936686515808,
      "step": 5325
    },
    {
      "epoch": 68.224,
      "grad_norm": 0.08433854287879275,
      "learning_rate": 0.0013780020494988446,
      "loss": 2.3427,
      "loss_layer_12_head": 0.5369664430618286,
      "loss_layer_18_head": 0.42983609437942505,
      "loss_layer_24_head": 0.2558009624481201,
      "loss_layer_30_head": 0.14776499569416046,
      "loss_layer_36_head": 0.09208012372255325,
      "loss_layer_42_head": 0.04844372346997261,
      "loss_layer_6_head": 0.7649826407432556,
      "step": 5330
    },
    {
      "epoch": 68.288,
      "grad_norm": 0.09366103480450942,
      "learning_rate": 0.0013730058779371265,
      "loss": 2.3606,
      "loss_layer_12_head": 0.547406017780304,
      "loss_layer_18_head": 0.43980520963668823,
      "loss_layer_24_head": 0.26384928822517395,
      "loss_layer_30_head": 0.15445473790168762,
      "loss_layer_36_head": 0.09606665372848511,
      "loss_layer_42_head": 0.05105496197938919,
      "loss_layer_6_head": 0.7795709371566772,
      "step": 5335
    },
    {
      "epoch": 68.352,
      "grad_norm": 0.1282052932353282,
      "learning_rate": 0.0013680153490759073,
      "loss": 2.3665,
      "loss_layer_12_head": 0.5479649305343628,
      "loss_layer_18_head": 0.43836578726768494,
      "loss_layer_24_head": 0.26084762811660767,
      "loss_layer_30_head": 0.15122172236442566,
      "loss_layer_36_head": 0.09420381486415863,
      "loss_layer_42_head": 0.05001320317387581,
      "loss_layer_6_head": 0.7851589918136597,
      "step": 5340
    },
    {
      "epoch": 68.416,
      "grad_norm": 0.11168793123944552,
      "learning_rate": 0.00136303048790206,
      "loss": 2.4059,
      "loss_layer_12_head": 0.5697135329246521,
      "loss_layer_18_head": 0.45545822381973267,
      "loss_layer_24_head": 0.27111271023750305,
      "loss_layer_30_head": 0.15657876431941986,
      "loss_layer_36_head": 0.09654238075017929,
      "loss_layer_42_head": 0.05144345015287399,
      "loss_layer_6_head": 0.8124370574951172,
      "step": 5345
    },
    {
      "epoch": 68.48,
      "grad_norm": 0.13134492580226353,
      "learning_rate": 0.0013580513193740817,
      "loss": 2.3748,
      "loss_layer_12_head": 0.5600630640983582,
      "loss_layer_18_head": 0.44794923067092896,
      "loss_layer_24_head": 0.26532214879989624,
      "loss_layer_30_head": 0.153532937169075,
      "loss_layer_36_head": 0.09464345872402191,
      "loss_layer_42_head": 0.05082206800580025,
      "loss_layer_6_head": 0.7952293157577515,
      "step": 5350
    },
    {
      "epoch": 68.544,
      "grad_norm": 0.10035695195129772,
      "learning_rate": 0.0013530778684219646,
      "loss": 2.4004,
      "loss_layer_12_head": 0.5764386057853699,
      "loss_layer_18_head": 0.4595361649990082,
      "loss_layer_24_head": 0.2747218608856201,
      "loss_layer_30_head": 0.16208073496818542,
      "loss_layer_36_head": 0.09960604459047318,
      "loss_layer_42_head": 0.05307096242904663,
      "loss_layer_6_head": 0.8198025822639465,
      "step": 5355
    },
    {
      "epoch": 68.608,
      "grad_norm": 0.1053923043107958,
      "learning_rate": 0.0013481101599470793,
      "loss": 2.4004,
      "loss_layer_12_head": 0.5548727512359619,
      "loss_layer_18_head": 0.4422317445278168,
      "loss_layer_24_head": 0.26513952016830444,
      "loss_layer_30_head": 0.15538263320922852,
      "loss_layer_36_head": 0.09866098314523697,
      "loss_layer_42_head": 0.052181024104356766,
      "loss_layer_6_head": 0.7949376106262207,
      "step": 5360
    },
    {
      "epoch": 68.672,
      "grad_norm": 0.12537444458249866,
      "learning_rate": 0.0013431482188220368,
      "loss": 2.3963,
      "loss_layer_12_head": 0.5562868118286133,
      "loss_layer_18_head": 0.44399842619895935,
      "loss_layer_24_head": 0.2661983370780945,
      "loss_layer_30_head": 0.15576837956905365,
      "loss_layer_36_head": 0.0965532511472702,
      "loss_layer_42_head": 0.05113821476697922,
      "loss_layer_6_head": 0.7879191637039185,
      "step": 5365
    },
    {
      "epoch": 68.736,
      "grad_norm": 0.10974722976495493,
      "learning_rate": 0.0013381920698905786,
      "loss": 2.4099,
      "loss_layer_12_head": 0.5761160850524902,
      "loss_layer_18_head": 0.46047431230545044,
      "loss_layer_24_head": 0.27487877011299133,
      "loss_layer_30_head": 0.15948805212974548,
      "loss_layer_36_head": 0.09914465248584747,
      "loss_layer_42_head": 0.05208210274577141,
      "loss_layer_6_head": 0.8177138566970825,
      "step": 5370
    },
    {
      "epoch": 68.8,
      "grad_norm": 0.09402809504786983,
      "learning_rate": 0.0013332417379674426,
      "loss": 2.4257,
      "loss_layer_12_head": 0.5874722003936768,
      "loss_layer_18_head": 0.47112053632736206,
      "loss_layer_24_head": 0.2806778848171234,
      "loss_layer_30_head": 0.16290700435638428,
      "loss_layer_36_head": 0.10013767331838608,
      "loss_layer_42_head": 0.052746839821338654,
      "loss_layer_6_head": 0.834861159324646,
      "step": 5375
    },
    {
      "epoch": 68.864,
      "grad_norm": 0.12896609255003552,
      "learning_rate": 0.001328297247838241,
      "loss": 2.4523,
      "loss_layer_12_head": 0.5604492425918579,
      "loss_layer_18_head": 0.44999709725379944,
      "loss_layer_24_head": 0.2681766152381897,
      "loss_layer_30_head": 0.15659764409065247,
      "loss_layer_36_head": 0.097244992852211,
      "loss_layer_42_head": 0.050766605883836746,
      "loss_layer_6_head": 0.8050448298454285,
      "step": 5380
    },
    {
      "epoch": 68.928,
      "grad_norm": 0.09423411377688119,
      "learning_rate": 0.0013233586242593386,
      "loss": 2.4514,
      "loss_layer_12_head": 0.5618788003921509,
      "loss_layer_18_head": 0.45106250047683716,
      "loss_layer_24_head": 0.2695927023887634,
      "loss_layer_30_head": 0.15639103949069977,
      "loss_layer_36_head": 0.09700961410999298,
      "loss_layer_42_head": 0.05087515711784363,
      "loss_layer_6_head": 0.7988547086715698,
      "step": 5385
    },
    {
      "epoch": 68.992,
      "grad_norm": 0.09379310864402048,
      "learning_rate": 0.0013184258919577269,
      "loss": 2.4361,
      "loss_layer_12_head": 0.5512022376060486,
      "loss_layer_18_head": 0.44364267587661743,
      "loss_layer_24_head": 0.2627025246620178,
      "loss_layer_30_head": 0.15209001302719116,
      "loss_layer_36_head": 0.09431121498346329,
      "loss_layer_42_head": 0.04996483027935028,
      "loss_layer_6_head": 0.7854719758033752,
      "step": 5390
    },
    {
      "epoch": 69.056,
      "grad_norm": 0.09257588272020106,
      "learning_rate": 0.001313499075630899,
      "loss": 2.3508,
      "loss_layer_12_head": 0.5990757346153259,
      "loss_layer_18_head": 0.47921663522720337,
      "loss_layer_24_head": 0.28636088967323303,
      "loss_layer_30_head": 0.16624771058559418,
      "loss_layer_36_head": 0.10295461118221283,
      "loss_layer_42_head": 0.05412915349006653,
      "loss_layer_6_head": 0.8421087265014648,
      "step": 5395
    },
    {
      "epoch": 69.12,
      "grad_norm": 0.10972167779153522,
      "learning_rate": 0.0013085781999467303,
      "loss": 2.3506,
      "loss_layer_12_head": 0.527679979801178,
      "loss_layer_18_head": 0.41973042488098145,
      "loss_layer_24_head": 0.2514473795890808,
      "loss_layer_30_head": 0.14601513743400574,
      "loss_layer_36_head": 0.09090875089168549,
      "loss_layer_42_head": 0.04853242635726929,
      "loss_layer_6_head": 0.7580686807632446,
      "step": 5400
    },
    {
      "epoch": 69.12,
      "eval_loss": 5.376793384552002,
      "eval_loss_layer_12_head": 1.245397686958313,
      "eval_loss_layer_18_head": 1.079089879989624,
      "eval_loss_layer_24_head": 0.6869164705276489,
      "eval_loss_layer_30_head": 0.44741615653038025,
      "eval_loss_layer_36_head": 0.2866867780685425,
      "eval_loss_layer_42_head": 0.17103120684623718,
      "eval_loss_layer_6_head": 1.5882246494293213,
      "eval_runtime": 33.0893,
      "eval_samples_per_second": 9.671,
      "eval_steps_per_second": 0.604,
      "step": 5400
    },
    {
      "epoch": 69.184,
      "grad_norm": 0.11911550974034457,
      "learning_rate": 0.0013036632895433503,
      "loss": 2.3432,
      "loss_layer_12_head": 0.5572623014450073,
      "loss_layer_18_head": 0.44827336072921753,
      "loss_layer_24_head": 0.2694876790046692,
      "loss_layer_30_head": 0.1543877124786377,
      "loss_layer_36_head": 0.09580288827419281,
      "loss_layer_42_head": 0.050012242048978806,
      "loss_layer_6_head": 0.7944494485855103,
      "step": 5405
    },
    {
      "epoch": 69.248,
      "grad_norm": 0.09231459054911244,
      "learning_rate": 0.001298754369029022,
      "loss": 2.3599,
      "loss_layer_12_head": 0.5819986462593079,
      "loss_layer_18_head": 0.4649704396724701,
      "loss_layer_24_head": 0.2826339602470398,
      "loss_layer_30_head": 0.16461049020290375,
      "loss_layer_36_head": 0.10721878707408905,
      "loss_layer_42_head": 0.0656307190656662,
      "loss_layer_6_head": 0.8252191543579102,
      "step": 5410
    },
    {
      "epoch": 69.312,
      "grad_norm": 0.09733533974241086,
      "learning_rate": 0.001293851462982017,
      "loss": 2.3713,
      "loss_layer_12_head": 0.5516754984855652,
      "loss_layer_18_head": 0.44041284918785095,
      "loss_layer_24_head": 0.26253288984298706,
      "loss_layer_30_head": 0.15283706784248352,
      "loss_layer_36_head": 0.09471763670444489,
      "loss_layer_42_head": 0.049144305288791656,
      "loss_layer_6_head": 0.7913897633552551,
      "step": 5415
    },
    {
      "epoch": 69.376,
      "grad_norm": 0.08601094975841313,
      "learning_rate": 0.001288954595950494,
      "loss": 2.3725,
      "loss_layer_12_head": 0.5871697068214417,
      "loss_layer_18_head": 0.46904411911964417,
      "loss_layer_24_head": 0.28164753317832947,
      "loss_layer_30_head": 0.1631268858909607,
      "loss_layer_36_head": 0.100311778485775,
      "loss_layer_42_head": 0.052251894026994705,
      "loss_layer_6_head": 0.8377426266670227,
      "step": 5420
    },
    {
      "epoch": 69.44,
      "grad_norm": 0.08295933706955866,
      "learning_rate": 0.0012840637924523741,
      "loss": 2.3763,
      "loss_layer_12_head": 0.5390000343322754,
      "loss_layer_18_head": 0.43138718605041504,
      "loss_layer_24_head": 0.2577996850013733,
      "loss_layer_30_head": 0.15147358179092407,
      "loss_layer_36_head": 0.09378086030483246,
      "loss_layer_42_head": 0.050283629447221756,
      "loss_layer_6_head": 0.7683120965957642,
      "step": 5425
    },
    {
      "epoch": 69.504,
      "grad_norm": 0.09945736093189426,
      "learning_rate": 0.0012791790769752232,
      "loss": 2.3158,
      "loss_layer_12_head": 0.5166209936141968,
      "loss_layer_18_head": 0.41152581572532654,
      "loss_layer_24_head": 0.2474578619003296,
      "loss_layer_30_head": 0.14296789467334747,
      "loss_layer_36_head": 0.08760537207126617,
      "loss_layer_42_head": 0.04737364500761032,
      "loss_layer_6_head": 0.7369243502616882,
      "step": 5430
    },
    {
      "epoch": 69.568,
      "grad_norm": 0.10285879997785655,
      "learning_rate": 0.0012743004739761176,
      "loss": 2.4045,
      "loss_layer_12_head": 0.5747213959693909,
      "loss_layer_18_head": 0.4586687684059143,
      "loss_layer_24_head": 0.27384305000305176,
      "loss_layer_30_head": 0.15795546770095825,
      "loss_layer_36_head": 0.09840188920497894,
      "loss_layer_42_head": 0.053668152540922165,
      "loss_layer_6_head": 0.8138332366943359,
      "step": 5435
    },
    {
      "epoch": 69.632,
      "grad_norm": 0.09800223452159455,
      "learning_rate": 0.0012694280078815383,
      "loss": 2.3961,
      "loss_layer_12_head": 0.536048948764801,
      "loss_layer_18_head": 0.42551690340042114,
      "loss_layer_24_head": 0.2547954022884369,
      "loss_layer_30_head": 0.1509898453950882,
      "loss_layer_36_head": 0.09415043145418167,
      "loss_layer_42_head": 0.0507943220436573,
      "loss_layer_6_head": 0.7633163928985596,
      "step": 5440
    },
    {
      "epoch": 69.696,
      "grad_norm": 0.10401117858247784,
      "learning_rate": 0.0012645617030872327,
      "loss": 2.4013,
      "loss_layer_12_head": 0.5601393580436707,
      "loss_layer_18_head": 0.44662365317344666,
      "loss_layer_24_head": 0.2655910849571228,
      "loss_layer_30_head": 0.15691301226615906,
      "loss_layer_36_head": 0.09636274725198746,
      "loss_layer_42_head": 0.05133684352040291,
      "loss_layer_6_head": 0.7935290932655334,
      "step": 5445
    },
    {
      "epoch": 69.76,
      "grad_norm": 0.09608083671249089,
      "learning_rate": 0.0012597015839581032,
      "loss": 2.448,
      "loss_layer_12_head": 0.5647367238998413,
      "loss_layer_18_head": 0.4528251588344574,
      "loss_layer_24_head": 0.26969465613365173,
      "loss_layer_30_head": 0.15698231756687164,
      "loss_layer_36_head": 0.09726875275373459,
      "loss_layer_42_head": 0.051156409084796906,
      "loss_layer_6_head": 0.8049466013908386,
      "step": 5450
    },
    {
      "epoch": 69.824,
      "grad_norm": 0.110879816400583,
      "learning_rate": 0.0012548476748280798,
      "loss": 2.3704,
      "loss_layer_12_head": 0.5830736756324768,
      "loss_layer_18_head": 0.4646722376346588,
      "loss_layer_24_head": 0.2795243561267853,
      "loss_layer_30_head": 0.16013593971729279,
      "loss_layer_36_head": 0.0994807705283165,
      "loss_layer_42_head": 0.05217687040567398,
      "loss_layer_6_head": 0.8249305486679077,
      "step": 5455
    },
    {
      "epoch": 69.888,
      "grad_norm": 0.10073155451332941,
      "learning_rate": 0.0012500000000000007,
      "loss": 2.4186,
      "loss_layer_12_head": 0.5584479570388794,
      "loss_layer_18_head": 0.4453919529914856,
      "loss_layer_24_head": 0.27140891551971436,
      "loss_layer_30_head": 0.15368501842021942,
      "loss_layer_36_head": 0.09483233839273453,
      "loss_layer_42_head": 0.050274115055799484,
      "loss_layer_6_head": 0.8052247166633606,
      "step": 5460
    },
    {
      "epoch": 69.952,
      "grad_norm": 0.09953006724078228,
      "learning_rate": 0.0012451585837454878,
      "loss": 2.3935,
      "loss_layer_12_head": 0.5839226245880127,
      "loss_layer_18_head": 0.4653795659542084,
      "loss_layer_24_head": 0.27991971373558044,
      "loss_layer_30_head": 0.16087405383586884,
      "loss_layer_36_head": 0.09985681623220444,
      "loss_layer_42_head": 0.0529613271355629,
      "loss_layer_6_head": 0.8292669057846069,
      "step": 5465
    },
    {
      "epoch": 70.016,
      "grad_norm": 0.10667955037065256,
      "learning_rate": 0.0012403234503048318,
      "loss": 2.405,
      "loss_layer_12_head": 0.5540944337844849,
      "loss_layer_18_head": 0.443520724773407,
      "loss_layer_24_head": 0.26553580164909363,
      "loss_layer_30_head": 0.15466545522212982,
      "loss_layer_36_head": 0.09650041908025742,
      "loss_layer_42_head": 0.05060475319623947,
      "loss_layer_6_head": 0.7884739637374878,
      "step": 5470
    },
    {
      "epoch": 70.08,
      "grad_norm": 0.10362779385163681,
      "learning_rate": 0.0012354946238868631,
      "loss": 2.2761,
      "loss_layer_12_head": 0.5024293065071106,
      "loss_layer_18_head": 0.40054965019226074,
      "loss_layer_24_head": 0.2386283576488495,
      "loss_layer_30_head": 0.13751547038555145,
      "loss_layer_36_head": 0.08639391511678696,
      "loss_layer_42_head": 0.04544436186552048,
      "loss_layer_6_head": 0.7199009656906128,
      "step": 5475
    },
    {
      "epoch": 70.144,
      "grad_norm": 0.09311694894960537,
      "learning_rate": 0.001230672128668831,
      "loss": 2.3019,
      "loss_layer_12_head": 0.5336896777153015,
      "loss_layer_18_head": 0.4240944981575012,
      "loss_layer_24_head": 0.25293800234794617,
      "loss_layer_30_head": 0.14536747336387634,
      "loss_layer_36_head": 0.0910239890217781,
      "loss_layer_42_head": 0.047807276248931885,
      "loss_layer_6_head": 0.7614033818244934,
      "step": 5480
    },
    {
      "epoch": 70.208,
      "grad_norm": 0.08695394309466753,
      "learning_rate": 0.001225855988796291,
      "loss": 2.3312,
      "loss_layer_12_head": 0.5353509187698364,
      "loss_layer_18_head": 0.4314948618412018,
      "loss_layer_24_head": 0.26072901487350464,
      "loss_layer_30_head": 0.158537358045578,
      "loss_layer_36_head": 0.1061687022447586,
      "loss_layer_42_head": 0.05178200080990791,
      "loss_layer_6_head": 0.7615779042243958,
      "step": 5485
    },
    {
      "epoch": 70.272,
      "grad_norm": 0.08973676864576462,
      "learning_rate": 0.0012210462283829754,
      "loss": 2.3595,
      "loss_layer_12_head": 0.585614025592804,
      "loss_layer_18_head": 0.4697713255882263,
      "loss_layer_24_head": 0.2828715443611145,
      "loss_layer_30_head": 0.1710052192211151,
      "loss_layer_36_head": 0.10145296901464462,
      "loss_layer_42_head": 0.05311361700296402,
      "loss_layer_6_head": 0.8289822340011597,
      "step": 5490
    },
    {
      "epoch": 70.336,
      "grad_norm": 0.09030690920132081,
      "learning_rate": 0.0012162428715106752,
      "loss": 2.3629,
      "loss_layer_12_head": 0.5600045919418335,
      "loss_layer_18_head": 0.44477295875549316,
      "loss_layer_24_head": 0.26514145731925964,
      "loss_layer_30_head": 0.152329683303833,
      "loss_layer_36_head": 0.09397609531879425,
      "loss_layer_42_head": 0.04952815920114517,
      "loss_layer_6_head": 0.7960458397865295,
      "step": 5495
    },
    {
      "epoch": 70.4,
      "grad_norm": 0.11861557356175668,
      "learning_rate": 0.0012114459422291205,
      "loss": 2.3852,
      "loss_layer_12_head": 0.5796315670013428,
      "loss_layer_18_head": 0.45883575081825256,
      "loss_layer_24_head": 0.2736167311668396,
      "loss_layer_30_head": 0.1588888019323349,
      "loss_layer_36_head": 0.097938671708107,
      "loss_layer_42_head": 0.05065115541219711,
      "loss_layer_6_head": 0.8249458074569702,
      "step": 5500
    },
    {
      "epoch": 70.464,
      "grad_norm": 0.08249351284968628,
      "learning_rate": 0.0012066554645558578,
      "loss": 2.3869,
      "loss_layer_12_head": 0.5634979009628296,
      "loss_layer_18_head": 0.45192256569862366,
      "loss_layer_24_head": 0.27416080236434937,
      "loss_layer_30_head": 0.16973450779914856,
      "loss_layer_36_head": 0.10164041817188263,
      "loss_layer_42_head": 0.051729362457990646,
      "loss_layer_6_head": 0.7973347306251526,
      "step": 5505
    },
    {
      "epoch": 70.528,
      "grad_norm": 0.08076197746640386,
      "learning_rate": 0.001201871462476135,
      "loss": 2.3835,
      "loss_layer_12_head": 0.5642200708389282,
      "loss_layer_18_head": 0.4532284140586853,
      "loss_layer_24_head": 0.27182674407958984,
      "loss_layer_30_head": 0.15712055563926697,
      "loss_layer_36_head": 0.09557511657476425,
      "loss_layer_42_head": 0.04929818585515022,
      "loss_layer_6_head": 0.8016234636306763,
      "step": 5510
    },
    {
      "epoch": 70.592,
      "grad_norm": 0.0960779268832577,
      "learning_rate": 0.0011970939599427741,
      "loss": 2.3875,
      "loss_layer_12_head": 0.580566942691803,
      "loss_layer_18_head": 0.46431073546409607,
      "loss_layer_24_head": 0.27464035153388977,
      "loss_layer_30_head": 0.16010700166225433,
      "loss_layer_36_head": 0.09901885688304901,
      "loss_layer_42_head": 0.05125637724995613,
      "loss_layer_6_head": 0.8247208595275879,
      "step": 5515
    },
    {
      "epoch": 70.656,
      "grad_norm": 0.07908593338661954,
      "learning_rate": 0.0011923229808760564,
      "loss": 2.3685,
      "loss_layer_12_head": 0.5624557733535767,
      "loss_layer_18_head": 0.4492347240447998,
      "loss_layer_24_head": 0.2666240930557251,
      "loss_layer_30_head": 0.15440799295902252,
      "loss_layer_36_head": 0.09406361728906631,
      "loss_layer_42_head": 0.04931820556521416,
      "loss_layer_6_head": 0.8018423914909363,
      "step": 5520
    },
    {
      "epoch": 70.72,
      "grad_norm": 0.08694076285771431,
      "learning_rate": 0.0011875585491635998,
      "loss": 2.4013,
      "loss_layer_12_head": 0.5656533241271973,
      "loss_layer_18_head": 0.4502074718475342,
      "loss_layer_24_head": 0.2682296931743622,
      "loss_layer_30_head": 0.1542513370513916,
      "loss_layer_36_head": 0.09618735313415527,
      "loss_layer_42_head": 0.05004975199699402,
      "loss_layer_6_head": 0.8035200238227844,
      "step": 5525
    },
    {
      "epoch": 70.784,
      "grad_norm": 0.12722827494359482,
      "learning_rate": 0.0011828006886602422,
      "loss": 2.4148,
      "loss_layer_12_head": 0.543490469455719,
      "loss_layer_18_head": 0.4382587969303131,
      "loss_layer_24_head": 0.27005454897880554,
      "loss_layer_30_head": 0.15689395368099213,
      "loss_layer_36_head": 0.09862229973077774,
      "loss_layer_42_head": 0.05107790231704712,
      "loss_layer_6_head": 0.7723699808120728,
      "step": 5530
    },
    {
      "epoch": 70.848,
      "grad_norm": 0.0943293553823817,
      "learning_rate": 0.0011780494231879183,
      "loss": 2.4065,
      "loss_layer_12_head": 0.5554178357124329,
      "loss_layer_18_head": 0.44557619094848633,
      "loss_layer_24_head": 0.2729918956756592,
      "loss_layer_30_head": 0.15804430842399597,
      "loss_layer_36_head": 0.09852548688650131,
      "loss_layer_42_head": 0.052139025181531906,
      "loss_layer_6_head": 0.788246750831604,
      "step": 5535
    },
    {
      "epoch": 70.912,
      "grad_norm": 0.08788733260743338,
      "learning_rate": 0.0011733047765355464,
      "loss": 2.4017,
      "loss_layer_12_head": 0.5774174332618713,
      "loss_layer_18_head": 0.4624478816986084,
      "loss_layer_24_head": 0.2757474184036255,
      "loss_layer_30_head": 0.1573120504617691,
      "loss_layer_36_head": 0.09683147072792053,
      "loss_layer_42_head": 0.050533540546894073,
      "loss_layer_6_head": 0.8206308484077454,
      "step": 5540
    },
    {
      "epoch": 70.976,
      "grad_norm": 0.08473741146611646,
      "learning_rate": 0.0011685667724589002,
      "loss": 2.4058,
      "loss_layer_12_head": 0.5566309094429016,
      "loss_layer_18_head": 0.44344544410705566,
      "loss_layer_24_head": 0.26343637704849243,
      "loss_layer_30_head": 0.15235742926597595,
      "loss_layer_36_head": 0.09418001770973206,
      "loss_layer_42_head": 0.04892579838633537,
      "loss_layer_6_head": 0.7918591499328613,
      "step": 5545
    },
    {
      "epoch": 71.04,
      "grad_norm": 0.08350151791846107,
      "learning_rate": 0.0011638354346804971,
      "loss": 2.3494,
      "loss_layer_12_head": 0.5638017654418945,
      "loss_layer_18_head": 0.45039883255958557,
      "loss_layer_24_head": 0.26914528012275696,
      "loss_layer_30_head": 0.15367193520069122,
      "loss_layer_36_head": 0.09482550621032715,
      "loss_layer_42_head": 0.04946111515164375,
      "loss_layer_6_head": 0.8010358810424805,
      "step": 5550
    },
    {
      "epoch": 71.104,
      "grad_norm": 0.0970502551941564,
      "learning_rate": 0.0011591107868894797,
      "loss": 2.2852,
      "loss_layer_12_head": 0.5456808805465698,
      "loss_layer_18_head": 0.4341527819633484,
      "loss_layer_24_head": 0.25974923372268677,
      "loss_layer_30_head": 0.15087559819221497,
      "loss_layer_36_head": 0.09398321807384491,
      "loss_layer_42_head": 0.049287110567092896,
      "loss_layer_6_head": 0.7778557538986206,
      "step": 5555
    },
    {
      "epoch": 71.168,
      "grad_norm": 0.08063662875881149,
      "learning_rate": 0.0011543928527414909,
      "loss": 2.3032,
      "loss_layer_12_head": 0.522553563117981,
      "loss_layer_18_head": 0.41599687933921814,
      "loss_layer_24_head": 0.24982860684394836,
      "loss_layer_30_head": 0.14525657892227173,
      "loss_layer_36_head": 0.09019313752651215,
      "loss_layer_42_head": 0.04812514781951904,
      "loss_layer_6_head": 0.7464012503623962,
      "step": 5560
    },
    {
      "epoch": 71.232,
      "grad_norm": 0.08031354447987653,
      "learning_rate": 0.0011496816558585622,
      "loss": 2.3198,
      "loss_layer_12_head": 0.5591468214988708,
      "loss_layer_18_head": 0.44562435150146484,
      "loss_layer_24_head": 0.26696738600730896,
      "loss_layer_30_head": 0.15862946212291718,
      "loss_layer_36_head": 0.09578440338373184,
      "loss_layer_42_head": 0.050591371953487396,
      "loss_layer_6_head": 0.7922371029853821,
      "step": 5565
    },
    {
      "epoch": 71.296,
      "grad_norm": 0.08147198963823746,
      "learning_rate": 0.0011449772198289904,
      "loss": 2.2907,
      "loss_layer_12_head": 0.5605618357658386,
      "loss_layer_18_head": 0.45021238923072815,
      "loss_layer_24_head": 0.2672578692436218,
      "loss_layer_30_head": 0.16044963896274567,
      "loss_layer_36_head": 0.09605761617422104,
      "loss_layer_42_head": 0.05087246745824814,
      "loss_layer_6_head": 0.7941055297851562,
      "step": 5570
    },
    {
      "epoch": 71.36,
      "grad_norm": 0.09044050335652697,
      "learning_rate": 0.0011402795682072223,
      "loss": 2.3692,
      "loss_layer_12_head": 0.5829329490661621,
      "loss_layer_18_head": 0.46486014127731323,
      "loss_layer_24_head": 0.27643853425979614,
      "loss_layer_30_head": 0.15909801423549652,
      "loss_layer_36_head": 0.0980708971619606,
      "loss_layer_42_head": 0.05081282928586006,
      "loss_layer_6_head": 0.8290421366691589,
      "step": 5575
    },
    {
      "epoch": 71.424,
      "grad_norm": 0.0873283785499165,
      "learning_rate": 0.0011355887245137384,
      "loss": 2.343,
      "loss_layer_12_head": 0.5402742624282837,
      "loss_layer_18_head": 0.42767763137817383,
      "loss_layer_24_head": 0.2544030249118805,
      "loss_layer_30_head": 0.14599737524986267,
      "loss_layer_36_head": 0.09069070965051651,
      "loss_layer_42_head": 0.04733961820602417,
      "loss_layer_6_head": 0.7720564603805542,
      "step": 5580
    },
    {
      "epoch": 71.488,
      "grad_norm": 0.09002095941023724,
      "learning_rate": 0.0011309047122349306,
      "loss": 2.3944,
      "loss_layer_12_head": 0.5590826869010925,
      "loss_layer_18_head": 0.44737300276756287,
      "loss_layer_24_head": 0.26631659269332886,
      "loss_layer_30_head": 0.15419670939445496,
      "loss_layer_36_head": 0.09471851587295532,
      "loss_layer_42_head": 0.04923203960061073,
      "loss_layer_6_head": 0.8030705451965332,
      "step": 5585
    },
    {
      "epoch": 71.552,
      "grad_norm": 0.09825659208925204,
      "learning_rate": 0.001126227554822985,
      "loss": 2.3525,
      "loss_layer_12_head": 0.5430908203125,
      "loss_layer_18_head": 0.4334653913974762,
      "loss_layer_24_head": 0.2601294219493866,
      "loss_layer_30_head": 0.15221253037452698,
      "loss_layer_36_head": 0.10233648121356964,
      "loss_layer_42_head": 0.05902108550071716,
      "loss_layer_6_head": 0.7738138437271118,
      "step": 5590
    },
    {
      "epoch": 71.616,
      "grad_norm": 0.09570170155105043,
      "learning_rate": 0.001121557275695771,
      "loss": 2.3486,
      "loss_layer_12_head": 0.5585882067680359,
      "loss_layer_18_head": 0.4462118148803711,
      "loss_layer_24_head": 0.26422610878944397,
      "loss_layer_30_head": 0.15146932005882263,
      "loss_layer_36_head": 0.0933571383357048,
      "loss_layer_42_head": 0.04860212653875351,
      "loss_layer_6_head": 0.7923351526260376,
      "step": 5595
    },
    {
      "epoch": 71.68,
      "grad_norm": 0.09750077684140962,
      "learning_rate": 0.001116893898236716,
      "loss": 2.4044,
      "loss_layer_12_head": 0.5672653317451477,
      "loss_layer_18_head": 0.45567837357521057,
      "loss_layer_24_head": 0.2723308205604553,
      "loss_layer_30_head": 0.15795525908470154,
      "loss_layer_36_head": 0.09809926897287369,
      "loss_layer_42_head": 0.0511845238506794,
      "loss_layer_6_head": 0.8062906265258789,
      "step": 5600
    },
    {
      "epoch": 71.68,
      "eval_loss": 5.360450744628906,
      "eval_loss_layer_12_head": 1.238508701324463,
      "eval_loss_layer_18_head": 1.073888897895813,
      "eval_loss_layer_24_head": 0.6914258003234863,
      "eval_loss_layer_30_head": 0.44724956154823303,
      "eval_loss_layer_36_head": 0.28561681509017944,
      "eval_loss_layer_42_head": 0.17004218697547913,
      "eval_loss_layer_6_head": 1.5856258869171143,
      "eval_runtime": 33.0434,
      "eval_samples_per_second": 9.684,
      "eval_steps_per_second": 0.605,
      "step": 5600
    },
    {
      "epoch": 71.744,
      "grad_norm": 0.0845064802268845,
      "learning_rate": 0.001112237445794693,
      "loss": 2.3961,
      "loss_layer_12_head": 0.564632773399353,
      "loss_layer_18_head": 0.4508134424686432,
      "loss_layer_24_head": 0.2683842182159424,
      "loss_layer_30_head": 0.15454669296741486,
      "loss_layer_36_head": 0.09512560069561005,
      "loss_layer_42_head": 0.04986995458602905,
      "loss_layer_6_head": 0.8086351156234741,
      "step": 5605
    },
    {
      "epoch": 71.808,
      "grad_norm": 0.08753605677399463,
      "learning_rate": 0.0011075879416839022,
      "loss": 2.4058,
      "loss_layer_12_head": 0.5508392453193665,
      "loss_layer_18_head": 0.44259828329086304,
      "loss_layer_24_head": 0.26593151688575745,
      "loss_layer_30_head": 0.15471498668193817,
      "loss_layer_36_head": 0.09588557481765747,
      "loss_layer_42_head": 0.05148781090974808,
      "loss_layer_6_head": 0.7799230813980103,
      "step": 5610
    },
    {
      "epoch": 71.872,
      "grad_norm": 0.12334067758085328,
      "learning_rate": 0.001102945409183754,
      "loss": 2.4219,
      "loss_layer_12_head": 0.5786576867103577,
      "loss_layer_18_head": 0.46043434739112854,
      "loss_layer_24_head": 0.27294930815696716,
      "loss_layer_30_head": 0.1548728048801422,
      "loss_layer_36_head": 0.09511029720306396,
      "loss_layer_42_head": 0.04979122057557106,
      "loss_layer_6_head": 0.8209393620491028,
      "step": 5615
    },
    {
      "epoch": 71.936,
      "grad_norm": 0.1092165599373487,
      "learning_rate": 0.0010983098715387526,
      "loss": 2.4295,
      "loss_layer_12_head": 0.5678204298019409,
      "loss_layer_18_head": 0.451857328414917,
      "loss_layer_24_head": 0.26938050985336304,
      "loss_layer_30_head": 0.1536611020565033,
      "loss_layer_36_head": 0.0949711948633194,
      "loss_layer_42_head": 0.04942820221185684,
      "loss_layer_6_head": 0.8091564178466797,
      "step": 5620
    },
    {
      "epoch": 72.0,
      "grad_norm": 0.10182397664290023,
      "learning_rate": 0.001093681351958383,
      "loss": 2.4042,
      "loss_layer_12_head": 0.5790759325027466,
      "loss_layer_18_head": 0.46178025007247925,
      "loss_layer_24_head": 0.27603060007095337,
      "loss_layer_30_head": 0.1584838479757309,
      "loss_layer_36_head": 0.0984288901090622,
      "loss_layer_42_head": 0.051493626087903976,
      "loss_layer_6_head": 0.8184674382209778,
      "step": 5625
    },
    {
      "epoch": 72.064,
      "grad_norm": 0.08813631250765232,
      "learning_rate": 0.0010890598736169879,
      "loss": 2.3208,
      "loss_layer_12_head": 0.5813970565795898,
      "loss_layer_18_head": 0.46002936363220215,
      "loss_layer_24_head": 0.2733423113822937,
      "loss_layer_30_head": 0.1578468680381775,
      "loss_layer_36_head": 0.09789346158504486,
      "loss_layer_42_head": 0.05063742399215698,
      "loss_layer_6_head": 0.8225024938583374,
      "step": 5630
    },
    {
      "epoch": 72.128,
      "grad_norm": 0.08594193922748178,
      "learning_rate": 0.0010844454596536574,
      "loss": 2.3008,
      "loss_layer_12_head": 0.5196276903152466,
      "loss_layer_18_head": 0.41505104303359985,
      "loss_layer_24_head": 0.24737605452537537,
      "loss_layer_30_head": 0.14413173496723175,
      "loss_layer_36_head": 0.08993745595216751,
      "loss_layer_42_head": 0.0471578910946846,
      "loss_layer_6_head": 0.7445937991142273,
      "step": 5635
    },
    {
      "epoch": 72.192,
      "grad_norm": 0.09013441798677573,
      "learning_rate": 0.0010798381331721108,
      "loss": 2.2866,
      "loss_layer_12_head": 0.5422458648681641,
      "loss_layer_18_head": 0.4316342771053314,
      "loss_layer_24_head": 0.2581145167350769,
      "loss_layer_30_head": 0.14937207102775574,
      "loss_layer_36_head": 0.09226687252521515,
      "loss_layer_42_head": 0.048264745622873306,
      "loss_layer_6_head": 0.7718225717544556,
      "step": 5640
    },
    {
      "epoch": 72.256,
      "grad_norm": 0.08285513320783991,
      "learning_rate": 0.0010752379172405807,
      "loss": 2.3248,
      "loss_layer_12_head": 0.5627008676528931,
      "loss_layer_18_head": 0.4453171193599701,
      "loss_layer_24_head": 0.26598647236824036,
      "loss_layer_30_head": 0.1536271870136261,
      "loss_layer_36_head": 0.09483180940151215,
      "loss_layer_42_head": 0.05032235383987427,
      "loss_layer_6_head": 0.797799825668335,
      "step": 5645
    },
    {
      "epoch": 72.32,
      "grad_norm": 0.0772074725218934,
      "learning_rate": 0.0010706448348917006,
      "loss": 2.3294,
      "loss_layer_12_head": 0.5741179585456848,
      "loss_layer_18_head": 0.46007728576660156,
      "loss_layer_24_head": 0.27625301480293274,
      "loss_layer_30_head": 0.16059251129627228,
      "loss_layer_36_head": 0.09913943707942963,
      "loss_layer_42_head": 0.05178670212626457,
      "loss_layer_6_head": 0.820048987865448,
      "step": 5650
    },
    {
      "epoch": 72.384,
      "grad_norm": 0.08154067757511477,
      "learning_rate": 0.0010660589091223854,
      "loss": 2.3168,
      "loss_layer_12_head": 0.5247608423233032,
      "loss_layer_18_head": 0.4218398928642273,
      "loss_layer_24_head": 0.25114497542381287,
      "loss_layer_30_head": 0.1448221653699875,
      "loss_layer_36_head": 0.08925621211528778,
      "loss_layer_42_head": 0.046773046255111694,
      "loss_layer_6_head": 0.7557961940765381,
      "step": 5655
    },
    {
      "epoch": 72.448,
      "grad_norm": 0.10389449931495047,
      "learning_rate": 0.001061480162893716,
      "loss": 2.3455,
      "loss_layer_12_head": 0.5390672087669373,
      "loss_layer_18_head": 0.4310081899166107,
      "loss_layer_24_head": 0.25751224160194397,
      "loss_layer_30_head": 0.14852020144462585,
      "loss_layer_36_head": 0.0925002321600914,
      "loss_layer_42_head": 0.048272792249917984,
      "loss_layer_6_head": 0.7708923816680908,
      "step": 5660
    },
    {
      "epoch": 72.512,
      "grad_norm": 0.09174390193317876,
      "learning_rate": 0.0010569086191308304,
      "loss": 2.354,
      "loss_layer_12_head": 0.5473750233650208,
      "loss_layer_18_head": 0.4390328526496887,
      "loss_layer_24_head": 0.26254957914352417,
      "loss_layer_30_head": 0.15117573738098145,
      "loss_layer_36_head": 0.0940229669213295,
      "loss_layer_42_head": 0.04961683228611946,
      "loss_layer_6_head": 0.7779014706611633,
      "step": 5665
    },
    {
      "epoch": 72.576,
      "grad_norm": 0.10144937032633021,
      "learning_rate": 0.0010523443007228028,
      "loss": 2.3767,
      "loss_layer_12_head": 0.5498192310333252,
      "loss_layer_18_head": 0.4389416575431824,
      "loss_layer_24_head": 0.26207101345062256,
      "loss_layer_30_head": 0.1497814953327179,
      "loss_layer_36_head": 0.09247744083404541,
      "loss_layer_42_head": 0.04813087359070778,
      "loss_layer_6_head": 0.7805224657058716,
      "step": 5670
    },
    {
      "epoch": 72.64,
      "grad_norm": 0.11430646748508105,
      "learning_rate": 0.0010477872305225309,
      "loss": 2.3837,
      "loss_layer_12_head": 0.5556079149246216,
      "loss_layer_18_head": 0.4449824392795563,
      "loss_layer_24_head": 0.26506465673446655,
      "loss_layer_30_head": 0.15362292528152466,
      "loss_layer_36_head": 0.09518565237522125,
      "loss_layer_42_head": 0.05001748725771904,
      "loss_layer_6_head": 0.7900441884994507,
      "step": 5675
    },
    {
      "epoch": 72.704,
      "grad_norm": 0.08369693295447468,
      "learning_rate": 0.001043237431346622,
      "loss": 2.3469,
      "loss_layer_12_head": 0.5440775156021118,
      "loss_layer_18_head": 0.4340090751647949,
      "loss_layer_24_head": 0.25926825404167175,
      "loss_layer_30_head": 0.1499083936214447,
      "loss_layer_36_head": 0.09306468069553375,
      "loss_layer_42_head": 0.04953023046255112,
      "loss_layer_6_head": 0.7782944440841675,
      "step": 5680
    },
    {
      "epoch": 72.768,
      "grad_norm": 0.10086870933543773,
      "learning_rate": 0.0010386949259752784,
      "loss": 2.357,
      "loss_layer_12_head": 0.5388556718826294,
      "loss_layer_18_head": 0.4313333034515381,
      "loss_layer_24_head": 0.2581126391887665,
      "loss_layer_30_head": 0.15148594975471497,
      "loss_layer_36_head": 0.09623315185308456,
      "loss_layer_42_head": 0.05643288046121597,
      "loss_layer_6_head": 0.761394202709198,
      "step": 5685
    },
    {
      "epoch": 72.832,
      "grad_norm": 0.09861063585154864,
      "learning_rate": 0.0010341597371521824,
      "loss": 2.3874,
      "loss_layer_12_head": 0.5828830599784851,
      "loss_layer_18_head": 0.46448034048080444,
      "loss_layer_24_head": 0.2794218957424164,
      "loss_layer_30_head": 0.1604129523038864,
      "loss_layer_36_head": 0.0987405925989151,
      "loss_layer_42_head": 0.05158063769340515,
      "loss_layer_6_head": 0.8302946090698242,
      "step": 5690
    },
    {
      "epoch": 72.896,
      "grad_norm": 0.0907966922846734,
      "learning_rate": 0.0010296318875843862,
      "loss": 2.4183,
      "loss_layer_12_head": 0.5751842260360718,
      "loss_layer_18_head": 0.45574504137039185,
      "loss_layer_24_head": 0.2728256583213806,
      "loss_layer_30_head": 0.15629276633262634,
      "loss_layer_36_head": 0.09663442522287369,
      "loss_layer_42_head": 0.050228409469127655,
      "loss_layer_6_head": 0.8145866394042969,
      "step": 5695
    },
    {
      "epoch": 72.96,
      "grad_norm": 0.1338140532047423,
      "learning_rate": 0.0010251113999421935,
      "loss": 2.4236,
      "loss_layer_12_head": 0.5592122673988342,
      "loss_layer_18_head": 0.44163432717323303,
      "loss_layer_24_head": 0.2642163038253784,
      "loss_layer_30_head": 0.1502273827791214,
      "loss_layer_36_head": 0.09271318465471268,
      "loss_layer_42_head": 0.04850583150982857,
      "loss_layer_6_head": 0.788387656211853,
      "step": 5700
    },
    {
      "epoch": 73.024,
      "grad_norm": 0.0964982398420008,
      "learning_rate": 0.001020598296859045,
      "loss": 2.369,
      "loss_layer_12_head": 0.5735142230987549,
      "loss_layer_18_head": 0.45547395944595337,
      "loss_layer_24_head": 0.2714191675186157,
      "loss_layer_30_head": 0.15520919859409332,
      "loss_layer_36_head": 0.09640487283468246,
      "loss_layer_42_head": 0.050520606338977814,
      "loss_layer_6_head": 0.8113002777099609,
      "step": 5705
    },
    {
      "epoch": 73.088,
      "grad_norm": 0.09701760080181906,
      "learning_rate": 0.001016092600931414,
      "loss": 2.2469,
      "loss_layer_12_head": 0.5211244225502014,
      "loss_layer_18_head": 0.41467300057411194,
      "loss_layer_24_head": 0.24639467895030975,
      "loss_layer_30_head": 0.1420271247625351,
      "loss_layer_36_head": 0.08806177973747253,
      "loss_layer_42_head": 0.045950911939144135,
      "loss_layer_6_head": 0.7438262701034546,
      "step": 5710
    },
    {
      "epoch": 73.152,
      "grad_norm": 0.08499132131535464,
      "learning_rate": 0.0010115943347186827,
      "loss": 2.3328,
      "loss_layer_12_head": 0.5282027125358582,
      "loss_layer_18_head": 0.41821223497390747,
      "loss_layer_24_head": 0.24938292801380157,
      "loss_layer_30_head": 0.1448613554239273,
      "loss_layer_36_head": 0.09036881476640701,
      "loss_layer_42_head": 0.048038698732852936,
      "loss_layer_6_head": 0.7565073370933533,
      "step": 5715
    },
    {
      "epoch": 73.216,
      "grad_norm": 0.10816288718573926,
      "learning_rate": 0.0010071035207430351,
      "loss": 2.3136,
      "loss_layer_12_head": 0.5210358500480652,
      "loss_layer_18_head": 0.4153243899345398,
      "loss_layer_24_head": 0.248165562748909,
      "loss_layer_30_head": 0.1437683403491974,
      "loss_layer_36_head": 0.0886407420039177,
      "loss_layer_42_head": 0.04634737968444824,
      "loss_layer_6_head": 0.7442346811294556,
      "step": 5720
    },
    {
      "epoch": 73.28,
      "grad_norm": 0.13712854420258613,
      "learning_rate": 0.001002620181489343,
      "loss": 2.3213,
      "loss_layer_12_head": 0.5355392098426819,
      "loss_layer_18_head": 0.4271247982978821,
      "loss_layer_24_head": 0.25712451338768005,
      "loss_layer_30_head": 0.14758507907390594,
      "loss_layer_36_head": 0.09118280559778214,
      "loss_layer_42_head": 0.047352634370326996,
      "loss_layer_6_head": 0.7637184262275696,
      "step": 5725
    },
    {
      "epoch": 73.344,
      "grad_norm": 0.07905734531868686,
      "learning_rate": 0.0009981443394050523,
      "loss": 2.3115,
      "loss_layer_12_head": 0.5657747983932495,
      "loss_layer_18_head": 0.44924062490463257,
      "loss_layer_24_head": 0.26837044954299927,
      "loss_layer_30_head": 0.1540326029062271,
      "loss_layer_36_head": 0.09442360699176788,
      "loss_layer_42_head": 0.04963044077157974,
      "loss_layer_6_head": 0.808613657951355,
      "step": 5730
    },
    {
      "epoch": 73.408,
      "grad_norm": 0.11104833436741333,
      "learning_rate": 0.000993676016900075,
      "loss": 2.3533,
      "loss_layer_12_head": 0.5398064851760864,
      "loss_layer_18_head": 0.4283141493797302,
      "loss_layer_24_head": 0.2555343210697174,
      "loss_layer_30_head": 0.14528322219848633,
      "loss_layer_36_head": 0.09049415588378906,
      "loss_layer_42_head": 0.04697301238775253,
      "loss_layer_6_head": 0.7719407081604004,
      "step": 5735
    },
    {
      "epoch": 73.472,
      "grad_norm": 0.1255852516764157,
      "learning_rate": 0.0009892152363466691,
      "loss": 2.3594,
      "loss_layer_12_head": 0.5746393799781799,
      "loss_layer_18_head": 0.4589950442314148,
      "loss_layer_24_head": 0.2739291787147522,
      "loss_layer_30_head": 0.15758481621742249,
      "loss_layer_36_head": 0.09784603118896484,
      "loss_layer_42_head": 0.05050859600305557,
      "loss_layer_6_head": 0.8168492317199707,
      "step": 5740
    },
    {
      "epoch": 73.536,
      "grad_norm": 0.0927106769173534,
      "learning_rate": 0.0009847620200793342,
      "loss": 2.4037,
      "loss_layer_12_head": 0.5669339895248413,
      "loss_layer_18_head": 0.4545467793941498,
      "loss_layer_24_head": 0.27306652069091797,
      "loss_layer_30_head": 0.16265633702278137,
      "loss_layer_36_head": 0.09870961308479309,
      "loss_layer_42_head": 0.05505412817001343,
      "loss_layer_6_head": 0.7996861934661865,
      "step": 5745
    },
    {
      "epoch": 73.6,
      "grad_norm": 0.08109584259300906,
      "learning_rate": 0.0009803163903946953,
      "loss": 2.3356,
      "loss_layer_12_head": 0.5556929707527161,
      "loss_layer_18_head": 0.43985113501548767,
      "loss_layer_24_head": 0.26151156425476074,
      "loss_layer_30_head": 0.15073636174201965,
      "loss_layer_36_head": 0.09221173077821732,
      "loss_layer_42_head": 0.048929717391729355,
      "loss_layer_6_head": 0.7858646512031555,
      "step": 5750
    },
    {
      "epoch": 73.664,
      "grad_norm": 0.07399706249470507,
      "learning_rate": 0.0009758783695513925,
      "loss": 2.3933,
      "loss_layer_12_head": 0.5804101228713989,
      "loss_layer_18_head": 0.464486300945282,
      "loss_layer_24_head": 0.27666527032852173,
      "loss_layer_30_head": 0.1598948985338211,
      "loss_layer_36_head": 0.09870950877666473,
      "loss_layer_42_head": 0.051381438970565796,
      "loss_layer_6_head": 0.821672260761261,
      "step": 5755
    },
    {
      "epoch": 73.728,
      "grad_norm": 0.07677822163524993,
      "learning_rate": 0.0009714479797699693,
      "loss": 2.3985,
      "loss_layer_12_head": 0.5683960318565369,
      "loss_layer_18_head": 0.4549197256565094,
      "loss_layer_24_head": 0.2722500264644623,
      "loss_layer_30_head": 0.1622505635023117,
      "loss_layer_36_head": 0.09670663625001907,
      "loss_layer_42_head": 0.05235493183135986,
      "loss_layer_6_head": 0.8082783818244934,
      "step": 5760
    },
    {
      "epoch": 73.792,
      "grad_norm": 0.07196927160204937,
      "learning_rate": 0.0009670252432327645,
      "loss": 2.3859,
      "loss_layer_12_head": 0.5679503679275513,
      "loss_layer_18_head": 0.4542841911315918,
      "loss_layer_24_head": 0.2691924273967743,
      "loss_layer_30_head": 0.15574303269386292,
      "loss_layer_36_head": 0.09499519318342209,
      "loss_layer_42_head": 0.049431540071964264,
      "loss_layer_6_head": 0.8074603080749512,
      "step": 5765
    },
    {
      "epoch": 73.856,
      "grad_norm": 0.08123966076454311,
      "learning_rate": 0.0009626101820837926,
      "loss": 2.4036,
      "loss_layer_12_head": 0.569747805595398,
      "loss_layer_18_head": 0.4534655213356018,
      "loss_layer_24_head": 0.2682664394378662,
      "loss_layer_30_head": 0.15499791502952576,
      "loss_layer_36_head": 0.09543906152248383,
      "loss_layer_42_head": 0.049195174127817154,
      "loss_layer_6_head": 0.8126770257949829,
      "step": 5770
    },
    {
      "epoch": 73.92,
      "grad_norm": 0.08649477384715129,
      "learning_rate": 0.0009582028184286424,
      "loss": 2.3672,
      "loss_layer_12_head": 0.5422612428665161,
      "loss_layer_18_head": 0.4346292018890381,
      "loss_layer_24_head": 0.2566079795360565,
      "loss_layer_30_head": 0.14836807548999786,
      "loss_layer_36_head": 0.09270451217889786,
      "loss_layer_42_head": 0.04835749790072441,
      "loss_layer_6_head": 0.7754207849502563,
      "step": 5775
    },
    {
      "epoch": 73.984,
      "grad_norm": 0.07977814207783128,
      "learning_rate": 0.0009538031743343628,
      "loss": 2.3427,
      "loss_layer_12_head": 0.5376604199409485,
      "loss_layer_18_head": 0.4273601174354553,
      "loss_layer_24_head": 0.2535747289657593,
      "loss_layer_30_head": 0.14691227674484253,
      "loss_layer_36_head": 0.0894998088479042,
      "loss_layer_42_head": 0.04735178500413895,
      "loss_layer_6_head": 0.7670060396194458,
      "step": 5780
    },
    {
      "epoch": 74.048,
      "grad_norm": 0.0877153244750318,
      "learning_rate": 0.0009494112718293502,
      "loss": 2.3518,
      "loss_layer_12_head": 0.5376673936843872,
      "loss_layer_18_head": 0.4289923310279846,
      "loss_layer_24_head": 0.25485482811927795,
      "loss_layer_30_head": 0.14718714356422424,
      "loss_layer_36_head": 0.09086791425943375,
      "loss_layer_42_head": 0.0475921705365181,
      "loss_layer_6_head": 0.7660344243049622,
      "step": 5785
    },
    {
      "epoch": 74.112,
      "grad_norm": 0.0723044946169637,
      "learning_rate": 0.0009450271329032404,
      "loss": 2.3043,
      "loss_layer_12_head": 0.5365365743637085,
      "loss_layer_18_head": 0.42896467447280884,
      "loss_layer_24_head": 0.25632286071777344,
      "loss_layer_30_head": 0.14877080917358398,
      "loss_layer_36_head": 0.091732919216156,
      "loss_layer_42_head": 0.04771579056978226,
      "loss_layer_6_head": 0.7657873034477234,
      "step": 5790
    },
    {
      "epoch": 74.176,
      "grad_norm": 0.07120047865600428,
      "learning_rate": 0.0009406507795067981,
      "loss": 2.3499,
      "loss_layer_12_head": 0.5255545973777771,
      "loss_layer_18_head": 0.4203030467033386,
      "loss_layer_24_head": 0.2520439922809601,
      "loss_layer_30_head": 0.1462918221950531,
      "loss_layer_36_head": 0.08985036611557007,
      "loss_layer_42_head": 0.04731179028749466,
      "loss_layer_6_head": 0.7533494234085083,
      "step": 5795
    },
    {
      "epoch": 74.24,
      "grad_norm": 0.07870038909856213,
      "learning_rate": 0.0009362822335518062,
      "loss": 2.3106,
      "loss_layer_12_head": 0.5549668073654175,
      "loss_layer_18_head": 0.44239091873168945,
      "loss_layer_24_head": 0.26380449533462524,
      "loss_layer_30_head": 0.15157853066921234,
      "loss_layer_36_head": 0.09274730831384659,
      "loss_layer_42_head": 0.04836529865860939,
      "loss_layer_6_head": 0.7861323952674866,
      "step": 5800
    },
    {
      "epoch": 74.24,
      "eval_loss": 5.410980701446533,
      "eval_loss_layer_12_head": 1.2417991161346436,
      "eval_loss_layer_18_head": 1.0776185989379883,
      "eval_loss_layer_24_head": 0.6972260475158691,
      "eval_loss_layer_30_head": 0.4813305735588074,
      "eval_loss_layer_36_head": 0.289094477891922,
      "eval_loss_layer_42_head": 0.19075994193553925,
      "eval_loss_layer_6_head": 1.5955613851547241,
      "eval_runtime": 33.0804,
      "eval_samples_per_second": 9.673,
      "eval_steps_per_second": 0.605,
      "step": 5800
    },
    {
      "epoch": 74.304,
      "grad_norm": 0.07640726911678747,
      "learning_rate": 0.0009319215169109598,
      "loss": 2.3235,
      "loss_layer_12_head": 0.5467821955680847,
      "loss_layer_18_head": 0.4379340708255768,
      "loss_layer_24_head": 0.2626608908176422,
      "loss_layer_30_head": 0.15262499451637268,
      "loss_layer_36_head": 0.09371785819530487,
      "loss_layer_42_head": 0.04951037839055061,
      "loss_layer_6_head": 0.7734649777412415,
      "step": 5805
    },
    {
      "epoch": 74.368,
      "grad_norm": 0.08640678825729853,
      "learning_rate": 0.0009275686514177506,
      "loss": 2.3144,
      "loss_layer_12_head": 0.5429695844650269,
      "loss_layer_18_head": 0.4346829354763031,
      "loss_layer_24_head": 0.2603020668029785,
      "loss_layer_30_head": 0.15211138129234314,
      "loss_layer_36_head": 0.09423617273569107,
      "loss_layer_42_head": 0.049245212227106094,
      "loss_layer_6_head": 0.771332859992981,
      "step": 5810
    },
    {
      "epoch": 74.432,
      "grad_norm": 0.0830603909561277,
      "learning_rate": 0.0009232236588663615,
      "loss": 2.3184,
      "loss_layer_12_head": 0.537375271320343,
      "loss_layer_18_head": 0.428488165140152,
      "loss_layer_24_head": 0.25445568561553955,
      "loss_layer_30_head": 0.14643725752830505,
      "loss_layer_36_head": 0.09021512418985367,
      "loss_layer_42_head": 0.046761803328990936,
      "loss_layer_6_head": 0.769096851348877,
      "step": 5815
    },
    {
      "epoch": 74.496,
      "grad_norm": 0.08799396791034682,
      "learning_rate": 0.0009188865610115571,
      "loss": 2.3577,
      "loss_layer_12_head": 0.5076459646224976,
      "loss_layer_18_head": 0.4066011905670166,
      "loss_layer_24_head": 0.24413685500621796,
      "loss_layer_30_head": 0.14242519438266754,
      "loss_layer_36_head": 0.08912168443202972,
      "loss_layer_42_head": 0.047430310398340225,
      "loss_layer_6_head": 0.7247560024261475,
      "step": 5820
    },
    {
      "epoch": 74.56,
      "grad_norm": 0.0758571475552096,
      "learning_rate": 0.0009145573795685741,
      "loss": 2.3674,
      "loss_layer_12_head": 0.5303391218185425,
      "loss_layer_18_head": 0.42352062463760376,
      "loss_layer_24_head": 0.25247135758399963,
      "loss_layer_30_head": 0.14545771479606628,
      "loss_layer_36_head": 0.08942024409770966,
      "loss_layer_42_head": 0.0472564660012722,
      "loss_layer_6_head": 0.7542840838432312,
      "step": 5825
    },
    {
      "epoch": 74.624,
      "grad_norm": 0.09434657059763038,
      "learning_rate": 0.0009102361362130132,
      "loss": 2.3118,
      "loss_layer_12_head": 0.5547032952308655,
      "loss_layer_18_head": 0.4421428143978119,
      "loss_layer_24_head": 0.2625966966152191,
      "loss_layer_30_head": 0.15126511454582214,
      "loss_layer_36_head": 0.09338952600955963,
      "loss_layer_42_head": 0.048176493495702744,
      "loss_layer_6_head": 0.7920879125595093,
      "step": 5830
    },
    {
      "epoch": 74.688,
      "grad_norm": 0.10061781768003912,
      "learning_rate": 0.0009059228525807295,
      "loss": 2.3255,
      "loss_layer_12_head": 0.5407409071922302,
      "loss_layer_18_head": 0.4317746162414551,
      "loss_layer_24_head": 0.2584421634674072,
      "loss_layer_30_head": 0.14972469210624695,
      "loss_layer_36_head": 0.0923369973897934,
      "loss_layer_42_head": 0.048624224960803986,
      "loss_layer_6_head": 0.7730118036270142,
      "step": 5835
    },
    {
      "epoch": 74.752,
      "grad_norm": 0.08507685629674014,
      "learning_rate": 0.000901617550267726,
      "loss": 2.3759,
      "loss_layer_12_head": 0.5676208138465881,
      "loss_layer_18_head": 0.450258731842041,
      "loss_layer_24_head": 0.2670828402042389,
      "loss_layer_30_head": 0.15453281998634338,
      "loss_layer_36_head": 0.09425376355648041,
      "loss_layer_42_head": 0.04869775474071503,
      "loss_layer_6_head": 0.8086221814155579,
      "step": 5840
    },
    {
      "epoch": 74.816,
      "grad_norm": 0.08302506459888749,
      "learning_rate": 0.0008973202508300421,
      "loss": 2.3098,
      "loss_layer_12_head": 0.5507552027702332,
      "loss_layer_18_head": 0.43936872482299805,
      "loss_layer_24_head": 0.2626217007637024,
      "loss_layer_30_head": 0.15050701797008514,
      "loss_layer_36_head": 0.09249670803546906,
      "loss_layer_42_head": 0.048075176775455475,
      "loss_layer_6_head": 0.781469464302063,
      "step": 5845
    },
    {
      "epoch": 74.88,
      "grad_norm": 0.07968730111200109,
      "learning_rate": 0.0008930309757836516,
      "loss": 2.3717,
      "loss_layer_12_head": 0.6049565076828003,
      "loss_layer_18_head": 0.4836077094078064,
      "loss_layer_24_head": 0.2892746329307556,
      "loss_layer_30_head": 0.16575804352760315,
      "loss_layer_36_head": 0.10114330053329468,
      "loss_layer_42_head": 0.052593618631362915,
      "loss_layer_6_head": 0.8571552038192749,
      "step": 5850
    },
    {
      "epoch": 74.944,
      "grad_norm": 0.08244497734523368,
      "learning_rate": 0.0008887497466043488,
      "loss": 2.368,
      "loss_layer_12_head": 0.5731956362724304,
      "loss_layer_18_head": 0.458118736743927,
      "loss_layer_24_head": 0.2732034921646118,
      "loss_layer_30_head": 0.15698733925819397,
      "loss_layer_36_head": 0.0969591960310936,
      "loss_layer_42_head": 0.04989396035671234,
      "loss_layer_6_head": 0.8165812492370605,
      "step": 5855
    },
    {
      "epoch": 75.008,
      "grad_norm": 0.08998197533436127,
      "learning_rate": 0.0008844765847276432,
      "loss": 2.4246,
      "loss_layer_12_head": 0.5558816194534302,
      "loss_layer_18_head": 0.44401922821998596,
      "loss_layer_24_head": 0.26519352197647095,
      "loss_layer_30_head": 0.1528087556362152,
      "loss_layer_36_head": 0.0938301533460617,
      "loss_layer_42_head": 0.04880371689796448,
      "loss_layer_6_head": 0.7926174402236938,
      "step": 5860
    },
    {
      "epoch": 75.072,
      "grad_norm": 0.09177048333364875,
      "learning_rate": 0.0008802115115486534,
      "loss": 2.2638,
      "loss_layer_12_head": 0.5301346182823181,
      "loss_layer_18_head": 0.4229167401790619,
      "loss_layer_24_head": 0.2529616057872772,
      "loss_layer_30_head": 0.14961561560630798,
      "loss_layer_36_head": 0.09091387689113617,
      "loss_layer_42_head": 0.04767182096838951,
      "loss_layer_6_head": 0.754980206489563,
      "step": 5865
    },
    {
      "epoch": 75.136,
      "grad_norm": 0.08003075490061097,
      "learning_rate": 0.0008759545484219983,
      "loss": 2.2899,
      "loss_layer_12_head": 0.5614721179008484,
      "loss_layer_18_head": 0.4480770230293274,
      "loss_layer_24_head": 0.2680107057094574,
      "loss_layer_30_head": 0.15256722271442413,
      "loss_layer_36_head": 0.09277012199163437,
      "loss_layer_42_head": 0.04840445891022682,
      "loss_layer_6_head": 0.7973222732543945,
      "step": 5870
    },
    {
      "epoch": 75.2,
      "grad_norm": 0.07744101207895769,
      "learning_rate": 0.0008717057166616926,
      "loss": 2.3481,
      "loss_layer_12_head": 0.5772678852081299,
      "loss_layer_18_head": 0.4624158442020416,
      "loss_layer_24_head": 0.27570798993110657,
      "loss_layer_30_head": 0.15773390233516693,
      "loss_layer_36_head": 0.0964926928281784,
      "loss_layer_42_head": 0.050117336213588715,
      "loss_layer_6_head": 0.817699134349823,
      "step": 5875
    },
    {
      "epoch": 75.264,
      "grad_norm": 0.08871887296233746,
      "learning_rate": 0.0008674650375410379,
      "loss": 2.2582,
      "loss_layer_12_head": 0.5132205486297607,
      "loss_layer_18_head": 0.4143695831298828,
      "loss_layer_24_head": 0.24962785840034485,
      "loss_layer_30_head": 0.15069979429244995,
      "loss_layer_36_head": 0.09000755846500397,
      "loss_layer_42_head": 0.048350512981414795,
      "loss_layer_6_head": 0.7261356115341187,
      "step": 5880
    },
    {
      "epoch": 75.328,
      "grad_norm": 0.08364599678607103,
      "learning_rate": 0.0008632325322925127,
      "loss": 2.3305,
      "loss_layer_12_head": 0.5594637393951416,
      "loss_layer_18_head": 0.44504696130752563,
      "loss_layer_24_head": 0.26490721106529236,
      "loss_layer_30_head": 0.15102145075798035,
      "loss_layer_36_head": 0.09253251552581787,
      "loss_layer_42_head": 0.04826132208108902,
      "loss_layer_6_head": 0.7973617315292358,
      "step": 5885
    },
    {
      "epoch": 75.392,
      "grad_norm": 0.08083977866745111,
      "learning_rate": 0.0008590082221076764,
      "loss": 2.3373,
      "loss_layer_12_head": 0.5514628291130066,
      "loss_layer_18_head": 0.438545286655426,
      "loss_layer_24_head": 0.2636452317237854,
      "loss_layer_30_head": 0.14979125559329987,
      "loss_layer_36_head": 0.09228147566318512,
      "loss_layer_42_head": 0.04782130569219589,
      "loss_layer_6_head": 0.7920076251029968,
      "step": 5890
    },
    {
      "epoch": 75.456,
      "grad_norm": 0.07628226717131234,
      "learning_rate": 0.000854792128137053,
      "loss": 2.3552,
      "loss_layer_12_head": 0.5615903735160828,
      "loss_layer_18_head": 0.44698747992515564,
      "loss_layer_24_head": 0.2641564905643463,
      "loss_layer_30_head": 0.15159815549850464,
      "loss_layer_36_head": 0.09314510226249695,
      "loss_layer_42_head": 0.048406489193439484,
      "loss_layer_6_head": 0.8016958236694336,
      "step": 5895
    },
    {
      "epoch": 75.52,
      "grad_norm": 0.08506508226577242,
      "learning_rate": 0.0008505842714900297,
      "loss": 2.3331,
      "loss_layer_12_head": 0.5442160367965698,
      "loss_layer_18_head": 0.4318545460700989,
      "loss_layer_24_head": 0.25502559542655945,
      "loss_layer_30_head": 0.1466924250125885,
      "loss_layer_36_head": 0.09115133434534073,
      "loss_layer_42_head": 0.04674351215362549,
      "loss_layer_6_head": 0.7734619975090027,
      "step": 5900
    },
    {
      "epoch": 75.584,
      "grad_norm": 0.11747130436031974,
      "learning_rate": 0.0008463846732347511,
      "loss": 2.3222,
      "loss_layer_12_head": 0.5450567603111267,
      "loss_layer_18_head": 0.4336596131324768,
      "loss_layer_24_head": 0.2590074837207794,
      "loss_layer_30_head": 0.14965426921844482,
      "loss_layer_36_head": 0.09176523238420486,
      "loss_layer_42_head": 0.04749138280749321,
      "loss_layer_6_head": 0.7715091109275818,
      "step": 5905
    },
    {
      "epoch": 75.648,
      "grad_norm": 0.10263092480315389,
      "learning_rate": 0.0008421933543980125,
      "loss": 2.352,
      "loss_layer_12_head": 0.5787950754165649,
      "loss_layer_18_head": 0.46076980233192444,
      "loss_layer_24_head": 0.27436190843582153,
      "loss_layer_30_head": 0.15607954561710358,
      "loss_layer_36_head": 0.09570559114217758,
      "loss_layer_42_head": 0.04969794303178787,
      "loss_layer_6_head": 0.8161465525627136,
      "step": 5910
    },
    {
      "epoch": 75.712,
      "grad_norm": 0.0836998825702074,
      "learning_rate": 0.0008380103359651553,
      "loss": 2.3352,
      "loss_layer_12_head": 0.5190392732620239,
      "loss_layer_18_head": 0.41486960649490356,
      "loss_layer_24_head": 0.24796955287456512,
      "loss_layer_30_head": 0.1428256779909134,
      "loss_layer_36_head": 0.08782418072223663,
      "loss_layer_42_head": 0.04628778249025345,
      "loss_layer_6_head": 0.7428368330001831,
      "step": 5915
    },
    {
      "epoch": 75.776,
      "grad_norm": 0.08065876999019243,
      "learning_rate": 0.0008338356388799637,
      "loss": 2.3383,
      "loss_layer_12_head": 0.5527087450027466,
      "loss_layer_18_head": 0.44291168451309204,
      "loss_layer_24_head": 0.2645028829574585,
      "loss_layer_30_head": 0.15118227899074554,
      "loss_layer_36_head": 0.09324125200510025,
      "loss_layer_42_head": 0.047955431044101715,
      "loss_layer_6_head": 0.7881574630737305,
      "step": 5920
    },
    {
      "epoch": 75.84,
      "grad_norm": 0.0852546566639995,
      "learning_rate": 0.0008296692840445569,
      "loss": 2.3784,
      "loss_layer_12_head": 0.5670901536941528,
      "loss_layer_18_head": 0.4542098939418793,
      "loss_layer_24_head": 0.27289149165153503,
      "loss_layer_30_head": 0.15741781890392303,
      "loss_layer_36_head": 0.09617812931537628,
      "loss_layer_42_head": 0.049975767731666565,
      "loss_layer_6_head": 0.8003484606742859,
      "step": 5925
    },
    {
      "epoch": 75.904,
      "grad_norm": 0.08000595851321024,
      "learning_rate": 0.0008255112923192834,
      "loss": 2.3355,
      "loss_layer_12_head": 0.542208194732666,
      "loss_layer_18_head": 0.4290239214897156,
      "loss_layer_24_head": 0.25471293926239014,
      "loss_layer_30_head": 0.14445960521697998,
      "loss_layer_36_head": 0.0892053097486496,
      "loss_layer_42_head": 0.04592350870370865,
      "loss_layer_6_head": 0.772663950920105,
      "step": 5930
    },
    {
      "epoch": 75.968,
      "grad_norm": 0.07143538011257189,
      "learning_rate": 0.0008213616845226227,
      "loss": 2.3553,
      "loss_layer_12_head": 0.5429335236549377,
      "loss_layer_18_head": 0.4309907853603363,
      "loss_layer_24_head": 0.25499334931373596,
      "loss_layer_30_head": 0.14583095908164978,
      "loss_layer_36_head": 0.08948921412229538,
      "loss_layer_42_head": 0.04681173712015152,
      "loss_layer_6_head": 0.7709838151931763,
      "step": 5935
    },
    {
      "epoch": 76.032,
      "grad_norm": 0.07789392562157837,
      "learning_rate": 0.0008172204814310741,
      "loss": 2.3298,
      "loss_layer_12_head": 0.5284218788146973,
      "loss_layer_18_head": 0.42416805028915405,
      "loss_layer_24_head": 0.25359517335891724,
      "loss_layer_30_head": 0.14431801438331604,
      "loss_layer_36_head": 0.08890695869922638,
      "loss_layer_42_head": 0.04609914869070053,
      "loss_layer_6_head": 0.7541780471801758,
      "step": 5940
    },
    {
      "epoch": 76.096,
      "grad_norm": 0.07638504665535363,
      "learning_rate": 0.0008130877037790593,
      "loss": 2.2723,
      "loss_layer_12_head": 0.5083048939704895,
      "loss_layer_18_head": 0.403778612613678,
      "loss_layer_24_head": 0.24181225895881653,
      "loss_layer_30_head": 0.1390024721622467,
      "loss_layer_36_head": 0.08571873605251312,
      "loss_layer_42_head": 0.04524841532111168,
      "loss_layer_6_head": 0.7303053140640259,
      "step": 5945
    },
    {
      "epoch": 76.16,
      "grad_norm": 0.07060684176516897,
      "learning_rate": 0.0008089633722588103,
      "loss": 2.2676,
      "loss_layer_12_head": 0.528194785118103,
      "loss_layer_18_head": 0.41894468665122986,
      "loss_layer_24_head": 0.25187110900878906,
      "loss_layer_30_head": 0.1444474309682846,
      "loss_layer_36_head": 0.08937288820743561,
      "loss_layer_42_head": 0.04660377278923988,
      "loss_layer_6_head": 0.7561173439025879,
      "step": 5950
    },
    {
      "epoch": 76.224,
      "grad_norm": 0.0722497571260409,
      "learning_rate": 0.0008048475075202727,
      "loss": 2.2875,
      "loss_layer_12_head": 0.5520007014274597,
      "loss_layer_18_head": 0.4382699131965637,
      "loss_layer_24_head": 0.26337847113609314,
      "loss_layer_30_head": 0.1508668214082718,
      "loss_layer_36_head": 0.09275707602500916,
      "loss_layer_42_head": 0.04816962033510208,
      "loss_layer_6_head": 0.7836023569107056,
      "step": 5955
    },
    {
      "epoch": 76.288,
      "grad_norm": 0.07620900881556503,
      "learning_rate": 0.0008007401301710021,
      "loss": 2.2519,
      "loss_layer_12_head": 0.48900729417800903,
      "loss_layer_18_head": 0.3883330523967743,
      "loss_layer_24_head": 0.23235774040222168,
      "loss_layer_30_head": 0.13416577875614166,
      "loss_layer_36_head": 0.08263109624385834,
      "loss_layer_42_head": 0.04353206604719162,
      "loss_layer_6_head": 0.6973899006843567,
      "step": 5960
    },
    {
      "epoch": 76.352,
      "grad_norm": 0.06950069991233387,
      "learning_rate": 0.0007966412607760565,
      "loss": 2.3058,
      "loss_layer_12_head": 0.5664440393447876,
      "loss_layer_18_head": 0.44890326261520386,
      "loss_layer_24_head": 0.26754623651504517,
      "loss_layer_30_head": 0.15239599347114563,
      "loss_layer_36_head": 0.09309245645999908,
      "loss_layer_42_head": 0.04803399369120598,
      "loss_layer_6_head": 0.8074942827224731,
      "step": 5965
    },
    {
      "epoch": 76.416,
      "grad_norm": 0.07852458190322746,
      "learning_rate": 0.000792550919857896,
      "loss": 2.3584,
      "loss_layer_12_head": 0.546902060508728,
      "loss_layer_18_head": 0.4366208016872406,
      "loss_layer_24_head": 0.25987693667411804,
      "loss_layer_30_head": 0.14859867095947266,
      "loss_layer_36_head": 0.09092505276203156,
      "loss_layer_42_head": 0.04701460897922516,
      "loss_layer_6_head": 0.7801912426948547,
      "step": 5970
    },
    {
      "epoch": 76.48,
      "grad_norm": 0.07433775074938616,
      "learning_rate": 0.0007884691278962805,
      "loss": 2.2953,
      "loss_layer_12_head": 0.529559850692749,
      "loss_layer_18_head": 0.42031389474868774,
      "loss_layer_24_head": 0.24963712692260742,
      "loss_layer_30_head": 0.14181512594223022,
      "loss_layer_36_head": 0.08726682513952255,
      "loss_layer_42_head": 0.044991932809352875,
      "loss_layer_6_head": 0.75506991147995,
      "step": 5975
    },
    {
      "epoch": 76.544,
      "grad_norm": 0.08160483711831604,
      "learning_rate": 0.0007843959053281663,
      "loss": 2.3408,
      "loss_layer_12_head": 0.5742703080177307,
      "loss_layer_18_head": 0.46020227670669556,
      "loss_layer_24_head": 0.2727958559989929,
      "loss_layer_30_head": 0.15473778545856476,
      "loss_layer_36_head": 0.09550261497497559,
      "loss_layer_42_head": 0.048939771950244904,
      "loss_layer_6_head": 0.815965473651886,
      "step": 5980
    },
    {
      "epoch": 76.608,
      "grad_norm": 0.08041696512299222,
      "learning_rate": 0.0007803312725476031,
      "loss": 2.3287,
      "loss_layer_12_head": 0.5485153198242188,
      "loss_layer_18_head": 0.43646734952926636,
      "loss_layer_24_head": 0.26168039441108704,
      "loss_layer_30_head": 0.1496315449476242,
      "loss_layer_36_head": 0.09285569190979004,
      "loss_layer_42_head": 0.0481015145778656,
      "loss_layer_6_head": 0.7799005508422852,
      "step": 5985
    },
    {
      "epoch": 76.672,
      "grad_norm": 0.07443760634024867,
      "learning_rate": 0.0007762752499056358,
      "loss": 2.2966,
      "loss_layer_12_head": 0.5620275735855103,
      "loss_layer_18_head": 0.4502514898777008,
      "loss_layer_24_head": 0.26894864439964294,
      "loss_layer_30_head": 0.1550697386264801,
      "loss_layer_36_head": 0.09669594466686249,
      "loss_layer_42_head": 0.049904488027095795,
      "loss_layer_6_head": 0.7968118190765381,
      "step": 5990
    },
    {
      "epoch": 76.736,
      "grad_norm": 0.07295800673851645,
      "learning_rate": 0.0007722278577101946,
      "loss": 2.3803,
      "loss_layer_12_head": 0.5867892503738403,
      "loss_layer_18_head": 0.47252893447875977,
      "loss_layer_24_head": 0.2827807068824768,
      "loss_layer_30_head": 0.16217133402824402,
      "loss_layer_36_head": 0.09853000938892365,
      "loss_layer_42_head": 0.05130968242883682,
      "loss_layer_6_head": 0.8220421075820923,
      "step": 5995
    },
    {
      "epoch": 76.8,
      "grad_norm": 0.0768098944569757,
      "learning_rate": 0.0007681891162260015,
      "loss": 2.3976,
      "loss_layer_12_head": 0.5647968053817749,
      "loss_layer_18_head": 0.4511360228061676,
      "loss_layer_24_head": 0.2686835527420044,
      "loss_layer_30_head": 0.15460413694381714,
      "loss_layer_36_head": 0.09472499787807465,
      "loss_layer_42_head": 0.04912637546658516,
      "loss_layer_6_head": 0.8005831837654114,
      "step": 6000
    },
    {
      "epoch": 76.8,
      "eval_loss": 5.36863374710083,
      "eval_loss_layer_12_head": 1.2409725189208984,
      "eval_loss_layer_18_head": 1.075352430343628,
      "eval_loss_layer_24_head": 0.6877194046974182,
      "eval_loss_layer_30_head": 0.4455486834049225,
      "eval_loss_layer_36_head": 0.28563594818115234,
      "eval_loss_layer_42_head": 0.16846001148223877,
      "eval_loss_layer_6_head": 1.5894299745559692,
      "eval_runtime": 33.0895,
      "eval_samples_per_second": 9.671,
      "eval_steps_per_second": 0.604,
      "step": 6000
    },
    {
      "epoch": 76.864,
      "grad_norm": 0.0700353897750285,
      "learning_rate": 0.0007641590456744663,
      "loss": 2.3325,
      "loss_layer_12_head": 0.5576691031455994,
      "loss_layer_18_head": 0.44427958130836487,
      "loss_layer_24_head": 0.2633112072944641,
      "loss_layer_30_head": 0.15200018882751465,
      "loss_layer_36_head": 0.09342028200626373,
      "loss_layer_42_head": 0.04862789064645767,
      "loss_layer_6_head": 0.7886427640914917,
      "step": 6005
    },
    {
      "epoch": 76.928,
      "grad_norm": 0.07463759540150496,
      "learning_rate": 0.000760137666233583,
      "loss": 2.3461,
      "loss_layer_12_head": 0.5625598430633545,
      "loss_layer_18_head": 0.44834500551223755,
      "loss_layer_24_head": 0.2669185996055603,
      "loss_layer_30_head": 0.15356555581092834,
      "loss_layer_36_head": 0.09412995725870132,
      "loss_layer_42_head": 0.04839523881673813,
      "loss_layer_6_head": 0.8019194602966309,
      "step": 6010
    },
    {
      "epoch": 76.992,
      "grad_norm": 0.07537197176070112,
      "learning_rate": 0.0007561249980378302,
      "loss": 2.3762,
      "loss_layer_12_head": 0.5665708184242249,
      "loss_layer_18_head": 0.4527831971645355,
      "loss_layer_24_head": 0.26992493867874146,
      "loss_layer_30_head": 0.1549486666917801,
      "loss_layer_36_head": 0.09498012810945511,
      "loss_layer_42_head": 0.04940406605601311,
      "loss_layer_6_head": 0.8068750500679016,
      "step": 6015
    },
    {
      "epoch": 77.056,
      "grad_norm": 0.07213896709210114,
      "learning_rate": 0.0007521210611780715,
      "loss": 2.2803,
      "loss_layer_12_head": 0.5400776267051697,
      "loss_layer_18_head": 0.43246564269065857,
      "loss_layer_24_head": 0.25826412439346313,
      "loss_layer_30_head": 0.1505340039730072,
      "loss_layer_36_head": 0.09266237169504166,
      "loss_layer_42_head": 0.048447322100400925,
      "loss_layer_6_head": 0.7643826603889465,
      "step": 6020
    },
    {
      "epoch": 77.12,
      "grad_norm": 0.07290359881057626,
      "learning_rate": 0.0007481258757014534,
      "loss": 2.242,
      "loss_layer_12_head": 0.525490403175354,
      "loss_layer_18_head": 0.4203408360481262,
      "loss_layer_24_head": 0.2513850927352905,
      "loss_layer_30_head": 0.14504054188728333,
      "loss_layer_36_head": 0.08979020267724991,
      "loss_layer_42_head": 0.04678668454289436,
      "loss_layer_6_head": 0.7552701830863953,
      "step": 6025
    },
    {
      "epoch": 77.184,
      "grad_norm": 0.08331762984027531,
      "learning_rate": 0.0007441394616113061,
      "loss": 2.2859,
      "loss_layer_12_head": 0.5318803787231445,
      "loss_layer_18_head": 0.4248695373535156,
      "loss_layer_24_head": 0.25367626547813416,
      "loss_layer_30_head": 0.14569008350372314,
      "loss_layer_36_head": 0.09005106985569,
      "loss_layer_42_head": 0.04682443290948868,
      "loss_layer_6_head": 0.7523360252380371,
      "step": 6030
    },
    {
      "epoch": 77.248,
      "grad_norm": 0.07405825691401284,
      "learning_rate": 0.0007401618388670428,
      "loss": 2.325,
      "loss_layer_12_head": 0.5415347814559937,
      "loss_layer_18_head": 0.43282350897789,
      "loss_layer_24_head": 0.25690752267837524,
      "loss_layer_30_head": 0.14708757400512695,
      "loss_layer_36_head": 0.09030942618846893,
      "loss_layer_42_head": 0.04674718901515007,
      "loss_layer_6_head": 0.7725521326065063,
      "step": 6035
    },
    {
      "epoch": 77.312,
      "grad_norm": 0.08150616680878357,
      "learning_rate": 0.0007361930273840581,
      "loss": 2.2537,
      "loss_layer_12_head": 0.5255495309829712,
      "loss_layer_18_head": 0.41773897409439087,
      "loss_layer_24_head": 0.24766209721565247,
      "loss_layer_30_head": 0.14088915288448334,
      "loss_layer_36_head": 0.08648693561553955,
      "loss_layer_42_head": 0.04493985325098038,
      "loss_layer_6_head": 0.7537581324577332,
      "step": 6040
    },
    {
      "epoch": 77.376,
      "grad_norm": 0.07678882269710649,
      "learning_rate": 0.0007322330470336313,
      "loss": 2.2973,
      "loss_layer_12_head": 0.5519379377365112,
      "loss_layer_18_head": 0.44270047545433044,
      "loss_layer_24_head": 0.2664932310581207,
      "loss_layer_30_head": 0.15538427233695984,
      "loss_layer_36_head": 0.0957149788737297,
      "loss_layer_42_head": 0.0505402497947216,
      "loss_layer_6_head": 0.7814130783081055,
      "step": 6045
    },
    {
      "epoch": 77.44,
      "grad_norm": 0.06965631515597653,
      "learning_rate": 0.0007282819176428251,
      "loss": 2.3041,
      "loss_layer_12_head": 0.5172895193099976,
      "loss_layer_18_head": 0.4123409390449524,
      "loss_layer_24_head": 0.24518921971321106,
      "loss_layer_30_head": 0.1407289206981659,
      "loss_layer_36_head": 0.08722622692584991,
      "loss_layer_42_head": 0.04574524611234665,
      "loss_layer_6_head": 0.7339498400688171,
      "step": 6050
    },
    {
      "epoch": 77.504,
      "grad_norm": 0.07476401357386483,
      "learning_rate": 0.0007243396589943868,
      "loss": 2.2918,
      "loss_layer_12_head": 0.5593112707138062,
      "loss_layer_18_head": 0.4431684911251068,
      "loss_layer_24_head": 0.26289117336273193,
      "loss_layer_30_head": 0.14969676733016968,
      "loss_layer_36_head": 0.09292374551296234,
      "loss_layer_42_head": 0.04837322235107422,
      "loss_layer_6_head": 0.8032577633857727,
      "step": 6055
    },
    {
      "epoch": 77.568,
      "grad_norm": 0.07483697147784267,
      "learning_rate": 0.0007204062908266489,
      "loss": 2.305,
      "loss_layer_12_head": 0.5456087589263916,
      "loss_layer_18_head": 0.4343617558479309,
      "loss_layer_24_head": 0.25726914405822754,
      "loss_layer_30_head": 0.14798352122306824,
      "loss_layer_36_head": 0.09111333638429642,
      "loss_layer_42_head": 0.04734276980161667,
      "loss_layer_6_head": 0.7719742655754089,
      "step": 6060
    },
    {
      "epoch": 77.632,
      "grad_norm": 0.0771540147777686,
      "learning_rate": 0.0007164818328334307,
      "loss": 2.3759,
      "loss_layer_12_head": 0.5650414228439331,
      "loss_layer_18_head": 0.4529106616973877,
      "loss_layer_24_head": 0.2698430120944977,
      "loss_layer_30_head": 0.154885396361351,
      "loss_layer_36_head": 0.09517432749271393,
      "loss_layer_42_head": 0.04904273897409439,
      "loss_layer_6_head": 0.7974535822868347,
      "step": 6065
    },
    {
      "epoch": 77.696,
      "grad_norm": 0.0741776321230785,
      "learning_rate": 0.0007125663046639386,
      "loss": 2.3597,
      "loss_layer_12_head": 0.5944698452949524,
      "loss_layer_18_head": 0.47451233863830566,
      "loss_layer_24_head": 0.28266897797584534,
      "loss_layer_30_head": 0.16180309653282166,
      "loss_layer_36_head": 0.09842659533023834,
      "loss_layer_42_head": 0.05112394690513611,
      "loss_layer_6_head": 0.8390895128250122,
      "step": 6070
    },
    {
      "epoch": 77.76,
      "grad_norm": 0.07782758039209528,
      "learning_rate": 0.0007086597259226707,
      "loss": 2.3363,
      "loss_layer_12_head": 0.5324329137802124,
      "loss_layer_18_head": 0.4224180281162262,
      "loss_layer_24_head": 0.2507017254829407,
      "loss_layer_30_head": 0.1413750946521759,
      "loss_layer_36_head": 0.08720046281814575,
      "loss_layer_42_head": 0.044940073043107986,
      "loss_layer_6_head": 0.7618409395217896,
      "step": 6075
    },
    {
      "epoch": 77.824,
      "grad_norm": 0.07702557723164258,
      "learning_rate": 0.0007047621161693152,
      "loss": 2.3601,
      "loss_layer_12_head": 0.5417267680168152,
      "loss_layer_18_head": 0.4320138096809387,
      "loss_layer_24_head": 0.2582332491874695,
      "loss_layer_30_head": 0.14634504914283752,
      "loss_layer_36_head": 0.08992363512516022,
      "loss_layer_42_head": 0.04679848626255989,
      "loss_layer_6_head": 0.7670430541038513,
      "step": 6080
    },
    {
      "epoch": 77.888,
      "grad_norm": 0.07677189862971752,
      "learning_rate": 0.0007008734949186538,
      "loss": 2.3904,
      "loss_layer_12_head": 0.5698273181915283,
      "loss_layer_18_head": 0.45534053444862366,
      "loss_layer_24_head": 0.26807886362075806,
      "loss_layer_30_head": 0.15214423835277557,
      "loss_layer_36_head": 0.09307000041007996,
      "loss_layer_42_head": 0.047956693917512894,
      "loss_layer_6_head": 0.8128267526626587,
      "step": 6085
    },
    {
      "epoch": 77.952,
      "grad_norm": 0.07129301455108811,
      "learning_rate": 0.0006969938816404639,
      "loss": 2.329,
      "loss_layer_12_head": 0.5539348721504211,
      "loss_layer_18_head": 0.4454125463962555,
      "loss_layer_24_head": 0.2669183909893036,
      "loss_layer_30_head": 0.1539638340473175,
      "loss_layer_36_head": 0.09538088738918304,
      "loss_layer_42_head": 0.05007969215512276,
      "loss_layer_6_head": 0.7821504473686218,
      "step": 6090
    },
    {
      "epoch": 78.016,
      "grad_norm": 0.07601211179423482,
      "learning_rate": 0.0006931232957594213,
      "loss": 2.3098,
      "loss_layer_12_head": 0.5552355051040649,
      "loss_layer_18_head": 0.4460284113883972,
      "loss_layer_24_head": 0.2666816711425781,
      "loss_layer_30_head": 0.1535816192626953,
      "loss_layer_36_head": 0.09462487697601318,
      "loss_layer_42_head": 0.04932069033384323,
      "loss_layer_6_head": 0.78844153881073,
      "step": 6095
    },
    {
      "epoch": 78.08,
      "grad_norm": 0.073593682894227,
      "learning_rate": 0.0006892617566550044,
      "loss": 2.2952,
      "loss_layer_12_head": 0.5704615712165833,
      "loss_layer_18_head": 0.45477813482284546,
      "loss_layer_24_head": 0.27238214015960693,
      "loss_layer_30_head": 0.15593436360359192,
      "loss_layer_36_head": 0.09579074382781982,
      "loss_layer_42_head": 0.04907495528459549,
      "loss_layer_6_head": 0.8112842440605164,
      "step": 6100
    },
    {
      "epoch": 78.144,
      "grad_norm": 0.06540090822607363,
      "learning_rate": 0.0006854092836613948,
      "loss": 2.2732,
      "loss_layer_12_head": 0.5560177564620972,
      "loss_layer_18_head": 0.4424892067909241,
      "loss_layer_24_head": 0.2625701427459717,
      "loss_layer_30_head": 0.14909929037094116,
      "loss_layer_36_head": 0.09082135558128357,
      "loss_layer_42_head": 0.04694611206650734,
      "loss_layer_6_head": 0.7909688949584961,
      "step": 6105
    },
    {
      "epoch": 78.208,
      "grad_norm": 0.07102544296025644,
      "learning_rate": 0.0006815658960673781,
      "loss": 2.2605,
      "loss_layer_12_head": 0.5462185144424438,
      "loss_layer_18_head": 0.43201595544815063,
      "loss_layer_24_head": 0.2566160559654236,
      "loss_layer_30_head": 0.14615969359874725,
      "loss_layer_36_head": 0.08948183804750443,
      "loss_layer_42_head": 0.04616248607635498,
      "loss_layer_6_head": 0.7792559862136841,
      "step": 6110
    },
    {
      "epoch": 78.272,
      "grad_norm": 0.07568592688418377,
      "learning_rate": 0.0006777316131162564,
      "loss": 2.2901,
      "loss_layer_12_head": 0.5271133780479431,
      "loss_layer_18_head": 0.42078453302383423,
      "loss_layer_24_head": 0.25002604722976685,
      "loss_layer_30_head": 0.14247198402881622,
      "loss_layer_36_head": 0.08743016421794891,
      "loss_layer_42_head": 0.04483914375305176,
      "loss_layer_6_head": 0.7539930939674377,
      "step": 6115
    },
    {
      "epoch": 78.336,
      "grad_norm": 0.0688443690633472,
      "learning_rate": 0.0006739064540057424,
      "loss": 2.2994,
      "loss_layer_12_head": 0.5282705426216125,
      "loss_layer_18_head": 0.42175206542015076,
      "loss_layer_24_head": 0.24892497062683105,
      "loss_layer_30_head": 0.14308860898017883,
      "loss_layer_36_head": 0.08852535486221313,
      "loss_layer_42_head": 0.046022929251194,
      "loss_layer_6_head": 0.7516704797744751,
      "step": 6120
    },
    {
      "epoch": 78.4,
      "grad_norm": 0.06960406744149791,
      "learning_rate": 0.0006700904378878675,
      "loss": 2.2567,
      "loss_layer_12_head": 0.5522502064704895,
      "loss_layer_18_head": 0.43948134779930115,
      "loss_layer_24_head": 0.26127558946609497,
      "loss_layer_30_head": 0.14881590008735657,
      "loss_layer_36_head": 0.0907742828130722,
      "loss_layer_42_head": 0.04659678786993027,
      "loss_layer_6_head": 0.7867797017097473,
      "step": 6125
    },
    {
      "epoch": 78.464,
      "grad_norm": 0.06935085946295635,
      "learning_rate": 0.0006662835838688863,
      "loss": 2.3306,
      "loss_layer_12_head": 0.5351370573043823,
      "loss_layer_18_head": 0.42570438981056213,
      "loss_layer_24_head": 0.25248292088508606,
      "loss_layer_30_head": 0.14480915665626526,
      "loss_layer_36_head": 0.08896969258785248,
      "loss_layer_42_head": 0.04623815789818764,
      "loss_layer_6_head": 0.7642939686775208,
      "step": 6130
    },
    {
      "epoch": 78.528,
      "grad_norm": 0.07272621963668134,
      "learning_rate": 0.0006624859110091791,
      "loss": 2.3069,
      "loss_layer_12_head": 0.5714215040206909,
      "loss_layer_18_head": 0.45500874519348145,
      "loss_layer_24_head": 0.27137404680252075,
      "loss_layer_30_head": 0.1543014943599701,
      "loss_layer_36_head": 0.09447211027145386,
      "loss_layer_42_head": 0.04880537837743759,
      "loss_layer_6_head": 0.8083324432373047,
      "step": 6135
    },
    {
      "epoch": 78.592,
      "grad_norm": 0.07501272944814785,
      "learning_rate": 0.0006586974383231573,
      "loss": 2.2713,
      "loss_layer_12_head": 0.5113379955291748,
      "loss_layer_18_head": 0.4084097445011139,
      "loss_layer_24_head": 0.24390287697315216,
      "loss_layer_30_head": 0.14006741344928741,
      "loss_layer_36_head": 0.0859985426068306,
      "loss_layer_42_head": 0.044904522597789764,
      "loss_layer_6_head": 0.7343742847442627,
      "step": 6140
    },
    {
      "epoch": 78.656,
      "grad_norm": 0.07021775318001447,
      "learning_rate": 0.0006549181847791705,
      "loss": 2.3232,
      "loss_layer_12_head": 0.5557719469070435,
      "loss_layer_18_head": 0.4441661238670349,
      "loss_layer_24_head": 0.26685428619384766,
      "loss_layer_30_head": 0.15363271534442902,
      "loss_layer_36_head": 0.09396065771579742,
      "loss_layer_42_head": 0.04852723702788353,
      "loss_layer_6_head": 0.7895371317863464,
      "step": 6145
    },
    {
      "epoch": 78.72,
      "grad_norm": 0.0717446694607529,
      "learning_rate": 0.0006511481692994075,
      "loss": 2.3894,
      "loss_layer_12_head": 0.5771350264549255,
      "loss_layer_18_head": 0.4620262682437897,
      "loss_layer_24_head": 0.27583789825439453,
      "loss_layer_30_head": 0.15846946835517883,
      "loss_layer_36_head": 0.09704472124576569,
      "loss_layer_42_head": 0.050323374569416046,
      "loss_layer_6_head": 0.8123035430908203,
      "step": 6150
    },
    {
      "epoch": 78.784,
      "grad_norm": 0.07150136337879812,
      "learning_rate": 0.0006473874107598019,
      "loss": 2.343,
      "loss_layer_12_head": 0.5698662996292114,
      "loss_layer_18_head": 0.4549383223056793,
      "loss_layer_24_head": 0.27257323265075684,
      "loss_layer_30_head": 0.1552368849515915,
      "loss_layer_36_head": 0.09548469632863998,
      "loss_layer_42_head": 0.04913216084241867,
      "loss_layer_6_head": 0.8036497831344604,
      "step": 6155
    },
    {
      "epoch": 78.848,
      "grad_norm": 0.06966314339344831,
      "learning_rate": 0.0006436359279899426,
      "loss": 2.3282,
      "loss_layer_12_head": 0.5452821850776672,
      "loss_layer_18_head": 0.4373631477355957,
      "loss_layer_24_head": 0.26167911291122437,
      "loss_layer_30_head": 0.1505098044872284,
      "loss_layer_36_head": 0.0923055037856102,
      "loss_layer_42_head": 0.047717832028865814,
      "loss_layer_6_head": 0.781923770904541,
      "step": 6160
    },
    {
      "epoch": 78.912,
      "grad_norm": 0.07619193600028396,
      "learning_rate": 0.0006398937397729731,
      "loss": 2.3509,
      "loss_layer_12_head": 0.5699392557144165,
      "loss_layer_18_head": 0.45462217926979065,
      "loss_layer_24_head": 0.27176859974861145,
      "loss_layer_30_head": 0.15460430085659027,
      "loss_layer_36_head": 0.09541213512420654,
      "loss_layer_42_head": 0.04937038570642471,
      "loss_layer_6_head": 0.815619170665741,
      "step": 6165
    },
    {
      "epoch": 78.976,
      "grad_norm": 0.07727853555219188,
      "learning_rate": 0.0006361608648455039,
      "loss": 2.3153,
      "loss_layer_12_head": 0.548202395439148,
      "loss_layer_18_head": 0.43713006377220154,
      "loss_layer_24_head": 0.25765901803970337,
      "loss_layer_30_head": 0.1470467746257782,
      "loss_layer_36_head": 0.09045735001564026,
      "loss_layer_42_head": 0.04703389108181,
      "loss_layer_6_head": 0.7754589319229126,
      "step": 6170
    },
    {
      "epoch": 79.04,
      "grad_norm": 0.07133974725046141,
      "learning_rate": 0.0006324373218975105,
      "loss": 2.2832,
      "loss_layer_12_head": 0.5699695348739624,
      "loss_layer_18_head": 0.45536381006240845,
      "loss_layer_24_head": 0.26925721764564514,
      "loss_layer_30_head": 0.15293261408805847,
      "loss_layer_36_head": 0.09357286989688873,
      "loss_layer_42_head": 0.04839203506708145,
      "loss_layer_6_head": 0.8113353848457336,
      "step": 6175
    },
    {
      "epoch": 79.104,
      "grad_norm": 0.0830382532211316,
      "learning_rate": 0.000628723129572247,
      "loss": 2.2725,
      "loss_layer_12_head": 0.5620076060295105,
      "loss_layer_18_head": 0.44494661688804626,
      "loss_layer_24_head": 0.26451364159584045,
      "loss_layer_30_head": 0.1500881016254425,
      "loss_layer_36_head": 0.0916702002286911,
      "loss_layer_42_head": 0.04717849940061569,
      "loss_layer_6_head": 0.801081657409668,
      "step": 6180
    },
    {
      "epoch": 79.168,
      "grad_norm": 0.07912740854210341,
      "learning_rate": 0.0006250183064661519,
      "loss": 2.2672,
      "loss_layer_12_head": 0.5308602452278137,
      "loss_layer_18_head": 0.42266756296157837,
      "loss_layer_24_head": 0.2512940764427185,
      "loss_layer_30_head": 0.1429583728313446,
      "loss_layer_36_head": 0.08749721944332123,
      "loss_layer_42_head": 0.04519215226173401,
      "loss_layer_6_head": 0.7518661618232727,
      "step": 6185
    },
    {
      "epoch": 79.232,
      "grad_norm": 0.08766703032491241,
      "learning_rate": 0.00062132287112875,
      "loss": 2.2984,
      "loss_layer_12_head": 0.5545870065689087,
      "loss_layer_18_head": 0.4403671324253082,
      "loss_layer_24_head": 0.262807160615921,
      "loss_layer_30_head": 0.1491050273180008,
      "loss_layer_36_head": 0.09202302247285843,
      "loss_layer_42_head": 0.047315433621406555,
      "loss_layer_6_head": 0.7927783131599426,
      "step": 6190
    },
    {
      "epoch": 79.296,
      "grad_norm": 0.07301499811182317,
      "learning_rate": 0.0006176368420625653,
      "loss": 2.2718,
      "loss_layer_12_head": 0.505622148513794,
      "loss_layer_18_head": 0.4008455276489258,
      "loss_layer_24_head": 0.237935870885849,
      "loss_layer_30_head": 0.13504889607429504,
      "loss_layer_36_head": 0.08352693170309067,
      "loss_layer_42_head": 0.043241776525974274,
      "loss_layer_6_head": 0.7207675576210022,
      "step": 6195
    },
    {
      "epoch": 79.36,
      "grad_norm": 0.07719862316156349,
      "learning_rate": 0.0006139602377230246,
      "loss": 2.2507,
      "loss_layer_12_head": 0.5226200819015503,
      "loss_layer_18_head": 0.4166424870491028,
      "loss_layer_24_head": 0.24778524041175842,
      "loss_layer_30_head": 0.1412443369626999,
      "loss_layer_36_head": 0.08736841380596161,
      "loss_layer_42_head": 0.04538998007774353,
      "loss_layer_6_head": 0.7467795014381409,
      "step": 6200
    },
    {
      "epoch": 79.36,
      "eval_loss": 5.372689723968506,
      "eval_loss_layer_12_head": 1.2413933277130127,
      "eval_loss_layer_18_head": 1.0759763717651367,
      "eval_loss_layer_24_head": 0.6877433061599731,
      "eval_loss_layer_30_head": 0.44547492265701294,
      "eval_loss_layer_36_head": 0.28520965576171875,
      "eval_loss_layer_42_head": 0.17008763551712036,
      "eval_loss_layer_6_head": 1.592294454574585,
      "eval_runtime": 33.0824,
      "eval_samples_per_second": 9.673,
      "eval_steps_per_second": 0.605,
      "step": 6200
    },
    {
      "epoch": 79.424,
      "grad_norm": 0.06942853728470363,
      "learning_rate": 0.000610293076518367,
      "loss": 2.2676,
      "loss_layer_12_head": 0.5263605713844299,
      "loss_layer_18_head": 0.4182710647583008,
      "loss_layer_24_head": 0.24834895133972168,
      "loss_layer_30_head": 0.14211151003837585,
      "loss_layer_36_head": 0.08701149374246597,
      "loss_layer_42_head": 0.04532521218061447,
      "loss_layer_6_head": 0.7499408721923828,
      "step": 6205
    },
    {
      "epoch": 79.488,
      "grad_norm": 0.08209470305480152,
      "learning_rate": 0.0006066353768095504,
      "loss": 2.3051,
      "loss_layer_12_head": 0.5411631464958191,
      "loss_layer_18_head": 0.4307491183280945,
      "loss_layer_24_head": 0.2584875226020813,
      "loss_layer_30_head": 0.14856305718421936,
      "loss_layer_36_head": 0.09143179655075073,
      "loss_layer_42_head": 0.048204101622104645,
      "loss_layer_6_head": 0.7670475840568542,
      "step": 6210
    },
    {
      "epoch": 79.552,
      "grad_norm": 0.06681950546424564,
      "learning_rate": 0.0006029871569101627,
      "loss": 2.2943,
      "loss_layer_12_head": 0.5545369386672974,
      "loss_layer_18_head": 0.4447581171989441,
      "loss_layer_24_head": 0.2656252384185791,
      "loss_layer_30_head": 0.15259265899658203,
      "loss_layer_36_head": 0.09388178586959839,
      "loss_layer_42_head": 0.04894993454217911,
      "loss_layer_6_head": 0.7847052216529846,
      "step": 6215
    },
    {
      "epoch": 79.616,
      "grad_norm": 0.07183552110659878,
      "learning_rate": 0.0005993484350863246,
      "loss": 2.3345,
      "loss_layer_12_head": 0.5478235483169556,
      "loss_layer_18_head": 0.43756574392318726,
      "loss_layer_24_head": 0.2604452967643738,
      "loss_layer_30_head": 0.14803972840309143,
      "loss_layer_36_head": 0.09055653214454651,
      "loss_layer_42_head": 0.04676216468214989,
      "loss_layer_6_head": 0.7788383364677429,
      "step": 6220
    },
    {
      "epoch": 79.68,
      "grad_norm": 0.06976696677185719,
      "learning_rate": 0.0005957192295566022,
      "loss": 2.3402,
      "loss_layer_12_head": 0.5555456876754761,
      "loss_layer_18_head": 0.4460199773311615,
      "loss_layer_24_head": 0.2667836844921112,
      "loss_layer_30_head": 0.15279927849769592,
      "loss_layer_36_head": 0.09408002346754074,
      "loss_layer_42_head": 0.048665352165699005,
      "loss_layer_6_head": 0.7875241041183472,
      "step": 6225
    },
    {
      "epoch": 79.744,
      "grad_norm": 0.06786542426840408,
      "learning_rate": 0.000592099558491917,
      "loss": 2.3613,
      "loss_layer_12_head": 0.555708646774292,
      "loss_layer_18_head": 0.4483472406864166,
      "loss_layer_24_head": 0.26924410462379456,
      "loss_layer_30_head": 0.15482698380947113,
      "loss_layer_36_head": 0.0958835631608963,
      "loss_layer_42_head": 0.050189755856990814,
      "loss_layer_6_head": 0.7873321771621704,
      "step": 6230
    },
    {
      "epoch": 79.808,
      "grad_norm": 0.07569173825946669,
      "learning_rate": 0.0005884894400154501,
      "loss": 2.2909,
      "loss_layer_12_head": 0.5624122023582458,
      "loss_layer_18_head": 0.45063287019729614,
      "loss_layer_24_head": 0.26879850029945374,
      "loss_layer_30_head": 0.15344233810901642,
      "loss_layer_36_head": 0.0942104309797287,
      "loss_layer_42_head": 0.04880990833044052,
      "loss_layer_6_head": 0.7942590713500977,
      "step": 6235
    },
    {
      "epoch": 79.872,
      "grad_norm": 0.07040836535648257,
      "learning_rate": 0.0005848888922025552,
      "loss": 2.3349,
      "loss_layer_12_head": 0.5462376475334167,
      "loss_layer_18_head": 0.44158822298049927,
      "loss_layer_24_head": 0.26466643810272217,
      "loss_layer_30_head": 0.1535748541355133,
      "loss_layer_36_head": 0.09469742327928543,
      "loss_layer_42_head": 0.04912467300891876,
      "loss_layer_6_head": 0.770872950553894,
      "step": 6240
    },
    {
      "epoch": 79.936,
      "grad_norm": 0.07007981855813088,
      "learning_rate": 0.0005812979330806672,
      "loss": 2.3282,
      "loss_layer_12_head": 0.5634763240814209,
      "loss_layer_18_head": 0.4497946798801422,
      "loss_layer_24_head": 0.2671286463737488,
      "loss_layer_30_head": 0.15154129266738892,
      "loss_layer_36_head": 0.09315936267375946,
      "loss_layer_42_head": 0.04806562140583992,
      "loss_layer_6_head": 0.8012627363204956,
      "step": 6245
    },
    {
      "epoch": 80.0,
      "grad_norm": 0.07277494590574841,
      "learning_rate": 0.0005777165806292109,
      "loss": 2.3125,
      "loss_layer_12_head": 0.580785870552063,
      "loss_layer_18_head": 0.46266111731529236,
      "loss_layer_24_head": 0.2749088406562805,
      "loss_layer_30_head": 0.1563441902399063,
      "loss_layer_36_head": 0.09610249102115631,
      "loss_layer_42_head": 0.04940047860145569,
      "loss_layer_6_head": 0.8234256505966187,
      "step": 6250
    },
    {
      "epoch": 80.064,
      "grad_norm": 0.06749373835869633,
      "learning_rate": 0.0005741448527795137,
      "loss": 2.2764,
      "loss_layer_12_head": 0.5465020537376404,
      "loss_layer_18_head": 0.43376070261001587,
      "loss_layer_24_head": 0.25830215215682983,
      "loss_layer_30_head": 0.14812004566192627,
      "loss_layer_36_head": 0.09112035483121872,
      "loss_layer_42_head": 0.046912290155887604,
      "loss_layer_6_head": 0.7804373502731323,
      "step": 6255
    },
    {
      "epoch": 80.128,
      "grad_norm": 0.0718003781521697,
      "learning_rate": 0.0005705827674147124,
      "loss": 2.2309,
      "loss_layer_12_head": 0.5055789947509766,
      "loss_layer_18_head": 0.40339261293411255,
      "loss_layer_24_head": 0.23991331458091736,
      "loss_layer_30_head": 0.137883260846138,
      "loss_layer_36_head": 0.08469338715076447,
      "loss_layer_42_head": 0.043835632503032684,
      "loss_layer_6_head": 0.717957079410553,
      "step": 6260
    },
    {
      "epoch": 80.192,
      "grad_norm": 0.07137090723643098,
      "learning_rate": 0.0005670303423696654,
      "loss": 2.2773,
      "loss_layer_12_head": 0.5555157661437988,
      "loss_layer_18_head": 0.4459607005119324,
      "loss_layer_24_head": 0.26830536127090454,
      "loss_layer_30_head": 0.15594688057899475,
      "loss_layer_36_head": 0.09624911844730377,
      "loss_layer_42_head": 0.05033060908317566,
      "loss_layer_6_head": 0.784421443939209,
      "step": 6265
    },
    {
      "epoch": 80.256,
      "grad_norm": 0.07766510662566106,
      "learning_rate": 0.0005634875954308638,
      "loss": 2.2853,
      "loss_layer_12_head": 0.5443748831748962,
      "loss_layer_18_head": 0.43465548753738403,
      "loss_layer_24_head": 0.26033395528793335,
      "loss_layer_30_head": 0.14811848104000092,
      "loss_layer_36_head": 0.09065987169742584,
      "loss_layer_42_head": 0.04677652567625046,
      "loss_layer_6_head": 0.7712472677230835,
      "step": 6270
    },
    {
      "epoch": 80.32,
      "grad_norm": 0.07281408820595216,
      "learning_rate": 0.0005599545443363411,
      "loss": 2.2749,
      "loss_layer_12_head": 0.5692509412765503,
      "loss_layer_18_head": 0.45198163390159607,
      "loss_layer_24_head": 0.26871129870414734,
      "loss_layer_30_head": 0.15244905650615692,
      "loss_layer_36_head": 0.0934966504573822,
      "loss_layer_42_head": 0.0480445958673954,
      "loss_layer_6_head": 0.8076051473617554,
      "step": 6275
    },
    {
      "epoch": 80.384,
      "grad_norm": 0.0756009577824708,
      "learning_rate": 0.0005564312067755856,
      "loss": 2.2713,
      "loss_layer_12_head": 0.5362197160720825,
      "loss_layer_18_head": 0.4256981909275055,
      "loss_layer_24_head": 0.2536953091621399,
      "loss_layer_30_head": 0.14580512046813965,
      "loss_layer_36_head": 0.08969803899526596,
      "loss_layer_42_head": 0.04624030739068985,
      "loss_layer_6_head": 0.7607301473617554,
      "step": 6280
    },
    {
      "epoch": 80.448,
      "grad_norm": 0.07138122771954182,
      "learning_rate": 0.0005529176003894509,
      "loss": 2.306,
      "loss_layer_12_head": 0.5549246072769165,
      "loss_layer_18_head": 0.44334182143211365,
      "loss_layer_24_head": 0.26334866881370544,
      "loss_layer_30_head": 0.15173576772212982,
      "loss_layer_36_head": 0.09371159970760345,
      "loss_layer_42_head": 0.04916629567742348,
      "loss_layer_6_head": 0.7843819856643677,
      "step": 6285
    },
    {
      "epoch": 80.512,
      "grad_norm": 0.06457773687495323,
      "learning_rate": 0.0005494137427700688,
      "loss": 2.319,
      "loss_layer_12_head": 0.5695633292198181,
      "loss_layer_18_head": 0.4526449143886566,
      "loss_layer_24_head": 0.2707459330558777,
      "loss_layer_30_head": 0.15521197021007538,
      "loss_layer_36_head": 0.09533359855413437,
      "loss_layer_42_head": 0.048909157514572144,
      "loss_layer_6_head": 0.8061602711677551,
      "step": 6290
    },
    {
      "epoch": 80.576,
      "grad_norm": 0.07720745360710506,
      "learning_rate": 0.000545919651460759,
      "loss": 2.2692,
      "loss_layer_12_head": 0.5586211085319519,
      "loss_layer_18_head": 0.44814997911453247,
      "loss_layer_24_head": 0.2707417905330658,
      "loss_layer_30_head": 0.15661552548408508,
      "loss_layer_36_head": 0.09691490232944489,
      "loss_layer_42_head": 0.05194586515426636,
      "loss_layer_6_head": 0.7834509611129761,
      "step": 6295
    },
    {
      "epoch": 80.64,
      "grad_norm": 0.07268852776116513,
      "learning_rate": 0.0005424353439559445,
      "loss": 2.3147,
      "loss_layer_12_head": 0.5614066123962402,
      "loss_layer_18_head": 0.44719213247299194,
      "loss_layer_24_head": 0.2633689045906067,
      "loss_layer_30_head": 0.1503949612379074,
      "loss_layer_36_head": 0.09221871197223663,
      "loss_layer_42_head": 0.04792993143200874,
      "loss_layer_6_head": 0.7958675622940063,
      "step": 6300
    },
    {
      "epoch": 80.704,
      "grad_norm": 0.06842348714256999,
      "learning_rate": 0.0005389608377010608,
      "loss": 2.2842,
      "loss_layer_12_head": 0.5481469035148621,
      "loss_layer_18_head": 0.4353587031364441,
      "loss_layer_24_head": 0.2595806121826172,
      "loss_layer_30_head": 0.1482030600309372,
      "loss_layer_36_head": 0.09056597948074341,
      "loss_layer_42_head": 0.046818818897008896,
      "loss_layer_6_head": 0.7785199284553528,
      "step": 6305
    },
    {
      "epoch": 80.768,
      "grad_norm": 0.07035052337794474,
      "learning_rate": 0.0005354961500924705,
      "loss": 2.3346,
      "loss_layer_12_head": 0.5526851415634155,
      "loss_layer_18_head": 0.4427970051765442,
      "loss_layer_24_head": 0.26230621337890625,
      "loss_layer_30_head": 0.15085116028785706,
      "loss_layer_36_head": 0.09227116405963898,
      "loss_layer_42_head": 0.04789404943585396,
      "loss_layer_6_head": 0.7811059951782227,
      "step": 6310
    },
    {
      "epoch": 80.832,
      "grad_norm": 0.07109838771052028,
      "learning_rate": 0.0005320412984773748,
      "loss": 2.3175,
      "loss_layer_12_head": 0.537881076335907,
      "loss_layer_18_head": 0.4296366274356842,
      "loss_layer_24_head": 0.2556877136230469,
      "loss_layer_30_head": 0.14663414657115936,
      "loss_layer_36_head": 0.09058143198490143,
      "loss_layer_42_head": 0.047406330704689026,
      "loss_layer_6_head": 0.7622926235198975,
      "step": 6315
    },
    {
      "epoch": 80.896,
      "grad_norm": 0.07009261369784812,
      "learning_rate": 0.000528596300153728,
      "loss": 2.296,
      "loss_layer_12_head": 0.5450689792633057,
      "loss_layer_18_head": 0.4386252760887146,
      "loss_layer_24_head": 0.2617475390434265,
      "loss_layer_30_head": 0.14966312050819397,
      "loss_layer_36_head": 0.09174905717372894,
      "loss_layer_42_head": 0.0471545085310936,
      "loss_layer_6_head": 0.7706741094589233,
      "step": 6320
    },
    {
      "epoch": 80.96,
      "grad_norm": 0.07386060185251932,
      "learning_rate": 0.0005251611723701516,
      "loss": 2.3516,
      "loss_layer_12_head": 0.5226753950119019,
      "loss_layer_18_head": 0.4194074273109436,
      "loss_layer_24_head": 0.24988910555839539,
      "loss_layer_30_head": 0.14227305352687836,
      "loss_layer_36_head": 0.08765731751918793,
      "loss_layer_42_head": 0.04517645388841629,
      "loss_layer_6_head": 0.7406671643257141,
      "step": 6325
    },
    {
      "epoch": 81.024,
      "grad_norm": 0.06743315644669128,
      "learning_rate": 0.0005217359323258458,
      "loss": 2.2556,
      "loss_layer_12_head": 0.5230852365493774,
      "loss_layer_18_head": 0.4174930453300476,
      "loss_layer_24_head": 0.2494308054447174,
      "loss_layer_30_head": 0.14333298802375793,
      "loss_layer_36_head": 0.08857410401105881,
      "loss_layer_42_head": 0.04546217992901802,
      "loss_layer_6_head": 0.7469751238822937,
      "step": 6330
    },
    {
      "epoch": 81.088,
      "grad_norm": 0.07328677975251996,
      "learning_rate": 0.0005183205971705016,
      "loss": 2.2362,
      "loss_layer_12_head": 0.5645562410354614,
      "loss_layer_18_head": 0.44843825697898865,
      "loss_layer_24_head": 0.26677849888801575,
      "loss_layer_30_head": 0.15092957019805908,
      "loss_layer_36_head": 0.09231148660182953,
      "loss_layer_42_head": 0.04707021266222,
      "loss_layer_6_head": 0.7979294657707214,
      "step": 6335
    },
    {
      "epoch": 81.152,
      "grad_norm": 0.06575177258257897,
      "learning_rate": 0.0005149151840042224,
      "loss": 2.3178,
      "loss_layer_12_head": 0.5329887866973877,
      "loss_layer_18_head": 0.4235006868839264,
      "loss_layer_24_head": 0.2517310380935669,
      "loss_layer_30_head": 0.14428047835826874,
      "loss_layer_36_head": 0.08879682421684265,
      "loss_layer_42_head": 0.045687153935432434,
      "loss_layer_6_head": 0.7565253376960754,
      "step": 6340
    },
    {
      "epoch": 81.216,
      "grad_norm": 0.06776676530974438,
      "learning_rate": 0.0005115197098774302,
      "loss": 2.2777,
      "loss_layer_12_head": 0.5474923253059387,
      "loss_layer_18_head": 0.4341053366661072,
      "loss_layer_24_head": 0.25803130865097046,
      "loss_layer_30_head": 0.1468833088874817,
      "loss_layer_36_head": 0.08900482207536697,
      "loss_layer_42_head": 0.04562674090266228,
      "loss_layer_6_head": 0.7767430543899536,
      "step": 6345
    },
    {
      "epoch": 81.28,
      "grad_norm": 0.07772922117212219,
      "learning_rate": 0.0005081341917907853,
      "loss": 2.2714,
      "loss_layer_12_head": 0.529261589050293,
      "loss_layer_18_head": 0.4219036102294922,
      "loss_layer_24_head": 0.2505970001220703,
      "loss_layer_30_head": 0.1441197693347931,
      "loss_layer_36_head": 0.08866091072559357,
      "loss_layer_42_head": 0.045912060886621475,
      "loss_layer_6_head": 0.7599284648895264,
      "step": 6350
    },
    {
      "epoch": 81.344,
      "grad_norm": 0.06576221058490588,
      "learning_rate": 0.0005047586466950984,
      "loss": 2.259,
      "loss_layer_12_head": 0.5315473675727844,
      "loss_layer_18_head": 0.4240263104438782,
      "loss_layer_24_head": 0.2501051127910614,
      "loss_layer_30_head": 0.14392022788524628,
      "loss_layer_36_head": 0.0884777158498764,
      "loss_layer_42_head": 0.045930568128824234,
      "loss_layer_6_head": 0.7590261697769165,
      "step": 6355
    },
    {
      "epoch": 81.408,
      "grad_norm": 0.07015062849385176,
      "learning_rate": 0.0005013930914912477,
      "loss": 2.2606,
      "loss_layer_12_head": 0.5186308026313782,
      "loss_layer_18_head": 0.4117237627506256,
      "loss_layer_24_head": 0.24380025267601013,
      "loss_layer_30_head": 0.138962060213089,
      "loss_layer_36_head": 0.08568651974201202,
      "loss_layer_42_head": 0.044214341789484024,
      "loss_layer_6_head": 0.7397429347038269,
      "step": 6360
    },
    {
      "epoch": 81.472,
      "grad_norm": 0.06719300942145427,
      "learning_rate": 0.000498037543030092,
      "loss": 2.2674,
      "loss_layer_12_head": 0.5418444871902466,
      "loss_layer_18_head": 0.43206149339675903,
      "loss_layer_24_head": 0.25757092237472534,
      "loss_layer_30_head": 0.14704783260822296,
      "loss_layer_36_head": 0.09059353172779083,
      "loss_layer_42_head": 0.04674364998936653,
      "loss_layer_6_head": 0.7711106538772583,
      "step": 6365
    },
    {
      "epoch": 81.536,
      "grad_norm": 0.07020126226526263,
      "learning_rate": 0.0004946920181123904,
      "loss": 2.3038,
      "loss_layer_12_head": 0.5518583059310913,
      "loss_layer_18_head": 0.4385463297367096,
      "loss_layer_24_head": 0.26084479689598083,
      "loss_layer_30_head": 0.14810551702976227,
      "loss_layer_36_head": 0.09128700941801071,
      "loss_layer_42_head": 0.04719919711351395,
      "loss_layer_6_head": 0.7815605401992798,
      "step": 6370
    },
    {
      "epoch": 81.6,
      "grad_norm": 0.0641633627654232,
      "learning_rate": 0.0004913565334887135,
      "loss": 2.3195,
      "loss_layer_12_head": 0.5485092997550964,
      "loss_layer_18_head": 0.4380178451538086,
      "loss_layer_24_head": 0.2613169550895691,
      "loss_layer_30_head": 0.14870287477970123,
      "loss_layer_36_head": 0.09057803452014923,
      "loss_layer_42_head": 0.04691166803240776,
      "loss_layer_6_head": 0.7741998434066772,
      "step": 6375
    },
    {
      "epoch": 81.664,
      "grad_norm": 0.06891139288401883,
      "learning_rate": 0.00048803110585936174,
      "loss": 2.3049,
      "loss_layer_12_head": 0.5449396371841431,
      "loss_layer_18_head": 0.4337214529514313,
      "loss_layer_24_head": 0.25782161951065063,
      "loss_layer_30_head": 0.146535724401474,
      "loss_layer_36_head": 0.08978348225355148,
      "loss_layer_42_head": 0.04637492075562477,
      "loss_layer_6_head": 0.7744215130805969,
      "step": 6380
    },
    {
      "epoch": 81.728,
      "grad_norm": 0.06045665856647066,
      "learning_rate": 0.00048471575187428173,
      "loss": 2.3078,
      "loss_layer_12_head": 0.5691594481468201,
      "loss_layer_18_head": 0.45365285873413086,
      "loss_layer_24_head": 0.2699325680732727,
      "loss_layer_30_head": 0.15428777039051056,
      "loss_layer_36_head": 0.09382623434066772,
      "loss_layer_42_head": 0.04811197146773338,
      "loss_layer_6_head": 0.8115581274032593,
      "step": 6385
    },
    {
      "epoch": 81.792,
      "grad_norm": 0.07606300604000266,
      "learning_rate": 0.0004814104881329828,
      "loss": 2.3104,
      "loss_layer_12_head": 0.5532704591751099,
      "loss_layer_18_head": 0.44358938932418823,
      "loss_layer_24_head": 0.26474320888519287,
      "loss_layer_30_head": 0.15215156972408295,
      "loss_layer_36_head": 0.09333119541406631,
      "loss_layer_42_head": 0.047965191304683685,
      "loss_layer_6_head": 0.7847241759300232,
      "step": 6390
    },
    {
      "epoch": 81.856,
      "grad_norm": 0.06557210400815436,
      "learning_rate": 0.0004781153311844555,
      "loss": 2.317,
      "loss_layer_12_head": 0.5353488326072693,
      "loss_layer_18_head": 0.4284172058105469,
      "loss_layer_24_head": 0.253751277923584,
      "loss_layer_30_head": 0.14492549002170563,
      "loss_layer_36_head": 0.08887295424938202,
      "loss_layer_42_head": 0.045748524367809296,
      "loss_layer_6_head": 0.7599323391914368,
      "step": 6395
    },
    {
      "epoch": 81.92,
      "grad_norm": 0.06565637937187242,
      "learning_rate": 0.0004748302975270838,
      "loss": 2.3297,
      "loss_layer_12_head": 0.5635569095611572,
      "loss_layer_18_head": 0.44990473985671997,
      "loss_layer_24_head": 0.2696107029914856,
      "loss_layer_30_head": 0.15528568625450134,
      "loss_layer_36_head": 0.09590446203947067,
      "loss_layer_42_head": 0.049012474715709686,
      "loss_layer_6_head": 0.7991381883621216,
      "step": 6400
    },
    {
      "epoch": 81.92,
      "eval_loss": 5.362007141113281,
      "eval_loss_layer_12_head": 1.240680456161499,
      "eval_loss_layer_18_head": 1.074781060218811,
      "eval_loss_layer_24_head": 0.6867393255233765,
      "eval_loss_layer_30_head": 0.44434356689453125,
      "eval_loss_layer_36_head": 0.28553837537765503,
      "eval_loss_layer_42_head": 0.16864898800849915,
      "eval_loss_layer_6_head": 1.5870819091796875,
      "eval_runtime": 33.0755,
      "eval_samples_per_second": 9.675,
      "eval_steps_per_second": 0.605,
      "step": 6400
    },
    {
      "epoch": 81.984,
      "grad_norm": 0.06977032815684953,
      "learning_rate": 0.0004715554036085673,
      "loss": 2.2872,
      "loss_layer_12_head": 0.5479716062545776,
      "loss_layer_18_head": 0.4353172183036804,
      "loss_layer_24_head": 0.2597397565841675,
      "loss_layer_30_head": 0.15082158148288727,
      "loss_layer_36_head": 0.09491641819477081,
      "loss_layer_42_head": 0.048453234136104584,
      "loss_layer_6_head": 0.7724564671516418,
      "step": 6405
    },
    {
      "epoch": 82.048,
      "grad_norm": 0.06627349460839743,
      "learning_rate": 0.0004682906658258393,
      "loss": 2.2357,
      "loss_layer_12_head": 0.504104733467102,
      "loss_layer_18_head": 0.40161505341529846,
      "loss_layer_24_head": 0.23876126110553741,
      "loss_layer_30_head": 0.1346750259399414,
      "loss_layer_36_head": 0.08326327055692673,
      "loss_layer_42_head": 0.04267028719186783,
      "loss_layer_6_head": 0.7238720655441284,
      "step": 6410
    },
    {
      "epoch": 82.112,
      "grad_norm": 0.07058537564513181,
      "learning_rate": 0.0004650361005249804,
      "loss": 2.2314,
      "loss_layer_12_head": 0.5315099954605103,
      "loss_layer_18_head": 0.4244975447654724,
      "loss_layer_24_head": 0.25452619791030884,
      "loss_layer_30_head": 0.14542880654335022,
      "loss_layer_36_head": 0.08937983214855194,
      "loss_layer_42_head": 0.04638016223907471,
      "loss_layer_6_head": 0.7556833028793335,
      "step": 6415
    },
    {
      "epoch": 82.176,
      "grad_norm": 0.07170686456792906,
      "learning_rate": 0.00046179172400113934,
      "loss": 2.2238,
      "loss_layer_12_head": 0.5444965958595276,
      "loss_layer_18_head": 0.43281984329223633,
      "loss_layer_24_head": 0.2562093436717987,
      "loss_layer_30_head": 0.14627817273139954,
      "loss_layer_36_head": 0.0895707979798317,
      "loss_layer_42_head": 0.04615187644958496,
      "loss_layer_6_head": 0.7753661870956421,
      "step": 6420
    },
    {
      "epoch": 82.24,
      "grad_norm": 0.0685604127782613,
      "learning_rate": 0.00045855755249845146,
      "loss": 2.2805,
      "loss_layer_12_head": 0.5045840740203857,
      "loss_layer_18_head": 0.40042644739151,
      "loss_layer_24_head": 0.2382729947566986,
      "loss_layer_30_head": 0.13550888001918793,
      "loss_layer_36_head": 0.08343406766653061,
      "loss_layer_42_head": 0.043208636343479156,
      "loss_layer_6_head": 0.7214422225952148,
      "step": 6425
    },
    {
      "epoch": 82.304,
      "grad_norm": 0.06447872758618493,
      "learning_rate": 0.00045533360220995745,
      "loss": 2.2836,
      "loss_layer_12_head": 0.5501700043678284,
      "loss_layer_18_head": 0.43940919637680054,
      "loss_layer_24_head": 0.26322808861732483,
      "loss_layer_30_head": 0.15069186687469482,
      "loss_layer_36_head": 0.09255506098270416,
      "loss_layer_42_head": 0.04768211394548416,
      "loss_layer_6_head": 0.7830749750137329,
      "step": 6430
    },
    {
      "epoch": 82.368,
      "grad_norm": 0.06115917504965647,
      "learning_rate": 0.00045211988927752023,
      "loss": 2.2966,
      "loss_layer_12_head": 0.530430257320404,
      "loss_layer_18_head": 0.4232467710971832,
      "loss_layer_24_head": 0.2519424557685852,
      "loss_layer_30_head": 0.1441952884197235,
      "loss_layer_36_head": 0.08791600912809372,
      "loss_layer_42_head": 0.044928163290023804,
      "loss_layer_6_head": 0.75372713804245,
      "step": 6435
    },
    {
      "epoch": 82.432,
      "grad_norm": 0.06970146002771342,
      "learning_rate": 0.00044891642979174917,
      "loss": 2.2986,
      "loss_layer_12_head": 0.5445534586906433,
      "loss_layer_18_head": 0.43682679533958435,
      "loss_layer_24_head": 0.26217442750930786,
      "loss_layer_30_head": 0.1518653929233551,
      "loss_layer_36_head": 0.09270922094583511,
      "loss_layer_42_head": 0.04765409976243973,
      "loss_layer_6_head": 0.769740641117096,
      "step": 6440
    },
    {
      "epoch": 82.496,
      "grad_norm": 0.06486674380701592,
      "learning_rate": 0.00044572323979191145,
      "loss": 2.2653,
      "loss_layer_12_head": 0.5318580269813538,
      "loss_layer_18_head": 0.42313456535339355,
      "loss_layer_24_head": 0.252377450466156,
      "loss_layer_30_head": 0.14444394409656525,
      "loss_layer_36_head": 0.08880550414323807,
      "loss_layer_42_head": 0.045447833836078644,
      "loss_layer_6_head": 0.757294774055481,
      "step": 6445
    },
    {
      "epoch": 82.56,
      "grad_norm": 0.06312860961367521,
      "learning_rate": 0.00044254033526585913,
      "loss": 2.2994,
      "loss_layer_12_head": 0.5686269402503967,
      "loss_layer_18_head": 0.45154905319213867,
      "loss_layer_24_head": 0.2673887610435486,
      "loss_layer_30_head": 0.15184618532657623,
      "loss_layer_36_head": 0.09234652668237686,
      "loss_layer_42_head": 0.04738082364201546,
      "loss_layer_6_head": 0.8101892471313477,
      "step": 6450
    },
    {
      "epoch": 82.624,
      "grad_norm": 0.06382166634451,
      "learning_rate": 0.00043936773214994676,
      "loss": 2.312,
      "loss_layer_12_head": 0.5555010437965393,
      "loss_layer_18_head": 0.44387292861938477,
      "loss_layer_24_head": 0.2639711797237396,
      "loss_layer_30_head": 0.15046079456806183,
      "loss_layer_36_head": 0.09254945814609528,
      "loss_layer_42_head": 0.047136884182691574,
      "loss_layer_6_head": 0.7950524091720581,
      "step": 6455
    },
    {
      "epoch": 82.688,
      "grad_norm": 0.0617043788046962,
      "learning_rate": 0.00043620544632895,
      "loss": 2.2713,
      "loss_layer_12_head": 0.5621874332427979,
      "loss_layer_18_head": 0.44960299134254456,
      "loss_layer_24_head": 0.2673717141151428,
      "loss_layer_30_head": 0.1528889387845993,
      "loss_layer_36_head": 0.09380461275577545,
      "loss_layer_42_head": 0.04814904183149338,
      "loss_layer_6_head": 0.7954167127609253,
      "step": 6460
    },
    {
      "epoch": 82.752,
      "grad_norm": 0.06666433994117546,
      "learning_rate": 0.00043305349363598726,
      "loss": 2.309,
      "loss_layer_12_head": 0.5346735715866089,
      "loss_layer_18_head": 0.4265333116054535,
      "loss_layer_24_head": 0.254997581243515,
      "loss_layer_30_head": 0.14936663210391998,
      "loss_layer_36_head": 0.09220517426729202,
      "loss_layer_42_head": 0.04807667434215546,
      "loss_layer_6_head": 0.7550350427627563,
      "step": 6465
    },
    {
      "epoch": 82.816,
      "grad_norm": 0.07300515158894817,
      "learning_rate": 0.00042991188985244124,
      "loss": 2.2707,
      "loss_layer_12_head": 0.5408638119697571,
      "loss_layer_18_head": 0.4321215748786926,
      "loss_layer_24_head": 0.2580704689025879,
      "loss_layer_30_head": 0.14822807908058167,
      "loss_layer_36_head": 0.0910295769572258,
      "loss_layer_42_head": 0.04712715744972229,
      "loss_layer_6_head": 0.7694892883300781,
      "step": 6470
    },
    {
      "epoch": 82.88,
      "grad_norm": 0.06201893469432807,
      "learning_rate": 0.0004267806507078778,
      "loss": 2.321,
      "loss_layer_12_head": 0.5474928617477417,
      "loss_layer_18_head": 0.43404173851013184,
      "loss_layer_24_head": 0.2570883631706238,
      "loss_layer_30_head": 0.14552977681159973,
      "loss_layer_36_head": 0.08865568786859512,
      "loss_layer_42_head": 0.04564545676112175,
      "loss_layer_6_head": 0.7799210548400879,
      "step": 6475
    },
    {
      "epoch": 82.944,
      "grad_norm": 0.06954505307985251,
      "learning_rate": 0.0004236597918799709,
      "loss": 2.3204,
      "loss_layer_12_head": 0.534702479839325,
      "loss_layer_18_head": 0.4250416159629822,
      "loss_layer_24_head": 0.2524891495704651,
      "loss_layer_30_head": 0.14472638070583344,
      "loss_layer_36_head": 0.08935188502073288,
      "loss_layer_42_head": 0.046680741012096405,
      "loss_layer_6_head": 0.7567447423934937,
      "step": 6480
    },
    {
      "epoch": 83.008,
      "grad_norm": 0.06730073894215037,
      "learning_rate": 0.000420549328994419,
      "loss": 2.2389,
      "loss_layer_12_head": 0.5236352682113647,
      "loss_layer_18_head": 0.4184463918209076,
      "loss_layer_24_head": 0.24846434593200684,
      "loss_layer_30_head": 0.14225628972053528,
      "loss_layer_36_head": 0.08750338852405548,
      "loss_layer_42_head": 0.04500048607587814,
      "loss_layer_6_head": 0.7453343272209167,
      "step": 6485
    },
    {
      "epoch": 83.072,
      "grad_norm": 0.07086383911783214,
      "learning_rate": 0.0004174492776248712,
      "loss": 2.2484,
      "loss_layer_12_head": 0.5279995799064636,
      "loss_layer_18_head": 0.4181602895259857,
      "loss_layer_24_head": 0.24762959778308868,
      "loss_layer_30_head": 0.1406756192445755,
      "loss_layer_36_head": 0.08583928644657135,
      "loss_layer_42_head": 0.04408843070268631,
      "loss_layer_6_head": 0.7570225596427917,
      "step": 6490
    },
    {
      "epoch": 83.136,
      "grad_norm": 0.060149681116245815,
      "learning_rate": 0.00041435965329284673,
      "loss": 2.2615,
      "loss_layer_12_head": 0.5425292253494263,
      "loss_layer_18_head": 0.4322192072868347,
      "loss_layer_24_head": 0.25730809569358826,
      "loss_layer_30_head": 0.14560768008232117,
      "loss_layer_36_head": 0.08944865316152573,
      "loss_layer_42_head": 0.0458780936896801,
      "loss_layer_6_head": 0.7736630439758301,
      "step": 6495
    },
    {
      "epoch": 83.2,
      "grad_norm": 0.06759692690079547,
      "learning_rate": 0.0004112804714676593,
      "loss": 2.2336,
      "loss_layer_12_head": 0.5209602117538452,
      "loss_layer_18_head": 0.4106729030609131,
      "loss_layer_24_head": 0.2440238893032074,
      "loss_layer_30_head": 0.13832072913646698,
      "loss_layer_36_head": 0.08504743874073029,
      "loss_layer_42_head": 0.04380479454994202,
      "loss_layer_6_head": 0.7434916496276855,
      "step": 6500
    },
    {
      "epoch": 83.264,
      "grad_norm": 0.06581781826269925,
      "learning_rate": 0.0004082117475663363,
      "loss": 2.2483,
      "loss_layer_12_head": 0.5414561033248901,
      "loss_layer_18_head": 0.4311619699001312,
      "loss_layer_24_head": 0.25873950123786926,
      "loss_layer_30_head": 0.1471589058637619,
      "loss_layer_36_head": 0.09013302624225616,
      "loss_layer_42_head": 0.04676305130124092,
      "loss_layer_6_head": 0.7623820304870605,
      "step": 6505
    },
    {
      "epoch": 83.328,
      "grad_norm": 0.0612294165749386,
      "learning_rate": 0.00040515349695354727,
      "loss": 2.2429,
      "loss_layer_12_head": 0.538168728351593,
      "loss_layer_18_head": 0.4268474578857422,
      "loss_layer_24_head": 0.2530152201652527,
      "loss_layer_30_head": 0.14488910138607025,
      "loss_layer_36_head": 0.0887119472026825,
      "loss_layer_42_head": 0.045661356300115585,
      "loss_layer_6_head": 0.7748141288757324,
      "step": 6510
    },
    {
      "epoch": 83.392,
      "grad_norm": 0.06156604366332751,
      "learning_rate": 0.0004021057349415197,
      "loss": 2.2668,
      "loss_layer_12_head": 0.5360924005508423,
      "loss_layer_18_head": 0.4267965257167816,
      "loss_layer_24_head": 0.25417616963386536,
      "loss_layer_30_head": 0.1457333117723465,
      "loss_layer_36_head": 0.08935729414224625,
      "loss_layer_42_head": 0.04546811059117317,
      "loss_layer_6_head": 0.7638968229293823,
      "step": 6515
    },
    {
      "epoch": 83.456,
      "grad_norm": 0.06218092447997475,
      "learning_rate": 0.00039906847678996773,
      "loss": 2.2634,
      "loss_layer_12_head": 0.5339849591255188,
      "loss_layer_18_head": 0.4264319837093353,
      "loss_layer_24_head": 0.2543196976184845,
      "loss_layer_30_head": 0.1458529531955719,
      "loss_layer_36_head": 0.09017837792634964,
      "loss_layer_42_head": 0.04655279964208603,
      "loss_layer_6_head": 0.7653087973594666,
      "step": 6520
    },
    {
      "epoch": 83.52,
      "grad_norm": 0.06605481402449438,
      "learning_rate": 0.0003960417377060152,
      "loss": 2.2813,
      "loss_layer_12_head": 0.530917763710022,
      "loss_layer_18_head": 0.42319709062576294,
      "loss_layer_24_head": 0.2513943612575531,
      "loss_layer_30_head": 0.14326710999011993,
      "loss_layer_36_head": 0.08843793720006943,
      "loss_layer_42_head": 0.04532283544540405,
      "loss_layer_6_head": 0.7629538774490356,
      "step": 6525
    },
    {
      "epoch": 83.584,
      "grad_norm": 0.06547053386772266,
      "learning_rate": 0.00039302553284411705,
      "loss": 2.2816,
      "loss_layer_12_head": 0.5216262340545654,
      "loss_layer_18_head": 0.4169894754886627,
      "loss_layer_24_head": 0.2500878870487213,
      "loss_layer_30_head": 0.14252053201198578,
      "loss_layer_36_head": 0.08740290999412537,
      "loss_layer_42_head": 0.0451023168861866,
      "loss_layer_6_head": 0.7482255697250366,
      "step": 6530
    },
    {
      "epoch": 83.648,
      "grad_norm": 0.06817766738698702,
      "learning_rate": 0.0003900198773059854,
      "loss": 2.2731,
      "loss_layer_12_head": 0.5453729033470154,
      "loss_layer_18_head": 0.4367977976799011,
      "loss_layer_24_head": 0.26214203238487244,
      "loss_layer_30_head": 0.15089255571365356,
      "loss_layer_36_head": 0.09291098266839981,
      "loss_layer_42_head": 0.04768028110265732,
      "loss_layer_6_head": 0.7692655324935913,
      "step": 6535
    },
    {
      "epoch": 83.712,
      "grad_norm": 0.059312272955413116,
      "learning_rate": 0.0003870247861405135,
      "loss": 2.3266,
      "loss_layer_12_head": 0.5646803975105286,
      "loss_layer_18_head": 0.44927436113357544,
      "loss_layer_24_head": 0.2659614384174347,
      "loss_layer_30_head": 0.15040485560894012,
      "loss_layer_36_head": 0.09187926352024078,
      "loss_layer_42_head": 0.047203827649354935,
      "loss_layer_6_head": 0.802202582359314,
      "step": 6540
    },
    {
      "epoch": 83.776,
      "grad_norm": 0.062053908973284035,
      "learning_rate": 0.0003840402743436988,
      "loss": 2.2816,
      "loss_layer_12_head": 0.5499003529548645,
      "loss_layer_18_head": 0.4376044273376465,
      "loss_layer_24_head": 0.2581423819065094,
      "loss_layer_30_head": 0.14690010249614716,
      "loss_layer_36_head": 0.09000765532255173,
      "loss_layer_42_head": 0.04647928848862648,
      "loss_layer_6_head": 0.7825456857681274,
      "step": 6545
    },
    {
      "epoch": 83.84,
      "grad_norm": 0.06190595840992297,
      "learning_rate": 0.0003810663568585723,
      "loss": 2.2787,
      "loss_layer_12_head": 0.5501017570495605,
      "loss_layer_18_head": 0.44183164834976196,
      "loss_layer_24_head": 0.2664022147655487,
      "loss_layer_30_head": 0.15490277111530304,
      "loss_layer_36_head": 0.09500958025455475,
      "loss_layer_42_head": 0.0488528236746788,
      "loss_layer_6_head": 0.7764110565185547,
      "step": 6550
    },
    {
      "epoch": 83.904,
      "grad_norm": 0.06260322852470804,
      "learning_rate": 0.0003781030485751191,
      "loss": 2.2887,
      "loss_layer_12_head": 0.5549980998039246,
      "loss_layer_18_head": 0.44128894805908203,
      "loss_layer_24_head": 0.2626485228538513,
      "loss_layer_30_head": 0.14861316978931427,
      "loss_layer_36_head": 0.0905989408493042,
      "loss_layer_42_head": 0.04678872600197792,
      "loss_layer_6_head": 0.7853071689605713,
      "step": 6555
    },
    {
      "epoch": 83.968,
      "grad_norm": 0.06159788406696576,
      "learning_rate": 0.00037515036433020347,
      "loss": 2.3229,
      "loss_layer_12_head": 0.5497353672981262,
      "loss_layer_18_head": 0.4411308169364929,
      "loss_layer_24_head": 0.26567545533180237,
      "loss_layer_30_head": 0.1524103730916977,
      "loss_layer_36_head": 0.09357419610023499,
      "loss_layer_42_head": 0.048373233526945114,
      "loss_layer_6_head": 0.7767873406410217,
      "step": 6560
    },
    {
      "epoch": 84.032,
      "grad_norm": 0.06262643353151737,
      "learning_rate": 0.0003722083189075007,
      "loss": 2.3189,
      "loss_layer_12_head": 0.557877779006958,
      "loss_layer_18_head": 0.4446055293083191,
      "loss_layer_24_head": 0.26396864652633667,
      "loss_layer_30_head": 0.14928501844406128,
      "loss_layer_36_head": 0.09068578481674194,
      "loss_layer_42_head": 0.04597433656454086,
      "loss_layer_6_head": 0.7921199202537537,
      "step": 6565
    },
    {
      "epoch": 84.096,
      "grad_norm": 0.0621327190321449,
      "learning_rate": 0.0003692769270374163,
      "loss": 2.241,
      "loss_layer_12_head": 0.5335725545883179,
      "loss_layer_18_head": 0.42681631445884705,
      "loss_layer_24_head": 0.2554943263530731,
      "loss_layer_30_head": 0.14634886384010315,
      "loss_layer_36_head": 0.08966200053691864,
      "loss_layer_42_head": 0.04666420817375183,
      "loss_layer_6_head": 0.7569630146026611,
      "step": 6570
    },
    {
      "epoch": 84.16,
      "grad_norm": 0.06115994179262845,
      "learning_rate": 0.0003663562033970155,
      "loss": 2.2413,
      "loss_layer_12_head": 0.5034642815589905,
      "loss_layer_18_head": 0.4020516276359558,
      "loss_layer_24_head": 0.23935607075691223,
      "loss_layer_30_head": 0.1358463317155838,
      "loss_layer_36_head": 0.08337035775184631,
      "loss_layer_42_head": 0.04273126646876335,
      "loss_layer_6_head": 0.713679313659668,
      "step": 6575
    },
    {
      "epoch": 84.224,
      "grad_norm": 0.05714892650979144,
      "learning_rate": 0.00036344616260994945,
      "loss": 2.2134,
      "loss_layer_12_head": 0.5436224937438965,
      "loss_layer_18_head": 0.4318552017211914,
      "loss_layer_24_head": 0.25710752606391907,
      "loss_layer_30_head": 0.1475636512041092,
      "loss_layer_36_head": 0.09105201065540314,
      "loss_layer_42_head": 0.04672311991453171,
      "loss_layer_6_head": 0.7738708257675171,
      "step": 6580
    },
    {
      "epoch": 84.288,
      "grad_norm": 0.06158939960058595,
      "learning_rate": 0.0003605468192463815,
      "loss": 2.2898,
      "loss_layer_12_head": 0.5495832562446594,
      "loss_layer_18_head": 0.4378860890865326,
      "loss_layer_24_head": 0.26036593317985535,
      "loss_layer_30_head": 0.1479702740907669,
      "loss_layer_36_head": 0.09094228595495224,
      "loss_layer_42_head": 0.046692680567502975,
      "loss_layer_6_head": 0.7762225270271301,
      "step": 6585
    },
    {
      "epoch": 84.352,
      "grad_norm": 0.060474585002816654,
      "learning_rate": 0.0003576581878229143,
      "loss": 2.2669,
      "loss_layer_12_head": 0.5425882935523987,
      "loss_layer_18_head": 0.4313373565673828,
      "loss_layer_24_head": 0.25918155908584595,
      "loss_layer_30_head": 0.14946714043617249,
      "loss_layer_36_head": 0.09118975698947906,
      "loss_layer_42_head": 0.04747995734214783,
      "loss_layer_6_head": 0.7675814032554626,
      "step": 6590
    },
    {
      "epoch": 84.416,
      "grad_norm": 0.05852781058427017,
      "learning_rate": 0.00035478028280251845,
      "loss": 2.2896,
      "loss_layer_12_head": 0.5549778938293457,
      "loss_layer_18_head": 0.4408612847328186,
      "loss_layer_24_head": 0.26182639598846436,
      "loss_layer_30_head": 0.14887897670269012,
      "loss_layer_36_head": 0.09157060086727142,
      "loss_layer_42_head": 0.047488223761320114,
      "loss_layer_6_head": 0.7870487570762634,
      "step": 6595
    },
    {
      "epoch": 84.48,
      "grad_norm": 0.06204773002497898,
      "learning_rate": 0.00035191311859445796,
      "loss": 2.2224,
      "loss_layer_12_head": 0.5224236845970154,
      "loss_layer_18_head": 0.4143506586551666,
      "loss_layer_24_head": 0.24681539833545685,
      "loss_layer_30_head": 0.13961660861968994,
      "loss_layer_36_head": 0.0848795622587204,
      "loss_layer_42_head": 0.04384300857782364,
      "loss_layer_6_head": 0.7432200312614441,
      "step": 6600
    },
    {
      "epoch": 84.48,
      "eval_loss": 5.362083435058594,
      "eval_loss_layer_12_head": 1.2408151626586914,
      "eval_loss_layer_18_head": 1.0751256942749023,
      "eval_loss_layer_24_head": 0.6865283250808716,
      "eval_loss_layer_30_head": 0.4443874955177307,
      "eval_loss_layer_36_head": 0.28464800119400024,
      "eval_loss_layer_42_head": 0.16867145895957947,
      "eval_loss_layer_6_head": 1.588051199913025,
      "eval_runtime": 33.0378,
      "eval_samples_per_second": 9.686,
      "eval_steps_per_second": 0.605,
      "step": 6600
    },
    {
      "epoch": 84.544,
      "grad_norm": 0.06033858231558078,
      "learning_rate": 0.00034905670955421935,
      "loss": 2.2765,
      "loss_layer_12_head": 0.5273030996322632,
      "loss_layer_18_head": 0.42019277811050415,
      "loss_layer_24_head": 0.24942722916603088,
      "loss_layer_30_head": 0.14185816049575806,
      "loss_layer_36_head": 0.08681138604879379,
      "loss_layer_42_head": 0.04469098150730133,
      "loss_layer_6_head": 0.7543646693229675,
      "step": 6605
    },
    {
      "epoch": 84.608,
      "grad_norm": 0.06591925312971718,
      "learning_rate": 0.0003462110699834392,
      "loss": 2.2873,
      "loss_layer_12_head": 0.5726321339607239,
      "loss_layer_18_head": 0.4574497640132904,
      "loss_layer_24_head": 0.2747747302055359,
      "loss_layer_30_head": 0.15806260704994202,
      "loss_layer_36_head": 0.09706728160381317,
      "loss_layer_42_head": 0.050202079117298126,
      "loss_layer_6_head": 0.8090144395828247,
      "step": 6610
    },
    {
      "epoch": 84.672,
      "grad_norm": 0.060020636737372436,
      "learning_rate": 0.00034337621412983273,
      "loss": 2.274,
      "loss_layer_12_head": 0.5485014319419861,
      "loss_layer_18_head": 0.43576937913894653,
      "loss_layer_24_head": 0.2587197422981262,
      "loss_layer_30_head": 0.14631116390228271,
      "loss_layer_36_head": 0.0897674560546875,
      "loss_layer_42_head": 0.04597922042012215,
      "loss_layer_6_head": 0.7769433856010437,
      "step": 6615
    },
    {
      "epoch": 84.736,
      "grad_norm": 0.06461502389386152,
      "learning_rate": 0.0003405521561871247,
      "loss": 2.2519,
      "loss_layer_12_head": 0.5419825315475464,
      "loss_layer_18_head": 0.43156710267066956,
      "loss_layer_24_head": 0.25646883249282837,
      "loss_layer_30_head": 0.1445082128047943,
      "loss_layer_36_head": 0.08953981101512909,
      "loss_layer_42_head": 0.045990943908691406,
      "loss_layer_6_head": 0.7720518708229065,
      "step": 6620
    },
    {
      "epoch": 84.8,
      "grad_norm": 0.058681642829528986,
      "learning_rate": 0.0003377389102949732,
      "loss": 2.2472,
      "loss_layer_12_head": 0.5251632928848267,
      "loss_layer_18_head": 0.4179890751838684,
      "loss_layer_24_head": 0.24744024872779846,
      "loss_layer_30_head": 0.1411394625902176,
      "loss_layer_36_head": 0.08616825193166733,
      "loss_layer_42_head": 0.044686369597911835,
      "loss_layer_6_head": 0.7538606524467468,
      "step": 6625
    },
    {
      "epoch": 84.864,
      "grad_norm": 0.05924120920814245,
      "learning_rate": 0.00033493649053890326,
      "loss": 2.3206,
      "loss_layer_12_head": 0.5227854251861572,
      "loss_layer_18_head": 0.4173906743526459,
      "loss_layer_24_head": 0.24872954189777374,
      "loss_layer_30_head": 0.14242471754550934,
      "loss_layer_36_head": 0.0878685936331749,
      "loss_layer_42_head": 0.04566620662808418,
      "loss_layer_6_head": 0.7439418435096741,
      "step": 6630
    },
    {
      "epoch": 84.928,
      "grad_norm": 0.06421061682365015,
      "learning_rate": 0.0003321449109502361,
      "loss": 2.2583,
      "loss_layer_12_head": 0.5415996313095093,
      "loss_layer_18_head": 0.4322631359100342,
      "loss_layer_24_head": 0.2576761245727539,
      "loss_layer_30_head": 0.14644987881183624,
      "loss_layer_36_head": 0.08958979696035385,
      "loss_layer_42_head": 0.0458441786468029,
      "loss_layer_6_head": 0.767863392829895,
      "step": 6635
    },
    {
      "epoch": 84.992,
      "grad_norm": 0.06097636893702272,
      "learning_rate": 0.00032936418550601616,
      "loss": 2.3505,
      "loss_layer_12_head": 0.5716210603713989,
      "loss_layer_18_head": 0.45637911558151245,
      "loss_layer_24_head": 0.27052217721939087,
      "loss_layer_30_head": 0.15382179617881775,
      "loss_layer_36_head": 0.0943298265337944,
      "loss_layer_42_head": 0.047969602048397064,
      "loss_layer_6_head": 0.8107573390007019,
      "step": 6640
    },
    {
      "epoch": 85.056,
      "grad_norm": 0.061905597148464726,
      "learning_rate": 0.00032659432812894295,
      "loss": 2.246,
      "loss_layer_12_head": 0.4954412579536438,
      "loss_layer_18_head": 0.39710670709609985,
      "loss_layer_24_head": 0.23858042061328888,
      "loss_layer_30_head": 0.1375037282705307,
      "loss_layer_36_head": 0.08466792106628418,
      "loss_layer_42_head": 0.043832652270793915,
      "loss_layer_6_head": 0.7040526270866394,
      "step": 6645
    },
    {
      "epoch": 85.12,
      "grad_norm": 0.05957157556880912,
      "learning_rate": 0.00032383535268730196,
      "loss": 2.2429,
      "loss_layer_12_head": 0.51688551902771,
      "loss_layer_18_head": 0.4119006097316742,
      "loss_layer_24_head": 0.24587401747703552,
      "loss_layer_30_head": 0.14143255352973938,
      "loss_layer_36_head": 0.08723609149456024,
      "loss_layer_42_head": 0.04518958181142807,
      "loss_layer_6_head": 0.7356343269348145,
      "step": 6650
    },
    {
      "epoch": 85.184,
      "grad_norm": 0.06023293142104077,
      "learning_rate": 0.0003210872729948935,
      "loss": 2.2046,
      "loss_layer_12_head": 0.5435062050819397,
      "loss_layer_18_head": 0.432736337184906,
      "loss_layer_24_head": 0.25696316361427307,
      "loss_layer_30_head": 0.1453084796667099,
      "loss_layer_36_head": 0.08833971619606018,
      "loss_layer_42_head": 0.04509878158569336,
      "loss_layer_6_head": 0.7746436595916748,
      "step": 6655
    },
    {
      "epoch": 85.248,
      "grad_norm": 0.05595342267777211,
      "learning_rate": 0.00031835010281096424,
      "loss": 2.2449,
      "loss_layer_12_head": 0.5362009406089783,
      "loss_layer_18_head": 0.42863965034484863,
      "loss_layer_24_head": 0.25790536403656006,
      "loss_layer_30_head": 0.14945060014724731,
      "loss_layer_36_head": 0.09196082502603531,
      "loss_layer_42_head": 0.0475839301943779,
      "loss_layer_6_head": 0.7530527710914612,
      "step": 6660
    },
    {
      "epoch": 85.312,
      "grad_norm": 0.06285853123340257,
      "learning_rate": 0.0003156238558401403,
      "loss": 2.2333,
      "loss_layer_12_head": 0.545531153678894,
      "loss_layer_18_head": 0.43402013182640076,
      "loss_layer_24_head": 0.25750356912612915,
      "loss_layer_30_head": 0.1453232318162918,
      "loss_layer_36_head": 0.08777523785829544,
      "loss_layer_42_head": 0.04482442885637283,
      "loss_layer_6_head": 0.7747218608856201,
      "step": 6665
    },
    {
      "epoch": 85.376,
      "grad_norm": 0.06531799275711905,
      "learning_rate": 0.0003129085457323538,
      "loss": 2.2921,
      "loss_layer_12_head": 0.5586044192314148,
      "loss_layer_18_head": 0.4445754885673523,
      "loss_layer_24_head": 0.2646661698818207,
      "loss_layer_30_head": 0.1508399099111557,
      "loss_layer_36_head": 0.0924069732427597,
      "loss_layer_42_head": 0.04766787961125374,
      "loss_layer_6_head": 0.7919691205024719,
      "step": 6670
    },
    {
      "epoch": 85.44,
      "grad_norm": 0.0710516635076859,
      "learning_rate": 0.00031020418608278033,
      "loss": 2.3031,
      "loss_layer_12_head": 0.5366767048835754,
      "loss_layer_18_head": 0.4291301369667053,
      "loss_layer_24_head": 0.2559660077095032,
      "loss_layer_30_head": 0.14629463851451874,
      "loss_layer_36_head": 0.09010110795497894,
      "loss_layer_42_head": 0.04641090705990791,
      "loss_layer_6_head": 0.7620426416397095,
      "step": 6675
    },
    {
      "epoch": 85.504,
      "grad_norm": 0.0620595591083191,
      "learning_rate": 0.00030751079043176674,
      "loss": 2.2544,
      "loss_layer_12_head": 0.51749587059021,
      "loss_layer_18_head": 0.4116649627685547,
      "loss_layer_24_head": 0.2457735240459442,
      "loss_layer_30_head": 0.13989709317684174,
      "loss_layer_36_head": 0.08563225716352463,
      "loss_layer_42_head": 0.044154729694128036,
      "loss_layer_6_head": 0.739150881767273,
      "step": 6680
    },
    {
      "epoch": 85.568,
      "grad_norm": 0.05999016059744548,
      "learning_rate": 0.0003048283722647649,
      "loss": 2.2773,
      "loss_layer_12_head": 0.521777868270874,
      "loss_layer_18_head": 0.41590985655784607,
      "loss_layer_24_head": 0.24620775878429413,
      "loss_layer_30_head": 0.14072659611701965,
      "loss_layer_36_head": 0.08654164522886276,
      "loss_layer_42_head": 0.044640738517045975,
      "loss_layer_6_head": 0.747124195098877,
      "step": 6685
    },
    {
      "epoch": 85.632,
      "grad_norm": 0.06012884286477552,
      "learning_rate": 0.0003021569450122638,
      "loss": 2.2692,
      "loss_layer_12_head": 0.5401060581207275,
      "loss_layer_18_head": 0.430352121591568,
      "loss_layer_24_head": 0.2541758418083191,
      "loss_layer_30_head": 0.1451440155506134,
      "loss_layer_36_head": 0.08909891545772552,
      "loss_layer_42_head": 0.04602315276861191,
      "loss_layer_6_head": 0.7685240507125854,
      "step": 6690
    },
    {
      "epoch": 85.696,
      "grad_norm": 0.05938457157198052,
      "learning_rate": 0.0002994965220497226,
      "loss": 2.2861,
      "loss_layer_12_head": 0.562614381313324,
      "loss_layer_18_head": 0.44822460412979126,
      "loss_layer_24_head": 0.2669324576854706,
      "loss_layer_30_head": 0.15230251848697662,
      "loss_layer_36_head": 0.09329725801944733,
      "loss_layer_42_head": 0.04752293974161148,
      "loss_layer_6_head": 0.7971410751342773,
      "step": 6695
    },
    {
      "epoch": 85.76,
      "grad_norm": 0.06154626122724264,
      "learning_rate": 0.00029684711669750315,
      "loss": 2.2608,
      "loss_layer_12_head": 0.5318783521652222,
      "loss_layer_18_head": 0.42316970229148865,
      "loss_layer_24_head": 0.2522587478160858,
      "loss_layer_30_head": 0.1446952372789383,
      "loss_layer_36_head": 0.08893351256847382,
      "loss_layer_42_head": 0.045634675770998,
      "loss_layer_6_head": 0.7562094926834106,
      "step": 6700
    },
    {
      "epoch": 85.824,
      "grad_norm": 0.06915524103039539,
      "learning_rate": 0.00029420874222080507,
      "loss": 2.2734,
      "loss_layer_12_head": 0.5471838712692261,
      "loss_layer_18_head": 0.4375895857810974,
      "loss_layer_24_head": 0.26202166080474854,
      "loss_layer_30_head": 0.15052315592765808,
      "loss_layer_36_head": 0.09244464337825775,
      "loss_layer_42_head": 0.04793420433998108,
      "loss_layer_6_head": 0.7745028734207153,
      "step": 6705
    },
    {
      "epoch": 85.888,
      "grad_norm": 0.06380793062994586,
      "learning_rate": 0.0002915814118295965,
      "loss": 2.3181,
      "loss_layer_12_head": 0.5380845069885254,
      "loss_layer_18_head": 0.42939385771751404,
      "loss_layer_24_head": 0.2563352882862091,
      "loss_layer_30_head": 0.14676907658576965,
      "loss_layer_36_head": 0.09006841480731964,
      "loss_layer_42_head": 0.04666604846715927,
      "loss_layer_6_head": 0.7669503092765808,
      "step": 6710
    },
    {
      "epoch": 85.952,
      "grad_norm": 0.0617047684809767,
      "learning_rate": 0.0002889651386785497,
      "loss": 2.2438,
      "loss_layer_12_head": 0.5220370292663574,
      "loss_layer_18_head": 0.41733279824256897,
      "loss_layer_24_head": 0.2464752495288849,
      "loss_layer_30_head": 0.14185042679309845,
      "loss_layer_36_head": 0.08683272451162338,
      "loss_layer_42_head": 0.044636450707912445,
      "loss_layer_6_head": 0.7420752644538879,
      "step": 6715
    },
    {
      "epoch": 86.016,
      "grad_norm": 0.06519374921873936,
      "learning_rate": 0.0002863599358669755,
      "loss": 2.2675,
      "loss_layer_12_head": 0.5332145690917969,
      "loss_layer_18_head": 0.4267454743385315,
      "loss_layer_24_head": 0.25271618366241455,
      "loss_layer_30_head": 0.1453295648097992,
      "loss_layer_36_head": 0.0897432193160057,
      "loss_layer_42_head": 0.045865077525377274,
      "loss_layer_6_head": 0.7544695138931274,
      "step": 6720
    },
    {
      "epoch": 86.08,
      "grad_norm": 0.0637043797422425,
      "learning_rate": 0.00028376581643875634,
      "loss": 2.2191,
      "loss_layer_12_head": 0.5068849325180054,
      "loss_layer_18_head": 0.4017639756202698,
      "loss_layer_24_head": 0.23963817954063416,
      "loss_layer_30_head": 0.13685718178749084,
      "loss_layer_36_head": 0.08402357995510101,
      "loss_layer_42_head": 0.043408799916505814,
      "loss_layer_6_head": 0.723760187625885,
      "step": 6725
    },
    {
      "epoch": 86.144,
      "grad_norm": 0.0599592431437669,
      "learning_rate": 0.00028118279338228136,
      "loss": 2.2499,
      "loss_layer_12_head": 0.5042485594749451,
      "loss_layer_18_head": 0.40544748306274414,
      "loss_layer_24_head": 0.2426948994398117,
      "loss_layer_30_head": 0.14131483435630798,
      "loss_layer_36_head": 0.08776389807462692,
      "loss_layer_42_head": 0.04573874920606613,
      "loss_layer_6_head": 0.7128604054450989,
      "step": 6730
    },
    {
      "epoch": 86.208,
      "grad_norm": 0.0683713435763905,
      "learning_rate": 0.00027861087963038436,
      "loss": 2.2254,
      "loss_layer_12_head": 0.5064147114753723,
      "loss_layer_18_head": 0.40105462074279785,
      "loss_layer_24_head": 0.23610742390155792,
      "loss_layer_30_head": 0.1336607038974762,
      "loss_layer_36_head": 0.08188009262084961,
      "loss_layer_42_head": 0.04243554547429085,
      "loss_layer_6_head": 0.7252146601676941,
      "step": 6735
    },
    {
      "epoch": 86.272,
      "grad_norm": 0.05959543612501831,
      "learning_rate": 0.00027605008806027203,
      "loss": 2.2606,
      "loss_layer_12_head": 0.532181441783905,
      "loss_layer_18_head": 0.42300447821617126,
      "loss_layer_24_head": 0.24985985457897186,
      "loss_layer_30_head": 0.14088596403598785,
      "loss_layer_36_head": 0.08655289560556412,
      "loss_layer_42_head": 0.04447270184755325,
      "loss_layer_6_head": 0.7592952847480774,
      "step": 6740
    },
    {
      "epoch": 86.336,
      "grad_norm": 0.06279553816648359,
      "learning_rate": 0.0002735004314934683,
      "loss": 2.274,
      "loss_layer_12_head": 0.5378656387329102,
      "loss_layer_18_head": 0.4282541275024414,
      "loss_layer_24_head": 0.25439590215682983,
      "loss_layer_30_head": 0.14513885974884033,
      "loss_layer_36_head": 0.08873539417982101,
      "loss_layer_42_head": 0.0454745814204216,
      "loss_layer_6_head": 0.7676151990890503,
      "step": 6745
    },
    {
      "epoch": 86.4,
      "grad_norm": 0.05983909905047214,
      "learning_rate": 0.000270961922695743,
      "loss": 2.2761,
      "loss_layer_12_head": 0.5446697473526001,
      "loss_layer_18_head": 0.4331987500190735,
      "loss_layer_24_head": 0.25742092728614807,
      "loss_layer_30_head": 0.14603550732135773,
      "loss_layer_36_head": 0.08961508423089981,
      "loss_layer_42_head": 0.04593000188469887,
      "loss_layer_6_head": 0.776239275932312,
      "step": 6750
    },
    {
      "epoch": 86.464,
      "grad_norm": 0.06350789397930341,
      "learning_rate": 0.00026843457437705136,
      "loss": 2.2668,
      "loss_layer_12_head": 0.5450259447097778,
      "loss_layer_18_head": 0.43384331464767456,
      "loss_layer_24_head": 0.25872355699539185,
      "loss_layer_30_head": 0.14706823229789734,
      "loss_layer_36_head": 0.08968444168567657,
      "loss_layer_42_head": 0.04611571133136749,
      "loss_layer_6_head": 0.7736265659332275,
      "step": 6755
    },
    {
      "epoch": 86.528,
      "grad_norm": 0.06348262502069905,
      "learning_rate": 0.0002659183991914696,
      "loss": 2.2224,
      "loss_layer_12_head": 0.5226296186447144,
      "loss_layer_18_head": 0.41735514998435974,
      "loss_layer_24_head": 0.24800880253314972,
      "loss_layer_30_head": 0.14158210158348083,
      "loss_layer_36_head": 0.0863649994134903,
      "loss_layer_42_head": 0.04474876821041107,
      "loss_layer_6_head": 0.7437916994094849,
      "step": 6760
    },
    {
      "epoch": 86.592,
      "grad_norm": 0.06111192832007558,
      "learning_rate": 0.00026341340973713186,
      "loss": 2.2828,
      "loss_layer_12_head": 0.5310556888580322,
      "loss_layer_18_head": 0.42655569314956665,
      "loss_layer_24_head": 0.25755709409713745,
      "loss_layer_30_head": 0.1482301652431488,
      "loss_layer_36_head": 0.09033164381980896,
      "loss_layer_42_head": 0.04687374457716942,
      "loss_layer_6_head": 0.757186233997345,
      "step": 6765
    },
    {
      "epoch": 86.656,
      "grad_norm": 0.05840809756963051,
      "learning_rate": 0.00026091961855616633,
      "loss": 2.2855,
      "loss_layer_12_head": 0.5339376926422119,
      "loss_layer_18_head": 0.4295603632926941,
      "loss_layer_24_head": 0.25718119740486145,
      "loss_layer_30_head": 0.15076155960559845,
      "loss_layer_36_head": 0.09343454986810684,
      "loss_layer_42_head": 0.04899568110704422,
      "loss_layer_6_head": 0.7517622113227844,
      "step": 6770
    },
    {
      "epoch": 86.72,
      "grad_norm": 0.059029202708597586,
      "learning_rate": 0.00025843703813463405,
      "loss": 2.2758,
      "loss_layer_12_head": 0.5185956358909607,
      "loss_layer_18_head": 0.41415756940841675,
      "loss_layer_24_head": 0.24835947155952454,
      "loss_layer_30_head": 0.14127469062805176,
      "loss_layer_36_head": 0.0868893712759018,
      "loss_layer_42_head": 0.04456452280282974,
      "loss_layer_6_head": 0.7378668785095215,
      "step": 6775
    },
    {
      "epoch": 86.784,
      "grad_norm": 0.0660265868053307,
      "learning_rate": 0.00025596568090246547,
      "loss": 2.2175,
      "loss_layer_12_head": 0.5526667833328247,
      "loss_layer_18_head": 0.43963804841041565,
      "loss_layer_24_head": 0.2611958980560303,
      "loss_layer_30_head": 0.14853700995445251,
      "loss_layer_36_head": 0.0905013307929039,
      "loss_layer_42_head": 0.04630681499838829,
      "loss_layer_6_head": 0.7816737294197083,
      "step": 6780
    },
    {
      "epoch": 86.848,
      "grad_norm": 0.05995533214622456,
      "learning_rate": 0.0002535055592333954,
      "loss": 2.264,
      "loss_layer_12_head": 0.5486246347427368,
      "loss_layer_18_head": 0.4378492832183838,
      "loss_layer_24_head": 0.26261380314826965,
      "loss_layer_30_head": 0.14971014857292175,
      "loss_layer_36_head": 0.09175258129835129,
      "loss_layer_42_head": 0.04702647775411606,
      "loss_layer_6_head": 0.7823960781097412,
      "step": 6785
    },
    {
      "epoch": 86.912,
      "grad_norm": 0.060886909239631144,
      "learning_rate": 0.00025105668544490726,
      "loss": 2.333,
      "loss_layer_12_head": 0.549963653087616,
      "loss_layer_18_head": 0.44106364250183105,
      "loss_layer_24_head": 0.25955909490585327,
      "loss_layer_30_head": 0.1487460434436798,
      "loss_layer_36_head": 0.09084698557853699,
      "loss_layer_42_head": 0.0466119647026062,
      "loss_layer_6_head": 0.7820623517036438,
      "step": 6790
    },
    {
      "epoch": 86.976,
      "grad_norm": 0.05687449131984122,
      "learning_rate": 0.00024861907179816653,
      "loss": 2.2157,
      "loss_layer_12_head": 0.5280774235725403,
      "loss_layer_18_head": 0.4222298562526703,
      "loss_layer_24_head": 0.25104424357414246,
      "loss_layer_30_head": 0.14182265102863312,
      "loss_layer_36_head": 0.08656584471464157,
      "loss_layer_42_head": 0.04452679306268692,
      "loss_layer_6_head": 0.7580618858337402,
      "step": 6795
    },
    {
      "epoch": 87.04,
      "grad_norm": 0.057432079056039295,
      "learning_rate": 0.00024619273049796,
      "loss": 2.2312,
      "loss_layer_12_head": 0.5078566074371338,
      "loss_layer_18_head": 0.4024267792701721,
      "loss_layer_24_head": 0.23993070423603058,
      "loss_layer_30_head": 0.13754835724830627,
      "loss_layer_36_head": 0.08506311476230621,
      "loss_layer_42_head": 0.0439240038394928,
      "loss_layer_6_head": 0.722137987613678,
      "step": 6800
    },
    {
      "epoch": 87.04,
      "eval_loss": 5.359401226043701,
      "eval_loss_layer_12_head": 1.240039348602295,
      "eval_loss_layer_18_head": 1.073506236076355,
      "eval_loss_layer_24_head": 0.6862438321113586,
      "eval_loss_layer_30_head": 0.44464802742004395,
      "eval_loss_layer_36_head": 0.2846316695213318,
      "eval_loss_layer_42_head": 0.16886478662490845,
      "eval_loss_layer_6_head": 1.586295485496521,
      "eval_runtime": 33.0712,
      "eval_samples_per_second": 9.676,
      "eval_steps_per_second": 0.605,
      "step": 6800
    },
    {
      "epoch": 87.104,
      "grad_norm": 0.057668425164134965,
      "learning_rate": 0.0002437776736926367,
      "loss": 2.2443,
      "loss_layer_12_head": 0.5179181694984436,
      "loss_layer_18_head": 0.41214412450790405,
      "loss_layer_24_head": 0.24569153785705566,
      "loss_layer_30_head": 0.14100295305252075,
      "loss_layer_36_head": 0.08766494691371918,
      "loss_layer_42_head": 0.04484416916966438,
      "loss_layer_6_head": 0.735958456993103,
      "step": 6805
    },
    {
      "epoch": 87.168,
      "grad_norm": 0.055601108767389586,
      "learning_rate": 0.00024137391347404475,
      "loss": 2.2341,
      "loss_layer_12_head": 0.5409920811653137,
      "loss_layer_18_head": 0.43041855096817017,
      "loss_layer_24_head": 0.2557937204837799,
      "loss_layer_30_head": 0.1468038707971573,
      "loss_layer_36_head": 0.08919794112443924,
      "loss_layer_42_head": 0.04548775777220726,
      "loss_layer_6_head": 0.7735974192619324,
      "step": 6810
    },
    {
      "epoch": 87.232,
      "grad_norm": 0.055797941296718445,
      "learning_rate": 0.00023898146187747188,
      "loss": 2.2492,
      "loss_layer_12_head": 0.5641626119613647,
      "loss_layer_18_head": 0.44954776763916016,
      "loss_layer_24_head": 0.2671263515949249,
      "loss_layer_30_head": 0.15155717730522156,
      "loss_layer_36_head": 0.0925733670592308,
      "loss_layer_42_head": 0.04697907716035843,
      "loss_layer_6_head": 0.8005262613296509,
      "step": 6815
    },
    {
      "epoch": 87.296,
      "grad_norm": 0.05380591415132192,
      "learning_rate": 0.00023660033088158646,
      "loss": 2.2431,
      "loss_layer_12_head": 0.522892951965332,
      "loss_layer_18_head": 0.41645392775535583,
      "loss_layer_24_head": 0.24754996597766876,
      "loss_layer_30_head": 0.14025893807411194,
      "loss_layer_36_head": 0.08579500019550323,
      "loss_layer_42_head": 0.044320616871118546,
      "loss_layer_6_head": 0.7401856780052185,
      "step": 6820
    },
    {
      "epoch": 87.36,
      "grad_norm": 0.05496634694868247,
      "learning_rate": 0.00023423053240837516,
      "loss": 2.2392,
      "loss_layer_12_head": 0.5458707213401794,
      "loss_layer_18_head": 0.43412309885025024,
      "loss_layer_24_head": 0.2563820779323578,
      "loss_layer_30_head": 0.14608648419380188,
      "loss_layer_36_head": 0.08924529701471329,
      "loss_layer_42_head": 0.04588618874549866,
      "loss_layer_6_head": 0.776205837726593,
      "step": 6825
    },
    {
      "epoch": 87.424,
      "grad_norm": 0.05370215732616587,
      "learning_rate": 0.00023187207832308404,
      "loss": 2.2603,
      "loss_layer_12_head": 0.535464346408844,
      "loss_layer_18_head": 0.4258701205253601,
      "loss_layer_24_head": 0.2525565028190613,
      "loss_layer_30_head": 0.14417919516563416,
      "loss_layer_36_head": 0.08880215883255005,
      "loss_layer_42_head": 0.046122726052999496,
      "loss_layer_6_head": 0.7620994448661804,
      "step": 6830
    },
    {
      "epoch": 87.488,
      "grad_norm": 0.05354916140020669,
      "learning_rate": 0.0002295249804341601,
      "loss": 2.2173,
      "loss_layer_12_head": 0.5268098711967468,
      "loss_layer_18_head": 0.42047590017318726,
      "loss_layer_24_head": 0.25297805666923523,
      "loss_layer_30_head": 0.14366549253463745,
      "loss_layer_36_head": 0.08779659867286682,
      "loss_layer_42_head": 0.04524371773004532,
      "loss_layer_6_head": 0.7475795149803162,
      "step": 6835
    },
    {
      "epoch": 87.552,
      "grad_norm": 0.055513965940126454,
      "learning_rate": 0.0002271892504931905,
      "loss": 2.287,
      "loss_layer_12_head": 0.563186526298523,
      "loss_layer_18_head": 0.44733983278274536,
      "loss_layer_24_head": 0.2670723795890808,
      "loss_layer_30_head": 0.15191218256950378,
      "loss_layer_36_head": 0.09287315607070923,
      "loss_layer_42_head": 0.047421906143426895,
      "loss_layer_6_head": 0.8000202178955078,
      "step": 6840
    },
    {
      "epoch": 87.616,
      "grad_norm": 0.05661486185742164,
      "learning_rate": 0.00022486490019484718,
      "loss": 2.2915,
      "loss_layer_12_head": 0.5238741636276245,
      "loss_layer_18_head": 0.4150371551513672,
      "loss_layer_24_head": 0.2451181858778,
      "loss_layer_30_head": 0.13959337770938873,
      "loss_layer_36_head": 0.08498172461986542,
      "loss_layer_42_head": 0.04361828416585922,
      "loss_layer_6_head": 0.7494237422943115,
      "step": 6845
    },
    {
      "epoch": 87.68,
      "grad_norm": 0.05509126866526624,
      "learning_rate": 0.00022255194117682203,
      "loss": 2.2476,
      "loss_layer_12_head": 0.5273737907409668,
      "loss_layer_18_head": 0.42145460844039917,
      "loss_layer_24_head": 0.24913454055786133,
      "loss_layer_30_head": 0.14290118217468262,
      "loss_layer_36_head": 0.08784132450819016,
      "loss_layer_42_head": 0.04539437219500542,
      "loss_layer_6_head": 0.749653160572052,
      "step": 6850
    },
    {
      "epoch": 87.744,
      "grad_norm": 0.054302771462407036,
      "learning_rate": 0.00022025038501977484,
      "loss": 2.2595,
      "loss_layer_12_head": 0.5621454119682312,
      "loss_layer_18_head": 0.44645094871520996,
      "loss_layer_24_head": 0.2639508843421936,
      "loss_layer_30_head": 0.14940956234931946,
      "loss_layer_36_head": 0.09156061708927155,
      "loss_layer_42_head": 0.04670485109090805,
      "loss_layer_6_head": 0.794995903968811,
      "step": 6855
    },
    {
      "epoch": 87.808,
      "grad_norm": 0.05671559722481603,
      "learning_rate": 0.00021796024324727297,
      "loss": 2.2469,
      "loss_layer_12_head": 0.5277398228645325,
      "loss_layer_18_head": 0.4210580885410309,
      "loss_layer_24_head": 0.2503761947154999,
      "loss_layer_30_head": 0.14303870499134064,
      "loss_layer_36_head": 0.0876300111413002,
      "loss_layer_42_head": 0.04539332538843155,
      "loss_layer_6_head": 0.7459090948104858,
      "step": 6860
    },
    {
      "epoch": 87.872,
      "grad_norm": 0.06172608443484001,
      "learning_rate": 0.00021568152732573314,
      "loss": 2.2175,
      "loss_layer_12_head": 0.5356486439704895,
      "loss_layer_18_head": 0.42519432306289673,
      "loss_layer_24_head": 0.25486522912979126,
      "loss_layer_30_head": 0.14442728459835052,
      "loss_layer_36_head": 0.08863189816474915,
      "loss_layer_42_head": 0.04539735987782478,
      "loss_layer_6_head": 0.7622808218002319,
      "step": 6865
    },
    {
      "epoch": 87.936,
      "grad_norm": 0.05977832574258168,
      "learning_rate": 0.00021341424866436366,
      "loss": 2.302,
      "loss_layer_12_head": 0.546026885509491,
      "loss_layer_18_head": 0.4350968301296234,
      "loss_layer_24_head": 0.2592531740665436,
      "loss_layer_30_head": 0.14747214317321777,
      "loss_layer_36_head": 0.09019935876131058,
      "loss_layer_42_head": 0.046083513647317886,
      "loss_layer_6_head": 0.7748081088066101,
      "step": 6870
    },
    {
      "epoch": 88.0,
      "grad_norm": 0.06112626017181989,
      "learning_rate": 0.00021115841861510943,
      "loss": 2.2577,
      "loss_layer_12_head": 0.5233558416366577,
      "loss_layer_18_head": 0.4167335629463196,
      "loss_layer_24_head": 0.2473728209733963,
      "loss_layer_30_head": 0.14080706238746643,
      "loss_layer_36_head": 0.08631843328475952,
      "loss_layer_42_head": 0.044219836592674255,
      "loss_layer_6_head": 0.7431975603103638,
      "step": 6875
    },
    {
      "epoch": 88.064,
      "grad_norm": 0.05565674271519428,
      "learning_rate": 0.00020891404847259265,
      "loss": 2.2304,
      "loss_layer_12_head": 0.5208278894424438,
      "loss_layer_18_head": 0.4133831560611725,
      "loss_layer_24_head": 0.24610228836536407,
      "loss_layer_30_head": 0.1377626359462738,
      "loss_layer_36_head": 0.08389551937580109,
      "loss_layer_42_head": 0.04309685155749321,
      "loss_layer_6_head": 0.740589439868927,
      "step": 6880
    },
    {
      "epoch": 88.128,
      "grad_norm": 0.06211798936498134,
      "learning_rate": 0.00020668114947405725,
      "loss": 2.1846,
      "loss_layer_12_head": 0.5547683238983154,
      "loss_layer_18_head": 0.44111737608909607,
      "loss_layer_24_head": 0.2623556852340698,
      "loss_layer_30_head": 0.1498388648033142,
      "loss_layer_36_head": 0.09049813449382782,
      "loss_layer_42_head": 0.046591419726610184,
      "loss_layer_6_head": 0.7885844707489014,
      "step": 6885
    },
    {
      "epoch": 88.192,
      "grad_norm": 0.05708511605934873,
      "learning_rate": 0.0002044597327993153,
      "loss": 2.2825,
      "loss_layer_12_head": 0.5313240885734558,
      "loss_layer_18_head": 0.4232202470302582,
      "loss_layer_24_head": 0.2503286600112915,
      "loss_layer_30_head": 0.1423531025648117,
      "loss_layer_36_head": 0.08761018514633179,
      "loss_layer_42_head": 0.04478836804628372,
      "loss_layer_6_head": 0.752066969871521,
      "step": 6890
    },
    {
      "epoch": 88.256,
      "grad_norm": 0.05729639101037143,
      "learning_rate": 0.00020224980957068427,
      "loss": 2.2478,
      "loss_layer_12_head": 0.557756245136261,
      "loss_layer_18_head": 0.44733673334121704,
      "loss_layer_24_head": 0.26696380972862244,
      "loss_layer_30_head": 0.15278266370296478,
      "loss_layer_36_head": 0.09366295486688614,
      "loss_layer_42_head": 0.048374392092227936,
      "loss_layer_6_head": 0.7856209874153137,
      "step": 6895
    },
    {
      "epoch": 88.32,
      "grad_norm": 0.05928991632043489,
      "learning_rate": 0.00020005139085293945,
      "loss": 2.2404,
      "loss_layer_12_head": 0.5373083353042603,
      "loss_layer_18_head": 0.42907482385635376,
      "loss_layer_24_head": 0.25586724281311035,
      "loss_layer_30_head": 0.14655670523643494,
      "loss_layer_36_head": 0.08940807729959488,
      "loss_layer_42_head": 0.046046338975429535,
      "loss_layer_6_head": 0.7625064849853516,
      "step": 6900
    },
    {
      "epoch": 88.384,
      "grad_norm": 0.05657752828247937,
      "learning_rate": 0.0001978644876532526,
      "loss": 2.2303,
      "loss_layer_12_head": 0.5374948978424072,
      "loss_layer_18_head": 0.4280579686164856,
      "loss_layer_24_head": 0.25543004274368286,
      "loss_layer_30_head": 0.1461663395166397,
      "loss_layer_36_head": 0.0898989886045456,
      "loss_layer_42_head": 0.046274684369564056,
      "loss_layer_6_head": 0.7684916257858276,
      "step": 6905
    },
    {
      "epoch": 88.448,
      "grad_norm": 0.055499281613891456,
      "learning_rate": 0.00019568911092113922,
      "loss": 2.2252,
      "loss_layer_12_head": 0.5335908532142639,
      "loss_layer_18_head": 0.42451420426368713,
      "loss_layer_24_head": 0.2514415383338928,
      "loss_layer_30_head": 0.1435246616601944,
      "loss_layer_36_head": 0.0879477709531784,
      "loss_layer_42_head": 0.04525122418999672,
      "loss_layer_6_head": 0.7554299235343933,
      "step": 6910
    },
    {
      "epoch": 88.512,
      "grad_norm": 0.05842542357650877,
      "learning_rate": 0.00019352527154840345,
      "loss": 2.2459,
      "loss_layer_12_head": 0.5515720248222351,
      "loss_layer_18_head": 0.4390309453010559,
      "loss_layer_24_head": 0.2610723376274109,
      "loss_layer_30_head": 0.14874950051307678,
      "loss_layer_36_head": 0.09074465185403824,
      "loss_layer_42_head": 0.04662574455142021,
      "loss_layer_6_head": 0.7810153365135193,
      "step": 6915
    },
    {
      "epoch": 88.576,
      "grad_norm": 0.05288849185021993,
      "learning_rate": 0.0001913729803690839,
      "loss": 2.2381,
      "loss_layer_12_head": 0.5216792225837708,
      "loss_layer_18_head": 0.4162190556526184,
      "loss_layer_24_head": 0.24946236610412598,
      "loss_layer_30_head": 0.14313170313835144,
      "loss_layer_36_head": 0.08819614350795746,
      "loss_layer_42_head": 0.04579892009496689,
      "loss_layer_6_head": 0.7439811825752258,
      "step": 6920
    },
    {
      "epoch": 88.64,
      "grad_norm": 0.06400786200185156,
      "learning_rate": 0.0001892322481593983,
      "loss": 2.2779,
      "loss_layer_12_head": 0.5402895212173462,
      "loss_layer_18_head": 0.4305475354194641,
      "loss_layer_24_head": 0.25562530755996704,
      "loss_layer_30_head": 0.1464603841304779,
      "loss_layer_36_head": 0.08978608250617981,
      "loss_layer_42_head": 0.04587053880095482,
      "loss_layer_6_head": 0.7692114114761353,
      "step": 6925
    },
    {
      "epoch": 88.704,
      "grad_norm": 0.06425514129390522,
      "learning_rate": 0.00018710308563769123,
      "loss": 2.2946,
      "loss_layer_12_head": 0.5353783369064331,
      "loss_layer_18_head": 0.4257078170776367,
      "loss_layer_24_head": 0.25251516699790955,
      "loss_layer_30_head": 0.1435382068157196,
      "loss_layer_36_head": 0.0878438800573349,
      "loss_layer_42_head": 0.04511797055602074,
      "loss_layer_6_head": 0.7619491815567017,
      "step": 6930
    },
    {
      "epoch": 88.768,
      "grad_norm": 0.0720459870382579,
      "learning_rate": 0.00018498550346437854,
      "loss": 2.2971,
      "loss_layer_12_head": 0.5336654782295227,
      "loss_layer_18_head": 0.4263373017311096,
      "loss_layer_24_head": 0.25272268056869507,
      "loss_layer_30_head": 0.1425168812274933,
      "loss_layer_36_head": 0.0866183191537857,
      "loss_layer_42_head": 0.04439380019903183,
      "loss_layer_6_head": 0.7645434141159058,
      "step": 6935
    },
    {
      "epoch": 88.832,
      "grad_norm": 0.06076648269293442,
      "learning_rate": 0.00018287951224189554,
      "loss": 2.2381,
      "loss_layer_12_head": 0.5284239053726196,
      "loss_layer_18_head": 0.42110395431518555,
      "loss_layer_24_head": 0.2507324814796448,
      "loss_layer_30_head": 0.14135773479938507,
      "loss_layer_36_head": 0.08661328256130219,
      "loss_layer_42_head": 0.044463902711868286,
      "loss_layer_6_head": 0.7559477090835571,
      "step": 6940
    },
    {
      "epoch": 88.896,
      "grad_norm": 0.06008951296478203,
      "learning_rate": 0.00018078512251464286,
      "loss": 2.2525,
      "loss_layer_12_head": 0.5182813405990601,
      "loss_layer_18_head": 0.41305485367774963,
      "loss_layer_24_head": 0.24512962996959686,
      "loss_layer_30_head": 0.1399398297071457,
      "loss_layer_36_head": 0.0865577906370163,
      "loss_layer_42_head": 0.04445686191320419,
      "loss_layer_6_head": 0.7347121834754944,
      "step": 6945
    },
    {
      "epoch": 88.96,
      "grad_norm": 0.05511452353343232,
      "learning_rate": 0.00017870234476893428,
      "loss": 2.2563,
      "loss_layer_12_head": 0.5374377965927124,
      "loss_layer_18_head": 0.42947903275489807,
      "loss_layer_24_head": 0.2561786472797394,
      "loss_layer_30_head": 0.14682166278362274,
      "loss_layer_36_head": 0.08966167271137238,
      "loss_layer_42_head": 0.04568709060549736,
      "loss_layer_6_head": 0.7646173238754272,
      "step": 6950
    },
    {
      "epoch": 89.024,
      "grad_norm": 0.0543313063799632,
      "learning_rate": 0.00017663118943294365,
      "loss": 2.2045,
      "loss_layer_12_head": 0.5137767791748047,
      "loss_layer_18_head": 0.40694957971572876,
      "loss_layer_24_head": 0.2400384396314621,
      "loss_layer_30_head": 0.13627514243125916,
      "loss_layer_36_head": 0.0835515484213829,
      "loss_layer_42_head": 0.04323747009038925,
      "loss_layer_6_head": 0.7323139905929565,
      "step": 6955
    },
    {
      "epoch": 89.088,
      "grad_norm": 0.05385283001059491,
      "learning_rate": 0.00017457166687665448,
      "loss": 2.2475,
      "loss_layer_12_head": 0.5543397068977356,
      "loss_layer_18_head": 0.4451007843017578,
      "loss_layer_24_head": 0.26727569103240967,
      "loss_layer_30_head": 0.15160156786441803,
      "loss_layer_36_head": 0.09284184128046036,
      "loss_layer_42_head": 0.04765158146619797,
      "loss_layer_6_head": 0.7852809429168701,
      "step": 6960
    },
    {
      "epoch": 89.152,
      "grad_norm": 0.054020627934849054,
      "learning_rate": 0.00017252378741180408,
      "loss": 2.2312,
      "loss_layer_12_head": 0.5454975366592407,
      "loss_layer_18_head": 0.4385412633419037,
      "loss_layer_24_head": 0.2639528512954712,
      "loss_layer_30_head": 0.15284475684165955,
      "loss_layer_36_head": 0.09400571137666702,
      "loss_layer_42_head": 0.048600614070892334,
      "loss_layer_6_head": 0.766593337059021,
      "step": 6965
    },
    {
      "epoch": 89.216,
      "grad_norm": 0.053797734562927656,
      "learning_rate": 0.0001704875612918369,
      "loss": 2.241,
      "loss_layer_12_head": 0.5434983372688293,
      "loss_layer_18_head": 0.43423691391944885,
      "loss_layer_24_head": 0.2576582133769989,
      "loss_layer_30_head": 0.14642329514026642,
      "loss_layer_36_head": 0.08926916122436523,
      "loss_layer_42_head": 0.04577638581395149,
      "loss_layer_6_head": 0.7712544202804565,
      "step": 6970
    },
    {
      "epoch": 89.28,
      "grad_norm": 0.05471156921546921,
      "learning_rate": 0.0001684629987118494,
      "loss": 2.2262,
      "loss_layer_12_head": 0.5330283641815186,
      "loss_layer_18_head": 0.42552512884140015,
      "loss_layer_24_head": 0.25373929738998413,
      "loss_layer_30_head": 0.14421412348747253,
      "loss_layer_36_head": 0.08785871416330338,
      "loss_layer_42_head": 0.045140478760004044,
      "loss_layer_6_head": 0.7583318948745728,
      "step": 6975
    },
    {
      "epoch": 89.344,
      "grad_norm": 0.05196551936408948,
      "learning_rate": 0.0001664501098085408,
      "loss": 2.239,
      "loss_layer_12_head": 0.5213258862495422,
      "loss_layer_18_head": 0.41287779808044434,
      "loss_layer_24_head": 0.24584424495697021,
      "loss_layer_30_head": 0.13976594805717468,
      "loss_layer_36_head": 0.08548443019390106,
      "loss_layer_42_head": 0.043877650052309036,
      "loss_layer_6_head": 0.7464960813522339,
      "step": 6980
    },
    {
      "epoch": 89.408,
      "grad_norm": 0.053958279459563274,
      "learning_rate": 0.00016444890466016133,
      "loss": 2.257,
      "loss_layer_12_head": 0.5283223986625671,
      "loss_layer_18_head": 0.4184536337852478,
      "loss_layer_24_head": 0.24705660343170166,
      "loss_layer_30_head": 0.14021897315979004,
      "loss_layer_36_head": 0.08639784902334213,
      "loss_layer_42_head": 0.044169407337903976,
      "loss_layer_6_head": 0.7544612288475037,
      "step": 6985
    },
    {
      "epoch": 89.472,
      "grad_norm": 0.0580812512555204,
      "learning_rate": 0.0001624593932864632,
      "loss": 2.2017,
      "loss_layer_12_head": 0.49516016244888306,
      "loss_layer_18_head": 0.3944857120513916,
      "loss_layer_24_head": 0.2356228530406952,
      "loss_layer_30_head": 0.13511595129966736,
      "loss_layer_36_head": 0.08372960984706879,
      "loss_layer_42_head": 0.043034572154283524,
      "loss_layer_6_head": 0.6996387243270874,
      "step": 6990
    },
    {
      "epoch": 89.536,
      "grad_norm": 0.05397967086884604,
      "learning_rate": 0.00016048158564864828,
      "loss": 2.2022,
      "loss_layer_12_head": 0.4938785433769226,
      "loss_layer_18_head": 0.3918052315711975,
      "loss_layer_24_head": 0.23229572176933289,
      "loss_layer_30_head": 0.13207563757896423,
      "loss_layer_36_head": 0.08138305693864822,
      "loss_layer_42_head": 0.04173192381858826,
      "loss_layer_6_head": 0.707516610622406,
      "step": 6995
    },
    {
      "epoch": 89.6,
      "grad_norm": 0.05339917755245831,
      "learning_rate": 0.00015851549164932116,
      "loss": 2.2597,
      "loss_layer_12_head": 0.5440739393234253,
      "loss_layer_18_head": 0.43574923276901245,
      "loss_layer_24_head": 0.2604158818721771,
      "loss_layer_30_head": 0.14823810756206512,
      "loss_layer_36_head": 0.09078150242567062,
      "loss_layer_42_head": 0.04635284096002579,
      "loss_layer_6_head": 0.7722476124763489,
      "step": 7000
    },
    {
      "epoch": 89.6,
      "eval_loss": 5.356217384338379,
      "eval_loss_layer_12_head": 1.2387166023254395,
      "eval_loss_layer_18_head": 1.0732319355010986,
      "eval_loss_layer_24_head": 0.6859959363937378,
      "eval_loss_layer_30_head": 0.44397395849227905,
      "eval_loss_layer_36_head": 0.284365713596344,
      "eval_loss_layer_42_head": 0.16835597157478333,
      "eval_loss_layer_6_head": 1.5858103036880493,
      "eval_runtime": 33.0616,
      "eval_samples_per_second": 9.679,
      "eval_steps_per_second": 0.605,
      "step": 7000
    },
    {
      "epoch": 89.664,
      "grad_norm": 0.056757846186042134,
      "learning_rate": 0.0001565611211324372,
      "loss": 2.267,
      "loss_layer_12_head": 0.5450161695480347,
      "loss_layer_18_head": 0.43357953429222107,
      "loss_layer_24_head": 0.25895190238952637,
      "loss_layer_30_head": 0.14739055931568146,
      "loss_layer_36_head": 0.09050838649272919,
      "loss_layer_42_head": 0.04657844081521034,
      "loss_layer_6_head": 0.775080144405365,
      "step": 7005
    },
    {
      "epoch": 89.728,
      "grad_norm": 0.05479429072628373,
      "learning_rate": 0.00015461848388325205,
      "loss": 2.2571,
      "loss_layer_12_head": 0.5233315229415894,
      "loss_layer_18_head": 0.4173172414302826,
      "loss_layer_24_head": 0.25018396973609924,
      "loss_layer_30_head": 0.1441095471382141,
      "loss_layer_36_head": 0.08883531391620636,
      "loss_layer_42_head": 0.045959316194057465,
      "loss_layer_6_head": 0.7458498477935791,
      "step": 7010
    },
    {
      "epoch": 89.792,
      "grad_norm": 0.05779473294047172,
      "learning_rate": 0.00015268758962827783,
      "loss": 2.2245,
      "loss_layer_12_head": 0.532579779624939,
      "loss_layer_18_head": 0.4211394190788269,
      "loss_layer_24_head": 0.2488752156496048,
      "loss_layer_30_head": 0.14008697867393494,
      "loss_layer_36_head": 0.0858522281050682,
      "loss_layer_42_head": 0.043602656573057175,
      "loss_layer_6_head": 0.76105135679245,
      "step": 7015
    },
    {
      "epoch": 89.856,
      "grad_norm": 0.052816642913364326,
      "learning_rate": 0.00015076844803522921,
      "loss": 2.2507,
      "loss_layer_12_head": 0.532747209072113,
      "loss_layer_18_head": 0.4282136559486389,
      "loss_layer_24_head": 0.2568611204624176,
      "loss_layer_30_head": 0.14915773272514343,
      "loss_layer_36_head": 0.09260449558496475,
      "loss_layer_42_head": 0.04812869802117348,
      "loss_layer_6_head": 0.7531678676605225,
      "step": 7020
    },
    {
      "epoch": 89.92,
      "grad_norm": 0.05407066850215884,
      "learning_rate": 0.00014886106871297682,
      "loss": 2.217,
      "loss_layer_12_head": 0.5448362231254578,
      "loss_layer_18_head": 0.4344650208950043,
      "loss_layer_24_head": 0.25980645418167114,
      "loss_layer_30_head": 0.14796803891658783,
      "loss_layer_36_head": 0.09004895389080048,
      "loss_layer_42_head": 0.04576081037521362,
      "loss_layer_6_head": 0.7710133790969849,
      "step": 7025
    },
    {
      "epoch": 89.984,
      "grad_norm": 0.05256007176476212,
      "learning_rate": 0.00014696546121150013,
      "loss": 2.3266,
      "loss_layer_12_head": 0.5551749467849731,
      "loss_layer_18_head": 0.4439207911491394,
      "loss_layer_24_head": 0.26401352882385254,
      "loss_layer_30_head": 0.1511339694261551,
      "loss_layer_36_head": 0.09269972890615463,
      "loss_layer_42_head": 0.04723666235804558,
      "loss_layer_6_head": 0.7888709902763367,
      "step": 7030
    },
    {
      "epoch": 90.048,
      "grad_norm": 0.05232497038443383,
      "learning_rate": 0.00014508163502183785,
      "loss": 2.2511,
      "loss_layer_12_head": 0.5232555866241455,
      "loss_layer_18_head": 0.4163399636745453,
      "loss_layer_24_head": 0.2463575303554535,
      "loss_layer_30_head": 0.13943186402320862,
      "loss_layer_36_head": 0.08543013036251068,
      "loss_layer_42_head": 0.043880216777324677,
      "loss_layer_6_head": 0.7437131404876709,
      "step": 7035
    },
    {
      "epoch": 90.112,
      "grad_norm": 0.05230170949426621,
      "learning_rate": 0.0001432095995760424,
      "loss": 2.1983,
      "loss_layer_12_head": 0.5029712915420532,
      "loss_layer_18_head": 0.39944034814834595,
      "loss_layer_24_head": 0.2383076250553131,
      "loss_layer_30_head": 0.13563981652259827,
      "loss_layer_36_head": 0.08292778581380844,
      "loss_layer_42_head": 0.042596571147441864,
      "loss_layer_6_head": 0.7157915830612183,
      "step": 7040
    },
    {
      "epoch": 90.176,
      "grad_norm": 0.05115482528137981,
      "learning_rate": 0.00014134936424713056,
      "loss": 2.2197,
      "loss_layer_12_head": 0.5415345430374146,
      "loss_layer_18_head": 0.43149566650390625,
      "loss_layer_24_head": 0.256852388381958,
      "loss_layer_30_head": 0.14676420390605927,
      "loss_layer_36_head": 0.0903097614645958,
      "loss_layer_42_head": 0.046439941972494125,
      "loss_layer_6_head": 0.7679991126060486,
      "step": 7045
    },
    {
      "epoch": 90.24,
      "grad_norm": 0.05279165755475762,
      "learning_rate": 0.00013950093834903866,
      "loss": 2.2406,
      "loss_layer_12_head": 0.4899584650993347,
      "loss_layer_18_head": 0.39022940397262573,
      "loss_layer_24_head": 0.23052187263965607,
      "loss_layer_30_head": 0.13181224465370178,
      "loss_layer_36_head": 0.08062335103750229,
      "loss_layer_42_head": 0.041549816727638245,
      "loss_layer_6_head": 0.6977458000183105,
      "step": 7050
    },
    {
      "epoch": 90.304,
      "grad_norm": 0.052359262523776484,
      "learning_rate": 0.00013766433113657355,
      "loss": 2.2102,
      "loss_layer_12_head": 0.5232660174369812,
      "loss_layer_18_head": 0.41841092705726624,
      "loss_layer_24_head": 0.25099676847457886,
      "loss_layer_30_head": 0.14364229142665863,
      "loss_layer_36_head": 0.0883738249540329,
      "loss_layer_42_head": 0.04589400440454483,
      "loss_layer_6_head": 0.7435967326164246,
      "step": 7055
    },
    {
      "epoch": 90.368,
      "grad_norm": 0.054273313405259815,
      "learning_rate": 0.000135839551805369,
      "loss": 2.2632,
      "loss_layer_12_head": 0.5553951263427734,
      "loss_layer_18_head": 0.44093966484069824,
      "loss_layer_24_head": 0.2635316252708435,
      "loss_layer_30_head": 0.1490272581577301,
      "loss_layer_36_head": 0.09143238514661789,
      "loss_layer_42_head": 0.04693298414349556,
      "loss_layer_6_head": 0.78724205493927,
      "step": 7060
    },
    {
      "epoch": 90.432,
      "grad_norm": 0.051655121391378435,
      "learning_rate": 0.0001340266094918366,
      "loss": 2.2529,
      "loss_layer_12_head": 0.5009587407112122,
      "loss_layer_18_head": 0.3983080983161926,
      "loss_layer_24_head": 0.23814216256141663,
      "loss_layer_30_head": 0.13751927018165588,
      "loss_layer_36_head": 0.08530883491039276,
      "loss_layer_42_head": 0.044005222618579865,
      "loss_layer_6_head": 0.7147679924964905,
      "step": 7065
    },
    {
      "epoch": 90.496,
      "grad_norm": 0.05295106909876316,
      "learning_rate": 0.00013222551327312398,
      "loss": 2.2393,
      "loss_layer_12_head": 0.5481197237968445,
      "loss_layer_18_head": 0.4376665949821472,
      "loss_layer_24_head": 0.26115334033966064,
      "loss_layer_30_head": 0.14749102294445038,
      "loss_layer_36_head": 0.09066436439752579,
      "loss_layer_42_head": 0.04606344550848007,
      "loss_layer_6_head": 0.7730547189712524,
      "step": 7070
    },
    {
      "epoch": 90.56,
      "grad_norm": 0.05346201898384375,
      "learning_rate": 0.0001304362721670646,
      "loss": 2.2587,
      "loss_layer_12_head": 0.5447693467140198,
      "loss_layer_18_head": 0.43302807211875916,
      "loss_layer_24_head": 0.25627434253692627,
      "loss_layer_30_head": 0.14500948786735535,
      "loss_layer_36_head": 0.0883561223745346,
      "loss_layer_42_head": 0.04541439563035965,
      "loss_layer_6_head": 0.7753423452377319,
      "step": 7075
    },
    {
      "epoch": 90.624,
      "grad_norm": 0.05564158555402166,
      "learning_rate": 0.0001286588951321363,
      "loss": 2.2225,
      "loss_layer_12_head": 0.5445320010185242,
      "loss_layer_18_head": 0.43277063965797424,
      "loss_layer_24_head": 0.25837618112564087,
      "loss_layer_30_head": 0.1462927758693695,
      "loss_layer_36_head": 0.08983197808265686,
      "loss_layer_42_head": 0.04611320048570633,
      "loss_layer_6_head": 0.771212100982666,
      "step": 7080
    },
    {
      "epoch": 90.688,
      "grad_norm": 0.05875885052393764,
      "learning_rate": 0.00012689339106741526,
      "loss": 2.237,
      "loss_layer_12_head": 0.5212829113006592,
      "loss_layer_18_head": 0.41382455825805664,
      "loss_layer_24_head": 0.24462971091270447,
      "loss_layer_30_head": 0.13878513872623444,
      "loss_layer_36_head": 0.08534792810678482,
      "loss_layer_42_head": 0.04349168762564659,
      "loss_layer_6_head": 0.7430446743965149,
      "step": 7085
    },
    {
      "epoch": 90.752,
      "grad_norm": 0.052407157955111444,
      "learning_rate": 0.00012513976881253108,
      "loss": 2.2687,
      "loss_layer_12_head": 0.5124794244766235,
      "loss_layer_18_head": 0.4076639711856842,
      "loss_layer_24_head": 0.24201726913452148,
      "loss_layer_30_head": 0.13842031359672546,
      "loss_layer_36_head": 0.08473876118659973,
      "loss_layer_42_head": 0.04324420914053917,
      "loss_layer_6_head": 0.7278995513916016,
      "step": 7090
    },
    {
      "epoch": 90.816,
      "grad_norm": 0.0560715561473946,
      "learning_rate": 0.00012339803714762315,
      "loss": 2.2431,
      "loss_layer_12_head": 0.537960410118103,
      "loss_layer_18_head": 0.42895182967185974,
      "loss_layer_24_head": 0.2542749345302582,
      "loss_layer_30_head": 0.1445624977350235,
      "loss_layer_36_head": 0.08842632174491882,
      "loss_layer_42_head": 0.04524507373571396,
      "loss_layer_6_head": 0.7651423215866089,
      "step": 7095
    },
    {
      "epoch": 90.88,
      "grad_norm": 0.05935642061086336,
      "learning_rate": 0.00012166820479329571,
      "loss": 2.241,
      "loss_layer_12_head": 0.519959568977356,
      "loss_layer_18_head": 0.4147302508354187,
      "loss_layer_24_head": 0.24627681076526642,
      "loss_layer_30_head": 0.14044925570487976,
      "loss_layer_36_head": 0.08659595996141434,
      "loss_layer_42_head": 0.04420977085828781,
      "loss_layer_6_head": 0.7431796789169312,
      "step": 7100
    },
    {
      "epoch": 90.944,
      "grad_norm": 0.053833951346479565,
      "learning_rate": 0.00011995028041057509,
      "loss": 2.2267,
      "loss_layer_12_head": 0.526918888092041,
      "loss_layer_18_head": 0.4195947051048279,
      "loss_layer_24_head": 0.24853253364562988,
      "loss_layer_30_head": 0.1408359557390213,
      "loss_layer_36_head": 0.0856551080942154,
      "loss_layer_42_head": 0.044560983777046204,
      "loss_layer_6_head": 0.7463191747665405,
      "step": 7105
    },
    {
      "epoch": 91.008,
      "grad_norm": 0.05024880577221044,
      "learning_rate": 0.00011824427260086617,
      "loss": 2.2592,
      "loss_layer_12_head": 0.5469123125076294,
      "loss_layer_18_head": 0.4341784119606018,
      "loss_layer_24_head": 0.2584002912044525,
      "loss_layer_30_head": 0.14665910601615906,
      "loss_layer_36_head": 0.08993585407733917,
      "loss_layer_42_head": 0.04578367620706558,
      "loss_layer_6_head": 0.7763140797615051,
      "step": 7110
    },
    {
      "epoch": 91.072,
      "grad_norm": 0.05228004821490313,
      "learning_rate": 0.00011655018990591043,
      "loss": 2.2153,
      "loss_layer_12_head": 0.5050893425941467,
      "loss_layer_18_head": 0.40541234612464905,
      "loss_layer_24_head": 0.24303162097930908,
      "loss_layer_30_head": 0.13979493081569672,
      "loss_layer_36_head": 0.08577661961317062,
      "loss_layer_42_head": 0.04424048215150833,
      "loss_layer_6_head": 0.7186012864112854,
      "step": 7115
    },
    {
      "epoch": 91.136,
      "grad_norm": 0.0516772585010992,
      "learning_rate": 0.00011486804080773877,
      "loss": 2.1943,
      "loss_layer_12_head": 0.5272228121757507,
      "loss_layer_18_head": 0.42181238532066345,
      "loss_layer_24_head": 0.2502351403236389,
      "loss_layer_30_head": 0.14350543916225433,
      "loss_layer_36_head": 0.08874441683292389,
      "loss_layer_42_head": 0.04519970342516899,
      "loss_layer_6_head": 0.7477651834487915,
      "step": 7120
    },
    {
      "epoch": 91.2,
      "grad_norm": 0.05219456359747025,
      "learning_rate": 0.00011319783372863601,
      "loss": 2.2296,
      "loss_layer_12_head": 0.5228275060653687,
      "loss_layer_18_head": 0.4174315333366394,
      "loss_layer_24_head": 0.24911698698997498,
      "loss_layer_30_head": 0.14199914038181305,
      "loss_layer_36_head": 0.0865330696105957,
      "loss_layer_42_head": 0.04422705993056297,
      "loss_layer_6_head": 0.7472935914993286,
      "step": 7125
    },
    {
      "epoch": 91.264,
      "grad_norm": 0.053257077800307806,
      "learning_rate": 0.00011153957703109258,
      "loss": 2.2091,
      "loss_layer_12_head": 0.5168344378471375,
      "loss_layer_18_head": 0.4109462797641754,
      "loss_layer_24_head": 0.24363939464092255,
      "loss_layer_30_head": 0.13902731239795685,
      "loss_layer_36_head": 0.08524765074253082,
      "loss_layer_42_head": 0.043548136949539185,
      "loss_layer_6_head": 0.7371495962142944,
      "step": 7130
    },
    {
      "epoch": 91.328,
      "grad_norm": 0.051001719900327834,
      "learning_rate": 0.00010989327901776564,
      "loss": 2.2152,
      "loss_layer_12_head": 0.5445336103439331,
      "loss_layer_18_head": 0.433986097574234,
      "loss_layer_24_head": 0.25799334049224854,
      "loss_layer_30_head": 0.14658167958259583,
      "loss_layer_36_head": 0.0897030383348465,
      "loss_layer_42_head": 0.04599509388208389,
      "loss_layer_6_head": 0.7733681201934814,
      "step": 7135
    },
    {
      "epoch": 91.392,
      "grad_norm": 0.051047127586380095,
      "learning_rate": 0.0001082589479314372,
      "loss": 2.1876,
      "loss_layer_12_head": 0.5103386640548706,
      "loss_layer_18_head": 0.4076921343803406,
      "loss_layer_24_head": 0.2430182248353958,
      "loss_layer_30_head": 0.13802394270896912,
      "loss_layer_36_head": 0.08485790342092514,
      "loss_layer_42_head": 0.043551623821258545,
      "loss_layer_6_head": 0.7313261032104492,
      "step": 7140
    },
    {
      "epoch": 91.456,
      "grad_norm": 0.0517400528684229,
      "learning_rate": 0.00010663659195497222,
      "loss": 2.2705,
      "loss_layer_12_head": 0.5300264954566956,
      "loss_layer_18_head": 0.42338743805885315,
      "loss_layer_24_head": 0.25127196311950684,
      "loss_layer_30_head": 0.14342385530471802,
      "loss_layer_36_head": 0.08791377395391464,
      "loss_layer_42_head": 0.04500436410307884,
      "loss_layer_6_head": 0.7585389018058777,
      "step": 7145
    },
    {
      "epoch": 91.52,
      "grad_norm": 0.05293301307767997,
      "learning_rate": 0.00010502621921127774,
      "loss": 2.2504,
      "loss_layer_12_head": 0.5272853374481201,
      "loss_layer_18_head": 0.4218936562538147,
      "loss_layer_24_head": 0.2516951262950897,
      "loss_layer_30_head": 0.14523956179618835,
      "loss_layer_36_head": 0.0888117328286171,
      "loss_layer_42_head": 0.04577495530247688,
      "loss_layer_6_head": 0.7468043565750122,
      "step": 7150
    },
    {
      "epoch": 91.584,
      "grad_norm": 0.05352571384379411,
      "learning_rate": 0.00010342783776326358,
      "loss": 2.2753,
      "loss_layer_12_head": 0.5392197370529175,
      "loss_layer_18_head": 0.4318494200706482,
      "loss_layer_24_head": 0.2587434947490692,
      "loss_layer_30_head": 0.14932116866111755,
      "loss_layer_36_head": 0.09221414476633072,
      "loss_layer_42_head": 0.04802995175123215,
      "loss_layer_6_head": 0.7588967084884644,
      "step": 7155
    },
    {
      "epoch": 91.648,
      "grad_norm": 0.051668973648516595,
      "learning_rate": 0.00010184145561379949,
      "loss": 2.2482,
      "loss_layer_12_head": 0.5554657578468323,
      "loss_layer_18_head": 0.4415279030799866,
      "loss_layer_24_head": 0.2619887888431549,
      "loss_layer_30_head": 0.1498332917690277,
      "loss_layer_36_head": 0.09149160236120224,
      "loss_layer_42_head": 0.04692346230149269,
      "loss_layer_6_head": 0.7842797040939331,
      "step": 7160
    },
    {
      "epoch": 91.712,
      "grad_norm": 0.05609084958190971,
      "learning_rate": 0.00010026708070567713,
      "loss": 2.2522,
      "loss_layer_12_head": 0.5384258031845093,
      "loss_layer_18_head": 0.42761173844337463,
      "loss_layer_24_head": 0.25419914722442627,
      "loss_layer_30_head": 0.14380492269992828,
      "loss_layer_36_head": 0.08846693485975266,
      "loss_layer_42_head": 0.044943876564502716,
      "loss_layer_6_head": 0.7655342817306519,
      "step": 7165
    },
    {
      "epoch": 91.776,
      "grad_norm": 0.0538982706021387,
      "learning_rate": 9.87047209215694e-05,
      "loss": 2.2316,
      "loss_layer_12_head": 0.5316731929779053,
      "loss_layer_18_head": 0.4236294627189636,
      "loss_layer_24_head": 0.252806693315506,
      "loss_layer_30_head": 0.14398811757564545,
      "loss_layer_36_head": 0.0882612019777298,
      "loss_layer_42_head": 0.04526513069868088,
      "loss_layer_6_head": 0.7585301399230957,
      "step": 7170
    },
    {
      "epoch": 91.84,
      "grad_norm": 0.05482486327715114,
      "learning_rate": 9.715438408399113e-05,
      "loss": 2.2266,
      "loss_layer_12_head": 0.5273081064224243,
      "loss_layer_18_head": 0.4192798137664795,
      "loss_layer_24_head": 0.2512471377849579,
      "loss_layer_30_head": 0.14328232407569885,
      "loss_layer_36_head": 0.08787624537944794,
      "loss_layer_42_head": 0.04507755860686302,
      "loss_layer_6_head": 0.7503596544265747,
      "step": 7175
    },
    {
      "epoch": 91.904,
      "grad_norm": 0.055650135223957745,
      "learning_rate": 9.561607795526006e-05,
      "loss": 2.21,
      "loss_layer_12_head": 0.4912711977958679,
      "loss_layer_18_head": 0.39225998520851135,
      "loss_layer_24_head": 0.23463661968708038,
      "loss_layer_30_head": 0.1365848332643509,
      "loss_layer_36_head": 0.08497779071331024,
      "loss_layer_42_head": 0.04408429190516472,
      "loss_layer_6_head": 0.6957547664642334,
      "step": 7180
    },
    {
      "epoch": 91.968,
      "grad_norm": 0.055063778251260034,
      "learning_rate": 9.408981023745849e-05,
      "loss": 2.2893,
      "loss_layer_12_head": 0.5326167941093445,
      "loss_layer_18_head": 0.4278637766838074,
      "loss_layer_24_head": 0.2566971778869629,
      "loss_layer_30_head": 0.14653955399990082,
      "loss_layer_36_head": 0.08990754187107086,
      "loss_layer_42_head": 0.04602175951004028,
      "loss_layer_6_head": 0.7549342513084412,
      "step": 7185
    },
    {
      "epoch": 92.032,
      "grad_norm": 0.05226087633040168,
      "learning_rate": 9.257558857239223e-05,
      "loss": 2.234,
      "loss_layer_12_head": 0.532545268535614,
      "loss_layer_18_head": 0.4255821108818054,
      "loss_layer_24_head": 0.25367921590805054,
      "loss_layer_30_head": 0.14476756751537323,
      "loss_layer_36_head": 0.08872775733470917,
      "loss_layer_42_head": 0.045499287545681,
      "loss_layer_6_head": 0.7518646121025085,
      "step": 7190
    },
    {
      "epoch": 92.096,
      "grad_norm": 0.05258435079031712,
      "learning_rate": 9.107342054155643e-05,
      "loss": 2.2179,
      "loss_layer_12_head": 0.5430327653884888,
      "loss_layer_18_head": 0.43686050176620483,
      "loss_layer_24_head": 0.2619762718677521,
      "loss_layer_30_head": 0.1505759060382843,
      "loss_layer_36_head": 0.09137316048145294,
      "loss_layer_42_head": 0.047124043107032776,
      "loss_layer_6_head": 0.7688589692115784,
      "step": 7195
    },
    {
      "epoch": 92.16,
      "grad_norm": 0.054048828141898604,
      "learning_rate": 8.958331366609424e-05,
      "loss": 2.201,
      "loss_layer_12_head": 0.5316187739372253,
      "loss_layer_18_head": 0.42059358954429626,
      "loss_layer_24_head": 0.24950429797172546,
      "loss_layer_30_head": 0.14203578233718872,
      "loss_layer_36_head": 0.0868087187409401,
      "loss_layer_42_head": 0.044327955693006516,
      "loss_layer_6_head": 0.7623559832572937,
      "step": 7200
    },
    {
      "epoch": 92.16,
      "eval_loss": 5.356188774108887,
      "eval_loss_layer_12_head": 1.2387239933013916,
      "eval_loss_layer_18_head": 1.0733051300048828,
      "eval_loss_layer_24_head": 0.6861399412155151,
      "eval_loss_layer_30_head": 0.44375720620155334,
      "eval_loss_layer_36_head": 0.2841845154762268,
      "eval_loss_layer_42_head": 0.16843418776988983,
      "eval_loss_layer_6_head": 1.586723804473877,
      "eval_runtime": 33.0699,
      "eval_samples_per_second": 9.676,
      "eval_steps_per_second": 0.605,
      "step": 7200
    },
    {
      "epoch": 92.224,
      "grad_norm": 0.05103847773477053,
      "learning_rate": 8.810527540675988e-05,
      "loss": 2.2455,
      "loss_layer_12_head": 0.565186083316803,
      "loss_layer_18_head": 0.447012335062027,
      "loss_layer_24_head": 0.2656610906124115,
      "loss_layer_30_head": 0.1493075042963028,
      "loss_layer_36_head": 0.09073054790496826,
      "loss_layer_42_head": 0.046321772038936615,
      "loss_layer_6_head": 0.8007001876831055,
      "step": 7205
    },
    {
      "epoch": 92.288,
      "grad_norm": 0.05170466097073324,
      "learning_rate": 8.663931316388318e-05,
      "loss": 2.2548,
      "loss_layer_12_head": 0.542030930519104,
      "loss_layer_18_head": 0.43296176195144653,
      "loss_layer_24_head": 0.2587263882160187,
      "loss_layer_30_head": 0.14840267598628998,
      "loss_layer_36_head": 0.09066106379032135,
      "loss_layer_42_head": 0.04623229429125786,
      "loss_layer_6_head": 0.7678894996643066,
      "step": 7210
    },
    {
      "epoch": 92.352,
      "grad_norm": 0.052410979680652066,
      "learning_rate": 8.51854342773295e-05,
      "loss": 2.2119,
      "loss_layer_12_head": 0.49559807777404785,
      "loss_layer_18_head": 0.39600977301597595,
      "loss_layer_24_head": 0.23653773963451385,
      "loss_layer_30_head": 0.13561375439167023,
      "loss_layer_36_head": 0.08348972350358963,
      "loss_layer_42_head": 0.043414484709501266,
      "loss_layer_6_head": 0.7075377702713013,
      "step": 7215
    },
    {
      "epoch": 92.416,
      "grad_norm": 0.04894524600843059,
      "learning_rate": 8.374364602646511e-05,
      "loss": 2.2137,
      "loss_layer_12_head": 0.5398342609405518,
      "loss_layer_18_head": 0.4310402274131775,
      "loss_layer_24_head": 0.25469911098480225,
      "loss_layer_30_head": 0.1464356780052185,
      "loss_layer_36_head": 0.08978019654750824,
      "loss_layer_42_head": 0.0455387607216835,
      "loss_layer_6_head": 0.7650227546691895,
      "step": 7220
    },
    {
      "epoch": 92.48,
      "grad_norm": 0.052318511515194954,
      "learning_rate": 8.231395563012084e-05,
      "loss": 2.1969,
      "loss_layer_12_head": 0.5160000324249268,
      "loss_layer_18_head": 0.41011542081832886,
      "loss_layer_24_head": 0.24446430802345276,
      "loss_layer_30_head": 0.1394282728433609,
      "loss_layer_36_head": 0.0853966698050499,
      "loss_layer_42_head": 0.043984513729810715,
      "loss_layer_6_head": 0.7360585927963257,
      "step": 7225
    },
    {
      "epoch": 92.544,
      "grad_norm": 0.050041703066820035,
      "learning_rate": 8.089637024655483e-05,
      "loss": 2.2212,
      "loss_layer_12_head": 0.5125681161880493,
      "loss_layer_18_head": 0.40681010484695435,
      "loss_layer_24_head": 0.24092896282672882,
      "loss_layer_30_head": 0.1357462853193283,
      "loss_layer_36_head": 0.08315271884202957,
      "loss_layer_42_head": 0.042459458112716675,
      "loss_layer_6_head": 0.7290599942207336,
      "step": 7230
    },
    {
      "epoch": 92.608,
      "grad_norm": 0.0527755047708869,
      "learning_rate": 7.949089697341621e-05,
      "loss": 2.2334,
      "loss_layer_12_head": 0.511350154876709,
      "loss_layer_18_head": 0.4103212356567383,
      "loss_layer_24_head": 0.24294844269752502,
      "loss_layer_30_head": 0.1397019326686859,
      "loss_layer_36_head": 0.08599673211574554,
      "loss_layer_42_head": 0.04426560923457146,
      "loss_layer_6_head": 0.7258923649787903,
      "step": 7235
    },
    {
      "epoch": 92.672,
      "grad_norm": 0.052342204610697864,
      "learning_rate": 7.809754284771182e-05,
      "loss": 2.2623,
      "loss_layer_12_head": 0.5162094831466675,
      "loss_layer_18_head": 0.4096457362174988,
      "loss_layer_24_head": 0.24343343079090118,
      "loss_layer_30_head": 0.1391303837299347,
      "loss_layer_36_head": 0.08560486137866974,
      "loss_layer_42_head": 0.043631959706544876,
      "loss_layer_6_head": 0.7349289059638977,
      "step": 7240
    },
    {
      "epoch": 92.736,
      "grad_norm": 0.052127554264489506,
      "learning_rate": 7.671631484576891e-05,
      "loss": 2.2272,
      "loss_layer_12_head": 0.5449023246765137,
      "loss_layer_18_head": 0.43368539214134216,
      "loss_layer_24_head": 0.25790348649024963,
      "loss_layer_30_head": 0.14632590115070343,
      "loss_layer_36_head": 0.08959884941577911,
      "loss_layer_42_head": 0.04569728299975395,
      "loss_layer_6_head": 0.7756956815719604,
      "step": 7245
    },
    {
      "epoch": 92.8,
      "grad_norm": 0.05253343091669382,
      "learning_rate": 7.534721988320142e-05,
      "loss": 2.2571,
      "loss_layer_12_head": 0.5414641499519348,
      "loss_layer_18_head": 0.43481332063674927,
      "loss_layer_24_head": 0.25917351245880127,
      "loss_layer_30_head": 0.14865942299365997,
      "loss_layer_36_head": 0.09172725677490234,
      "loss_layer_42_head": 0.046875715255737305,
      "loss_layer_6_head": 0.7621031403541565,
      "step": 7250
    },
    {
      "epoch": 92.864,
      "grad_norm": 0.051493694728967344,
      "learning_rate": 7.399026481487436e-05,
      "loss": 2.2278,
      "loss_layer_12_head": 0.5259193778038025,
      "loss_layer_18_head": 0.421492338180542,
      "loss_layer_24_head": 0.25207868218421936,
      "loss_layer_30_head": 0.14472441375255585,
      "loss_layer_36_head": 0.08978960663080215,
      "loss_layer_42_head": 0.0465787872672081,
      "loss_layer_6_head": 0.7445868253707886,
      "step": 7255
    },
    {
      "epoch": 92.928,
      "grad_norm": 0.04999359291981968,
      "learning_rate": 7.264545643486997e-05,
      "loss": 2.2371,
      "loss_layer_12_head": 0.5205427408218384,
      "loss_layer_18_head": 0.413536012172699,
      "loss_layer_24_head": 0.24666428565979004,
      "loss_layer_30_head": 0.14049525558948517,
      "loss_layer_36_head": 0.08600541949272156,
      "loss_layer_42_head": 0.044004689902067184,
      "loss_layer_6_head": 0.7443909049034119,
      "step": 7260
    },
    {
      "epoch": 92.992,
      "grad_norm": 0.04956552392348256,
      "learning_rate": 7.131280147645442e-05,
      "loss": 2.272,
      "loss_layer_12_head": 0.5328021049499512,
      "loss_layer_18_head": 0.4235454201698303,
      "loss_layer_24_head": 0.25194114446640015,
      "loss_layer_30_head": 0.14524827897548676,
      "loss_layer_36_head": 0.088971808552742,
      "loss_layer_42_head": 0.045267991721630096,
      "loss_layer_6_head": 0.7589775323867798,
      "step": 7265
    },
    {
      "epoch": 93.056,
      "grad_norm": 0.050166220022907666,
      "learning_rate": 6.99923066120428e-05,
      "loss": 2.1962,
      "loss_layer_12_head": 0.5065149068832397,
      "loss_layer_18_head": 0.4041587710380554,
      "loss_layer_24_head": 0.24152569472789764,
      "loss_layer_30_head": 0.13750280439853668,
      "loss_layer_36_head": 0.08431414514780045,
      "loss_layer_42_head": 0.04314825311303139,
      "loss_layer_6_head": 0.7218984365463257,
      "step": 7270
    },
    {
      "epoch": 93.12,
      "grad_norm": 0.05147247506032885,
      "learning_rate": 6.868397845316676e-05,
      "loss": 2.2352,
      "loss_layer_12_head": 0.5464877486228943,
      "loss_layer_18_head": 0.4354175925254822,
      "loss_layer_24_head": 0.25659820437431335,
      "loss_layer_30_head": 0.1472482532262802,
      "loss_layer_36_head": 0.09018800407648087,
      "loss_layer_42_head": 0.04608428478240967,
      "loss_layer_6_head": 0.77470463514328,
      "step": 7275
    },
    {
      "epoch": 93.184,
      "grad_norm": 0.050346043752337366,
      "learning_rate": 6.738782355044049e-05,
      "loss": 2.1969,
      "loss_layer_12_head": 0.5066360831260681,
      "loss_layer_18_head": 0.4020136892795563,
      "loss_layer_24_head": 0.23853126168251038,
      "loss_layer_30_head": 0.1347806751728058,
      "loss_layer_36_head": 0.08263157308101654,
      "loss_layer_42_head": 0.04262102022767067,
      "loss_layer_6_head": 0.7231611013412476,
      "step": 7280
    },
    {
      "epoch": 93.248,
      "grad_norm": 0.05106095299856557,
      "learning_rate": 6.610384839352862e-05,
      "loss": 2.2241,
      "loss_layer_12_head": 0.533324122428894,
      "loss_layer_18_head": 0.42486920952796936,
      "loss_layer_24_head": 0.25439217686653137,
      "loss_layer_30_head": 0.14638930559158325,
      "loss_layer_36_head": 0.08925886452198029,
      "loss_layer_42_head": 0.04579823464155197,
      "loss_layer_6_head": 0.7566064596176147,
      "step": 7285
    },
    {
      "epoch": 93.312,
      "grad_norm": 0.04879215989924989,
      "learning_rate": 6.483205941111347e-05,
      "loss": 2.2191,
      "loss_layer_12_head": 0.5148011445999146,
      "loss_layer_18_head": 0.40864452719688416,
      "loss_layer_24_head": 0.24332115054130554,
      "loss_layer_30_head": 0.1389361321926117,
      "loss_layer_36_head": 0.08549489080905914,
      "loss_layer_42_head": 0.043775562196969986,
      "loss_layer_6_head": 0.7318793535232544,
      "step": 7290
    },
    {
      "epoch": 93.376,
      "grad_norm": 0.050810247342218555,
      "learning_rate": 6.357246297086338e-05,
      "loss": 2.2094,
      "loss_layer_12_head": 0.5326784253120422,
      "loss_layer_18_head": 0.42445212602615356,
      "loss_layer_24_head": 0.25009673833847046,
      "loss_layer_30_head": 0.14350628852844238,
      "loss_layer_36_head": 0.08813217282295227,
      "loss_layer_42_head": 0.04496673867106438,
      "loss_layer_6_head": 0.7576306462287903,
      "step": 7295
    },
    {
      "epoch": 93.44,
      "grad_norm": 0.050097290735194484,
      "learning_rate": 6.232506537939941e-05,
      "loss": 2.2674,
      "loss_layer_12_head": 0.5610400438308716,
      "loss_layer_18_head": 0.44743967056274414,
      "loss_layer_24_head": 0.26677197217941284,
      "loss_layer_30_head": 0.15176048874855042,
      "loss_layer_36_head": 0.091761514544487,
      "loss_layer_42_head": 0.046686917543411255,
      "loss_layer_6_head": 0.7930856943130493,
      "step": 7300
    },
    {
      "epoch": 93.504,
      "grad_norm": 0.048752732298387745,
      "learning_rate": 6.108987288226537e-05,
      "loss": 2.2596,
      "loss_layer_12_head": 0.525027871131897,
      "loss_layer_18_head": 0.41549062728881836,
      "loss_layer_24_head": 0.246365025639534,
      "loss_layer_30_head": 0.13926716148853302,
      "loss_layer_36_head": 0.08504535257816315,
      "loss_layer_42_head": 0.043700579553842545,
      "loss_layer_6_head": 0.7411078214645386,
      "step": 7305
    },
    {
      "epoch": 93.568,
      "grad_norm": 0.04928155011876491,
      "learning_rate": 5.986689166389614e-05,
      "loss": 2.1944,
      "loss_layer_12_head": 0.5285251140594482,
      "loss_layer_18_head": 0.42091408371925354,
      "loss_layer_24_head": 0.2500697672367096,
      "loss_layer_30_head": 0.14209720492362976,
      "loss_layer_36_head": 0.08712536096572876,
      "loss_layer_42_head": 0.04481091350317001,
      "loss_layer_6_head": 0.7546754479408264,
      "step": 7310
    },
    {
      "epoch": 93.632,
      "grad_norm": 0.049364162406600035,
      "learning_rate": 5.865612784758556e-05,
      "loss": 2.2559,
      "loss_layer_12_head": 0.5348905920982361,
      "loss_layer_18_head": 0.4246949553489685,
      "loss_layer_24_head": 0.2513922452926636,
      "loss_layer_30_head": 0.1409299671649933,
      "loss_layer_36_head": 0.08640655875205994,
      "loss_layer_42_head": 0.044360335916280746,
      "loss_layer_6_head": 0.763494074344635,
      "step": 7315
    },
    {
      "epoch": 93.696,
      "grad_norm": 0.05279835010732758,
      "learning_rate": 5.7457587495457484e-05,
      "loss": 2.2535,
      "loss_layer_12_head": 0.5274874567985535,
      "loss_layer_18_head": 0.42034950852394104,
      "loss_layer_24_head": 0.25086283683776855,
      "loss_layer_30_head": 0.14311878383159637,
      "loss_layer_36_head": 0.08816252648830414,
      "loss_layer_42_head": 0.04531439393758774,
      "loss_layer_6_head": 0.7484890222549438,
      "step": 7320
    },
    {
      "epoch": 93.76,
      "grad_norm": 0.05406292276131632,
      "learning_rate": 5.627127660843417e-05,
      "loss": 2.2039,
      "loss_layer_12_head": 0.5300931930541992,
      "loss_layer_18_head": 0.4204525947570801,
      "loss_layer_24_head": 0.25002992153167725,
      "loss_layer_30_head": 0.1424928605556488,
      "loss_layer_36_head": 0.08704705536365509,
      "loss_layer_42_head": 0.044434987008571625,
      "loss_layer_6_head": 0.7560887336730957,
      "step": 7325
    },
    {
      "epoch": 93.824,
      "grad_norm": 0.05090595362978343,
      "learning_rate": 5.509720112620659e-05,
      "loss": 2.2375,
      "loss_layer_12_head": 0.5284169912338257,
      "loss_layer_18_head": 0.4184652864933014,
      "loss_layer_24_head": 0.25025755167007446,
      "loss_layer_30_head": 0.14371904730796814,
      "loss_layer_36_head": 0.08829937875270844,
      "loss_layer_42_head": 0.045556675642728806,
      "loss_layer_6_head": 0.7494155764579773,
      "step": 7330
    },
    {
      "epoch": 93.888,
      "grad_norm": 0.05284830549556568,
      "learning_rate": 5.3935366927205245e-05,
      "loss": 2.2705,
      "loss_layer_12_head": 0.5500022768974304,
      "loss_layer_18_head": 0.43837037682533264,
      "loss_layer_24_head": 0.25907784700393677,
      "loss_layer_30_head": 0.14718593657016754,
      "loss_layer_36_head": 0.09035056829452515,
      "loss_layer_42_head": 0.046200357377529144,
      "loss_layer_6_head": 0.7770909070968628,
      "step": 7335
    },
    {
      "epoch": 93.952,
      "grad_norm": 0.049788600593223965,
      "learning_rate": 5.2785779828570246e-05,
      "loss": 2.1854,
      "loss_layer_12_head": 0.5368602275848389,
      "loss_layer_18_head": 0.42683857679367065,
      "loss_layer_24_head": 0.25303053855895996,
      "loss_layer_30_head": 0.14452925324440002,
      "loss_layer_36_head": 0.0886090099811554,
      "loss_layer_42_head": 0.04508986324071884,
      "loss_layer_6_head": 0.7626599073410034,
      "step": 7340
    },
    {
      "epoch": 94.016,
      "grad_norm": 0.0507229700185525,
      "learning_rate": 5.164844558612131e-05,
      "loss": 2.2125,
      "loss_layer_12_head": 0.5297929644584656,
      "loss_layer_18_head": 0.42104682326316833,
      "loss_layer_24_head": 0.25093841552734375,
      "loss_layer_30_head": 0.14330288767814636,
      "loss_layer_36_head": 0.0873197540640831,
      "loss_layer_42_head": 0.044650983065366745,
      "loss_layer_6_head": 0.7543781995773315,
      "step": 7345
    },
    {
      "epoch": 94.08,
      "grad_norm": 0.05203447134884375,
      "learning_rate": 5.052336989433082e-05,
      "loss": 2.2023,
      "loss_layer_12_head": 0.5156313180923462,
      "loss_layer_18_head": 0.41236376762390137,
      "loss_layer_24_head": 0.24575185775756836,
      "loss_layer_30_head": 0.14062859117984772,
      "loss_layer_36_head": 0.08662770688533783,
      "loss_layer_42_head": 0.044545166194438934,
      "loss_layer_6_head": 0.7321845889091492,
      "step": 7350
    },
    {
      "epoch": 94.144,
      "grad_norm": 0.048918773031445914,
      "learning_rate": 4.941055838629388e-05,
      "loss": 2.1859,
      "loss_layer_12_head": 0.5353293418884277,
      "loss_layer_18_head": 0.42604732513427734,
      "loss_layer_24_head": 0.25410860776901245,
      "loss_layer_30_head": 0.14441102743148804,
      "loss_layer_36_head": 0.08867961168289185,
      "loss_layer_42_head": 0.04561227560043335,
      "loss_layer_6_head": 0.7580705881118774,
      "step": 7355
    },
    {
      "epoch": 94.208,
      "grad_norm": 0.05109968668988158,
      "learning_rate": 4.831001663370083e-05,
      "loss": 2.247,
      "loss_layer_12_head": 0.516042947769165,
      "loss_layer_18_head": 0.4104418158531189,
      "loss_layer_24_head": 0.2416507452726364,
      "loss_layer_30_head": 0.13749200105667114,
      "loss_layer_36_head": 0.08431776612997055,
      "loss_layer_42_head": 0.043385643512010574,
      "loss_layer_6_head": 0.7369158864021301,
      "step": 7360
    },
    {
      "epoch": 94.272,
      "grad_norm": 0.04914731028130122,
      "learning_rate": 4.722175014680835e-05,
      "loss": 2.1827,
      "loss_layer_12_head": 0.504733145236969,
      "loss_layer_18_head": 0.4021202027797699,
      "loss_layer_24_head": 0.23865585029125214,
      "loss_layer_30_head": 0.13682100176811218,
      "loss_layer_36_head": 0.08383283019065857,
      "loss_layer_42_head": 0.043353013694286346,
      "loss_layer_6_head": 0.717356264591217,
      "step": 7365
    },
    {
      "epoch": 94.336,
      "grad_norm": 0.05080142682415078,
      "learning_rate": 4.61457643744137e-05,
      "loss": 2.2182,
      "loss_layer_12_head": 0.5734231472015381,
      "loss_layer_18_head": 0.45476651191711426,
      "loss_layer_24_head": 0.2701670527458191,
      "loss_layer_30_head": 0.15404611825942993,
      "loss_layer_36_head": 0.09426678717136383,
      "loss_layer_42_head": 0.04813188686966896,
      "loss_layer_6_head": 0.813595175743103,
      "step": 7370
    },
    {
      "epoch": 94.4,
      "grad_norm": 0.05080144189778148,
      "learning_rate": 4.508206470382553e-05,
      "loss": 2.2041,
      "loss_layer_12_head": 0.4918692111968994,
      "loss_layer_18_head": 0.3931012749671936,
      "loss_layer_24_head": 0.23291301727294922,
      "loss_layer_30_head": 0.1339966505765915,
      "loss_layer_36_head": 0.08290792256593704,
      "loss_layer_42_head": 0.043026313185691833,
      "loss_layer_6_head": 0.7007776498794556,
      "step": 7375
    },
    {
      "epoch": 94.464,
      "grad_norm": 0.047940588820475694,
      "learning_rate": 4.403065646083809e-05,
      "loss": 2.2478,
      "loss_layer_12_head": 0.5472118258476257,
      "loss_layer_18_head": 0.4348723292350769,
      "loss_layer_24_head": 0.260368287563324,
      "loss_layer_30_head": 0.1491580307483673,
      "loss_layer_36_head": 0.09191758185625076,
      "loss_layer_42_head": 0.04700625687837601,
      "loss_layer_6_head": 0.7731216549873352,
      "step": 7380
    },
    {
      "epoch": 94.528,
      "grad_norm": 0.047981334370751486,
      "learning_rate": 4.299154490970375e-05,
      "loss": 2.1987,
      "loss_layer_12_head": 0.5142067670822144,
      "loss_layer_18_head": 0.41130128502845764,
      "loss_layer_24_head": 0.24497921764850616,
      "loss_layer_30_head": 0.13921776413917542,
      "loss_layer_36_head": 0.0852941945195198,
      "loss_layer_42_head": 0.04337165877223015,
      "loss_layer_6_head": 0.733370304107666,
      "step": 7385
    },
    {
      "epoch": 94.592,
      "grad_norm": 0.04725192489413316,
      "learning_rate": 4.196473525310801e-05,
      "loss": 2.2027,
      "loss_layer_12_head": 0.5072149038314819,
      "loss_layer_18_head": 0.40363913774490356,
      "loss_layer_24_head": 0.23829057812690735,
      "loss_layer_30_head": 0.13629335165023804,
      "loss_layer_36_head": 0.08362124115228653,
      "loss_layer_42_head": 0.04264799878001213,
      "loss_layer_6_head": 0.718589186668396,
      "step": 7390
    },
    {
      "epoch": 94.656,
      "grad_norm": 0.04928739682318855,
      "learning_rate": 4.0950232632141204e-05,
      "loss": 2.2276,
      "loss_layer_12_head": 0.509835422039032,
      "loss_layer_18_head": 0.4073352813720703,
      "loss_layer_24_head": 0.24195675551891327,
      "loss_layer_30_head": 0.1374775469303131,
      "loss_layer_36_head": 0.08443424850702286,
      "loss_layer_42_head": 0.04362732917070389,
      "loss_layer_6_head": 0.7279062271118164,
      "step": 7395
    },
    {
      "epoch": 94.72,
      "grad_norm": 0.04964680499542269,
      "learning_rate": 3.994804212627462e-05,
      "loss": 2.2423,
      "loss_layer_12_head": 0.5464580059051514,
      "loss_layer_18_head": 0.43364572525024414,
      "loss_layer_24_head": 0.2566262185573578,
      "loss_layer_30_head": 0.14529064297676086,
      "loss_layer_36_head": 0.08917434513568878,
      "loss_layer_42_head": 0.045540325343608856,
      "loss_layer_6_head": 0.7763732075691223,
      "step": 7400
    },
    {
      "epoch": 94.72,
      "eval_loss": 5.353898048400879,
      "eval_loss_layer_12_head": 1.2380386590957642,
      "eval_loss_layer_18_head": 1.072614312171936,
      "eval_loss_layer_24_head": 0.6856150031089783,
      "eval_loss_layer_30_head": 0.4437575340270996,
      "eval_loss_layer_36_head": 0.28418707847595215,
      "eval_loss_layer_42_head": 0.1685718297958374,
      "eval_loss_layer_6_head": 1.5861798524856567,
      "eval_runtime": 33.0556,
      "eval_samples_per_second": 9.681,
      "eval_steps_per_second": 0.605,
      "step": 7400
    },
    {
      "epoch": 94.784,
      "grad_norm": 0.050418089182272946,
      "learning_rate": 3.895816875333552e-05,
      "loss": 2.2268,
      "loss_layer_12_head": 0.5392026901245117,
      "loss_layer_18_head": 0.42967090010643005,
      "loss_layer_24_head": 0.2553606331348419,
      "loss_layer_30_head": 0.14527627825737,
      "loss_layer_36_head": 0.08924500644207001,
      "loss_layer_42_head": 0.045985106378793716,
      "loss_layer_6_head": 0.7683435678482056,
      "step": 7405
    },
    {
      "epoch": 94.848,
      "grad_norm": 0.048702080785594536,
      "learning_rate": 3.798061746947995e-05,
      "loss": 2.2638,
      "loss_layer_12_head": 0.5638929009437561,
      "loss_layer_18_head": 0.4466710686683655,
      "loss_layer_24_head": 0.2648269534111023,
      "loss_layer_30_head": 0.14923232793807983,
      "loss_layer_36_head": 0.09156233817338943,
      "loss_layer_42_head": 0.046887580305337906,
      "loss_layer_6_head": 0.8014505505561829,
      "step": 7410
    },
    {
      "epoch": 94.912,
      "grad_norm": 0.04859433698829976,
      "learning_rate": 3.701539316916858e-05,
      "loss": 2.2371,
      "loss_layer_12_head": 0.5219055414199829,
      "loss_layer_18_head": 0.4143024981021881,
      "loss_layer_24_head": 0.2461954653263092,
      "loss_layer_30_head": 0.1389298439025879,
      "loss_layer_36_head": 0.08494042605161667,
      "loss_layer_42_head": 0.04341433569788933,
      "loss_layer_6_head": 0.7418003082275391,
      "step": 7415
    },
    {
      "epoch": 94.976,
      "grad_norm": 0.05095103624265276,
      "learning_rate": 3.606250068514394e-05,
      "loss": 2.2631,
      "loss_layer_12_head": 0.5199529528617859,
      "loss_layer_18_head": 0.41605672240257263,
      "loss_layer_24_head": 0.24828433990478516,
      "loss_layer_30_head": 0.14387363195419312,
      "loss_layer_36_head": 0.08890549838542938,
      "loss_layer_42_head": 0.046081941574811935,
      "loss_layer_6_head": 0.7367304563522339,
      "step": 7420
    },
    {
      "epoch": 95.04,
      "grad_norm": 0.049385982759421604,
      "learning_rate": 3.512194478840353e-05,
      "loss": 2.2314,
      "loss_layer_12_head": 0.5097154974937439,
      "loss_layer_18_head": 0.40520229935646057,
      "loss_layer_24_head": 0.242036372423172,
      "loss_layer_30_head": 0.13781997561454773,
      "loss_layer_36_head": 0.08447255194187164,
      "loss_layer_42_head": 0.043419186025857925,
      "loss_layer_6_head": 0.7233582139015198,
      "step": 7425
    },
    {
      "epoch": 95.104,
      "grad_norm": 0.047832975221119914,
      "learning_rate": 3.4193730188178676e-05,
      "loss": 2.2354,
      "loss_layer_12_head": 0.5347647666931152,
      "loss_layer_18_head": 0.4270428717136383,
      "loss_layer_24_head": 0.25317683815956116,
      "loss_layer_30_head": 0.14424407482147217,
      "loss_layer_36_head": 0.08843990415334702,
      "loss_layer_42_head": 0.04523122310638428,
      "loss_layer_6_head": 0.752372145652771,
      "step": 7430
    },
    {
      "epoch": 95.168,
      "grad_norm": 0.04856714667085708,
      "learning_rate": 3.327786153190848e-05,
      "loss": 2.2621,
      "loss_layer_12_head": 0.5376235246658325,
      "loss_layer_18_head": 0.4283548891544342,
      "loss_layer_24_head": 0.25535741448402405,
      "loss_layer_30_head": 0.14493116736412048,
      "loss_layer_36_head": 0.0885387510061264,
      "loss_layer_42_head": 0.04543321207165718,
      "loss_layer_6_head": 0.7634849548339844,
      "step": 7435
    },
    {
      "epoch": 95.232,
      "grad_norm": 0.04858117459115268,
      "learning_rate": 3.237434340521789e-05,
      "loss": 2.2195,
      "loss_layer_12_head": 0.5300498604774475,
      "loss_layer_18_head": 0.42406362295150757,
      "loss_layer_24_head": 0.2541801929473877,
      "loss_layer_30_head": 0.14546415209770203,
      "loss_layer_36_head": 0.0897911936044693,
      "loss_layer_42_head": 0.04636722803115845,
      "loss_layer_6_head": 0.7514089345932007,
      "step": 7440
    },
    {
      "epoch": 95.296,
      "grad_norm": 0.04675542375185867,
      "learning_rate": 3.14831803318949e-05,
      "loss": 2.2473,
      "loss_layer_12_head": 0.5196978449821472,
      "loss_layer_18_head": 0.41242051124572754,
      "loss_layer_24_head": 0.24465903639793396,
      "loss_layer_30_head": 0.13920676708221436,
      "loss_layer_36_head": 0.08576396852731705,
      "loss_layer_42_head": 0.04367454722523689,
      "loss_layer_6_head": 0.7376381754875183,
      "step": 7445
    },
    {
      "epoch": 95.36,
      "grad_norm": 0.049443459315767734,
      "learning_rate": 3.0604376773867294e-05,
      "loss": 2.2264,
      "loss_layer_12_head": 0.5278940200805664,
      "loss_layer_18_head": 0.41694194078445435,
      "loss_layer_24_head": 0.24713020026683807,
      "loss_layer_30_head": 0.13908903300762177,
      "loss_layer_36_head": 0.08538015186786652,
      "loss_layer_42_head": 0.04371030628681183,
      "loss_layer_6_head": 0.7500482797622681,
      "step": 7450
    },
    {
      "epoch": 95.424,
      "grad_norm": 0.049280039419981436,
      "learning_rate": 2.9737937131180392e-05,
      "loss": 2.1906,
      "loss_layer_12_head": 0.5249096155166626,
      "loss_layer_18_head": 0.4184521734714508,
      "loss_layer_24_head": 0.24954047799110413,
      "loss_layer_30_head": 0.1416356861591339,
      "loss_layer_36_head": 0.0867888480424881,
      "loss_layer_42_head": 0.04490268602967262,
      "loss_layer_6_head": 0.7440937757492065,
      "step": 7455
    },
    {
      "epoch": 95.488,
      "grad_norm": 0.048935572313689835,
      "learning_rate": 2.888386574197488e-05,
      "loss": 2.2121,
      "loss_layer_12_head": 0.5077144503593445,
      "loss_layer_18_head": 0.40426570177078247,
      "loss_layer_24_head": 0.24082589149475098,
      "loss_layer_30_head": 0.13659973442554474,
      "loss_layer_36_head": 0.08365523815155029,
      "loss_layer_42_head": 0.04272734001278877,
      "loss_layer_6_head": 0.7243356704711914,
      "step": 7460
    },
    {
      "epoch": 95.552,
      "grad_norm": 0.04910678249062025,
      "learning_rate": 2.804216688246597e-05,
      "loss": 2.2145,
      "loss_layer_12_head": 0.5439128875732422,
      "loss_layer_18_head": 0.43094402551651,
      "loss_layer_24_head": 0.2566753327846527,
      "loss_layer_30_head": 0.14742517471313477,
      "loss_layer_36_head": 0.09031836688518524,
      "loss_layer_42_head": 0.046097155660390854,
      "loss_layer_6_head": 0.7745534181594849,
      "step": 7465
    },
    {
      "epoch": 95.616,
      "grad_norm": 0.04918938765474974,
      "learning_rate": 2.721284476692093e-05,
      "loss": 2.197,
      "loss_layer_12_head": 0.5497468709945679,
      "loss_layer_18_head": 0.4372737407684326,
      "loss_layer_24_head": 0.2613700330257416,
      "loss_layer_30_head": 0.14881230890750885,
      "loss_layer_36_head": 0.09168092906475067,
      "loss_layer_42_head": 0.046921879053115845,
      "loss_layer_6_head": 0.7791920304298401,
      "step": 7470
    },
    {
      "epoch": 95.68,
      "grad_norm": 0.04847810288813092,
      "learning_rate": 2.6395903547638822e-05,
      "loss": 2.1494,
      "loss_layer_12_head": 0.550081193447113,
      "loss_layer_18_head": 0.4368104338645935,
      "loss_layer_24_head": 0.2596988081932068,
      "loss_layer_30_head": 0.14674346148967743,
      "loss_layer_36_head": 0.09004618227481842,
      "loss_layer_42_head": 0.04627431184053421,
      "loss_layer_6_head": 0.7812727689743042,
      "step": 7475
    },
    {
      "epoch": 95.744,
      "grad_norm": 0.048792321232979566,
      "learning_rate": 2.559134731492857e-05,
      "loss": 2.2421,
      "loss_layer_12_head": 0.5070135593414307,
      "loss_layer_18_head": 0.4030696749687195,
      "loss_layer_24_head": 0.23976829648017883,
      "loss_layer_30_head": 0.1376529186964035,
      "loss_layer_36_head": 0.08414429426193237,
      "loss_layer_42_head": 0.04315520450472832,
      "loss_layer_6_head": 0.7209748029708862,
      "step": 7480
    },
    {
      "epoch": 95.808,
      "grad_norm": 0.0489028116534384,
      "learning_rate": 2.4799180097089813e-05,
      "loss": 2.2458,
      "loss_layer_12_head": 0.5411000847816467,
      "loss_layer_18_head": 0.4324566423892975,
      "loss_layer_24_head": 0.2568822205066681,
      "loss_layer_30_head": 0.14679209887981415,
      "loss_layer_36_head": 0.08964550495147705,
      "loss_layer_42_head": 0.0458163246512413,
      "loss_layer_6_head": 0.7727065682411194,
      "step": 7485
    },
    {
      "epoch": 95.872,
      "grad_norm": 0.05041444749725471,
      "learning_rate": 2.4019405860392364e-05,
      "loss": 2.2487,
      "loss_layer_12_head": 0.5318107604980469,
      "loss_layer_18_head": 0.4218567907810211,
      "loss_layer_24_head": 0.2520443797111511,
      "loss_layer_30_head": 0.1430506557226181,
      "loss_layer_36_head": 0.08765053004026413,
      "loss_layer_42_head": 0.045168377459049225,
      "loss_layer_6_head": 0.7588457465171814,
      "step": 7490
    },
    {
      "epoch": 95.936,
      "grad_norm": 0.04916930771907434,
      "learning_rate": 2.325202850905539e-05,
      "loss": 2.2363,
      "loss_layer_12_head": 0.5383012890815735,
      "loss_layer_18_head": 0.43017101287841797,
      "loss_layer_24_head": 0.2569471001625061,
      "loss_layer_30_head": 0.14574947953224182,
      "loss_layer_36_head": 0.0888228565454483,
      "loss_layer_42_head": 0.04543430358171463,
      "loss_layer_6_head": 0.7668617367744446,
      "step": 7495
    },
    {
      "epoch": 96.0,
      "grad_norm": 0.04717860715105254,
      "learning_rate": 2.2497051885228825e-05,
      "loss": 2.217,
      "loss_layer_12_head": 0.5398637056350708,
      "loss_layer_18_head": 0.42982035875320435,
      "loss_layer_24_head": 0.25465312600135803,
      "loss_layer_30_head": 0.14380770921707153,
      "loss_layer_36_head": 0.08749948441982269,
      "loss_layer_42_head": 0.04460085183382034,
      "loss_layer_6_head": 0.7679110169410706,
      "step": 7500
    },
    {
      "epoch": 96.064,
      "grad_norm": 0.04621407380404632,
      "learning_rate": 2.175447976897449e-05,
      "loss": 2.2596,
      "loss_layer_12_head": 0.5604386925697327,
      "loss_layer_18_head": 0.4454452097415924,
      "loss_layer_24_head": 0.2648676633834839,
      "loss_layer_30_head": 0.1510244905948639,
      "loss_layer_36_head": 0.09220774471759796,
      "loss_layer_42_head": 0.047483719885349274,
      "loss_layer_6_head": 0.7912670373916626,
      "step": 7505
    },
    {
      "epoch": 96.128,
      "grad_norm": 0.048410865420270774,
      "learning_rate": 2.1024315878246104e-05,
      "loss": 2.2041,
      "loss_layer_12_head": 0.5194157361984253,
      "loss_layer_18_head": 0.41100817918777466,
      "loss_layer_24_head": 0.2442316710948944,
      "loss_layer_30_head": 0.13990136981010437,
      "loss_layer_36_head": 0.08676956593990326,
      "loss_layer_42_head": 0.04494382068514824,
      "loss_layer_6_head": 0.7396286129951477,
      "step": 7510
    },
    {
      "epoch": 96.192,
      "grad_norm": 0.048144309852747064,
      "learning_rate": 2.03065638688707e-05,
      "loss": 2.2199,
      "loss_layer_12_head": 0.49283018708229065,
      "loss_layer_18_head": 0.389244019985199,
      "loss_layer_24_head": 0.23159679770469666,
      "loss_layer_30_head": 0.13149218261241913,
      "loss_layer_36_head": 0.08133236318826675,
      "loss_layer_42_head": 0.04185012727975845,
      "loss_layer_6_head": 0.7014617919921875,
      "step": 7515
    },
    {
      "epoch": 96.256,
      "grad_norm": 0.04818866934875542,
      "learning_rate": 1.9601227334531958e-05,
      "loss": 2.1867,
      "loss_layer_12_head": 0.5312197804450989,
      "loss_layer_18_head": 0.42192500829696655,
      "loss_layer_24_head": 0.2499711960554123,
      "loss_layer_30_head": 0.14260806143283844,
      "loss_layer_36_head": 0.08787859231233597,
      "loss_layer_42_head": 0.044671546667814255,
      "loss_layer_6_head": 0.7515575289726257,
      "step": 7520
    },
    {
      "epoch": 96.32,
      "grad_norm": 0.048200245295201026,
      "learning_rate": 1.8908309806749955e-05,
      "loss": 2.2128,
      "loss_layer_12_head": 0.5266733169555664,
      "loss_layer_18_head": 0.4163888096809387,
      "loss_layer_24_head": 0.24804219603538513,
      "loss_layer_30_head": 0.1405041217803955,
      "loss_layer_36_head": 0.08625784516334534,
      "loss_layer_42_head": 0.04447668045759201,
      "loss_layer_6_head": 0.7465261816978455,
      "step": 7525
    },
    {
      "epoch": 96.384,
      "grad_norm": 0.0479743449696711,
      "learning_rate": 1.822781475486507e-05,
      "loss": 2.2037,
      "loss_layer_12_head": 0.5531439781188965,
      "loss_layer_18_head": 0.4405452609062195,
      "loss_layer_24_head": 0.2628982663154602,
      "loss_layer_30_head": 0.1499449610710144,
      "loss_layer_36_head": 0.09184703230857849,
      "loss_layer_42_head": 0.04686347767710686,
      "loss_layer_6_head": 0.7889763712882996,
      "step": 7530
    },
    {
      "epoch": 96.448,
      "grad_norm": 0.047024501084185905,
      "learning_rate": 1.7559745586019914e-05,
      "loss": 2.2109,
      "loss_layer_12_head": 0.5116602182388306,
      "loss_layer_18_head": 0.4074671268463135,
      "loss_layer_24_head": 0.24322326481342316,
      "loss_layer_30_head": 0.13966545462608337,
      "loss_layer_36_head": 0.08560404926538467,
      "loss_layer_42_head": 0.043648380786180496,
      "loss_layer_6_head": 0.7276858687400818,
      "step": 7535
    },
    {
      "epoch": 96.512,
      "grad_norm": 0.04651458546208778,
      "learning_rate": 1.690410564514244e-05,
      "loss": 2.2096,
      "loss_layer_12_head": 0.5429218411445618,
      "loss_layer_18_head": 0.4348849356174469,
      "loss_layer_24_head": 0.2609568238258362,
      "loss_layer_30_head": 0.15119777619838715,
      "loss_layer_36_head": 0.09241846948862076,
      "loss_layer_42_head": 0.04805991053581238,
      "loss_layer_6_head": 0.7628147602081299,
      "step": 7540
    },
    {
      "epoch": 96.576,
      "grad_norm": 0.04816926693380165,
      "learning_rate": 1.6260898214929542e-05,
      "loss": 2.2039,
      "loss_layer_12_head": 0.519668459892273,
      "loss_layer_18_head": 0.4145006239414215,
      "loss_layer_24_head": 0.24598869681358337,
      "loss_layer_30_head": 0.14048966765403748,
      "loss_layer_36_head": 0.08606062829494476,
      "loss_layer_42_head": 0.04434812441468239,
      "loss_layer_6_head": 0.7396847009658813,
      "step": 7545
    },
    {
      "epoch": 96.64,
      "grad_norm": 0.04899532090607689,
      "learning_rate": 1.5630126515830125e-05,
      "loss": 2.2507,
      "loss_layer_12_head": 0.5394229292869568,
      "loss_layer_18_head": 0.42892614006996155,
      "loss_layer_24_head": 0.2544345259666443,
      "loss_layer_30_head": 0.1450084000825882,
      "loss_layer_36_head": 0.088633693754673,
      "loss_layer_42_head": 0.045573003590106964,
      "loss_layer_6_head": 0.769891619682312,
      "step": 7550
    },
    {
      "epoch": 96.704,
      "grad_norm": 0.04781536616564642,
      "learning_rate": 1.5011793706028453e-05,
      "loss": 2.2169,
      "loss_layer_12_head": 0.5172067880630493,
      "loss_layer_18_head": 0.4120805263519287,
      "loss_layer_24_head": 0.24483008682727814,
      "loss_layer_30_head": 0.13928450644016266,
      "loss_layer_36_head": 0.08547414839267731,
      "loss_layer_42_head": 0.04459630697965622,
      "loss_layer_6_head": 0.7354665994644165,
      "step": 7555
    },
    {
      "epoch": 96.768,
      "grad_norm": 0.0484696674805805,
      "learning_rate": 1.4405902881430289e-05,
      "loss": 2.2522,
      "loss_layer_12_head": 0.5152945518493652,
      "loss_layer_18_head": 0.408928245306015,
      "loss_layer_24_head": 0.24452869594097137,
      "loss_layer_30_head": 0.1394452154636383,
      "loss_layer_36_head": 0.08555806428194046,
      "loss_layer_42_head": 0.04428505524992943,
      "loss_layer_6_head": 0.7351473569869995,
      "step": 7560
    },
    {
      "epoch": 96.832,
      "grad_norm": 0.04881579782504701,
      "learning_rate": 1.3812457075644824e-05,
      "loss": 2.2008,
      "loss_layer_12_head": 0.5114595293998718,
      "loss_layer_18_head": 0.4080333709716797,
      "loss_layer_24_head": 0.24258136749267578,
      "loss_layer_30_head": 0.13857226073741913,
      "loss_layer_36_head": 0.08439826965332031,
      "loss_layer_42_head": 0.04351218417286873,
      "loss_layer_6_head": 0.7224711179733276,
      "step": 7565
    },
    {
      "epoch": 96.896,
      "grad_norm": 0.04866263817200507,
      "learning_rate": 1.3231459259972211e-05,
      "loss": 2.2346,
      "loss_layer_12_head": 0.5320409536361694,
      "loss_layer_18_head": 0.42174381017684937,
      "loss_layer_24_head": 0.25058722496032715,
      "loss_layer_30_head": 0.14165911078453064,
      "loss_layer_36_head": 0.0870743989944458,
      "loss_layer_42_head": 0.04468743875622749,
      "loss_layer_6_head": 0.7575365304946899,
      "step": 7570
    },
    {
      "epoch": 96.96,
      "grad_norm": 0.048059327732427756,
      "learning_rate": 1.2662912343386069e-05,
      "loss": 2.2227,
      "loss_layer_12_head": 0.5340146422386169,
      "loss_layer_18_head": 0.42689189314842224,
      "loss_layer_24_head": 0.25484317541122437,
      "loss_layer_30_head": 0.14448025822639465,
      "loss_layer_36_head": 0.08819583803415298,
      "loss_layer_42_head": 0.04537888616323471,
      "loss_layer_6_head": 0.7602660655975342,
      "step": 7575
    },
    {
      "epoch": 97.024,
      "grad_norm": 0.04696560838340125,
      "learning_rate": 1.2106819172520434e-05,
      "loss": 2.2436,
      "loss_layer_12_head": 0.5374619960784912,
      "loss_layer_18_head": 0.4260658323764801,
      "loss_layer_24_head": 0.2526416778564453,
      "loss_layer_30_head": 0.14351734519004822,
      "loss_layer_36_head": 0.08812464773654938,
      "loss_layer_42_head": 0.04518343508243561,
      "loss_layer_6_head": 0.7621882557868958,
      "step": 7580
    },
    {
      "epoch": 97.088,
      "grad_norm": 0.0477365275612823,
      "learning_rate": 1.1563182531655614e-05,
      "loss": 2.2374,
      "loss_layer_12_head": 0.5113983154296875,
      "loss_layer_18_head": 0.4050496220588684,
      "loss_layer_24_head": 0.2420288771390915,
      "loss_layer_30_head": 0.1380770057439804,
      "loss_layer_36_head": 0.08462353050708771,
      "loss_layer_42_head": 0.043446313589811325,
      "loss_layer_6_head": 0.7215984463691711,
      "step": 7585
    },
    {
      "epoch": 97.152,
      "grad_norm": 0.048459658135919646,
      "learning_rate": 1.1032005142703194e-05,
      "loss": 2.2255,
      "loss_layer_12_head": 0.528217613697052,
      "loss_layer_18_head": 0.42197078466415405,
      "loss_layer_24_head": 0.25294801592826843,
      "loss_layer_30_head": 0.14464429020881653,
      "loss_layer_36_head": 0.08905737102031708,
      "loss_layer_42_head": 0.045864541083574295,
      "loss_layer_6_head": 0.7489959001541138,
      "step": 7590
    },
    {
      "epoch": 97.216,
      "grad_norm": 0.05058032474489488,
      "learning_rate": 1.0513289665193826e-05,
      "loss": 2.1984,
      "loss_layer_12_head": 0.5118033289909363,
      "loss_layer_18_head": 0.4073547422885895,
      "loss_layer_24_head": 0.2408057004213333,
      "loss_layer_30_head": 0.13786427676677704,
      "loss_layer_36_head": 0.08442191779613495,
      "loss_layer_42_head": 0.04357881098985672,
      "loss_layer_6_head": 0.7233367562294006,
      "step": 7595
    },
    {
      "epoch": 97.28,
      "grad_norm": 0.04827540415213903,
      "learning_rate": 1.0007038696262516e-05,
      "loss": 2.2145,
      "loss_layer_12_head": 0.5295868515968323,
      "loss_layer_18_head": 0.41884851455688477,
      "loss_layer_24_head": 0.24884319305419922,
      "loss_layer_30_head": 0.14117702841758728,
      "loss_layer_36_head": 0.08712510019540787,
      "loss_layer_42_head": 0.0444101020693779,
      "loss_layer_6_head": 0.751301646232605,
      "step": 7600
    },
    {
      "epoch": 97.28,
      "eval_loss": 5.354556560516357,
      "eval_loss_layer_12_head": 1.2383822202682495,
      "eval_loss_layer_18_head": 1.0728117227554321,
      "eval_loss_layer_24_head": 0.6856847405433655,
      "eval_loss_layer_30_head": 0.44373393058776855,
      "eval_loss_layer_36_head": 0.28420042991638184,
      "eval_loss_layer_42_head": 0.16856889426708221,
      "eval_loss_layer_6_head": 1.5862722396850586,
      "eval_runtime": 33.0292,
      "eval_samples_per_second": 9.688,
      "eval_steps_per_second": 0.606,
      "step": 7600
    },
    {
      "epoch": 97.344,
      "grad_norm": 0.04772203254960449,
      "learning_rate": 9.513254770636137e-06,
      "loss": 2.2251,
      "loss_layer_12_head": 0.5485472679138184,
      "loss_layer_18_head": 0.43624386191368103,
      "loss_layer_24_head": 0.260084331035614,
      "loss_layer_30_head": 0.14740106463432312,
      "loss_layer_36_head": 0.0896199494600296,
      "loss_layer_42_head": 0.045660972595214844,
      "loss_layer_6_head": 0.7847936749458313,
      "step": 7605
    },
    {
      "epoch": 97.408,
      "grad_norm": 0.04702900341423248,
      "learning_rate": 9.031940360621494e-06,
      "loss": 2.2078,
      "loss_layer_12_head": 0.5219594240188599,
      "loss_layer_18_head": 0.4148778021335602,
      "loss_layer_24_head": 0.24383816123008728,
      "loss_layer_30_head": 0.13887521624565125,
      "loss_layer_36_head": 0.0850883349776268,
      "loss_layer_42_head": 0.043440092355012894,
      "loss_layer_6_head": 0.7452921271324158,
      "step": 7610
    },
    {
      "epoch": 97.472,
      "grad_norm": 0.04711996175330007,
      "learning_rate": 8.563097876091718e-06,
      "loss": 2.2384,
      "loss_layer_12_head": 0.5424237847328186,
      "loss_layer_18_head": 0.43333131074905396,
      "loss_layer_24_head": 0.25862258672714233,
      "loss_layer_30_head": 0.14720368385314941,
      "loss_layer_36_head": 0.08983995020389557,
      "loss_layer_42_head": 0.04592455178499222,
      "loss_layer_6_head": 0.7739061713218689,
      "step": 7615
    },
    {
      "epoch": 97.536,
      "grad_norm": 0.04604682525780393,
      "learning_rate": 8.106729664475176e-06,
      "loss": 2.2321,
      "loss_layer_12_head": 0.54424649477005,
      "loss_layer_18_head": 0.4309788644313812,
      "loss_layer_24_head": 0.2568711042404175,
      "loss_layer_30_head": 0.14490000903606415,
      "loss_layer_36_head": 0.08911702781915665,
      "loss_layer_42_head": 0.04543914645910263,
      "loss_layer_6_head": 0.7736631631851196,
      "step": 7620
    },
    {
      "epoch": 97.6,
      "grad_norm": 0.04869374728137759,
      "learning_rate": 7.662838010742412e-06,
      "loss": 2.258,
      "loss_layer_12_head": 0.5330637693405151,
      "loss_layer_18_head": 0.4232029318809509,
      "loss_layer_24_head": 0.25266382098197937,
      "loss_layer_30_head": 0.1439393013715744,
      "loss_layer_36_head": 0.08781413733959198,
      "loss_layer_42_head": 0.044964972883462906,
      "loss_layer_6_head": 0.7609207034111023,
      "step": 7625
    },
    {
      "epoch": 97.664,
      "grad_norm": 0.0469918036380085,
      "learning_rate": 7.231425137397274e-06,
      "loss": 2.2135,
      "loss_layer_12_head": 0.4967819154262543,
      "loss_layer_18_head": 0.3944806158542633,
      "loss_layer_24_head": 0.23440369963645935,
      "loss_layer_30_head": 0.13338634371757507,
      "loss_layer_36_head": 0.0824437290430069,
      "loss_layer_42_head": 0.042510829865932465,
      "loss_layer_6_head": 0.7083306312561035,
      "step": 7630
    },
    {
      "epoch": 97.728,
      "grad_norm": 0.047147570026965874,
      "learning_rate": 6.812493204462477e-06,
      "loss": 2.1929,
      "loss_layer_12_head": 0.5360531210899353,
      "loss_layer_18_head": 0.42734161019325256,
      "loss_layer_24_head": 0.2545594274997711,
      "loss_layer_30_head": 0.14487186074256897,
      "loss_layer_36_head": 0.08874933421611786,
      "loss_layer_42_head": 0.0455973818898201,
      "loss_layer_6_head": 0.7604452967643738,
      "step": 7635
    },
    {
      "epoch": 97.792,
      "grad_norm": 0.0483458600285487,
      "learning_rate": 6.406044309471004e-06,
      "loss": 2.2135,
      "loss_layer_12_head": 0.5361990928649902,
      "loss_layer_18_head": 0.42738810181617737,
      "loss_layer_24_head": 0.25182539224624634,
      "loss_layer_30_head": 0.14171230792999268,
      "loss_layer_36_head": 0.08634297549724579,
      "loss_layer_42_head": 0.04413245618343353,
      "loss_layer_6_head": 0.7619434595108032,
      "step": 7640
    },
    {
      "epoch": 97.856,
      "grad_norm": 0.0472915246101266,
      "learning_rate": 6.012080487455551e-06,
      "loss": 2.2043,
      "loss_layer_12_head": 0.510196328163147,
      "loss_layer_18_head": 0.40766462683677673,
      "loss_layer_24_head": 0.24087238311767578,
      "loss_layer_30_head": 0.1365537792444229,
      "loss_layer_36_head": 0.08366300165653229,
      "loss_layer_42_head": 0.042944855988025665,
      "loss_layer_6_head": 0.7242112159729004,
      "step": 7645
    },
    {
      "epoch": 97.92,
      "grad_norm": 0.04838475435257961,
      "learning_rate": 5.630603710937155e-06,
      "loss": 2.2122,
      "loss_layer_12_head": 0.5254450440406799,
      "loss_layer_18_head": 0.4168366491794586,
      "loss_layer_24_head": 0.2457796037197113,
      "loss_layer_30_head": 0.13978086411952972,
      "loss_layer_36_head": 0.08535812050104141,
      "loss_layer_42_head": 0.0436202771961689,
      "loss_layer_6_head": 0.7524908781051636,
      "step": 7650
    },
    {
      "epoch": 97.984,
      "grad_norm": 0.04833121767333433,
      "learning_rate": 5.261615889916027e-06,
      "loss": 2.1866,
      "loss_layer_12_head": 0.5278292894363403,
      "loss_layer_18_head": 0.4197697639465332,
      "loss_layer_24_head": 0.24924680590629578,
      "loss_layer_30_head": 0.14110907912254333,
      "loss_layer_36_head": 0.08638477325439453,
      "loss_layer_42_head": 0.04410397633910179,
      "loss_layer_6_head": 0.7487640976905823,
      "step": 7655
    },
    {
      "epoch": 98.048,
      "grad_norm": 0.04800733976525724,
      "learning_rate": 4.905118871862402e-06,
      "loss": 2.1801,
      "loss_layer_12_head": 0.5165697336196899,
      "loss_layer_18_head": 0.40829309821128845,
      "loss_layer_24_head": 0.24308562278747559,
      "loss_layer_30_head": 0.13844510912895203,
      "loss_layer_36_head": 0.0843529924750328,
      "loss_layer_42_head": 0.04326789826154709,
      "loss_layer_6_head": 0.7336050868034363,
      "step": 7660
    },
    {
      "epoch": 98.112,
      "grad_norm": 0.04713406654762435,
      "learning_rate": 4.56111444170626e-06,
      "loss": 2.1728,
      "loss_layer_12_head": 0.4959138333797455,
      "loss_layer_18_head": 0.39130300283432007,
      "loss_layer_24_head": 0.23362024128437042,
      "loss_layer_30_head": 0.1321408450603485,
      "loss_layer_36_head": 0.08182616531848907,
      "loss_layer_42_head": 0.04181751608848572,
      "loss_layer_6_head": 0.7084263563156128,
      "step": 7665
    },
    {
      "epoch": 98.176,
      "grad_norm": 0.047828412834017814,
      "learning_rate": 4.229604321829561e-06,
      "loss": 2.2416,
      "loss_layer_12_head": 0.5260987281799316,
      "loss_layer_18_head": 0.41889506578445435,
      "loss_layer_24_head": 0.24845996499061584,
      "loss_layer_30_head": 0.14211705327033997,
      "loss_layer_36_head": 0.0868179127573967,
      "loss_layer_42_head": 0.044242698699235916,
      "loss_layer_6_head": 0.7435183525085449,
      "step": 7670
    },
    {
      "epoch": 98.24,
      "grad_norm": 0.04634018385683477,
      "learning_rate": 3.9105901720562496e-06,
      "loss": 2.2254,
      "loss_layer_12_head": 0.5336052179336548,
      "loss_layer_18_head": 0.4249156415462494,
      "loss_layer_24_head": 0.25199320912361145,
      "loss_layer_30_head": 0.14434120059013367,
      "loss_layer_36_head": 0.08857591450214386,
      "loss_layer_42_head": 0.045480139553546906,
      "loss_layer_6_head": 0.7603551745414734,
      "step": 7675
    },
    {
      "epoch": 98.304,
      "grad_norm": 0.04705198506349752,
      "learning_rate": 3.6040735896455955e-06,
      "loss": 2.2344,
      "loss_layer_12_head": 0.5340797305107117,
      "loss_layer_18_head": 0.4260595440864563,
      "loss_layer_24_head": 0.2532065808773041,
      "loss_layer_30_head": 0.14468078315258026,
      "loss_layer_36_head": 0.08894868195056915,
      "loss_layer_42_head": 0.04583946615457535,
      "loss_layer_6_head": 0.7546078562736511,
      "step": 7680
    },
    {
      "epoch": 98.368,
      "grad_norm": 0.047741277662948216,
      "learning_rate": 3.3100561092824777e-06,
      "loss": 2.2525,
      "loss_layer_12_head": 0.5400086045265198,
      "loss_layer_18_head": 0.43066662549972534,
      "loss_layer_24_head": 0.2576448321342468,
      "loss_layer_30_head": 0.14679929614067078,
      "loss_layer_36_head": 0.09037864953279495,
      "loss_layer_42_head": 0.04661364108324051,
      "loss_layer_6_head": 0.7654889822006226,
      "step": 7685
    },
    {
      "epoch": 98.432,
      "grad_norm": 0.04712116309494177,
      "learning_rate": 3.028539203071001e-06,
      "loss": 2.2234,
      "loss_layer_12_head": 0.5268769860267639,
      "loss_layer_18_head": 0.4161738455295563,
      "loss_layer_24_head": 0.2471807301044464,
      "loss_layer_30_head": 0.1406070739030838,
      "loss_layer_36_head": 0.08657457679510117,
      "loss_layer_42_head": 0.04418870061635971,
      "loss_layer_6_head": 0.7565459609031677,
      "step": 7690
    },
    {
      "epoch": 98.496,
      "grad_norm": 0.04733319929416073,
      "learning_rate": 2.759524280526726e-06,
      "loss": 2.2229,
      "loss_layer_12_head": 0.5219739675521851,
      "loss_layer_18_head": 0.4164242148399353,
      "loss_layer_24_head": 0.24668073654174805,
      "loss_layer_30_head": 0.14108192920684814,
      "loss_layer_36_head": 0.08618952333927155,
      "loss_layer_42_head": 0.04458010196685791,
      "loss_layer_6_head": 0.7416320443153381,
      "step": 7695
    },
    {
      "epoch": 98.56,
      "grad_norm": 0.04646205694507778,
      "learning_rate": 2.5030126885694504e-06,
      "loss": 2.2445,
      "loss_layer_12_head": 0.5464568734169006,
      "loss_layer_18_head": 0.43410173058509827,
      "loss_layer_24_head": 0.25835663080215454,
      "loss_layer_30_head": 0.1468503773212433,
      "loss_layer_36_head": 0.09022296965122223,
      "loss_layer_42_head": 0.046655647456645966,
      "loss_layer_6_head": 0.7785172462463379,
      "step": 7700
    },
    {
      "epoch": 98.624,
      "grad_norm": 0.04713119066710506,
      "learning_rate": 2.259005711516271e-06,
      "loss": 2.1875,
      "loss_layer_12_head": 0.5380223989486694,
      "loss_layer_18_head": 0.4276723861694336,
      "loss_layer_24_head": 0.2534812092781067,
      "loss_layer_30_head": 0.14326933026313782,
      "loss_layer_36_head": 0.08811579644680023,
      "loss_layer_42_head": 0.044826384633779526,
      "loss_layer_6_head": 0.7623842358589172,
      "step": 7705
    },
    {
      "epoch": 98.688,
      "grad_norm": 0.048217735810142305,
      "learning_rate": 2.0275045710760333e-06,
      "loss": 2.2064,
      "loss_layer_12_head": 0.5266226530075073,
      "loss_layer_18_head": 0.4228242039680481,
      "loss_layer_24_head": 0.2509838938713074,
      "loss_layer_30_head": 0.14340190589427948,
      "loss_layer_36_head": 0.08799199759960175,
      "loss_layer_42_head": 0.045455075800418854,
      "loss_layer_6_head": 0.7426528334617615,
      "step": 7710
    },
    {
      "epoch": 98.752,
      "grad_norm": 0.04733111837314087,
      "learning_rate": 1.808510426341836e-06,
      "loss": 2.1954,
      "loss_layer_12_head": 0.51893550157547,
      "loss_layer_18_head": 0.4116063714027405,
      "loss_layer_24_head": 0.24513521790504456,
      "loss_layer_30_head": 0.14006394147872925,
      "loss_layer_36_head": 0.08600065857172012,
      "loss_layer_42_head": 0.04439018666744232,
      "loss_layer_6_head": 0.7368661165237427,
      "step": 7715
    },
    {
      "epoch": 98.816,
      "grad_norm": 0.04605944071168619,
      "learning_rate": 1.6020243737865926e-06,
      "loss": 2.2176,
      "loss_layer_12_head": 0.5198768973350525,
      "loss_layer_18_head": 0.41486772894859314,
      "loss_layer_24_head": 0.2452196180820465,
      "loss_layer_30_head": 0.1408570557832718,
      "loss_layer_36_head": 0.08577336370944977,
      "loss_layer_42_head": 0.04393228515982628,
      "loss_layer_6_head": 0.7388896942138672,
      "step": 7720
    },
    {
      "epoch": 98.88,
      "grad_norm": 0.04848549877959241,
      "learning_rate": 1.4080474472569216e-06,
      "loss": 2.2434,
      "loss_layer_12_head": 0.5415701270103455,
      "loss_layer_18_head": 0.43013888597488403,
      "loss_layer_24_head": 0.2537843585014343,
      "loss_layer_30_head": 0.14369845390319824,
      "loss_layer_36_head": 0.08760565519332886,
      "loss_layer_42_head": 0.04481610655784607,
      "loss_layer_6_head": 0.7697018980979919,
      "step": 7725
    },
    {
      "epoch": 98.944,
      "grad_norm": 0.046740999578059717,
      "learning_rate": 1.2265806179681537e-06,
      "loss": 2.2172,
      "loss_layer_12_head": 0.5173385143280029,
      "loss_layer_18_head": 0.4116523861885071,
      "loss_layer_24_head": 0.2450505495071411,
      "loss_layer_30_head": 0.14002016186714172,
      "loss_layer_36_head": 0.08660206198692322,
      "loss_layer_42_head": 0.044548191130161285,
      "loss_layer_6_head": 0.7309035062789917,
      "step": 7730
    },
    {
      "epoch": 99.008,
      "grad_norm": 0.05120198903695742,
      "learning_rate": 1.0576247944985017e-06,
      "loss": 2.1922,
      "loss_layer_12_head": 0.525602400302887,
      "loss_layer_18_head": 0.41835451126098633,
      "loss_layer_24_head": 0.2486189603805542,
      "loss_layer_30_head": 0.14194296300411224,
      "loss_layer_36_head": 0.08719538152217865,
      "loss_layer_42_head": 0.044893983751535416,
      "loss_layer_6_head": 0.7493778467178345,
      "step": 7735
    },
    {
      "epoch": 99.072,
      "grad_norm": 0.0469194613080998,
      "learning_rate": 9.011808227865626e-07,
      "loss": 2.2079,
      "loss_layer_12_head": 0.5350391268730164,
      "loss_layer_18_head": 0.4255501627922058,
      "loss_layer_24_head": 0.25074857473373413,
      "loss_layer_30_head": 0.1427963525056839,
      "loss_layer_36_head": 0.08708472549915314,
      "loss_layer_42_head": 0.0445537194609642,
      "loss_layer_6_head": 0.7682226300239563,
      "step": 7740
    },
    {
      "epoch": 99.136,
      "grad_norm": 0.04741679518552047,
      "learning_rate": 7.572494861246559e-07,
      "loss": 2.2187,
      "loss_layer_12_head": 0.5227916240692139,
      "loss_layer_18_head": 0.41743701696395874,
      "loss_layer_24_head": 0.24873284995555878,
      "loss_layer_30_head": 0.14130835235118866,
      "loss_layer_36_head": 0.08685535192489624,
      "loss_layer_42_head": 0.04465156048536301,
      "loss_layer_6_head": 0.7465317845344543,
      "step": 7745
    },
    {
      "epoch": 99.2,
      "grad_norm": 0.04736798531159028,
      "learning_rate": 6.258315051568819e-07,
      "loss": 2.2141,
      "loss_layer_12_head": 0.49780216813087463,
      "loss_layer_18_head": 0.3960054814815521,
      "loss_layer_24_head": 0.2347451150417328,
      "loss_layer_30_head": 0.13402287662029266,
      "loss_layer_36_head": 0.08227376639842987,
      "loss_layer_42_head": 0.042138420045375824,
      "loss_layer_6_head": 0.7112741470336914,
      "step": 7750
    },
    {
      "epoch": 99.264,
      "grad_norm": 0.04743328796673216,
      "learning_rate": 5.069275378746796e-07,
      "loss": 2.2036,
      "loss_layer_12_head": 0.5282648801803589,
      "loss_layer_18_head": 0.4216739535331726,
      "loss_layer_24_head": 0.2518410086631775,
      "loss_layer_30_head": 0.14420276880264282,
      "loss_layer_36_head": 0.08859424293041229,
      "loss_layer_42_head": 0.0452834852039814,
      "loss_layer_6_head": 0.7505770325660706,
      "step": 7755
    },
    {
      "epoch": 99.328,
      "grad_norm": 0.04670242920251239,
      "learning_rate": 4.0053817961321905e-07,
      "loss": 2.1748,
      "loss_layer_12_head": 0.5282557010650635,
      "loss_layer_18_head": 0.4217909276485443,
      "loss_layer_24_head": 0.2519395351409912,
      "loss_layer_30_head": 0.14398393034934998,
      "loss_layer_36_head": 0.08867120742797852,
      "loss_layer_42_head": 0.0454082116484642,
      "loss_layer_6_head": 0.7452816367149353,
      "step": 7760
    },
    {
      "epoch": 99.392,
      "grad_norm": 0.047022582264906146,
      "learning_rate": 3.0666396304918074e-07,
      "loss": 2.2344,
      "loss_layer_12_head": 0.5282045006752014,
      "loss_layer_18_head": 0.42062681913375854,
      "loss_layer_24_head": 0.25122708082199097,
      "loss_layer_30_head": 0.14215140044689178,
      "loss_layer_36_head": 0.08697571605443954,
      "loss_layer_42_head": 0.04401630163192749,
      "loss_layer_6_head": 0.7532252073287964,
      "step": 7765
    },
    {
      "epoch": 99.456,
      "grad_norm": 0.0463821563930693,
      "learning_rate": 2.2530535819742514e-07,
      "loss": 2.2675,
      "loss_layer_12_head": 0.5320900678634644,
      "loss_layer_18_head": 0.42441660165786743,
      "loss_layer_24_head": 0.2505679726600647,
      "loss_layer_30_head": 0.14190511405467987,
      "loss_layer_36_head": 0.08675922453403473,
      "loss_layer_42_head": 0.04490956664085388,
      "loss_layer_6_head": 0.7558645606040955,
      "step": 7770
    },
    {
      "epoch": 99.52,
      "grad_norm": 0.04719644615712964,
      "learning_rate": 1.564627724090495e-07,
      "loss": 2.1904,
      "loss_layer_12_head": 0.49446120858192444,
      "loss_layer_18_head": 0.39401769638061523,
      "loss_layer_24_head": 0.23548324406147003,
      "loss_layer_30_head": 0.13529255986213684,
      "loss_layer_36_head": 0.08313772082328796,
      "loss_layer_42_head": 0.04276018217206001,
      "loss_layer_6_head": 0.7003766298294067,
      "step": 7775
    },
    {
      "epoch": 99.584,
      "grad_norm": 0.0467219630012499,
      "learning_rate": 1.0013655036916758e-07,
      "loss": 2.1931,
      "loss_layer_12_head": 0.5233424305915833,
      "loss_layer_18_head": 0.4162876009941101,
      "loss_layer_24_head": 0.25019821524620056,
      "loss_layer_30_head": 0.14369754493236542,
      "loss_layer_36_head": 0.08864432573318481,
      "loss_layer_42_head": 0.04584623873233795,
      "loss_layer_6_head": 0.7413180470466614,
      "step": 7780
    },
    {
      "epoch": 99.648,
      "grad_norm": 0.04741216968578843,
      "learning_rate": 5.632697409496679e-08,
      "loss": 2.2301,
      "loss_layer_12_head": 0.5492891073226929,
      "loss_layer_18_head": 0.43764814734458923,
      "loss_layer_24_head": 0.261425256729126,
      "loss_layer_30_head": 0.1507972925901413,
      "loss_layer_36_head": 0.09241719543933868,
      "loss_layer_42_head": 0.04733694717288017,
      "loss_layer_6_head": 0.7782749533653259,
      "step": 7785
    },
    {
      "epoch": 99.712,
      "grad_norm": 0.046454260301209735,
      "learning_rate": 2.5034262935152986e-08,
      "loss": 2.1888,
      "loss_layer_12_head": 0.5254442095756531,
      "loss_layer_18_head": 0.41656598448753357,
      "loss_layer_24_head": 0.24667198956012726,
      "loss_layer_30_head": 0.14003470540046692,
      "loss_layer_36_head": 0.08578041195869446,
      "loss_layer_42_head": 0.04399532824754715,
      "loss_layer_6_head": 0.7477318048477173,
      "step": 7790
    },
    {
      "epoch": 99.776,
      "grad_norm": 0.04581963301706076,
      "learning_rate": 6.258573567730075e-09,
      "loss": 2.2447,
      "loss_layer_12_head": 0.5252737402915955,
      "loss_layer_18_head": 0.4174478054046631,
      "loss_layer_24_head": 0.24900317192077637,
      "loss_layer_30_head": 0.14163519442081451,
      "loss_layer_36_head": 0.08707299828529358,
      "loss_layer_42_head": 0.044729284942150116,
      "loss_layer_6_head": 0.7501606345176697,
      "step": 7795
    },
    {
      "epoch": 99.84,
      "grad_norm": 0.04605891665453617,
      "learning_rate": 0.0,
      "loss": 2.2007,
      "loss_layer_12_head": 0.5443611145019531,
      "loss_layer_18_head": 0.4316760003566742,
      "loss_layer_24_head": 0.2571260333061218,
      "loss_layer_30_head": 0.14590995013713837,
      "loss_layer_36_head": 0.08907999098300934,
      "loss_layer_42_head": 0.04544537514448166,
      "loss_layer_6_head": 0.7722325921058655,
      "step": 7800
    },
    {
      "epoch": 99.84,
      "eval_loss": 5.354715347290039,
      "eval_loss_layer_12_head": 1.2383829355239868,
      "eval_loss_layer_18_head": 1.0728706121444702,
      "eval_loss_layer_24_head": 0.6857050657272339,
      "eval_loss_layer_30_head": 0.44380584359169006,
      "eval_loss_layer_36_head": 0.28424447774887085,
      "eval_loss_layer_42_head": 0.16854825615882874,
      "eval_loss_layer_6_head": 1.5862910747528076,
      "eval_runtime": 33.0834,
      "eval_samples_per_second": 9.673,
      "eval_steps_per_second": 0.605,
      "step": 7800
    },
    {
      "epoch": 99.84,
      "step": 7800,
      "total_flos": 2.7045300749205504e+18,
      "train_loss": 2.9396800227042954,
      "train_runtime": 87094.8521,
      "train_samples_per_second": 11.482,
      "train_steps_per_second": 0.09
    }
  ],
  "logging_steps": 5,
  "max_steps": 7800,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 100,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": false,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 2.7045300749205504e+18,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}
