MMR-DAPO / trainer_state.json
kangdawei's picture
Model save
d1ab61c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.22857142857142856,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_fraction": 0.0,
"completion_length": 2571.2083587646484,
"epoch": 0.001142857142857143,
"grad_norm": 0.22715197503566742,
"kl": 0.0,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 0.0,
"loss": 0.0941,
"reward": -0.05164351873099804,
"reward_after_mean": -0.05164351873099804,
"reward_after_std": 0.5470927599817514,
"reward_before_mean": 0.21363236638717353,
"reward_before_std": 0.541789973154664,
"reward_change_max": 0.0,
"reward_change_mean": -0.2652758788317442,
"reward_change_min": -0.4747793525457382,
"reward_change_std": 0.18003974109888077,
"reward_std": 0.5470927748829126,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": -0.015534311532974243,
"step": 1
},
{
"clip_fraction": 0.0,
"completion_length": 2804.395881652832,
"epoch": 0.002285714285714286,
"grad_norm": 0.21158379316329956,
"kl": 0.0,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5e-08,
"loss": 0.0288,
"reward": -0.10030801966786385,
"reward_after_mean": -0.10030801966786385,
"reward_after_std": 0.2960502114146948,
"reward_before_mean": 0.179365461692214,
"reward_before_std": 0.2432677550241351,
"reward_change_max": 0.0,
"reward_change_mean": -0.27967348881065845,
"reward_change_min": -0.40170327201485634,
"reward_change_std": 0.154046350158751,
"reward_std": 0.29605022072792053,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.04980122856795788,
"step": 2
},
{
"clip_fraction": 0.0,
"completion_length": 3309.8541717529297,
"epoch": 0.0034285714285714284,
"grad_norm": 0.16393998265266418,
"kl": 4.11495566368103e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1e-07,
"loss": -0.0221,
"reward": -0.3395198956131935,
"reward_after_mean": -0.3395198956131935,
"reward_after_std": 0.30841588601469994,
"reward_before_mean": -0.14859933033585548,
"reward_before_std": 0.2812267681583762,
"reward_change_max": 0.0,
"reward_change_mean": -0.19092058949172497,
"reward_change_min": -0.3236966449767351,
"reward_change_std": 0.11970376130193472,
"reward_std": 0.30841588601469994,
"rewards/accuracy_reward": 0.0625,
"rewards/cosine_scaled_reward": -0.2110993228852749,
"step": 3
},
{
"clip_fraction": 0.0,
"completion_length": 2138.9791717529297,
"epoch": 0.004571428571428572,
"grad_norm": 0.3178713321685791,
"kl": 3.808736801147461e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.5e-07,
"loss": 0.0416,
"reward": -0.010286737233400345,
"reward_after_mean": -0.010286737233400345,
"reward_after_std": 0.6836549900472164,
"reward_before_mean": 0.25457675755023956,
"reward_before_std": 0.7051120875403285,
"reward_change_max": 0.0,
"reward_change_mean": -0.26486349664628506,
"reward_change_min": -0.5560955684632063,
"reward_change_std": 0.2087760465219617,
"reward_std": 0.6836549993604422,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": 0.004576747305691242,
"step": 4
},
{
"clip_fraction": 0.0,
"completion_length": 3392.854217529297,
"epoch": 0.005714285714285714,
"grad_norm": 0.16369687020778656,
"kl": 4.531443119049072e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2e-07,
"loss": -0.0361,
"reward": -0.23667924478650093,
"reward_after_mean": -0.23667924478650093,
"reward_after_std": 0.3961083684116602,
"reward_before_mean": -0.015432212501764297,
"reward_before_std": 0.3830429194495082,
"reward_change_max": 0.0,
"reward_change_mean": -0.22124702110886574,
"reward_change_min": -0.3868277929723263,
"reward_change_std": 0.1524029728025198,
"reward_std": 0.39610837027430534,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.1404322199523449,
"step": 5
},
{
"clip_fraction": 0.0,
"completion_length": 2958.354217529297,
"epoch": 0.006857142857142857,
"grad_norm": 0.1846713125705719,
"kl": 4.254281520843506e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.5e-07,
"loss": 0.0131,
"reward": -0.188864734955132,
"reward_after_mean": -0.188864734955132,
"reward_after_std": 0.4999152459204197,
"reward_before_mean": 0.03220596443861723,
"reward_before_std": 0.4855938693508506,
"reward_change_max": 0.0,
"reward_change_mean": -0.2210706938058138,
"reward_change_min": -0.401586489751935,
"reward_change_std": 0.15394792892038822,
"reward_std": 0.4999152459204197,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.09279404580593109,
"step": 6
},
{
"clip_fraction": 0.0,
"completion_length": 3168.375030517578,
"epoch": 0.008,
"grad_norm": 0.14882907271385193,
"kl": 2.9325485229492188e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3e-07,
"loss": 0.0437,
"reward": -0.12195562478154898,
"reward_after_mean": -0.12195562478154898,
"reward_after_std": 0.5116294100880623,
"reward_before_mean": 0.11521910736337304,
"reward_before_std": 0.42447544634342194,
"reward_change_max": 0.0,
"reward_change_mean": -0.23717475309967995,
"reward_change_min": -0.3341871611773968,
"reward_change_std": 0.12419951800256968,
"reward_std": 0.5116294212639332,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.07228088192641735,
"step": 7
},
{
"clip_fraction": 0.0,
"completion_length": 2867.7708740234375,
"epoch": 0.009142857142857144,
"grad_norm": 0.18323290348052979,
"kl": 1.5871599316596985e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.5e-07,
"loss": -0.0588,
"reward": 0.05553785338997841,
"reward_after_mean": 0.05553785338997841,
"reward_after_std": 0.6334318313747644,
"reward_before_mean": 0.3454107344150543,
"reward_before_std": 0.5995671562850475,
"reward_change_max": 0.0,
"reward_change_mean": -0.28987287171185017,
"reward_change_min": -0.49663791060447693,
"reward_change_std": 0.19086131732910872,
"reward_std": 0.633431838825345,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.05374404788017273,
"step": 8
},
{
"clip_fraction": 0.0,
"completion_length": 3102.6250915527344,
"epoch": 0.010285714285714285,
"grad_norm": 0.19616039097309113,
"kl": 3.674440085887909e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4e-07,
"loss": -0.0234,
"reward": -0.16837336937896907,
"reward_after_mean": -0.16837336937896907,
"reward_after_std": 0.4944228585809469,
"reward_before_mean": 0.06263772025704384,
"reward_before_std": 0.4829963054507971,
"reward_change_max": 0.0,
"reward_change_mean": -0.23101108148694038,
"reward_change_min": -0.42128635197877884,
"reward_change_std": 0.15471239853650331,
"reward_std": 0.4944228623062372,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.08319561230018735,
"step": 9
},
{
"clip_fraction": 0.0,
"completion_length": 2681.9791717529297,
"epoch": 0.011428571428571429,
"grad_norm": 0.18401016294956207,
"kl": 3.248453140258789e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.5e-07,
"loss": -0.0059,
"reward": -0.11956031061708927,
"reward_after_mean": -0.11956031061708927,
"reward_after_std": 0.5619859658181667,
"reward_before_mean": 0.11641145590692759,
"reward_before_std": 0.5482429880648851,
"reward_change_max": 0.0,
"reward_change_mean": -0.23597176373004913,
"reward_change_min": -0.4013804756104946,
"reward_change_std": 0.15929853450506926,
"reward_std": 0.5619859807193279,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.0710885627195239,
"step": 10
},
{
"clip_fraction": 0.0,
"completion_length": 3318.6041870117188,
"epoch": 0.012571428571428572,
"grad_norm": 0.1448400914669037,
"kl": 3.1188130378723145e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5e-07,
"loss": 0.0127,
"reward": -0.2986925896257162,
"reward_after_mean": -0.2986925896257162,
"reward_after_std": 0.47236273624002934,
"reward_before_mean": -0.11717952135950327,
"reward_before_std": 0.43173919059336185,
"reward_change_max": 0.0,
"reward_change_mean": -0.18151307478547096,
"reward_change_min": -0.3272790275514126,
"reward_change_std": 0.11354650370776653,
"reward_std": 0.472362769767642,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.20051286462694407,
"step": 11
},
{
"clip_fraction": 0.0,
"completion_length": 2220.812545776367,
"epoch": 0.013714285714285714,
"grad_norm": 0.24356995522975922,
"kl": 4.044920206069946e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.5e-07,
"loss": -0.0058,
"reward": -0.1285993792116642,
"reward_after_mean": -0.1285993792116642,
"reward_after_std": 0.4768393710255623,
"reward_before_mean": 0.12047314643859863,
"reward_before_std": 0.4788372376933694,
"reward_change_max": 0.0,
"reward_change_mean": -0.24907256104052067,
"reward_change_min": -0.4614252410829067,
"reward_change_std": 0.1735817501321435,
"reward_std": 0.47683937288820744,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.08786016795784235,
"step": 12
},
{
"clip_fraction": 0.0,
"completion_length": 3018.354217529297,
"epoch": 0.014857142857142857,
"grad_norm": 0.2400161623954773,
"kl": 3.460049629211426e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6e-07,
"loss": 0.0728,
"reward": -0.10721326060593128,
"reward_after_mean": -0.10721326060593128,
"reward_after_std": 0.5508640371263027,
"reward_before_mean": 0.13581005320884287,
"reward_before_std": 0.5371037218719721,
"reward_change_max": 0.0,
"reward_change_mean": -0.2430233098566532,
"reward_change_min": -0.4148747492581606,
"reward_change_std": 0.16182870231568813,
"reward_std": 0.5508640389889479,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.05168995447456837,
"step": 13
},
{
"clip_fraction": 0.0,
"completion_length": 2948.187545776367,
"epoch": 0.016,
"grad_norm": 0.20983624458312988,
"kl": 2.816319465637207e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.5e-07,
"loss": -0.0293,
"reward": -0.1505213938653469,
"reward_after_mean": -0.1505213938653469,
"reward_after_std": 0.48780051805078983,
"reward_before_mean": 0.08621821040287614,
"reward_before_std": 0.4805648783221841,
"reward_change_max": 0.0,
"reward_change_mean": -0.2367396280169487,
"reward_change_min": -0.401450265198946,
"reward_change_std": 0.15510203130543232,
"reward_std": 0.48780052177608013,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.08044843003153801,
"step": 14
},
{
"clip_fraction": 0.0,
"completion_length": 2626.2291679382324,
"epoch": 0.017142857142857144,
"grad_norm": 0.2066204994916916,
"kl": 1.849886029958725e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7e-07,
"loss": -0.038,
"reward": -0.07881241291761398,
"reward_after_mean": -0.07881241291761398,
"reward_after_std": 0.3058329503983259,
"reward_before_mean": 0.20712529122829437,
"reward_before_std": 0.24552945792675018,
"reward_change_max": 0.0,
"reward_change_mean": -0.28593770414590836,
"reward_change_min": -0.4088184628635645,
"reward_change_std": 0.15766743291169405,
"reward_std": 0.3058329652994871,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.022041399031877518,
"step": 15
},
{
"clip_fraction": 0.0,
"completion_length": 3519.6458435058594,
"epoch": 0.018285714285714287,
"grad_norm": 0.16457392275333405,
"kl": 4.182755947113037e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.5e-07,
"loss": 0.002,
"reward": -0.3867794554680586,
"reward_after_mean": -0.3867794554680586,
"reward_after_std": 0.27169811353087425,
"reward_before_mean": -0.21112521784380078,
"reward_before_std": 0.21979969623498619,
"reward_change_max": 0.0,
"reward_change_mean": -0.17565423250198364,
"reward_change_min": -0.2535594701766968,
"reward_change_std": 0.0948435440659523,
"reward_std": 0.271698116324842,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2319585494697094,
"step": 16
},
{
"clip_fraction": 0.0,
"completion_length": 2438.541702270508,
"epoch": 0.019428571428571427,
"grad_norm": 0.2491314560174942,
"kl": 3.923475742340088e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8e-07,
"loss": 0.0261,
"reward": -0.09273135662078857,
"reward_after_mean": -0.09273135662078857,
"reward_after_std": 0.533635126426816,
"reward_before_mean": 0.16233503818511963,
"reward_before_std": 0.5344762653112411,
"reward_change_max": 0.0,
"reward_change_mean": -0.25506639294326305,
"reward_change_min": -0.48770347610116005,
"reward_change_std": 0.18604809325188398,
"reward_std": 0.5336351562291384,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.02516496740281582,
"step": 17
},
{
"clip_fraction": 0.0,
"completion_length": 2808.416732788086,
"epoch": 0.02057142857142857,
"grad_norm": 0.18383286893367767,
"kl": 2.2854655981063843e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0188,
"reward": -0.1808744166046381,
"reward_after_mean": -0.1808744166046381,
"reward_after_std": 0.4772877935320139,
"reward_before_mean": 0.04505238076671958,
"reward_before_std": 0.45604391396045685,
"reward_change_max": 0.0,
"reward_change_mean": -0.22592679969966412,
"reward_change_min": -0.3993493113666773,
"reward_change_std": 0.1486664516851306,
"reward_std": 0.4772877972573042,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.1216142950579524,
"step": 18
},
{
"clip_fraction": 0.0,
"completion_length": 2772.4166717529297,
"epoch": 0.021714285714285714,
"grad_norm": 0.18437886238098145,
"kl": 2.4475157260894775e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9e-07,
"loss": 0.0301,
"reward": 0.15320486947894096,
"reward_after_mean": 0.15320486947894096,
"reward_after_std": 0.6356023158878088,
"reward_before_mean": 0.4862675927579403,
"reward_before_std": 0.6554644731804729,
"reward_change_max": 0.0,
"reward_change_mean": -0.3330627214163542,
"reward_change_min": -0.6119302771985531,
"reward_change_std": 0.23800678364932537,
"reward_std": 0.6356023401021957,
"rewards/accuracy_reward": 0.3750000149011612,
"rewards/cosine_scaled_reward": 0.1112675853073597,
"step": 19
},
{
"clip_fraction": 0.0,
"completion_length": 2494.0209197998047,
"epoch": 0.022857142857142857,
"grad_norm": 0.2170894891023636,
"kl": 1.3802200555801392e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.499999999999999e-07,
"loss": -0.0054,
"reward": 0.16672552144154906,
"reward_after_mean": 0.16672552144154906,
"reward_after_std": 0.5955907795578241,
"reward_before_mean": 0.5104347411543131,
"reward_before_std": 0.6022562235593796,
"reward_change_max": 0.0,
"reward_change_mean": -0.3437092248350382,
"reward_change_min": -0.6036390513181686,
"reward_change_std": 0.24417879804968834,
"reward_std": 0.5955907888710499,
"rewards/accuracy_reward": 0.3958333395421505,
"rewards/cosine_scaled_reward": 0.11460138857364655,
"step": 20
},
{
"clip_fraction": 0.0,
"completion_length": 2875.125015258789,
"epoch": 0.024,
"grad_norm": 0.18273918330669403,
"kl": 4.053860902786255e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1e-06,
"loss": -0.0532,
"reward": -0.10901273542549461,
"reward_after_mean": -0.10901273542549461,
"reward_after_std": 0.47225300781428814,
"reward_before_mean": 0.14461494609713554,
"reward_before_std": 0.4534954270347953,
"reward_change_max": 0.0,
"reward_change_mean": -0.2536276988685131,
"reward_change_min": -0.41233821772038937,
"reward_change_std": 0.1598598938435316,
"reward_std": 0.47225301899015903,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.042885048780590296,
"step": 21
},
{
"clip_fraction": 0.0,
"completion_length": 1771.5000381469727,
"epoch": 0.025142857142857144,
"grad_norm": 0.31784588098526,
"kl": 2.108141779899597e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.99931462820376e-07,
"loss": 0.0008,
"reward": -0.08861424401402473,
"reward_after_mean": -0.08861424401402473,
"reward_after_std": 0.3277482558041811,
"reward_before_mean": 0.19050541147589684,
"reward_before_std": 0.27969441190361977,
"reward_change_max": 0.0,
"reward_change_mean": -0.2791196908801794,
"reward_change_min": -0.4190730433911085,
"reward_change_std": 0.15838530845940113,
"reward_std": 0.3277482632547617,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.03866123594343662,
"step": 22
},
{
"clip_fraction": 0.0,
"completion_length": 2396.791702270508,
"epoch": 0.026285714285714287,
"grad_norm": 0.18681152164936066,
"kl": 2.3262575268745422e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0211,
"reward": -0.23647188395261765,
"reward_after_mean": -0.23647188395261765,
"reward_after_std": 0.399081664159894,
"reward_before_mean": -0.021006003953516483,
"reward_before_std": 0.3748807581141591,
"reward_change_max": 0.0,
"reward_change_mean": -0.21546588093042374,
"reward_change_min": -0.38881424628198147,
"reward_change_std": 0.14373441133648157,
"reward_std": 0.39908168464899063,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.12517266999930143,
"step": 23
},
{
"clip_fraction": 0.0,
"completion_length": 2746.104202270508,
"epoch": 0.027428571428571427,
"grad_norm": 0.2844712734222412,
"kl": 1.5251338481903076e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.993832906395582e-07,
"loss": 0.0208,
"reward": 0.0761333703994751,
"reward_after_mean": 0.0761333703994751,
"reward_after_std": 0.6143560092896223,
"reward_before_mean": 0.3813359132036567,
"reward_before_std": 0.6187122687697411,
"reward_change_max": 0.0,
"reward_change_mean": -0.30520253628492355,
"reward_change_min": -0.5352295599877834,
"reward_change_std": 0.2117024352774024,
"reward_std": 0.6143560204654932,
"rewards/accuracy_reward": 0.31250000931322575,
"rewards/cosine_scaled_reward": 0.06883592065423727,
"step": 24
},
{
"clip_fraction": 0.0,
"completion_length": 2599.2708435058594,
"epoch": 0.02857142857142857,
"grad_norm": 0.2062840610742569,
"kl": 3.396719694137573e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0158,
"reward": -0.17240291833877563,
"reward_after_mean": -0.17240291833877563,
"reward_after_std": 0.5134213641285896,
"reward_before_mean": 0.05743051879107952,
"reward_before_std": 0.5324400179088116,
"reward_change_max": 0.0,
"reward_change_mean": -0.22983343712985516,
"reward_change_min": -0.439667459577322,
"reward_change_std": 0.1796932201832533,
"reward_std": 0.5134213827550411,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.1300694877281785,
"step": 25
},
{
"clip_fraction": 0.0,
"completion_length": 2904.1458740234375,
"epoch": 0.029714285714285714,
"grad_norm": 0.16000084578990936,
"kl": 2.2212974727153778e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.982876141412855e-07,
"loss": -0.0014,
"reward": -0.05851639807224274,
"reward_after_mean": -0.05851639807224274,
"reward_after_std": 0.44963656924664974,
"reward_before_mean": 0.2166665024124086,
"reward_before_std": 0.4241899009793997,
"reward_change_max": 0.0,
"reward_change_mean": -0.2751829121261835,
"reward_change_min": -0.41147186793386936,
"reward_change_std": 0.16059752739965916,
"reward_std": 0.44963658042252064,
"rewards/accuracy_reward": 0.22916667722165585,
"rewards/cosine_scaled_reward": -0.012500176206231117,
"step": 26
},
{
"clip_fraction": 0.0,
"completion_length": 2987.2083740234375,
"epoch": 0.030857142857142857,
"grad_norm": 0.17558637261390686,
"kl": 1.9339844584465027e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.975348529157229e-07,
"loss": -0.0073,
"reward": -0.2276617707684636,
"reward_after_mean": -0.2276617707684636,
"reward_after_std": 0.49568176455795765,
"reward_before_mean": -0.021468112245202065,
"reward_before_std": 0.4707259628921747,
"reward_change_max": 0.0,
"reward_change_mean": -0.20619365386664867,
"reward_change_min": -0.3916598744690418,
"reward_change_std": 0.1389320297166705,
"reward_std": 0.49568178318440914,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.12563478108495474,
"step": 27
},
{
"clip_fraction": 0.0,
"completion_length": 2722.8958740234375,
"epoch": 0.032,
"grad_norm": 0.2014327049255371,
"kl": 2.8021633625030518e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.96645768238595e-07,
"loss": 0.0061,
"reward": 0.040278345346450806,
"reward_after_mean": 0.040278345346450806,
"reward_after_std": 0.41374246776103973,
"reward_before_mean": 0.35379676637239754,
"reward_before_std": 0.34872716292738914,
"reward_change_max": 0.0,
"reward_change_mean": -0.3135183919221163,
"reward_change_min": -0.45590174198150635,
"reward_change_std": 0.1750109540298581,
"reward_std": 0.4137424696236849,
"rewards/accuracy_reward": 0.35416666977107525,
"rewards/cosine_scaled_reward": -0.00036993250250816345,
"step": 28
},
{
"clip_fraction": 0.0,
"completion_length": 3334.1458740234375,
"epoch": 0.03314285714285714,
"grad_norm": 0.1652597337961197,
"kl": 4.1700899600982666e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0112,
"reward": -0.39186385460197926,
"reward_after_mean": -0.39186385460197926,
"reward_after_std": 0.31510886177420616,
"reward_before_mean": -0.2214842550456524,
"reward_before_std": 0.2841310743242502,
"reward_change_max": 0.0,
"reward_change_mean": -0.17037959955632687,
"reward_change_min": -0.3085148986428976,
"reward_change_std": 0.10925670620054007,
"reward_std": 0.3151088785380125,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2631509155035019,
"step": 29
},
{
"clip_fraction": 0.0,
"completion_length": 2979.2084045410156,
"epoch": 0.03428571428571429,
"grad_norm": 0.23167316615581512,
"kl": 2.894178032875061e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.944597532678119e-07,
"loss": 0.0743,
"reward": 0.06503391482692678,
"reward_after_mean": 0.06503391482692678,
"reward_after_std": 0.6253597214818001,
"reward_before_mean": 0.36115007381886244,
"reward_before_std": 0.6058835834264755,
"reward_change_max": 0.0,
"reward_change_mean": -0.2961161620914936,
"reward_change_min": -0.4820784516632557,
"reward_change_std": 0.18900468945503235,
"reward_std": 0.6253597438335419,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/cosine_scaled_reward": 0.06948341536917724,
"step": 30
},
{
"clip_fraction": 0.0,
"completion_length": 2974.250015258789,
"epoch": 0.03542857142857143,
"grad_norm": 0.259880930185318,
"kl": 2.1339859813451767e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.931634888554935e-07,
"loss": -0.1239,
"reward": -0.17607227340340614,
"reward_after_mean": -0.17607227340340614,
"reward_after_std": 0.49117584340274334,
"reward_before_mean": 0.05198503099381924,
"reward_before_std": 0.4788073133677244,
"reward_change_max": 0.0,
"reward_change_mean": -0.22805731743574142,
"reward_change_min": -0.39622381143271923,
"reward_change_std": 0.1568627143278718,
"reward_std": 0.4911758713424206,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.09384830202907324,
"step": 31
},
{
"clip_fraction": 0.0,
"completion_length": 3248.7500610351562,
"epoch": 0.036571428571428574,
"grad_norm": 0.17636683583259583,
"kl": 4.393048584461212e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.917322325514487e-07,
"loss": -0.005,
"reward": -0.042430607602000237,
"reward_after_mean": -0.042430607602000237,
"reward_after_std": 0.40656069852411747,
"reward_before_mean": 0.24141232948750257,
"reward_before_std": 0.3450923506170511,
"reward_change_max": 0.0,
"reward_change_mean": -0.28384293988347054,
"reward_change_min": -0.4027772378176451,
"reward_change_std": 0.15636454056948423,
"reward_std": 0.40656072460114956,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": -0.008587680757045746,
"step": 32
},
{
"clip_fraction": 0.0,
"completion_length": 3381.8333740234375,
"epoch": 0.037714285714285714,
"grad_norm": 0.14724509418010712,
"kl": 4.9639493227005005e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.901664203302124e-07,
"loss": -0.0222,
"reward": -0.16744333039969206,
"reward_after_mean": -0.16744333039969206,
"reward_after_std": 0.5987622756510973,
"reward_before_mean": 0.04793749377131462,
"reward_before_std": 0.5893303733319044,
"reward_change_max": 0.0,
"reward_change_mean": -0.21538082137703896,
"reward_change_min": -0.4106915630400181,
"reward_change_std": 0.1567707946524024,
"reward_std": 0.5987622793763876,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.0978958396390226,
"step": 33
},
{
"clip_fraction": 0.0,
"completion_length": 2492.5208740234375,
"epoch": 0.038857142857142854,
"grad_norm": 0.2158302515745163,
"kl": 0.00010519847273826599,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.88466529153356e-07,
"loss": 0.0103,
"reward": 0.2191654071211815,
"reward_after_mean": 0.2191654071211815,
"reward_after_std": 0.7464860528707504,
"reward_before_mean": 0.5621518101543188,
"reward_before_std": 0.7706933673471212,
"reward_change_max": 0.0,
"reward_change_mean": -0.342986399307847,
"reward_change_min": -0.6137207373976707,
"reward_change_std": 0.249726596288383,
"reward_std": 0.7464860621839762,
"rewards/accuracy_reward": 0.39583334140479565,
"rewards/cosine_scaled_reward": 0.16631846246309578,
"step": 34
},
{
"clip_fraction": 0.0,
"completion_length": 3080.25004196167,
"epoch": 0.04,
"grad_norm": 0.19586136937141418,
"kl": 4.8315152525901794e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0329,
"reward": -0.11165531259030104,
"reward_after_mean": -0.11165531259030104,
"reward_after_std": 0.6846011225134134,
"reward_before_mean": 0.11056250985711813,
"reward_before_std": 0.662305686622858,
"reward_change_max": 0.0,
"reward_change_mean": -0.22221781872212887,
"reward_change_min": -0.41959609277546406,
"reward_change_std": 0.15651744045317173,
"reward_std": 0.6846011485904455,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.05610416317358613,
"step": 35
},
{
"clip_fraction": 0.0,
"completion_length": 3382.4583435058594,
"epoch": 0.04114285714285714,
"grad_norm": 0.1657496988773346,
"kl": 6.525218486785889e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.846666218300807e-07,
"loss": 0.0225,
"reward": -0.3704317416995764,
"reward_after_mean": -0.3704317416995764,
"reward_after_std": 0.31464754045009613,
"reward_before_mean": -0.19473244110122323,
"reward_before_std": 0.2709705028682947,
"reward_change_max": 0.0,
"reward_change_mean": -0.17569930106401443,
"reward_change_min": -0.2673074584454298,
"reward_change_std": 0.09599933121353388,
"reward_std": 0.3146475479006767,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.215565774589777,
"step": 36
},
{
"clip_fraction": 0.0,
"completion_length": 3370.0833740234375,
"epoch": 0.04228571428571429,
"grad_norm": 0.15995879471302032,
"kl": 2.010539174079895e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0199,
"reward": -0.4515352062880993,
"reward_after_mean": -0.4515352062880993,
"reward_after_std": 0.21112178452312946,
"reward_before_mean": -0.28895667381584644,
"reward_before_std": 0.17021457478404045,
"reward_change_max": 0.0,
"reward_change_mean": -0.162578534334898,
"reward_change_min": -0.2480195052921772,
"reward_change_std": 0.08829918596893549,
"reward_std": 0.2111217901110649,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.28895667754113674,
"step": 37
},
{
"clip_fraction": 0.0,
"completion_length": 3305.812530517578,
"epoch": 0.04342857142857143,
"grad_norm": 0.18195205926895142,
"kl": 4.641711711883545e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.80337140183366e-07,
"loss": 0.0274,
"reward": -0.2661134898662567,
"reward_after_mean": -0.2661134898662567,
"reward_after_std": 0.26150001399219036,
"reward_before_mean": -0.042573800310492516,
"reward_before_std": 0.2240951219573617,
"reward_change_max": 0.0,
"reward_change_mean": -0.22353969514369965,
"reward_change_min": -0.33214454911649227,
"reward_change_std": 0.12396880332380533,
"reward_std": 0.26150002889335155,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.14674047753214836,
"step": 38
},
{
"clip_fraction": 0.0,
"completion_length": 2746.2500381469727,
"epoch": 0.044571428571428574,
"grad_norm": 0.20554743707180023,
"kl": 5.049258470535278e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.779754323328192e-07,
"loss": -0.0103,
"reward": -0.08482712507247925,
"reward_after_mean": -0.08482712507247925,
"reward_after_std": 0.2759235240519047,
"reward_before_mean": 0.1990803610533476,
"reward_before_std": 0.17641383409500122,
"reward_change_max": 0.0,
"reward_change_mean": -0.2839074842631817,
"reward_change_min": -0.394864222034812,
"reward_change_std": 0.14654383901506662,
"reward_std": 0.2759235333651304,
"rewards/accuracy_reward": 0.25,
"rewards/cosine_scaled_reward": -0.0509196612983942,
"step": 39
},
{
"clip_fraction": 0.0,
"completion_length": 2545.0833740234375,
"epoch": 0.045714285714285714,
"grad_norm": 0.177109956741333,
"kl": 0.0001279488205909729,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.754833590196926e-07,
"loss": 0.0214,
"reward": -0.12675099074840546,
"reward_after_mean": -0.12675099074840546,
"reward_after_std": 0.42964968644082546,
"reward_before_mean": 0.1206003911793232,
"reward_before_std": 0.3606878248974681,
"reward_change_max": 0.0,
"reward_change_mean": -0.24735137075185776,
"reward_change_min": -0.39622214064002037,
"reward_change_std": 0.1445015063509345,
"reward_std": 0.4296496883034706,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.0668996311724186,
"step": 40
},
{
"clip_fraction": 0.0,
"completion_length": 3163.7709045410156,
"epoch": 0.046857142857142854,
"grad_norm": 0.16450874507427216,
"kl": 7.349532097578049e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0068,
"reward": -0.27329863607883453,
"reward_after_mean": -0.27329863607883453,
"reward_after_std": 0.382920210249722,
"reward_before_mean": -0.06760533340275288,
"reward_before_std": 0.36998474691063166,
"reward_change_max": 0.0,
"reward_change_mean": -0.2056933008134365,
"reward_change_min": -0.3910033367574215,
"reward_change_std": 0.13763669785112143,
"reward_std": 0.38292021211236715,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.15093865152448416,
"step": 41
},
{
"clip_fraction": 0.0,
"completion_length": 2923.2708492279053,
"epoch": 0.048,
"grad_norm": 0.22062529623508453,
"kl": 3.405660390853882e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.701111919237408e-07,
"loss": 0.0386,
"reward": -0.43281109537929296,
"reward_after_mean": -0.43281109537929296,
"reward_after_std": 0.29586860351264477,
"reward_before_mean": -0.2786172227934003,
"reward_before_std": 0.23877727705985308,
"reward_change_max": 0.0,
"reward_change_mean": -0.15419389307498932,
"reward_change_min": -0.2224500197917223,
"reward_change_std": 0.0808606967329979,
"reward_std": 0.2958686240017414,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2994505534879863,
"step": 42
},
{
"clip_fraction": 0.0,
"completion_length": 3085.625015258789,
"epoch": 0.04914285714285714,
"grad_norm": 0.18678560853004456,
"kl": 3.5848468542099e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0173,
"reward": -0.19783002510666847,
"reward_after_mean": -0.19783002510666847,
"reward_after_std": 0.37531200610101223,
"reward_before_mean": 0.03791181556880474,
"reward_before_std": 0.3623821344226599,
"reward_change_max": 0.0,
"reward_change_mean": -0.23574184253811836,
"reward_change_min": -0.40054079331457615,
"reward_change_std": 0.15123076178133488,
"reward_std": 0.375312015414238,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.1079215258359909,
"step": 43
},
{
"clip_fraction": 0.0,
"completion_length": 2848.187530517578,
"epoch": 0.05028571428571429,
"grad_norm": 0.23331570625305176,
"kl": 0.00020481832325458527,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.64227184053598e-07,
"loss": 0.0656,
"reward": -0.09341166913509369,
"reward_after_mean": -0.09341166913509369,
"reward_after_std": 0.504422040656209,
"reward_before_mean": 0.16584731824696064,
"reward_before_std": 0.5199198350310326,
"reward_change_max": 0.0,
"reward_change_mean": -0.2592589948326349,
"reward_change_min": -0.47888655215501785,
"reward_change_std": 0.18998684082180262,
"reward_std": 0.5044220443814993,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.021652692928910255,
"step": 44
},
{
"clip_fraction": 0.0,
"completion_length": 3421.7083740234375,
"epoch": 0.05142857142857143,
"grad_norm": 0.1464652270078659,
"kl": 3.8933008909225464e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.610954559391704e-07,
"loss": -0.0099,
"reward": -0.18143506534397602,
"reward_after_mean": -0.18143506534397602,
"reward_after_std": 0.48556733317673206,
"reward_before_mean": 0.0428056214004755,
"reward_before_std": 0.4591532591730356,
"reward_change_max": 0.0,
"reward_change_mean": -0.22424068115651608,
"reward_change_min": -0.3489771634340286,
"reward_change_std": 0.1350772501900792,
"reward_std": 0.4855673424899578,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.10302771534770727,
"step": 45
},
{
"clip_fraction": 0.0,
"completion_length": 3407.2083435058594,
"epoch": 0.052571428571428575,
"grad_norm": 0.1929873824119568,
"kl": 0.0002593100070953369,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.578385041664925e-07,
"loss": 0.0301,
"reward": -0.3940184600651264,
"reward_after_mean": -0.3940184600651264,
"reward_after_std": 0.27828204818069935,
"reward_before_mean": -0.22154150530695915,
"reward_before_std": 0.22950039338320494,
"reward_change_max": 0.0,
"reward_change_mean": -0.17247696220874786,
"reward_change_min": -0.2641389053314924,
"reward_change_std": 0.09335798770189285,
"reward_std": 0.27828205190598965,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.24237483832985163,
"step": 46
},
{
"clip_fraction": 0.0,
"completion_length": 2974.5000762939453,
"epoch": 0.053714285714285714,
"grad_norm": 0.26433587074279785,
"kl": 0.00010039284825325012,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.54457320834625e-07,
"loss": 0.1271,
"reward": 0.045076385140419006,
"reward_after_mean": 0.045076385140419006,
"reward_after_std": 0.562898326665163,
"reward_before_mean": 0.3511660899966955,
"reward_before_std": 0.5953826615586877,
"reward_change_max": 0.0,
"reward_change_mean": -0.3060897272080183,
"reward_change_min": -0.5509906392544508,
"reward_change_std": 0.22344755101948977,
"reward_std": 0.5628983462229371,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": 0.05949941836297512,
"step": 47
},
{
"clip_fraction": 0.0,
"completion_length": 2807.687530517578,
"epoch": 0.054857142857142854,
"grad_norm": 0.25756871700286865,
"kl": 0.000245068222284317,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.509529358847654e-07,
"loss": 0.0432,
"reward": -0.17351538315415382,
"reward_after_mean": -0.17351538315415382,
"reward_after_std": 0.39601418375968933,
"reward_before_mean": 0.06836471986025572,
"reward_before_std": 0.3816844457760453,
"reward_change_max": 0.0,
"reward_change_mean": -0.24188010767102242,
"reward_change_min": -0.41022371128201485,
"reward_change_std": 0.15736299566924572,
"reward_std": 0.3960142061114311,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.09830194525420666,
"step": 48
},
{
"clip_fraction": 0.0,
"completion_length": 2470.6875762939453,
"epoch": 0.056,
"grad_norm": 0.2077207863330841,
"kl": 0.00011547654867172241,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0475,
"reward": -0.004357508383691311,
"reward_after_mean": -0.004357508383691311,
"reward_after_std": 0.660737868398428,
"reward_before_mean": 0.2607364095747471,
"reward_before_std": 0.6347355041652918,
"reward_change_max": 0.0,
"reward_change_mean": -0.2650939039885998,
"reward_change_min": -0.4125436320900917,
"reward_change_std": 0.16354067996144295,
"reward_std": 0.6607378907501698,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": 0.01073638815432787,
"step": 49
},
{
"clip_fraction": 0.0,
"completion_length": 3005.458366394043,
"epoch": 0.05714285714285714,
"grad_norm": 0.14878875017166138,
"kl": 0.0001731663942337036,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.43578868212728e-07,
"loss": 0.0165,
"reward": -0.014921462163329124,
"reward_after_mean": -0.014921462163329124,
"reward_after_std": 0.35436189733445644,
"reward_before_mean": 0.2836841717362404,
"reward_before_std": 0.25718711968511343,
"reward_change_max": 0.0,
"reward_change_mean": -0.2986056488007307,
"reward_change_min": -0.41532920859754086,
"reward_change_std": 0.15457267686724663,
"reward_std": 0.35436189733445644,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": 0.01285084243863821,
"step": 50
},
{
"clip_fraction": 0.0,
"completion_length": 2303.520851135254,
"epoch": 0.05828571428571429,
"grad_norm": 0.23037534952163696,
"kl": 0.0005000531673431396,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0274,
"reward": -0.23243643390014768,
"reward_after_mean": -0.23243643390014768,
"reward_after_std": 0.4816542100161314,
"reward_before_mean": -0.028105121105909348,
"reward_before_std": 0.4414081573486328,
"reward_change_max": 0.0,
"reward_change_mean": -0.2043313141912222,
"reward_change_min": -0.3245617300271988,
"reward_change_std": 0.1267829779535532,
"reward_std": 0.4816542211920023,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.13227178994566202,
"step": 51
},
{
"clip_fraction": 0.0,
"completion_length": 2936.5417289733887,
"epoch": 0.05942857142857143,
"grad_norm": 0.21160194277763367,
"kl": 0.0004085935652256012,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.357252853159505e-07,
"loss": 0.0235,
"reward": -0.22832631319761276,
"reward_after_mean": -0.22832631319761276,
"reward_after_std": 0.596066826954484,
"reward_before_mean": -0.03995312686311081,
"reward_before_std": 0.555262666195631,
"reward_change_max": 0.0,
"reward_change_mean": -0.18837318196892738,
"reward_change_min": -0.3354088496416807,
"reward_change_std": 0.12037093937397003,
"reward_std": 0.5960668455809355,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.14411979634314775,
"step": 52
},
{
"clip_fraction": 0.0,
"completion_length": 2943.395866394043,
"epoch": 0.060571428571428575,
"grad_norm": 0.19955258071422577,
"kl": 0.0003105923533439636,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0486,
"reward": 0.046732327435165644,
"reward_after_mean": 0.046732327435165644,
"reward_after_std": 0.572412297129631,
"reward_before_mean": 0.3493400067090988,
"reward_before_std": 0.581750338897109,
"reward_change_max": 0.0,
"reward_change_mean": -0.30260770581662655,
"reward_change_min": -0.48020620830357075,
"reward_change_std": 0.20280153769999743,
"reward_std": 0.5724122989922762,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/cosine_scaled_reward": 0.05767334741540253,
"step": 53
},
{
"clip_fraction": 0.0,
"completion_length": 2875.0208740234375,
"epoch": 0.061714285714285715,
"grad_norm": 0.16428104043006897,
"kl": 8.340924978256226e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.274017555754407e-07,
"loss": 0.0482,
"reward": 0.17428431287407875,
"reward_after_mean": 0.17428431287407875,
"reward_after_std": 0.5909074582159519,
"reward_before_mean": 0.5155755765736103,
"reward_before_std": 0.5545622715726495,
"reward_change_max": 0.0,
"reward_change_mean": -0.34129126369953156,
"reward_change_min": -0.6125206407159567,
"reward_change_std": 0.22563873324543238,
"reward_std": 0.590907484292984,
"rewards/accuracy_reward": 0.3750000074505806,
"rewards/cosine_scaled_reward": 0.1405755653977394,
"step": 54
},
{
"clip_fraction": 0.0,
"completion_length": 3093.875030517578,
"epoch": 0.06285714285714286,
"grad_norm": 0.17089791595935822,
"kl": 0.0005162432789802551,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0094,
"reward": -0.11113542690873146,
"reward_after_mean": -0.11113542690873146,
"reward_after_std": 0.3386560436338186,
"reward_before_mean": 0.15820150449872017,
"reward_before_std": 0.29461055248975754,
"reward_change_max": 0.0,
"reward_change_mean": -0.26933695189654827,
"reward_change_min": -0.4236486293375492,
"reward_change_std": 0.15788063406944275,
"reward_std": 0.3386560510843992,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.029298486188054085,
"step": 55
},
{
"clip_fraction": 0.0,
"completion_length": 2877.291717529297,
"epoch": 0.064,
"grad_norm": 0.20696526765823364,
"kl": 0.00024941563606262207,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.186184199300463e-07,
"loss": 0.0124,
"reward": -0.11615562066435814,
"reward_after_mean": -0.11615562066435814,
"reward_after_std": 0.3605691157281399,
"reward_before_mean": 0.15260330587625504,
"reward_before_std": 0.3486750479787588,
"reward_change_max": 0.0,
"reward_change_mean": -0.2687589228153229,
"reward_change_min": -0.3955491781234741,
"reward_change_std": 0.16233802400529385,
"reward_std": 0.36056913062930107,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/cosine_scaled_reward": -0.055730052292346954,
"step": 56
},
{
"clip_fraction": 0.0,
"completion_length": 3307.875,
"epoch": 0.06514285714285714,
"grad_norm": 0.13051925599575043,
"kl": 0.00013627856969833374,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0235,
"reward": -0.2993467375636101,
"reward_after_mean": -0.2993467375636101,
"reward_after_std": 0.4061383567750454,
"reward_before_mean": -0.10633787885308266,
"reward_before_std": 0.3765461528673768,
"reward_change_max": 0.0,
"reward_change_mean": -0.19300885125994682,
"reward_change_min": -0.3217271789908409,
"reward_change_std": 0.12175194267183542,
"reward_std": 0.40613835863769054,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.1896712128072977,
"step": 57
},
{
"clip_fraction": 0.0,
"completion_length": 2274.1667251586914,
"epoch": 0.06628571428571428,
"grad_norm": 0.3047843277454376,
"kl": 0.0021878480911254883,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.093859795212817e-07,
"loss": 0.147,
"reward": 0.014255084563046694,
"reward_after_mean": 0.014255084563046694,
"reward_after_std": 0.49078281223773956,
"reward_before_mean": 0.30834553577005863,
"reward_before_std": 0.4538161437958479,
"reward_change_max": 0.0,
"reward_change_mean": -0.2940904349088669,
"reward_change_min": -0.4698449205607176,
"reward_change_std": 0.18156961910426617,
"reward_std": 0.4907828290015459,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": 0.016678830608725548,
"step": 58
},
{
"clip_fraction": 0.0,
"completion_length": 2838.6041870117188,
"epoch": 0.06742857142857143,
"grad_norm": 0.15764220058918,
"kl": 0.0009508654475212097,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.046048391230247e-07,
"loss": 0.05,
"reward": -0.15274246037006378,
"reward_after_mean": -0.15274246037006378,
"reward_after_std": 0.37365361116826534,
"reward_before_mean": 0.1003800667822361,
"reward_before_std": 0.3644466046243906,
"reward_change_max": 0.0,
"reward_change_mean": -0.2531225271522999,
"reward_change_min": -0.41390063986182213,
"reward_change_std": 0.16076032724231482,
"reward_std": 0.3736536279320717,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.0662866085767746,
"step": 59
},
{
"clip_fraction": 0.0,
"completion_length": 3020.4375534057617,
"epoch": 0.06857142857142857,
"grad_norm": 0.17373518645763397,
"kl": 0.00023894011974334717,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.997156826556369e-07,
"loss": -0.0036,
"reward": -0.28059580083936453,
"reward_after_mean": -0.28059580083936453,
"reward_after_std": 0.38212408497929573,
"reward_before_mean": -0.07820725813508034,
"reward_before_std": 0.35481286235153675,
"reward_change_max": 0.0,
"reward_change_mean": -0.20238853991031647,
"reward_change_min": -0.3260616082698107,
"reward_change_std": 0.12590447254478931,
"reward_std": 0.3821241036057472,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.16154058929532766,
"step": 60
},
{
"clip_fraction": 0.0,
"completion_length": 3286.6250610351562,
"epoch": 0.06971428571428571,
"grad_norm": 0.1478388011455536,
"kl": 0.00047288229689002037,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.9471999940354e-07,
"loss": -0.0065,
"reward": -0.1639209073036909,
"reward_after_mean": -0.1639209073036909,
"reward_after_std": 0.5073207020759583,
"reward_before_mean": 0.07017608173191547,
"reward_before_std": 0.5215496774762869,
"reward_change_max": 0.0,
"reward_change_mean": -0.23409699089825153,
"reward_change_min": -0.45022542029619217,
"reward_change_std": 0.17784226965159178,
"reward_std": 0.5073207188397646,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.13815725967288017,
"step": 61
},
{
"clip_fraction": 0.0,
"completion_length": 2659.3959045410156,
"epoch": 0.07085714285714285,
"grad_norm": 0.15675584971904755,
"kl": 0.002586759626865387,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.896193111002475e-07,
"loss": -0.0003,
"reward": 0.0647954298183322,
"reward_after_mean": 0.0647954298183322,
"reward_after_std": 0.7668796423822641,
"reward_before_mean": 0.3486335091292858,
"reward_before_std": 0.797910291235894,
"reward_change_max": 0.0,
"reward_change_mean": -0.28383809328079224,
"reward_change_min": -0.6051791943609715,
"reward_change_std": 0.23171952739357948,
"reward_std": 0.7668796591460705,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/cosine_scaled_reward": 0.03613350400701165,
"step": 62
},
{
"clip_fraction": 0.0,
"completion_length": 2394.916717529297,
"epoch": 0.072,
"grad_norm": 0.2152957320213318,
"kl": 0.0014880448579788208,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0886,
"reward": 0.14790401607751846,
"reward_after_mean": 0.14790401607751846,
"reward_after_std": 0.5827827583998442,
"reward_before_mean": 0.4811950605362654,
"reward_before_std": 0.5497365463525057,
"reward_change_max": 0.0,
"reward_change_mean": -0.3332910742610693,
"reward_change_min": -0.560102928429842,
"reward_change_std": 0.21325735840946436,
"reward_std": 0.5827827733010054,
"rewards/accuracy_reward": 0.35416667349636555,
"rewards/cosine_scaled_reward": 0.12702841963618994,
"step": 63
},
{
"clip_fraction": 0.0,
"completion_length": 3006.4791870117188,
"epoch": 0.07314285714285715,
"grad_norm": 0.1582987755537033,
"kl": 0.001294851303100586,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.791091657286267e-07,
"loss": 0.048,
"reward": -0.125318787060678,
"reward_after_mean": -0.125318787060678,
"reward_after_std": 0.5649810526520014,
"reward_before_mean": 0.10997231677174568,
"reward_before_std": 0.557425320148468,
"reward_change_max": 0.0,
"reward_change_mean": -0.23529109917581081,
"reward_change_min": -0.4151240587234497,
"reward_change_std": 0.16171532310545444,
"reward_std": 0.5649810750037432,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.05669435299932957,
"step": 64
},
{
"clip_fraction": 0.0,
"completion_length": 2798.354202270508,
"epoch": 0.07428571428571429,
"grad_norm": 0.2439664602279663,
"kl": 0.002039670944213867,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.737029101523929e-07,
"loss": -0.0404,
"reward": -0.1746473447419703,
"reward_after_mean": -0.1746473447419703,
"reward_after_std": 0.42232158221304417,
"reward_before_mean": 0.05733271440840326,
"reward_before_std": 0.3576160566881299,
"reward_change_max": 0.0,
"reward_change_mean": -0.2319800667464733,
"reward_change_min": -0.3824646957218647,
"reward_change_std": 0.13665939681231976,
"reward_std": 0.4223215878009796,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.13016729429364204,
"step": 65
},
{
"clip_fraction": 0.0,
"completion_length": 2135.3541679382324,
"epoch": 0.07542857142857143,
"grad_norm": 0.23610518872737885,
"kl": 0.0015348196029663086,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.681980515339463e-07,
"loss": -0.052,
"reward": -0.01672324660466984,
"reward_after_mean": -0.01672324660466984,
"reward_after_std": 0.5297550391405821,
"reward_before_mean": 0.26884749345481396,
"reward_before_std": 0.5452183573506773,
"reward_change_max": 0.0,
"reward_change_mean": -0.285570727661252,
"reward_change_min": -0.5179183334112167,
"reward_change_std": 0.2046450274065137,
"reward_std": 0.5297550512477756,
"rewards/accuracy_reward": 0.2916666753590107,
"rewards/cosine_scaled_reward": -0.02281918376684189,
"step": 66
},
{
"clip_fraction": 0.0,
"completion_length": 3525.2291870117188,
"epoch": 0.07657142857142857,
"grad_norm": 0.13342344760894775,
"kl": 0.0009595267474651337,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.625962667065487e-07,
"loss": -0.0164,
"reward": -0.4475428834557533,
"reward_after_mean": -0.4475428834557533,
"reward_after_std": 0.19988763332366943,
"reward_before_mean": -0.28256511874496937,
"reward_before_std": 0.15135450195521116,
"reward_change_max": 0.0,
"reward_change_mean": -0.16497775353491306,
"reward_change_min": -0.24302313476800919,
"reward_change_std": 0.0873506860807538,
"reward_std": 0.19988763891160488,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2825651131570339,
"step": 67
},
{
"clip_fraction": 0.0,
"completion_length": 2284.6042098999023,
"epoch": 0.07771428571428571,
"grad_norm": 0.33302831649780273,
"kl": 0.005011081695556641,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.568992620281243e-07,
"loss": 0.095,
"reward": 0.04241333995014429,
"reward_after_mean": 0.04241333995014429,
"reward_after_std": 0.7362328171730042,
"reward_before_mean": 0.3158540027216077,
"reward_before_std": 0.7369763031601906,
"reward_change_max": 0.0,
"reward_change_mean": -0.2734406590461731,
"reward_change_min": -0.4872976951301098,
"reward_change_std": 0.19121473841369152,
"reward_std": 0.7362328246235847,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/cosine_scaled_reward": 0.02418732549995184,
"step": 68
},
{
"clip_fraction": 0.0,
"completion_length": 2774.187545776367,
"epoch": 0.07885714285714286,
"grad_norm": 0.21714705228805542,
"kl": 0.003237783908843994,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.511087728614862e-07,
"loss": -0.0021,
"reward": -0.3341971240006387,
"reward_after_mean": -0.3341971240006387,
"reward_after_std": 0.476218955591321,
"reward_before_mean": -0.17050418560393155,
"reward_before_std": 0.4226016802713275,
"reward_change_max": 0.0,
"reward_change_mean": -0.16369293443858624,
"reward_change_min": -0.24435340613126755,
"reward_change_std": 0.08748195692896843,
"reward_std": 0.4762189742177725,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.2330041821114719,
"step": 69
},
{
"clip_fraction": 0.0,
"completion_length": 3101.041702270508,
"epoch": 0.08,
"grad_norm": 0.17803345620632172,
"kl": 0.0014088600873947144,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.452265630457282e-07,
"loss": 0.0245,
"reward": -0.23275760933756828,
"reward_after_mean": -0.23275760933756828,
"reward_after_std": 0.4238963555544615,
"reward_before_mean": -0.015425082296133041,
"reward_before_std": 0.4194338507950306,
"reward_change_max": 0.0,
"reward_change_mean": -0.2173325251787901,
"reward_change_min": -0.4142337366938591,
"reward_change_std": 0.1531017581000924,
"reward_std": 0.4238963592797518,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.11959175206720829,
"step": 70
},
{
"clip_fraction": 0.0,
"completion_length": 2711.166679382324,
"epoch": 0.08114285714285714,
"grad_norm": 0.2796599268913269,
"kl": 0.001299545168876648,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.392544243589427e-07,
"loss": -0.0292,
"reward": -0.10017471015453339,
"reward_after_mean": -0.10017471015453339,
"reward_after_std": 0.4897888842970133,
"reward_before_mean": 0.15967663563787937,
"reward_before_std": 0.4954986907541752,
"reward_change_max": 0.0,
"reward_change_mean": -0.25985134579241276,
"reward_change_min": -0.4646740537136793,
"reward_change_std": 0.18658488430082798,
"reward_std": 0.48978889361023903,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.02782335691154003,
"step": 71
},
{
"clip_fraction": 0.0,
"completion_length": 3315.937530517578,
"epoch": 0.08228571428571428,
"grad_norm": 0.17189651727676392,
"kl": 0.0025037527084350586,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.331941759724268e-07,
"loss": 0.0035,
"reward": -0.321637025102973,
"reward_after_mean": -0.321637025102973,
"reward_after_std": 0.36918958090245724,
"reward_before_mean": -0.13817780697718263,
"reward_before_std": 0.3153585446998477,
"reward_change_max": 0.0,
"reward_change_mean": -0.18345921859145164,
"reward_change_min": -0.26327282935380936,
"reward_change_std": 0.0971462931483984,
"reward_std": 0.369189590215683,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.1798444762825966,
"step": 72
},
{
"clip_fraction": 0.0,
"completion_length": 3520.187530517578,
"epoch": 0.08342857142857144,
"grad_norm": 0.14897483587265015,
"kl": 0.0005079209804534912,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0132,
"reward": -0.16560733504593372,
"reward_after_mean": -0.16560733504593372,
"reward_after_std": 0.5065639726817608,
"reward_before_mean": 0.06735767424106598,
"reward_before_std": 0.5171879883855581,
"reward_change_max": 0.0,
"reward_change_mean": -0.23296500742435455,
"reward_change_min": -0.46446412801742554,
"reward_change_std": 0.17644464783370495,
"reward_std": 0.5065639745444059,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.09930899925529957,
"step": 73
},
{
"clip_fraction": 0.0,
"completion_length": 3304.5625,
"epoch": 0.08457142857142858,
"grad_norm": 0.1551615595817566,
"kl": 0.0015359818935394287,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.208167604184217e-07,
"loss": 0.0443,
"reward": -0.07261835690587759,
"reward_after_mean": -0.07261835690587759,
"reward_after_std": 0.4282707963138819,
"reward_before_mean": 0.1986630754545331,
"reward_before_std": 0.3714675856754184,
"reward_change_max": 0.0,
"reward_change_mean": -0.27128143049776554,
"reward_change_min": -0.4079938605427742,
"reward_change_std": 0.1546249119564891,
"reward_std": 0.4282708205282688,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.030503608286380768,
"step": 74
},
{
"clip_fraction": 0.0,
"completion_length": 2993.7084045410156,
"epoch": 0.08571428571428572,
"grad_norm": 0.16756100952625275,
"kl": 0.004540964961051941,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0082,
"reward": -0.19706540927290916,
"reward_after_mean": -0.19706540927290916,
"reward_after_std": 0.2995363511145115,
"reward_before_mean": 0.04089047887828201,
"reward_before_std": 0.20250952150672674,
"reward_change_max": 0.0,
"reward_change_mean": -0.23795590549707413,
"reward_change_min": -0.3240286745131016,
"reward_change_std": 0.12177529092878103,
"reward_std": 0.2995363622903824,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.10494285449385643,
"step": 75
},
{
"clip_fraction": 0.0,
"completion_length": 3052.187530517578,
"epoch": 0.08685714285714285,
"grad_norm": 0.15915194153785706,
"kl": 0.00043383240699768066,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.081093963579707e-07,
"loss": 0.0524,
"reward": -0.3360134717077017,
"reward_after_mean": -0.3360134717077017,
"reward_after_std": 0.3105210345238447,
"reward_before_mean": -0.14231654070317745,
"reward_before_std": 0.29032292775809765,
"reward_change_max": 0.0,
"reward_change_mean": -0.19369693472981453,
"reward_change_min": -0.33717145025730133,
"reward_change_std": 0.12174026388674974,
"reward_std": 0.31052104756236076,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.1839832067489624,
"step": 76
},
{
"clip_fraction": 0.0,
"completion_length": 3304.375030517578,
"epoch": 0.088,
"grad_norm": 0.14390531182289124,
"kl": 0.0005064904689788818,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0526,
"reward": -0.2539611868560314,
"reward_after_mean": -0.2539611868560314,
"reward_after_std": 0.39100789465010166,
"reward_before_mean": -0.043096862733364105,
"reward_before_std": 0.35905097983777523,
"reward_change_max": 0.0,
"reward_change_mean": -0.21086432039737701,
"reward_change_min": -0.35267689265310764,
"reward_change_std": 0.12909309566020966,
"reward_std": 0.3910079039633274,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.1472635231912136,
"step": 77
},
{
"clip_fraction": 0.0,
"completion_length": 3308.541717529297,
"epoch": 0.08914285714285715,
"grad_norm": 0.15253493189811707,
"kl": 0.0008769482374191284,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.950875657567621e-07,
"loss": 0.0074,
"reward": -0.0791936544701457,
"reward_after_mean": -0.0791936544701457,
"reward_after_std": 0.5702229253947735,
"reward_before_mean": 0.17483498714864254,
"reward_before_std": 0.5539351883344352,
"reward_change_max": 0.0,
"reward_change_mean": -0.25402865186333656,
"reward_change_min": -0.41291412711143494,
"reward_change_std": 0.16963480412960052,
"reward_std": 0.5702229347079992,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.03349835175322369,
"step": 78
},
{
"clip_fraction": 0.0,
"completion_length": 2278.75004196167,
"epoch": 0.09028571428571429,
"grad_norm": 0.29192036390304565,
"kl": 0.0021656155586242676,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.884636689049422e-07,
"loss": -0.0445,
"reward": -0.05769808869808912,
"reward_after_mean": -0.05769808869808912,
"reward_after_std": 0.5233242809772491,
"reward_before_mean": 0.20565658761188388,
"reward_before_std": 0.5024187322705984,
"reward_change_max": 0.0,
"reward_change_mean": -0.26335467025637627,
"reward_change_min": -0.40742968022823334,
"reward_change_std": 0.1620514988899231,
"reward_std": 0.52332429215312,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/cosine_scaled_reward": -0.023510104045271873,
"step": 79
},
{
"clip_fraction": 0.0,
"completion_length": 3350.5833587646484,
"epoch": 0.09142857142857143,
"grad_norm": 0.150605708360672,
"kl": 0.001107722520828247,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.817671337095244e-07,
"loss": -0.0069,
"reward": -0.25659373961389065,
"reward_after_mean": -0.25659373961389065,
"reward_after_std": 0.37472186982631683,
"reward_before_mean": -0.04372098250314593,
"reward_before_std": 0.3439793400466442,
"reward_change_max": 0.0,
"reward_change_mean": -0.21287276968359947,
"reward_change_min": -0.33237724378705025,
"reward_change_std": 0.1219419464468956,
"reward_std": 0.37472187355160713,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.16872098669409752,
"step": 80
},
{
"clip_fraction": 0.0,
"completion_length": 3116.270866394043,
"epoch": 0.09257142857142857,
"grad_norm": 0.23280547559261322,
"kl": 0.0033311843872070312,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.75e-07,
"loss": 0.0373,
"reward": -0.26977309957146645,
"reward_after_mean": -0.26977309957146645,
"reward_after_std": 0.28102972730994225,
"reward_before_mean": -0.04979459196329117,
"reward_before_std": 0.25032276660203934,
"reward_change_max": 0.0,
"reward_change_mean": -0.21997853554785252,
"reward_change_min": -0.33210898377001286,
"reward_change_std": 0.12461962644010782,
"reward_std": 0.2810297291725874,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.15396124683320522,
"step": 81
},
{
"clip_fraction": 0.0,
"completion_length": 2953.9166717529297,
"epoch": 0.09371428571428571,
"grad_norm": 0.17104879021644592,
"kl": 0.002446308732032776,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.681643291108517e-07,
"loss": -0.0279,
"reward": -0.029579367488622665,
"reward_after_mean": -0.029579367488622665,
"reward_after_std": 0.5071045402437449,
"reward_before_mean": 0.24649053468601778,
"reward_before_std": 0.4566717045381665,
"reward_change_max": 0.0,
"reward_change_mean": -0.27606993727386,
"reward_change_min": -0.46944032795727253,
"reward_change_std": 0.17568331584334373,
"reward_std": 0.5071045681834221,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": -0.003509456291794777,
"step": 82
},
{
"clip_fraction": 0.0,
"completion_length": 2885.208366394043,
"epoch": 0.09485714285714286,
"grad_norm": 0.21742479503154755,
"kl": 0.0008513182401657104,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0297,
"reward": -0.30521316826343536,
"reward_after_mean": -0.30521316826343536,
"reward_after_std": 0.3845277652144432,
"reward_before_mean": -0.11277107335627079,
"reward_before_std": 0.354814812541008,
"reward_change_max": 0.0,
"reward_change_mean": -0.19244208745658398,
"reward_change_min": -0.30643659457564354,
"reward_change_std": 0.1196863753721118,
"reward_std": 0.3845277763903141,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.19610440777614713,
"step": 83
},
{
"clip_fraction": 0.0,
"completion_length": 3133.1666870117188,
"epoch": 0.096,
"grad_norm": 0.16427645087242126,
"kl": 0.00032585859298706055,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.54295724882796e-07,
"loss": 0.0304,
"reward": -0.038457803428173065,
"reward_after_mean": -0.038457803428173065,
"reward_after_std": 0.4437688495963812,
"reward_before_mean": 0.24993115104734898,
"reward_before_std": 0.44302100967615843,
"reward_change_max": 0.0,
"reward_change_mean": -0.28838893957436085,
"reward_change_min": -0.47144375927746296,
"reward_change_std": 0.18405816424638033,
"reward_std": 0.4437688793987036,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/cosine_scaled_reward": -6.886757910251617e-05,
"step": 84
},
{
"clip_fraction": 0.0,
"completion_length": 3301.979217529297,
"epoch": 0.09714285714285714,
"grad_norm": 0.1837218999862671,
"kl": 0.00045623257756233215,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0153,
"reward": -0.0646021788707003,
"reward_after_mean": -0.0646021788707003,
"reward_after_std": 0.5770941227674484,
"reward_before_mean": 0.1956373080611229,
"reward_before_std": 0.5833419263362885,
"reward_change_max": 0.0,
"reward_change_mean": -0.260239502415061,
"reward_change_min": -0.4684751071035862,
"reward_change_std": 0.18584690615534782,
"reward_std": 0.5770941376686096,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": -0.03352935239672661,
"step": 85
},
{
"clip_fraction": 0.0,
"completion_length": 3119.625030517578,
"epoch": 0.09828571428571428,
"grad_norm": 0.1819085329771042,
"kl": 0.0014518499374389648,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.401782177833147e-07,
"loss": -0.0187,
"reward": -0.30323445051908493,
"reward_after_mean": -0.30323445051908493,
"reward_after_std": 0.3614001404494047,
"reward_before_mean": -0.10566247068345547,
"reward_before_std": 0.33295972365885973,
"reward_change_max": 0.0,
"reward_change_mean": -0.19757197797298431,
"reward_change_min": -0.3210353907197714,
"reward_change_std": 0.12041187938302755,
"reward_std": 0.36140014231204987,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.20982913300395012,
"step": 86
},
{
"clip_fraction": 0.0,
"completion_length": 3044.104217529297,
"epoch": 0.09942857142857142,
"grad_norm": 0.17637749016284943,
"kl": 0.002590000629425049,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0096,
"reward": -0.14765978045761585,
"reward_after_mean": -0.14765978045761585,
"reward_after_std": 0.4970177672803402,
"reward_before_mean": 0.08903376385569572,
"reward_before_std": 0.48575887829065323,
"reward_change_max": 0.0,
"reward_change_mean": -0.23669353872537613,
"reward_change_min": -0.40174689143896103,
"reward_change_std": 0.15881517250090837,
"reward_std": 0.49701779522001743,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.07763291290029883,
"step": 87
},
{
"clip_fraction": 0.0,
"completion_length": 2976.3125610351562,
"epoch": 0.10057142857142858,
"grad_norm": 0.2443528175354004,
"kl": 0.0038170814514160156,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.258290078201731e-07,
"loss": 0.1182,
"reward": -0.06454395316541195,
"reward_after_mean": -0.06454395316541195,
"reward_after_std": 0.7364202737808228,
"reward_before_mean": 0.16822397490614094,
"reward_before_std": 0.7222829144448042,
"reward_change_max": 0.0,
"reward_change_mean": -0.23276793025434017,
"reward_change_min": -0.48013338819146156,
"reward_change_std": 0.17427105363458395,
"reward_std": 0.7364202737808228,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.04010936478152871,
"step": 88
},
{
"clip_fraction": 0.0,
"completion_length": 3386.9583435058594,
"epoch": 0.10171428571428572,
"grad_norm": 0.16150832176208496,
"kl": 0.0024889707565307617,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0032,
"reward": -0.31388526782393456,
"reward_after_mean": -0.31388526782393456,
"reward_after_std": 0.37776545993983746,
"reward_before_mean": -0.12356989830732346,
"reward_before_std": 0.3436039094813168,
"reward_change_max": 0.0,
"reward_change_mean": -0.1903153732419014,
"reward_change_min": -0.3229862004518509,
"reward_change_std": 0.11718899849802256,
"reward_std": 0.37776547484099865,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.18606988992542028,
"step": 89
},
{
"clip_fraction": 0.0,
"completion_length": 2772.937530517578,
"epoch": 0.10285714285714286,
"grad_norm": 0.2691808044910431,
"kl": 0.004205763339996338,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.11265577295385e-07,
"loss": 0.0159,
"reward": -0.3370555378496647,
"reward_after_mean": -0.3370555378496647,
"reward_after_std": 0.3809457756578922,
"reward_before_mean": -0.15994180366396904,
"reward_before_std": 0.3365004351362586,
"reward_change_max": 0.0,
"reward_change_mean": -0.17711373046040535,
"reward_change_min": -0.26847982592880726,
"reward_change_std": 0.09736619610339403,
"reward_std": 0.3809457868337631,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.20160847809165716,
"step": 90
},
{
"clip_fraction": 0.0,
"completion_length": 3308.937530517578,
"epoch": 0.104,
"grad_norm": 0.20113322138786316,
"kl": 0.001620747148990631,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.039090644965509e-07,
"loss": 0.054,
"reward": -0.1898544393479824,
"reward_after_mean": -0.1898544393479824,
"reward_after_std": 0.4037452917546034,
"reward_before_mean": 0.04781687818467617,
"reward_before_std": 0.40877903811633587,
"reward_change_max": 0.0,
"reward_change_mean": -0.23767131194472313,
"reward_change_min": -0.40195638686418533,
"reward_change_std": 0.16346925124526024,
"reward_std": 0.4037453029304743,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.0980164622887969,
"step": 91
},
{
"clip_fraction": 0.0,
"completion_length": 3016.812530517578,
"epoch": 0.10514285714285715,
"grad_norm": 0.22104471921920776,
"kl": 0.09951108694076538,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.965056695057204e-07,
"loss": 0.0342,
"reward": -0.19481445383280516,
"reward_after_mean": -0.19481445383280516,
"reward_after_std": 0.4913271814584732,
"reward_before_mean": 0.025329099036753178,
"reward_before_std": 0.468422326259315,
"reward_change_max": 0.0,
"reward_change_mean": -0.22014355659484863,
"reward_change_min": -0.38161665946245193,
"reward_change_std": 0.14695494808256626,
"reward_std": 0.4913271926343441,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.1205042446963489,
"step": 92
},
{
"clip_fraction": 0.0,
"completion_length": 3383.375,
"epoch": 0.10628571428571429,
"grad_norm": 0.15710371732711792,
"kl": 0.0020842552185058594,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0015,
"reward": -0.4476584121584892,
"reward_after_mean": -0.4476584121584892,
"reward_after_std": 0.2138187699019909,
"reward_before_mean": -0.28413364104926586,
"reward_before_std": 0.17518659960478544,
"reward_change_max": 0.0,
"reward_change_mean": -0.16352477483451366,
"reward_change_min": -0.25845394283533096,
"reward_change_std": 0.08956557791680098,
"reward_std": 0.21381877548992634,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2841336391866207,
"step": 93
},
{
"clip_fraction": 0.0,
"completion_length": 3182.1458587646484,
"epoch": 0.10742857142857143,
"grad_norm": 0.17469479143619537,
"kl": 0.028481706976890564,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.815672671252315e-07,
"loss": 0.0183,
"reward": -0.1544804833829403,
"reward_after_mean": -0.1544804833829403,
"reward_after_std": 0.4560176860541105,
"reward_before_mean": 0.08422049786895514,
"reward_before_std": 0.4216147158294916,
"reward_change_max": 0.0,
"reward_change_mean": -0.23870097286999226,
"reward_change_min": -0.40013051964342594,
"reward_change_std": 0.1476370794698596,
"reward_std": 0.4560176897794008,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.08244619239121675,
"step": 94
},
{
"clip_fraction": 0.0,
"completion_length": 3389.1875,
"epoch": 0.10857142857142857,
"grad_norm": 0.1376781314611435,
"kl": 0.0006309226155281067,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.740368101176495e-07,
"loss": -0.0017,
"reward": -0.35236234590411186,
"reward_after_mean": -0.35236234590411186,
"reward_after_std": 0.4059419594705105,
"reward_before_mean": -0.18372074887156487,
"reward_before_std": 0.34809670504182577,
"reward_change_max": 0.0,
"reward_change_mean": -0.1686416082084179,
"reward_change_min": -0.2610710132867098,
"reward_change_std": 0.0913059962913394,
"reward_std": 0.4059419725090265,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2253874186426401,
"step": 95
},
{
"clip_fraction": 0.0,
"completion_length": 2967.5208587646484,
"epoch": 0.10971428571428571,
"grad_norm": 0.18132349848747253,
"kl": 0.008769378066062927,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.664685702961344e-07,
"loss": 0.0782,
"reward": -0.009053988382220268,
"reward_after_mean": -0.009053988382220268,
"reward_after_std": 0.5677688978612423,
"reward_before_mean": 0.2716766329249367,
"reward_before_std": 0.5742009859532118,
"reward_change_max": 0.0,
"reward_change_mean": -0.28073061630129814,
"reward_change_min": -0.4879560321569443,
"reward_change_std": 0.19168910942971706,
"reward_std": 0.5677689034491777,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": 0.021676627919077873,
"step": 96
},
{
"clip_fraction": 0.0,
"completion_length": 3335.9166870117188,
"epoch": 0.11085714285714286,
"grad_norm": 0.15675108134746552,
"kl": 0.0016013942658901215,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0705,
"reward": -0.20349452830851078,
"reward_after_mean": -0.20349452830851078,
"reward_after_std": 0.4865495152771473,
"reward_before_mean": 0.01759020984172821,
"reward_before_std": 0.480413525365293,
"reward_change_max": 0.0,
"reward_change_mean": -0.2210847306996584,
"reward_change_min": -0.38277435675263405,
"reward_change_std": 0.15406140219420195,
"reward_std": 0.48654952459037304,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.14907646551728249,
"step": 97
},
{
"clip_fraction": 0.0,
"completion_length": 3094.6666870117188,
"epoch": 0.112,
"grad_norm": 0.20178250968456268,
"kl": 0.000708162784576416,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.512279744547392e-07,
"loss": 0.0154,
"reward": -0.2618050053715706,
"reward_after_mean": -0.2618050053715706,
"reward_after_std": 0.22535480838268995,
"reward_before_mean": -0.03537908382713795,
"reward_before_std": 0.14867154462262988,
"reward_change_max": 0.0,
"reward_change_mean": -0.2264259122312069,
"reward_change_min": -0.31195950135588646,
"reward_change_std": 0.11560725700110197,
"reward_std": 0.22535481490194798,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.16037909872829914,
"step": 98
},
{
"clip_fraction": 0.0,
"completion_length": 2847.812515258789,
"epoch": 0.11314285714285714,
"grad_norm": 0.19928348064422607,
"kl": 0.001722574234008789,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.435602608679916e-07,
"loss": -0.0071,
"reward": -0.23697317391633987,
"reward_after_mean": -0.23697317391633987,
"reward_after_std": 0.22447119280695915,
"reward_before_mean": -0.0008997507393360138,
"reward_before_std": 0.14804014191031456,
"reward_change_max": 0.0,
"reward_change_mean": -0.23607343435287476,
"reward_change_min": -0.3225790597498417,
"reward_change_std": 0.12129600439220667,
"reward_std": 0.2244712058454752,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.12589975725859404,
"step": 99
},
{
"clip_fraction": 0.0,
"completion_length": 2983.666702270508,
"epoch": 0.11428571428571428,
"grad_norm": 0.14032700657844543,
"kl": 0.025766372680664062,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.358640479194451e-07,
"loss": 0.0376,
"reward": -0.2477741176262498,
"reward_after_mean": -0.2477741176262498,
"reward_after_std": 0.4157449547201395,
"reward_before_mean": -0.03449610248208046,
"reward_before_std": 0.4026447180658579,
"reward_change_max": 0.0,
"reward_change_mean": -0.2132780011743307,
"reward_change_min": -0.3858350533992052,
"reward_change_std": 0.1465577408671379,
"reward_std": 0.41574496403336525,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.15949611051473767,
"step": 100
},
{
"clip_fraction": 0.0,
"completion_length": 3004.4166717529297,
"epoch": 0.11542857142857142,
"grad_norm": 0.17880059778690338,
"kl": 0.0014008283615112305,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0046,
"reward": -0.20459227031096816,
"reward_after_mean": -0.20459227031096816,
"reward_after_std": 0.33806266263127327,
"reward_before_mean": 0.02772543951869011,
"reward_before_std": 0.2651213016360998,
"reward_change_max": 0.0,
"reward_change_mean": -0.23231769353151321,
"reward_change_min": -0.32845479622483253,
"reward_change_std": 0.12321093957871199,
"reward_std": 0.33806267008185387,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.11810789071023464,
"step": 101
},
{
"clip_fraction": 0.0,
"completion_length": 2936.8959045410156,
"epoch": 0.11657142857142858,
"grad_norm": 0.26506590843200684,
"kl": 0.005680441856384277,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.203955092681039e-07,
"loss": 0.0551,
"reward": -0.15760041342582554,
"reward_after_mean": -0.15760041342582554,
"reward_after_std": 0.5418837126344442,
"reward_before_mean": 0.0681868263927754,
"reward_before_std": 0.5268151368945837,
"reward_change_max": 0.0,
"reward_change_mean": -0.22578725591301918,
"reward_change_min": -0.3727243058383465,
"reward_change_std": 0.14929345156997442,
"reward_std": 0.5418837182223797,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.05681317043490708,
"step": 102
},
{
"clip_fraction": 0.0,
"completion_length": 3200.9375610351562,
"epoch": 0.11771428571428572,
"grad_norm": 0.16663618385791779,
"kl": 0.006103813648223877,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0462,
"reward": -0.21803143620491028,
"reward_after_mean": -0.21803143620491028,
"reward_after_std": 0.5001228470355272,
"reward_before_mean": -0.009146178141236305,
"reward_before_std": 0.4853008156642318,
"reward_change_max": 0.0,
"reward_change_mean": -0.20888524316251278,
"reward_change_min": -0.39318533055484295,
"reward_change_std": 0.14420427940785885,
"reward_std": 0.5001228544861078,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.1341461860574782,
"step": 103
},
{
"clip_fraction": 0.0,
"completion_length": 2683.6250381469727,
"epoch": 0.11885714285714286,
"grad_norm": 0.19750936329364777,
"kl": 0.0069179534912109375,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.048412045323164e-07,
"loss": -0.0106,
"reward": -0.20132402144372463,
"reward_after_mean": -0.20132402144372463,
"reward_after_std": 0.3386107739061117,
"reward_before_mean": 0.03512360900640488,
"reward_before_std": 0.28090421110391617,
"reward_change_max": 0.0,
"reward_change_mean": -0.2364476341754198,
"reward_change_min": -0.3883332423865795,
"reward_change_std": 0.13935024105012417,
"reward_std": 0.33861077949404716,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.13154308311641216,
"step": 104
},
{
"clip_fraction": 0.0,
"completion_length": 3068.687530517578,
"epoch": 0.12,
"grad_norm": 0.18446093797683716,
"kl": 0.004405617713928223,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0333,
"reward": -0.049622944090515375,
"reward_after_mean": -0.049622944090515375,
"reward_after_std": 0.6895017940551043,
"reward_before_mean": 0.2017551939934492,
"reward_before_std": 0.7053272109478712,
"reward_change_max": 0.0,
"reward_change_mean": -0.25137814693152905,
"reward_change_min": -0.5445310994982719,
"reward_change_std": 0.2017376320436597,
"reward_std": 0.6895018108189106,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": -0.027411479502916336,
"step": 105
},
{
"clip_fraction": 0.0,
"completion_length": 2494.3334007263184,
"epoch": 0.12114285714285715,
"grad_norm": 0.16192932426929474,
"kl": 0.01092296838760376,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.892200842364462e-07,
"loss": 0.0417,
"reward": 0.27581100910902023,
"reward_after_mean": 0.27581100910902023,
"reward_after_std": 0.5071319099515676,
"reward_before_mean": 0.6641694903373718,
"reward_before_std": 0.4399477792903781,
"reward_change_max": 0.0,
"reward_change_mean": -0.3883584924042225,
"reward_change_min": -0.5921457093209028,
"reward_change_std": 0.2340643797069788,
"reward_std": 0.5071319285780191,
"rewards/accuracy_reward": 0.4791666716337204,
"rewards/cosine_scaled_reward": 0.18500283360481262,
"step": 106
},
{
"clip_fraction": 0.0,
"completion_length": 3063.854217529297,
"epoch": 0.12228571428571429,
"grad_norm": 0.2792545557022095,
"kl": 0.0023031234741210938,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.813904131848564e-07,
"loss": -0.0245,
"reward": -0.181765828281641,
"reward_after_mean": -0.181765828281641,
"reward_after_std": 0.338009238243103,
"reward_before_mean": 0.06094588339328766,
"reward_before_std": 0.2771811536513269,
"reward_change_max": 0.0,
"reward_change_mean": -0.24271169770509005,
"reward_change_min": -0.40003350004553795,
"reward_change_std": 0.14317026268690825,
"reward_std": 0.33800926245748997,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.10572081245481968,
"step": 107
},
{
"clip_fraction": 0.0,
"completion_length": 3032.0000610351562,
"epoch": 0.12342857142857143,
"grad_norm": 0.22073347866535187,
"kl": 0.0028527379035949707,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.735511803093248e-07,
"loss": 0.0812,
"reward": -0.06596326269209385,
"reward_after_mean": -0.06596326269209385,
"reward_after_std": 0.6328165102750063,
"reward_before_mean": 0.18634681031107903,
"reward_before_std": 0.6565964054316282,
"reward_change_max": 0.0,
"reward_change_mean": -0.2523100730031729,
"reward_change_min": -0.5472348593175411,
"reward_change_std": 0.2049607066437602,
"reward_std": 0.632816543802619,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.021986512932926416,
"step": 108
},
{
"clip_fraction": 0.0,
"completion_length": 3071.3958435058594,
"epoch": 0.12457142857142857,
"grad_norm": 0.1601523756980896,
"kl": 0.0012826323509216309,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0382,
"reward": -0.2346851173788309,
"reward_after_mean": -0.2346851173788309,
"reward_after_std": 0.3800167478621006,
"reward_before_mean": -0.01405545324087143,
"reward_before_std": 0.35714889597147703,
"reward_change_max": 0.0,
"reward_change_mean": -0.22062966413795948,
"reward_change_min": -0.34099637903273106,
"reward_change_std": 0.13036640547215939,
"reward_std": 0.38001675345003605,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.13905547931790352,
"step": 109
},
{
"clip_fraction": 0.0,
"completion_length": 3079.645866394043,
"epoch": 0.12571428571428572,
"grad_norm": 0.20363496243953705,
"kl": 0.0030457042157649994,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.578535828967777e-07,
"loss": 0.0427,
"reward": -0.24088630080223083,
"reward_after_mean": -0.24088630080223083,
"reward_after_std": 0.578212320804596,
"reward_before_mean": -0.056965045630931854,
"reward_before_std": 0.5253217183053493,
"reward_change_max": 0.0,
"reward_change_mean": -0.18392126075923443,
"reward_change_min": -0.3033385146409273,
"reward_change_std": 0.10721975099295378,
"reward_std": 0.5782123357057571,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.18196504982188344,
"step": 110
},
{
"clip_fraction": 0.0,
"completion_length": 3483.5625610351562,
"epoch": 0.12685714285714286,
"grad_norm": 0.1459619253873825,
"kl": 0.002910614013671875,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.5e-07,
"loss": 0.0093,
"reward": -0.20540679898113012,
"reward_after_mean": -0.20540679898113012,
"reward_after_std": 0.5167342368513346,
"reward_before_mean": 0.008995672687888145,
"reward_before_std": 0.5046728178858757,
"reward_change_max": 0.0,
"reward_change_mean": -0.21440248005092144,
"reward_change_min": -0.4584047421813011,
"reward_change_std": 0.15795970242470503,
"reward_std": 0.5167342405766249,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.11600433615967631,
"step": 111
},
{
"clip_fraction": 0.0,
"completion_length": 3405.0833740234375,
"epoch": 0.128,
"grad_norm": 0.16170577704906464,
"kl": 0.0009909868240356445,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.421464171032224e-07,
"loss": 0.0451,
"reward": -0.09846091084182262,
"reward_after_mean": -0.09846091084182262,
"reward_after_std": 0.5994726214557886,
"reward_before_mean": 0.14584718085825443,
"reward_before_std": 0.6081715226173401,
"reward_change_max": 0.0,
"reward_change_mean": -0.24430808424949646,
"reward_change_min": -0.46415193751454353,
"reward_change_std": 0.18282546661794186,
"reward_std": 0.5994726475328207,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.06248616240918636,
"step": 112
},
{
"clip_fraction": 0.0,
"completion_length": 3235.9166870117188,
"epoch": 0.12914285714285714,
"grad_norm": 0.21097467839717865,
"kl": 0.004243135452270508,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0744,
"reward": -0.23292838409543037,
"reward_after_mean": -0.23292838409543037,
"reward_after_std": 0.41003835014998913,
"reward_before_mean": -0.012828025966882706,
"reward_before_std": 0.4060424007475376,
"reward_change_max": 0.0,
"reward_change_mean": -0.22010035812854767,
"reward_change_min": -0.3953362423926592,
"reward_change_std": 0.15193824656307697,
"reward_std": 0.41003837063908577,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.13782802782952785,
"step": 113
},
{
"clip_fraction": 0.0,
"completion_length": 2685.479217529297,
"epoch": 0.13028571428571428,
"grad_norm": 0.18926027417182922,
"kl": 0.0038733482360839844,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.264488196906752e-07,
"loss": -0.0345,
"reward": -0.3609940633177757,
"reward_after_mean": -0.3609940633177757,
"reward_after_std": 0.31448194198310375,
"reward_before_mean": -0.17894649133086205,
"reward_before_std": 0.2785217398777604,
"reward_change_max": 0.0,
"reward_change_mean": -0.18204755894839764,
"reward_change_min": -0.3147698640823364,
"reward_change_std": 0.1119089126586914,
"reward_std": 0.31448194943368435,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.22061316156759858,
"step": 114
},
{
"clip_fraction": 0.0,
"completion_length": 3032.2916870117188,
"epoch": 0.13142857142857142,
"grad_norm": 0.20660527050495148,
"kl": 0.004292488098144531,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0133,
"reward": -0.25222317641600966,
"reward_after_mean": -0.25222317641600966,
"reward_after_std": 0.3675101902335882,
"reward_before_mean": -0.038438186049461365,
"reward_before_std": 0.3253793818876147,
"reward_change_max": 0.0,
"reward_change_mean": -0.21378498524427414,
"reward_change_min": -0.32515949942171574,
"reward_change_std": 0.12277536746114492,
"reward_std": 0.36751021072268486,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.1634381867479533,
"step": 115
},
{
"clip_fraction": 0.0,
"completion_length": 3507.2291870117188,
"epoch": 0.13257142857142856,
"grad_norm": 0.14657403528690338,
"kl": 0.0013550519943237305,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.107799157635538e-07,
"loss": 0.0071,
"reward": -0.3162567066028714,
"reward_after_mean": -0.3162567066028714,
"reward_after_std": 0.37543779239058495,
"reward_before_mean": -0.12292648106813431,
"reward_before_std": 0.3578766481950879,
"reward_change_max": 0.0,
"reward_change_mean": -0.19333022460341454,
"reward_change_min": -0.37804919853806496,
"reward_change_std": 0.1362152397632599,
"reward_std": 0.37543781008571386,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.2062598317861557,
"step": 116
},
{
"clip_fraction": 0.0,
"completion_length": 3425.5,
"epoch": 0.1337142857142857,
"grad_norm": 0.17479689419269562,
"kl": 0.0026268959045410156,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.02962191529556e-07,
"loss": -0.0087,
"reward": -0.39809813536703587,
"reward_after_mean": -0.39809813536703587,
"reward_after_std": 0.29817865043878555,
"reward_before_mean": -0.22966473922133446,
"reward_before_std": 0.25032039918005466,
"reward_change_max": 0.0,
"reward_change_mean": -0.168433403596282,
"reward_change_min": -0.254726717248559,
"reward_change_std": 0.09154631663113832,
"reward_std": 0.298178656026721,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2504980657249689,
"step": 117
},
{
"clip_fraction": 0.0,
"completion_length": 3336.7709045410156,
"epoch": 0.13485714285714287,
"grad_norm": 0.1754436492919922,
"kl": 0.0014584064483642578,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.951587954676837e-07,
"loss": 0.0358,
"reward": 0.049660567194223404,
"reward_after_mean": 0.049660567194223404,
"reward_after_std": 0.6818990353494883,
"reward_before_mean": 0.33523961436003447,
"reward_before_std": 0.6836029682308435,
"reward_change_max": 0.0,
"reward_change_mean": -0.2855790425091982,
"reward_change_min": -0.5425571762025356,
"reward_change_std": 0.20386025682091713,
"reward_std": 0.6818990539759398,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/cosine_scaled_reward": 0.06440626340918243,
"step": 118
},
{
"clip_fraction": 0.0,
"completion_length": 2581.8333892822266,
"epoch": 0.136,
"grad_norm": 0.19370344281196594,
"kl": 0.008479833602905273,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0367,
"reward": 0.004929829388856888,
"reward_after_mean": 0.004929829388856888,
"reward_after_std": 0.4302307050675154,
"reward_before_mean": 0.30591132678091526,
"reward_before_std": 0.3950154595077038,
"reward_change_max": 0.0,
"reward_change_mean": -0.3009814973920584,
"reward_change_min": -0.4940522387623787,
"reward_change_std": 0.18756380956619978,
"reward_std": 0.43023071624338627,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.05591131933033466,
"step": 119
},
{
"clip_fraction": 0.0,
"completion_length": 2806.541717529297,
"epoch": 0.13714285714285715,
"grad_norm": 0.31839409470558167,
"kl": 0.005864620208740234,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.79604490731896e-07,
"loss": 0.0281,
"reward": -0.14376608515158296,
"reward_after_mean": -0.14376608515158296,
"reward_after_std": 0.4351091645658016,
"reward_before_mean": 0.10138015681877732,
"reward_before_std": 0.41372954845428467,
"reward_change_max": 0.0,
"reward_change_mean": -0.2451462484896183,
"reward_change_min": -0.4056401252746582,
"reward_change_std": 0.1532444702461362,
"reward_std": 0.43510917387902737,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.06528650567634031,
"step": 120
},
{
"clip_fraction": 0.0,
"completion_length": 2467.1042251586914,
"epoch": 0.1382857142857143,
"grad_norm": 0.2131020575761795,
"kl": 0.01170969009399414,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.7185832004988133e-07,
"loss": -0.0323,
"reward": -0.13892671652138233,
"reward_after_mean": -0.13892671652138233,
"reward_after_std": 0.42193593084812164,
"reward_before_mean": 0.10583998123183846,
"reward_before_std": 0.3574620336294174,
"reward_change_max": 0.0,
"reward_change_mean": -0.24476669542491436,
"reward_change_min": -0.39448612928390503,
"reward_change_std": 0.14441118016839027,
"reward_std": 0.42193594202399254,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.08166004437953234,
"step": 121
},
{
"clip_fraction": 0.0,
"completion_length": 3173.8333740234375,
"epoch": 0.13942857142857143,
"grad_norm": 0.19904720783233643,
"kl": 0.0028328895568847656,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.641359520805548e-07,
"loss": 0.0668,
"reward": -0.009627103805541992,
"reward_after_mean": -0.009627103805541992,
"reward_after_std": 0.4583488889038563,
"reward_before_mean": 0.2879569726064801,
"reward_before_std": 0.4659885112196207,
"reward_change_max": 0.0,
"reward_change_mean": -0.29758409410715103,
"reward_change_min": -0.47819996625185013,
"reward_change_std": 0.19638753589242697,
"reward_std": 0.45834890380501747,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/cosine_scaled_reward": 0.03795698191970587,
"step": 122
},
{
"clip_fraction": 0.0,
"completion_length": 3264.0208435058594,
"epoch": 0.14057142857142857,
"grad_norm": 0.152817502617836,
"kl": 0.002018570899963379,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0387,
"reward": -0.19304709881544113,
"reward_after_mean": -0.19304709881544113,
"reward_after_std": 0.4907000754028559,
"reward_before_mean": 0.028430650010704994,
"reward_before_std": 0.4807597752660513,
"reward_change_max": 0.0,
"reward_change_mean": -0.22147774696350098,
"reward_change_min": -0.4099309202283621,
"reward_change_std": 0.1521046319976449,
"reward_std": 0.490700077265501,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.11740268766880035,
"step": 123
},
{
"clip_fraction": 0.0,
"completion_length": 2913.812530517578,
"epoch": 0.1417142857142857,
"grad_norm": 0.18517668545246124,
"kl": 0.03206205368041992,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.4877202554526084e-07,
"loss": -0.0244,
"reward": 0.025332925841212273,
"reward_after_mean": 0.025332925841212273,
"reward_after_std": 0.5710857696831226,
"reward_before_mean": 0.30887394258752465,
"reward_before_std": 0.5023922696709633,
"reward_change_max": 0.0,
"reward_change_mean": -0.28354101814329624,
"reward_change_min": -0.4145784545689821,
"reward_change_std": 0.1590277198702097,
"reward_std": 0.5710857976227999,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": 0.017207262571901083,
"step": 124
},
{
"clip_fraction": 0.0,
"completion_length": 2971.520866394043,
"epoch": 0.14285714285714285,
"grad_norm": 0.18670791387557983,
"kl": 0.005125522613525391,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0218,
"reward": -0.09776409342885017,
"reward_after_mean": -0.09776409342885017,
"reward_after_std": 0.3319449555128813,
"reward_before_mean": 0.17860279604792595,
"reward_before_std": 0.2927938736975193,
"reward_change_max": 0.0,
"reward_change_mean": -0.276366900652647,
"reward_change_min": -0.4111520666629076,
"reward_change_std": 0.1613099630922079,
"reward_std": 0.3319449629634619,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.029730526730418205,
"step": 125
},
{
"clip_fraction": 0.0,
"completion_length": 3012.0209045410156,
"epoch": 0.144,
"grad_norm": 0.1566249281167984,
"kl": 0.0010142326354980469,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.3353142970386557e-07,
"loss": 0.0173,
"reward": -0.037185917142778635,
"reward_after_mean": -0.037185917142778635,
"reward_after_std": 0.4496050179004669,
"reward_before_mean": 0.24768859706819057,
"reward_before_std": 0.43565093353390694,
"reward_change_max": 0.0,
"reward_change_mean": -0.28487453050911427,
"reward_change_min": -0.47056199237704277,
"reward_change_std": 0.1813699882477522,
"reward_std": 0.4496050253510475,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/cosine_scaled_reward": -0.0023114103823900223,
"step": 126
},
{
"clip_fraction": 0.0,
"completion_length": 3442.541748046875,
"epoch": 0.14514285714285713,
"grad_norm": 0.1765584796667099,
"kl": 0.0017638206481933594,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.2596318988235037e-07,
"loss": -0.0108,
"reward": -0.41788332164287567,
"reward_after_mean": -0.41788332164287567,
"reward_after_std": 0.21475844830274582,
"reward_before_mean": -0.2430284023284912,
"reward_before_std": 0.1823626570403576,
"reward_change_max": 0.0,
"reward_change_mean": -0.17485490441322327,
"reward_change_min": -0.2704111132770777,
"reward_change_std": 0.09725910797715187,
"reward_std": 0.21475845575332642,
"rewards/accuracy_reward": 0.0,
"rewards/cosine_scaled_reward": -0.2430284097790718,
"step": 127
},
{
"clip_fraction": 0.0,
"completion_length": 2990.0208587646484,
"epoch": 0.1462857142857143,
"grad_norm": 0.16409148275852203,
"kl": 0.0043749213218688965,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.1843273287476854e-07,
"loss": 0.0516,
"reward": 0.06294518522918224,
"reward_after_mean": 0.06294518522918224,
"reward_after_std": 0.5372496526688337,
"reward_before_mean": 0.37420000694692135,
"reward_before_std": 0.5208405908197165,
"reward_change_max": 0.0,
"reward_change_mean": -0.31125483103096485,
"reward_change_min": -0.5556082502007484,
"reward_change_std": 0.2155110565945506,
"reward_std": 0.5372496694326401,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.08253331389278173,
"step": 128
},
{
"clip_fraction": 0.0,
"completion_length": 3551.75,
"epoch": 0.14742857142857144,
"grad_norm": 0.13626347482204437,
"kl": 0.0023354291915893555,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.1094235253127374e-07,
"loss": -0.0121,
"reward": -0.3392734844237566,
"reward_after_mean": -0.3392734844237566,
"reward_after_std": 0.30535995587706566,
"reward_before_mean": -0.14702198840677738,
"reward_before_std": 0.277429336681962,
"reward_change_max": 0.0,
"reward_change_mean": -0.19225149974226952,
"reward_change_min": -0.3303426429629326,
"reward_change_std": 0.11987168062478304,
"reward_std": 0.30535995960235596,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.18868865631520748,
"step": 129
},
{
"clip_fraction": 0.0,
"completion_length": 3362.312530517578,
"epoch": 0.14857142857142858,
"grad_norm": 0.16789855062961578,
"kl": 0.006592273712158203,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.034943304942796e-07,
"loss": 0.0295,
"reward": -0.2279136423021555,
"reward_after_mean": -0.2279136423021555,
"reward_after_std": 0.35067533142864704,
"reward_before_mean": -0.0015783179551362991,
"reward_before_std": 0.3160779979079962,
"reward_change_max": 0.0,
"reward_change_mean": -0.22633531503379345,
"reward_change_min": -0.33433668687939644,
"reward_change_std": 0.13016977813094854,
"reward_std": 0.35067535378038883,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.1265783249400556,
"step": 130
},
{
"clip_fraction": 0.0,
"completion_length": 2975.583396911621,
"epoch": 0.14971428571428572,
"grad_norm": 0.23556658625602722,
"kl": 0.008499383926391602,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0364,
"reward": 0.07813497632741928,
"reward_after_mean": 0.07813497632741928,
"reward_after_std": 0.5219608303159475,
"reward_before_mean": 0.39421502873301506,
"reward_before_std": 0.48203556798398495,
"reward_change_max": 0.0,
"reward_change_mean": -0.3160800375044346,
"reward_change_min": -0.5246662721037865,
"reward_change_std": 0.20450247451663017,
"reward_std": 0.521960835903883,
"rewards/accuracy_reward": 0.3125000037252903,
"rewards/cosine_scaled_reward": 0.08171500638127327,
"step": 131
},
{
"clip_fraction": 0.0,
"completion_length": 3205.5416870117188,
"epoch": 0.15085714285714286,
"grad_norm": 0.1620175540447235,
"kl": 0.00755995512008667,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.8873442270461485e-07,
"loss": 0.0057,
"reward": -0.14543947018682957,
"reward_after_mean": -0.14543947018682957,
"reward_after_std": 0.47973890602588654,
"reward_before_mean": 0.09642863646149635,
"reward_before_std": 0.4733261279761791,
"reward_change_max": 0.0,
"reward_change_mean": -0.24186810292303562,
"reward_change_min": -0.4151979051530361,
"reward_change_std": 0.16245489288121462,
"reward_std": 0.47973891720175743,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.09107136679813266,
"step": 132
},
{
"clip_fraction": 0.0,
"completion_length": 3379.6666870117188,
"epoch": 0.152,
"grad_norm": 0.16600805521011353,
"kl": 0.0018353462219238281,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.8142703296283953e-07,
"loss": -0.0087,
"reward": -0.2287772335112095,
"reward_after_mean": -0.2287772335112095,
"reward_after_std": 0.3634595964103937,
"reward_before_mean": -0.006547695025801659,
"reward_before_std": 0.3184118759818375,
"reward_change_max": 0.0,
"reward_change_mean": -0.22222953848540783,
"reward_change_min": -0.3860268648713827,
"reward_change_std": 0.13445519004017115,
"reward_std": 0.363459600135684,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.17321436238125898,
"step": 133
},
{
"clip_fraction": 0.0,
"completion_length": 2977.5416717529297,
"epoch": 0.15314285714285714,
"grad_norm": 0.18056733906269073,
"kl": 0.007370948791503906,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.7417099217982686e-07,
"loss": -0.0006,
"reward": -0.13877355121076107,
"reward_after_mean": -0.13877355121076107,
"reward_after_std": 0.41921201907098293,
"reward_before_mean": 0.10657497371721547,
"reward_before_std": 0.34318217262625694,
"reward_change_max": 0.0,
"reward_change_mean": -0.24534854479134083,
"reward_change_min": -0.37725237011909485,
"reward_change_std": 0.13643485959619284,
"reward_std": 0.4192120339721441,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.08092502504587173,
"step": 134
},
{
"clip_fraction": 0.0,
"completion_length": 2650.5833740234375,
"epoch": 0.15428571428571428,
"grad_norm": 0.24054281413555145,
"kl": 0.00955343246459961,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.1139,
"reward": 0.22294577676802874,
"reward_after_mean": 0.22294577676802874,
"reward_after_std": 0.6538244057446718,
"reward_before_mean": 0.5760083859786391,
"reward_before_std": 0.6345205157995224,
"reward_change_max": 0.0,
"reward_change_mean": -0.3530625980347395,
"reward_change_min": -0.5656783059239388,
"reward_change_std": 0.22699419222772121,
"reward_std": 0.653824420645833,
"rewards/accuracy_reward": 0.4166666753590107,
"rewards/cosine_scaled_reward": 0.1593416929244995,
"step": 135
},
{
"clip_fraction": 0.0,
"completion_length": 3299.041717529297,
"epoch": 0.15542857142857142,
"grad_norm": 0.22330515086650848,
"kl": 0.0055866241455078125,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.5982178221668533e-07,
"loss": 0.051,
"reward": -0.04042801447212696,
"reward_after_mean": -0.04042801447212696,
"reward_after_std": 0.6660582087934017,
"reward_before_mean": 0.213487334898673,
"reward_before_std": 0.6636907681822777,
"reward_change_max": 0.0,
"reward_change_mean": -0.253915349021554,
"reward_change_min": -0.48380811884999275,
"reward_change_std": 0.18504780530929565,
"reward_std": 0.6660582162439823,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.01567934500053525,
"step": 136
},
{
"clip_fraction": 0.0,
"completion_length": 3480.1875,
"epoch": 0.15657142857142858,
"grad_norm": 0.14716432988643646,
"kl": 0.0042552947998046875,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0185,
"reward": -0.3442451786249876,
"reward_after_mean": -0.3442451786249876,
"reward_after_std": 0.2829555720090866,
"reward_before_mean": -0.15177570283412933,
"reward_before_std": 0.2524709850549698,
"reward_change_max": 0.0,
"reward_change_mean": -0.19246947765350342,
"reward_change_min": -0.3142265174537897,
"reward_change_std": 0.1153265843167901,
"reward_std": 0.28295557759702206,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.1934423731872812,
"step": 137
},
{
"clip_fraction": 0.0,
"completion_length": 2989.8125610351562,
"epoch": 0.15771428571428572,
"grad_norm": 0.17452307045459747,
"kl": 0.006173610687255859,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.45704275117204e-07,
"loss": 0.0592,
"reward": -0.17894627060741186,
"reward_after_mean": -0.17894627060741186,
"reward_after_std": 0.4771917462348938,
"reward_before_mean": 0.04556870646774769,
"reward_before_std": 0.4428609721362591,
"reward_change_max": 0.0,
"reward_change_mean": -0.22451498359441757,
"reward_change_min": -0.34395742416381836,
"reward_change_std": 0.13081647735089064,
"reward_std": 0.4771917574107647,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.10026463610120118,
"step": 138
},
{
"clip_fraction": 0.0,
"completion_length": 3251.3333435058594,
"epoch": 0.15885714285714286,
"grad_norm": 0.1768011748790741,
"kl": 0.0027399063110351562,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.387377967463493e-07,
"loss": -0.0126,
"reward": -0.3432940714992583,
"reward_after_mean": -0.3432940714992583,
"reward_after_std": 0.29503406770527363,
"reward_before_mean": -0.1541894283145666,
"reward_before_std": 0.24837601464241743,
"reward_change_max": 0.0,
"reward_change_mean": -0.18910463713109493,
"reward_change_min": -0.27739391289651394,
"reward_change_std": 0.102206707932055,
"reward_std": 0.2950340714305639,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.17502276599407196,
"step": 139
},
{
"clip_fraction": 0.0,
"completion_length": 3437.5833435058594,
"epoch": 0.16,
"grad_norm": 0.17263667285442352,
"kl": 0.00865936279296875,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.3183567088914833e-07,
"loss": 0.023,
"reward": -0.13906344398856163,
"reward_after_mean": -0.13906344398856163,
"reward_after_std": 0.39415651746094227,
"reward_before_mean": 0.10775475949048996,
"reward_before_std": 0.3045333120971918,
"reward_change_max": 0.0,
"reward_change_mean": -0.24681820906698704,
"reward_change_min": -0.3400336131453514,
"reward_change_std": 0.1301204552873969,
"reward_std": 0.39415652118623257,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.05891190283000469,
"step": 140
},
{
"clip_fraction": 0.0,
"completion_length": 3324.5416870117188,
"epoch": 0.16114285714285714,
"grad_norm": 0.15513010323047638,
"kl": 0.004058837890625,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0228,
"reward": -0.2757847439497709,
"reward_after_mean": -0.2757847439497709,
"reward_after_std": 0.5616568475961685,
"reward_before_mean": -0.10039510112255812,
"reward_before_std": 0.5163733661174774,
"reward_change_max": 0.0,
"reward_change_mean": -0.17538963817059994,
"reward_change_min": -0.3024909198284149,
"reward_change_std": 0.1056766239926219,
"reward_std": 0.5616568718105555,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.20456177182495594,
"step": 141
},
{
"clip_fraction": 0.0,
"completion_length": 3058.2708587646484,
"epoch": 0.16228571428571428,
"grad_norm": 0.17221488058567047,
"kl": 0.004315614700317383,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.182328662904756e-07,
"loss": 0.0362,
"reward": -0.15579389221966267,
"reward_after_mean": -0.15579389221966267,
"reward_after_std": 0.4785591959953308,
"reward_before_mean": 0.07918158546090126,
"reward_before_std": 0.46206824481487274,
"reward_change_max": 0.0,
"reward_change_mean": -0.23497546650469303,
"reward_change_min": -0.3963175192475319,
"reward_change_std": 0.15385002456605434,
"reward_std": 0.4785592146217823,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.08748509921133518,
"step": 142
},
{
"clip_fraction": 0.0,
"completion_length": 3342.750030517578,
"epoch": 0.16342857142857142,
"grad_norm": 0.2854948937892914,
"kl": 0.006926536560058594,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0212,
"reward": -0.37999577447772026,
"reward_after_mean": -0.37999577447772026,
"reward_after_std": 0.3082189168781042,
"reward_before_mean": -0.2052184585481882,
"reward_before_std": 0.2710018716752529,
"reward_change_max": 0.0,
"reward_change_mean": -0.1747773140668869,
"reward_change_min": -0.27538760751485825,
"reward_change_std": 0.0982753811404109,
"reward_std": 0.3082189206033945,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.22605179494712502,
"step": 143
},
{
"clip_fraction": 0.0,
"completion_length": 3177.875,
"epoch": 0.16457142857142856,
"grad_norm": 0.19952303171157837,
"kl": 0.010839700698852539,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.0491243424323783e-07,
"loss": -0.0267,
"reward": -0.07587263640016317,
"reward_after_mean": -0.07587263640016317,
"reward_after_std": 0.5351244080811739,
"reward_before_mean": 0.18004657654091716,
"reward_before_std": 0.5149005856364965,
"reward_change_max": 0.0,
"reward_change_mean": -0.25591920129954815,
"reward_change_min": -0.3980993516743183,
"reward_change_std": 0.15681752562522888,
"reward_std": 0.5351244378834963,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/cosine_scaled_reward": -0.0491200964897871,
"step": 144
},
{
"clip_fraction": 0.0,
"completion_length": 2733.541717529297,
"epoch": 0.1657142857142857,
"grad_norm": 0.2626601457595825,
"kl": 0.0073996782302856445,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.9836319343816397e-07,
"loss": -0.0461,
"reward": 0.024279942736029625,
"reward_after_mean": 0.024279942736029625,
"reward_after_std": 0.4396217279136181,
"reward_before_mean": 0.3221998196095228,
"reward_before_std": 0.3181461291387677,
"reward_change_max": 0.0,
"reward_change_mean": -0.29791986010968685,
"reward_change_min": -0.40559068880975246,
"reward_change_std": 0.15221791248768568,
"reward_std": 0.4396217316389084,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.030533142387866974,
"step": 145
},
{
"clip_fraction": 0.0,
"completion_length": 3020.479232788086,
"epoch": 0.16685714285714287,
"grad_norm": 0.16485413908958435,
"kl": 0.0025103092193603516,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.918906036420294e-07,
"loss": 0.0027,
"reward": -0.39893655199557543,
"reward_after_mean": -0.39893655199557543,
"reward_after_std": 0.3212998090311885,
"reward_before_mean": -0.23035122826695442,
"reward_before_std": 0.293486341368407,
"reward_change_max": 0.0,
"reward_change_mean": -0.16858533024787903,
"reward_change_min": -0.318555012345314,
"reward_change_std": 0.11151506658643484,
"reward_std": 0.3212998118251562,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.2720178929157555,
"step": 146
},
{
"clip_fraction": 0.0,
"completion_length": 3294.8333740234375,
"epoch": 0.168,
"grad_norm": 0.1880612075328827,
"kl": 0.0035572052001953125,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0363,
"reward": -0.20948936231434345,
"reward_after_mean": -0.20948936231434345,
"reward_after_std": 0.5007001124322414,
"reward_before_mean": 0.00404274370521307,
"reward_before_std": 0.48459853883832693,
"reward_change_max": 0.0,
"reward_change_mean": -0.21353211253881454,
"reward_change_min": -0.40709931403398514,
"reward_change_std": 0.15078270249068737,
"reward_std": 0.5007001329213381,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/cosine_scaled_reward": -0.12095727026462555,
"step": 147
},
{
"clip_fraction": 0.0,
"completion_length": 3102.437530517578,
"epoch": 0.16914285714285715,
"grad_norm": 0.17371371388435364,
"kl": 0.003039836883544922,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.791832395815782e-07,
"loss": 0.039,
"reward": -0.17386498861014843,
"reward_after_mean": -0.17386498861014843,
"reward_after_std": 0.33598186261951923,
"reward_before_mean": 0.07372774556279182,
"reward_before_std": 0.2765905484557152,
"reward_change_max": 0.0,
"reward_change_mean": -0.2475927509367466,
"reward_change_min": -0.4050207640975714,
"reward_change_std": 0.14486555475741625,
"reward_std": 0.3359818644821644,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.09293892048299313,
"step": 148
},
{
"clip_fraction": 0.0,
"completion_length": 3192.541717529297,
"epoch": 0.1702857142857143,
"grad_norm": 0.14574302732944489,
"kl": 0.002967357635498047,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.729523361034538e-07,
"loss": 0.052,
"reward": -0.1852257801219821,
"reward_after_mean": -0.1852257801219821,
"reward_after_std": 0.49382951483130455,
"reward_before_mean": 0.04067722149193287,
"reward_before_std": 0.48999650962650776,
"reward_change_max": 0.0,
"reward_change_mean": -0.225903008133173,
"reward_change_min": -0.4073325898498297,
"reward_change_std": 0.15895004384219646,
"reward_std": 0.4938295166939497,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.10515611711889505,
"step": 149
},
{
"clip_fraction": 0.0,
"completion_length": 3230.937530517578,
"epoch": 0.17142857142857143,
"grad_norm": 0.17936724424362183,
"kl": 0.006089210510253906,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.6680582402757324e-07,
"loss": 0.073,
"reward": -0.1742450948804617,
"reward_after_mean": -0.1742450948804617,
"reward_after_std": 0.46301714703440666,
"reward_before_mean": 0.058416103944182396,
"reward_before_std": 0.4464929485693574,
"reward_change_max": 0.0,
"reward_change_mean": -0.23266121558845043,
"reward_change_min": -0.4085942395031452,
"reward_change_std": 0.15767131559550762,
"reward_std": 0.4630171600729227,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.08741722581908107,
"step": 150
},
{
"clip_fraction": 0.0,
"completion_length": 3226.8333740234375,
"epoch": 0.17257142857142857,
"grad_norm": 0.1740923374891281,
"kl": 0.004552721977233887,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0624,
"reward": -0.014221547171473503,
"reward_after_mean": -0.014221547171473503,
"reward_after_std": 0.5023653339594603,
"reward_before_mean": 0.26862012315541506,
"reward_before_std": 0.44354164972901344,
"reward_change_max": 0.0,
"reward_change_mean": -0.2828416656702757,
"reward_change_min": -0.4648453965783119,
"reward_change_std": 0.1752566946670413,
"reward_std": 0.5023653507232666,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": 0.018620114773511887,
"step": 151
},
{
"clip_fraction": 0.0,
"completion_length": 2961.770854949951,
"epoch": 0.1737142857142857,
"grad_norm": 0.23375259339809418,
"kl": 0.0030716657638549805,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.547734369542718e-07,
"loss": -0.0393,
"reward": -0.3499955367296934,
"reward_after_mean": -0.3499955367296934,
"reward_after_std": 0.39264952577650547,
"reward_before_mean": -0.17821490950882435,
"reward_before_std": 0.34836819861084223,
"reward_change_max": 0.0,
"reward_change_mean": -0.17178061790764332,
"reward_change_min": -0.26522634364664555,
"reward_change_std": 0.09525439888238907,
"reward_std": 0.3926495313644409,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.21988157741725445,
"step": 152
},
{
"clip_fraction": 0.0,
"completion_length": 3206.6458740234375,
"epoch": 0.17485714285714285,
"grad_norm": 0.21046775579452515,
"kl": 0.07404422760009766,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0096,
"reward": -0.26564084365963936,
"reward_after_mean": -0.26564084365963936,
"reward_after_std": 0.37918250635266304,
"reward_before_mean": -0.05798601661808789,
"reward_before_std": 0.3497645128518343,
"reward_change_max": 0.0,
"reward_change_mean": -0.20765481144189835,
"reward_change_min": -0.3374109137803316,
"reward_change_std": 0.12641454488039017,
"reward_std": 0.3791825193911791,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.16215269826352596,
"step": 153
},
{
"clip_fraction": 0.0,
"completion_length": 3517.7708740234375,
"epoch": 0.176,
"grad_norm": 0.14556263387203217,
"kl": 0.002063751220703125,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.4310073797187573e-07,
"loss": 0.007,
"reward": -0.034340658225119114,
"reward_after_mean": -0.034340658225119114,
"reward_after_std": 0.5944497399032116,
"reward_before_mean": 0.23581288009881973,
"reward_before_std": 0.6102161034941673,
"reward_change_max": 0.0,
"reward_change_mean": -0.27015353739261627,
"reward_change_min": -0.5426580011844635,
"reward_change_std": 0.20289112720638514,
"reward_std": 0.594449769705534,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": 0.006646204274147749,
"step": 154
},
{
"clip_fraction": 0.0,
"completion_length": 2816.2916717529297,
"epoch": 0.17714285714285713,
"grad_norm": 0.2057914435863495,
"kl": 0.04213452339172363,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0263,
"reward": 0.010001560673117638,
"reward_after_mean": 0.010001560673117638,
"reward_after_std": 0.3674746323376894,
"reward_before_mean": 0.31472931057214737,
"reward_before_std": 0.2635038308799267,
"reward_change_max": 0.0,
"reward_change_mean": -0.30472773127257824,
"reward_change_min": -0.4399359282106161,
"reward_change_std": 0.16246692463755608,
"reward_std": 0.36747463420033455,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.023062625899910927,
"step": 155
},
{
"clip_fraction": 0.0,
"completion_length": 3464.187530517578,
"epoch": 0.1782857142857143,
"grad_norm": 0.14835189282894135,
"kl": 0.0031020641326904297,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.3180194846605364e-07,
"loss": 0.0117,
"reward": -0.21181728318333626,
"reward_after_mean": -0.21181728318333626,
"reward_after_std": 0.47288844734430313,
"reward_before_mean": 0.003191556199453771,
"reward_before_std": 0.44311563204973936,
"reward_change_max": 0.0,
"reward_change_mean": -0.21500884927809238,
"reward_change_min": -0.3760814815759659,
"reward_change_std": 0.14059338811784983,
"reward_std": 0.472888458520174,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.14264179207384586,
"step": 156
},
{
"clip_fraction": 0.0,
"completion_length": 3396.875030517578,
"epoch": 0.17942857142857144,
"grad_norm": 0.16735728085041046,
"kl": 0.004496216773986816,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0078,
"reward": -0.26899631321430206,
"reward_after_mean": -0.26899631321430206,
"reward_after_std": 0.28598711267113686,
"reward_before_mean": -0.04634012281894684,
"reward_before_std": 0.2616874389350414,
"reward_change_max": 0.0,
"reward_change_mean": -0.22265619039535522,
"reward_change_min": -0.32796306163072586,
"reward_change_std": 0.1285459529608488,
"reward_std": 0.2859871182590723,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.15050679631531239,
"step": 157
},
{
"clip_fraction": 0.0,
"completion_length": 3461.1250610351562,
"epoch": 0.18057142857142858,
"grad_norm": 0.1638031005859375,
"kl": 0.002267122268676758,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.2089083427137329e-07,
"loss": 0.0207,
"reward": -0.06732317310525104,
"reward_after_mean": -0.06732317310525104,
"reward_after_std": 0.4877306241542101,
"reward_before_mean": 0.19353781951213023,
"reward_before_std": 0.40885435976088047,
"reward_change_max": 0.0,
"reward_change_mean": -0.2608609963208437,
"reward_change_min": -0.40801957063376904,
"reward_change_std": 0.15057573933154345,
"reward_std": 0.48773064091801643,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.014795510563999414,
"step": 158
},
{
"clip_fraction": 0.0,
"completion_length": 3426.4166870117188,
"epoch": 0.18171428571428572,
"grad_norm": 0.14988620579242706,
"kl": 0.015199661254882812,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0272,
"reward": -0.375995397567749,
"reward_after_mean": -0.375995397567749,
"reward_after_std": 0.30748917162418365,
"reward_before_mean": -0.19884050451219082,
"reward_before_std": 0.27203916758298874,
"reward_change_max": 0.0,
"reward_change_mean": -0.1771549005061388,
"reward_change_min": -0.3032604958862066,
"reward_change_std": 0.108861212618649,
"reward_std": 0.3074891772121191,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.24050716683268547,
"step": 159
},
{
"clip_fraction": 0.0,
"completion_length": 3183.500045776367,
"epoch": 0.18285714285714286,
"grad_norm": 0.1655987948179245,
"kl": 0.009595870971679688,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.1038068889975259e-07,
"loss": -0.0121,
"reward": -0.09110978245735168,
"reward_after_mean": -0.09110978245735168,
"reward_after_std": 0.6802807692438364,
"reward_before_mean": 0.14124558059847914,
"reward_before_std": 0.6676155887544155,
"reward_change_max": 0.0,
"reward_change_mean": -0.23235537484288216,
"reward_change_min": -0.43460565991699696,
"reward_change_std": 0.16776727978140116,
"reward_std": 0.6802807692438364,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": -0.08792108716443181,
"step": 160
},
{
"clip_fraction": 0.0,
"completion_length": 3223.041717529297,
"epoch": 0.184,
"grad_norm": 0.16486133635044098,
"kl": 0.032108306884765625,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.065,
"reward": -0.03917538747191429,
"reward_after_mean": -0.03917538747191429,
"reward_after_std": 0.4343126844614744,
"reward_before_mean": 0.24449945986270905,
"reward_before_std": 0.38117840187624097,
"reward_change_max": 0.0,
"reward_change_mean": -0.2836748603731394,
"reward_change_min": -0.4806545842438936,
"reward_change_std": 0.17813984956592321,
"reward_std": 0.43431270122528076,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/cosine_scaled_reward": 0.015332793816924095,
"step": 161
},
{
"clip_fraction": 0.0,
"completion_length": 3437.6458740234375,
"epoch": 0.18514285714285714,
"grad_norm": 0.2203022539615631,
"kl": 0.00640869140625,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.0028431734436308e-07,
"loss": 0.0205,
"reward": -0.13686673156917095,
"reward_after_mean": -0.13686673156917095,
"reward_after_std": 0.4702302608639002,
"reward_before_mean": 0.10987010970711708,
"reward_before_std": 0.4584337314590812,
"reward_change_max": 0.0,
"reward_change_mean": -0.24673686362802982,
"reward_change_min": -0.39340290054678917,
"reward_change_std": 0.1597390165552497,
"reward_std": 0.4702302720397711,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.09846324659883976,
"step": 162
},
{
"clip_fraction": 0.0,
"completion_length": 2954.437545776367,
"epoch": 0.18628571428571428,
"grad_norm": 0.1848427951335907,
"kl": 0.021541118621826172,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0242,
"reward": 0.15330182015895844,
"reward_after_mean": 0.15330182015895844,
"reward_after_std": 0.32201647013425827,
"reward_before_mean": 0.5199118303135037,
"reward_before_std": 0.22552276588976383,
"reward_change_max": 0.0,
"reward_change_mean": -0.3666100241243839,
"reward_change_min": -0.5059312395751476,
"reward_change_std": 0.1948932707309723,
"reward_std": 0.3220164868980646,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.16574516613036394,
"step": 163
},
{
"clip_fraction": 0.0,
"completion_length": 3017.562530517578,
"epoch": 0.18742857142857142,
"grad_norm": 0.34415239095687866,
"kl": 0.005047798156738281,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.9061402047871833e-07,
"loss": 0.0798,
"reward": -0.09205959737300873,
"reward_after_mean": -0.09205959737300873,
"reward_after_std": 0.3648469839245081,
"reward_before_mean": 0.18029571324586868,
"reward_before_std": 0.3099568961188197,
"reward_change_max": 0.0,
"reward_change_mean": -0.27235531620681286,
"reward_change_min": -0.41520175337791443,
"reward_change_std": 0.1586802341043949,
"reward_std": 0.36484698951244354,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.048870958387851715,
"step": 164
},
{
"clip_fraction": 0.0,
"completion_length": 3212.062530517578,
"epoch": 0.18857142857142858,
"grad_norm": 0.2587808668613434,
"kl": 0.005465984344482422,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0868,
"reward": -0.16799346357584,
"reward_after_mean": -0.16799346357584,
"reward_after_std": 0.5451687276363373,
"reward_before_mean": 0.05025888653472066,
"reward_before_std": 0.5034487005323172,
"reward_change_max": 0.0,
"reward_change_mean": -0.21825237199664116,
"reward_change_min": -0.3237415961921215,
"reward_change_std": 0.1249654246494174,
"reward_std": 0.545168736949563,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.11640777194406837,
"step": 165
},
{
"clip_fraction": 0.0,
"completion_length": 3372.4791870117188,
"epoch": 0.18971428571428572,
"grad_norm": 0.15933357179164886,
"kl": 0.0029315948486328125,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.8138158006995363e-07,
"loss": 0.0526,
"reward": -0.01668019499629736,
"reward_after_mean": -0.01668019499629736,
"reward_after_std": 0.5762727987021208,
"reward_before_mean": 0.2592314127832651,
"reward_before_std": 0.5716875530779362,
"reward_change_max": 0.0,
"reward_change_mean": -0.27591159753501415,
"reward_change_min": -0.48107208497822285,
"reward_change_std": 0.1874433197081089,
"reward_std": 0.5762728247791529,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": 0.009231406031176448,
"step": 166
},
{
"clip_fraction": 0.0,
"completion_length": 2937.5833892822266,
"epoch": 0.19085714285714286,
"grad_norm": 0.15646594762802124,
"kl": 0.022654056549072266,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0779,
"reward": -0.2482462339103222,
"reward_after_mean": -0.2482462339103222,
"reward_after_std": 0.40840402990579605,
"reward_before_mean": -0.035066988319158554,
"reward_before_std": 0.3997558169066906,
"reward_change_max": 0.0,
"reward_change_mean": -0.21317926235496998,
"reward_change_min": -0.3921571187674999,
"reward_change_std": 0.1473480286076665,
"reward_std": 0.40840404108166695,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.13923365552909672,
"step": 167
},
{
"clip_fraction": 0.0,
"completion_length": 3454.979217529297,
"epoch": 0.192,
"grad_norm": 0.15156163275241852,
"kl": 0.0032837390899658203,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.7259824442455923e-07,
"loss": 0.0294,
"reward": 0.038655126467347145,
"reward_after_mean": 0.038655126467347145,
"reward_after_std": 0.5489779710769653,
"reward_before_mean": 0.3417097292840481,
"reward_before_std": 0.5534890927374363,
"reward_change_max": 0.0,
"reward_change_mean": -0.3030546009540558,
"reward_change_min": -0.5080073494464159,
"reward_change_std": 0.2078774282708764,
"reward_std": 0.5489780027419329,
"rewards/accuracy_reward": 0.2916666753590107,
"rewards/cosine_scaled_reward": 0.05004306013870519,
"step": 168
},
{
"clip_fraction": 0.0,
"completion_length": 2757.8333587646484,
"epoch": 0.19314285714285714,
"grad_norm": 0.20235563814640045,
"kl": 0.0031278133392333984,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.016,
"reward": 0.299284853041172,
"reward_after_mean": 0.299284853041172,
"reward_after_std": 0.3482187706977129,
"reward_before_mean": 0.7110624257475138,
"reward_before_std": 0.21563976723700762,
"reward_change_max": 0.0,
"reward_change_mean": -0.41177756898105145,
"reward_change_min": -0.5598046462982893,
"reward_change_std": 0.2105923229828477,
"reward_std": 0.3482187818735838,
"rewards/accuracy_reward": 0.5,
"rewards/cosine_scaled_reward": 0.21106241270899773,
"step": 169
},
{
"clip_fraction": 0.0,
"completion_length": 2691.5000381469727,
"epoch": 0.19428571428571428,
"grad_norm": 0.19220662117004395,
"kl": 0.012215614318847656,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.6427471468404952e-07,
"loss": 0.001,
"reward": -0.0186636159196496,
"reward_after_mean": -0.0186636159196496,
"reward_after_std": 0.37129569984972477,
"reward_before_mean": 0.2765026893466711,
"reward_before_std": 0.2835942036472261,
"reward_change_max": 0.0,
"reward_change_mean": -0.2951663248240948,
"reward_change_min": -0.4515752512961626,
"reward_change_std": 0.16774821933358908,
"reward_std": 0.37129571102559566,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": -0.015163990668952465,
"step": 170
},
{
"clip_fraction": 0.0,
"completion_length": 3110.604217529297,
"epoch": 0.19542857142857142,
"grad_norm": 0.16165022552013397,
"kl": 0.002307415008544922,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0289,
"reward": 0.09521710127592087,
"reward_after_mean": 0.09521710127592087,
"reward_after_std": 0.39058924093842506,
"reward_before_mean": 0.4361223494634032,
"reward_before_std": 0.35083947516977787,
"reward_change_max": 0.0,
"reward_change_mean": -0.34090524166822433,
"reward_change_min": -0.5021266676485538,
"reward_change_std": 0.19604680873453617,
"reward_std": 0.3905892614275217,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/cosine_scaled_reward": 0.10278898943215609,
"step": 171
},
{
"clip_fraction": 0.0,
"completion_length": 3107.166702270508,
"epoch": 0.19657142857142856,
"grad_norm": 0.18883849680423737,
"kl": 0.0051097869873046875,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.5642113178727193e-07,
"loss": 0.0285,
"reward": -0.044252441730350256,
"reward_after_mean": -0.044252441730350256,
"reward_after_std": 0.4415896963328123,
"reward_before_mean": 0.2377479849383235,
"reward_before_std": 0.41535679809749126,
"reward_change_max": 0.0,
"reward_change_mean": -0.28200042992830276,
"reward_change_min": -0.4121879041194916,
"reward_change_std": 0.1653971141204238,
"reward_std": 0.4415897000581026,
"rewards/accuracy_reward": 0.22916667722165585,
"rewards/cosine_scaled_reward": 0.00858130888082087,
"step": 172
},
{
"clip_fraction": 0.0,
"completion_length": 2682.958351135254,
"epoch": 0.1977142857142857,
"grad_norm": 0.20074620842933655,
"kl": 0.010714054107666016,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.058,
"reward": -0.23622408602386713,
"reward_after_mean": -0.23622408602386713,
"reward_after_std": 0.5661862269043922,
"reward_before_mean": -0.04538543475791812,
"reward_before_std": 0.539402324706316,
"reward_change_max": 0.0,
"reward_change_mean": -0.19083864893764257,
"reward_change_min": -0.3895431775599718,
"reward_change_std": 0.1356267612427473,
"reward_std": 0.5661862548440695,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.1703854314982891,
"step": 173
},
{
"clip_fraction": 0.0,
"completion_length": 3129.750030517578,
"epoch": 0.19885714285714284,
"grad_norm": 0.17718930542469025,
"kl": 0.013014793395996094,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.4904706411523448e-07,
"loss": 0.0326,
"reward": -0.2630236418917775,
"reward_after_mean": -0.2630236418917775,
"reward_after_std": 0.5134491641074419,
"reward_before_mean": -0.07400929369032383,
"reward_before_std": 0.48160199262201786,
"reward_change_max": 0.0,
"reward_change_mean": -0.1890143509954214,
"reward_change_min": -0.3234012946486473,
"reward_change_std": 0.120835080742836,
"reward_std": 0.513449190184474,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.1781759625300765,
"step": 174
},
{
"clip_fraction": 0.0,
"completion_length": 3009.333366394043,
"epoch": 0.2,
"grad_norm": 0.18382960557937622,
"kl": 0.06249094009399414,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0123,
"reward": 0.026797737926244736,
"reward_after_mean": 0.026797737926244736,
"reward_after_std": 0.4659009985625744,
"reward_before_mean": 0.3378504253923893,
"reward_before_std": 0.48004232347011566,
"reward_change_max": 0.0,
"reward_change_mean": -0.3110526669770479,
"reward_change_min": -0.5042965784668922,
"reward_change_std": 0.2051269579678774,
"reward_std": 0.46590100042521954,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/cosine_scaled_reward": 0.06701706536114216,
"step": 175
},
{
"clip_fraction": 0.0,
"completion_length": 3047.583366394043,
"epoch": 0.20114285714285715,
"grad_norm": 0.26642507314682007,
"kl": 0.01358795166015625,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.4216149583350755e-07,
"loss": 0.0936,
"reward": -0.019476521760225296,
"reward_after_mean": -0.019476521760225296,
"reward_after_std": 0.6982401926070452,
"reward_before_mean": 0.24331039190292358,
"reward_before_std": 0.731185233220458,
"reward_change_max": 0.0,
"reward_change_mean": -0.2627869173884392,
"reward_change_min": -0.5728251449763775,
"reward_change_std": 0.22166733164340258,
"reward_std": 0.6982402224093676,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/cosine_scaled_reward": -0.02752294298261404,
"step": 176
},
{
"clip_fraction": 0.0,
"completion_length": 3382.166748046875,
"epoch": 0.2022857142857143,
"grad_norm": 0.19716955721378326,
"kl": 0.006411075592041016,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0274,
"reward": -0.14210877695586532,
"reward_after_mean": -0.14210877695586532,
"reward_after_std": 0.4888409748673439,
"reward_before_mean": 0.09973286651074886,
"reward_before_std": 0.4782033069059253,
"reward_change_max": 0.0,
"reward_change_mean": -0.2418416514992714,
"reward_change_min": -0.4065688345581293,
"reward_change_std": 0.16381614096462727,
"reward_std": 0.4888409972190857,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.06693380372598767,
"step": 177
},
{
"clip_fraction": 0.0,
"completion_length": 3090.229217529297,
"epoch": 0.20342857142857143,
"grad_norm": 0.17369556427001953,
"kl": 0.011487960815429688,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.3577281594640182e-07,
"loss": -0.0599,
"reward": -0.03678634762763977,
"reward_after_mean": -0.03678634762763977,
"reward_after_std": 0.481352299451828,
"reward_before_mean": 0.24844557233154774,
"reward_before_std": 0.49319163616746664,
"reward_change_max": 0.0,
"reward_change_mean": -0.2852319311350584,
"reward_change_min": -0.47761483304202557,
"reward_change_std": 0.19820824172347784,
"reward_std": 0.48135231621563435,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": 0.01927890069782734,
"step": 178
},
{
"clip_fraction": 0.0,
"completion_length": 3365.750030517578,
"epoch": 0.20457142857142857,
"grad_norm": 0.1561906337738037,
"kl": 0.0033054351806640625,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.3276726544494571e-07,
"loss": -0.0253,
"reward": -0.40081705152988434,
"reward_after_mean": -0.40081705152988434,
"reward_after_std": 0.29850259609520435,
"reward_before_mean": -0.23233469319529831,
"reward_before_std": 0.2521834969520569,
"reward_change_max": 0.0,
"reward_change_mean": -0.16848235949873924,
"reward_change_min": -0.2541333418339491,
"reward_change_std": 0.09144267160445452,
"reward_std": 0.29850260354578495,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.25316802971065044,
"step": 179
},
{
"clip_fraction": 0.0,
"completion_length": 2725.0833587646484,
"epoch": 0.2057142857142857,
"grad_norm": 0.19090747833251953,
"kl": 0.011870384216308594,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.2988880807625927e-07,
"loss": 0.0402,
"reward": 0.11349364370107651,
"reward_after_mean": 0.11349364370107651,
"reward_after_std": 0.312832809984684,
"reward_before_mean": 0.4614496771246195,
"reward_before_std": 0.17704920517280698,
"reward_change_max": 0.0,
"reward_change_mean": -0.3479560390114784,
"reward_change_min": -0.4591350872069597,
"reward_change_std": 0.17630962189286947,
"reward_std": 0.3128328137099743,
"rewards/accuracy_reward": 0.39583333395421505,
"rewards/cosine_scaled_reward": 0.0656163152307272,
"step": 180
},
{
"clip_fraction": 0.0,
"completion_length": 3343.1458435058594,
"epoch": 0.20685714285714285,
"grad_norm": 0.1572800576686859,
"kl": 0.004780292510986328,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0256,
"reward": -0.07740832306444645,
"reward_after_mean": -0.07740832306444645,
"reward_after_std": 0.4118566922843456,
"reward_before_mean": 0.19327654596418142,
"reward_before_std": 0.3510447759181261,
"reward_change_max": 0.0,
"reward_change_mean": -0.2706848792731762,
"reward_change_min": -0.419716427102685,
"reward_change_std": 0.15835009142756462,
"reward_std": 0.41185671649873257,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/cosine_scaled_reward": -0.015056788921356201,
"step": 181
},
{
"clip_fraction": 0.0,
"completion_length": 2883.9166870117188,
"epoch": 0.208,
"grad_norm": 0.1602364033460617,
"kl": 0.00803762674331665,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.2451664098030743e-07,
"loss": -0.0218,
"reward": -0.044728060718625784,
"reward_after_mean": -0.044728060718625784,
"reward_after_std": 0.42621047236025333,
"reward_before_mean": 0.23698624456301332,
"reward_before_std": 0.3748616073280573,
"reward_change_max": 0.0,
"reward_change_mean": -0.28171432204544544,
"reward_change_min": -0.42212608829140663,
"reward_change_std": 0.16775514092296362,
"reward_std": 0.4262104816734791,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": 0.007819579914212227,
"step": 182
},
{
"clip_fraction": 0.0,
"completion_length": 3177.2500610351562,
"epoch": 0.20914285714285713,
"grad_norm": 0.1729901134967804,
"kl": 0.046076297760009766,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.220245676671809e-07,
"loss": -0.0188,
"reward": -0.0808305200189352,
"reward_after_mean": -0.0808305200189352,
"reward_after_std": 0.47396935522556305,
"reward_before_mean": 0.1872396506369114,
"reward_before_std": 0.4797352682799101,
"reward_change_max": 0.0,
"reward_change_mean": -0.2680701520293951,
"reward_change_min": -0.47748228162527084,
"reward_change_std": 0.18410342279821634,
"reward_std": 0.47396937943995,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/cosine_scaled_reward": -0.02109370008111,
"step": 183
},
{
"clip_fraction": 0.0,
"completion_length": 3157.9166870117188,
"epoch": 0.2102857142857143,
"grad_norm": 0.19053320586681366,
"kl": 0.003658771514892578,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1966285981663407e-07,
"loss": -0.0184,
"reward": -0.3129790127277374,
"reward_after_mean": -0.3129790127277374,
"reward_after_std": 0.2926772404462099,
"reward_before_mean": -0.10822748765349388,
"reward_before_std": 0.2700198283419013,
"reward_change_max": 0.0,
"reward_change_mean": -0.2047515194863081,
"reward_change_min": -0.31184492260217667,
"reward_change_std": 0.12345388997346163,
"reward_std": 0.29267724975943565,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.19156083092093468,
"step": 184
},
{
"clip_fraction": 0.0,
"completion_length": 3116.875,
"epoch": 0.21142857142857144,
"grad_norm": 0.23735588788986206,
"kl": 0.011540412902832031,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0255,
"reward": -0.27555170468986034,
"reward_after_mean": -0.27555170468986034,
"reward_after_std": 0.37195089366286993,
"reward_before_mean": -0.0702610481530428,
"reward_before_std": 0.34090360533446074,
"reward_change_max": 0.0,
"reward_change_mean": -0.2052906509488821,
"reward_change_min": -0.31347635202109814,
"reward_change_std": 0.12224087584763765,
"reward_std": 0.3719508945941925,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.1744277123361826,
"step": 185
},
{
"clip_fraction": 0.0,
"completion_length": 3370.000030517578,
"epoch": 0.21257142857142858,
"grad_norm": 0.23551151156425476,
"kl": 0.004643917083740234,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1533337816991931e-07,
"loss": 0.0291,
"reward": -0.020450815558433533,
"reward_after_mean": -0.020450815558433533,
"reward_after_std": 0.43135653622448444,
"reward_before_mean": 0.27618860453367233,
"reward_before_std": 0.42817492596805096,
"reward_change_max": 0.0,
"reward_change_mean": -0.29663942754268646,
"reward_change_min": -0.45596072264015675,
"reward_change_std": 0.18588601425290108,
"reward_std": 0.4313565380871296,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/cosine_scaled_reward": -0.015478070825338364,
"step": 186
},
{
"clip_fraction": 0.0,
"completion_length": 3190.541717529297,
"epoch": 0.21371428571428572,
"grad_norm": 0.23826000094413757,
"kl": 0.0100555419921875,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0412,
"reward": -0.22396802809089422,
"reward_after_mean": -0.22396802809089422,
"reward_after_std": 0.3726756442338228,
"reward_before_mean": 0.001834167167544365,
"reward_before_std": 0.3483778089284897,
"reward_change_max": 0.0,
"reward_change_mean": -0.22580219060182571,
"reward_change_min": -0.34733813256025314,
"reward_change_std": 0.13398846238851547,
"reward_std": 0.37267564609646797,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.12316584587097168,
"step": 187
},
{
"clip_fraction": 0.0,
"completion_length": 3546.5625,
"epoch": 0.21485714285714286,
"grad_norm": 0.15860070288181305,
"kl": 0.0028228759765625,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1153347084664419e-07,
"loss": 0.023,
"reward": -0.3123414749279618,
"reward_after_mean": -0.3123414749279618,
"reward_after_std": 0.4071262162178755,
"reward_before_mean": -0.1276569024194032,
"reward_before_std": 0.36976926028728485,
"reward_change_max": 0.0,
"reward_change_mean": -0.18468456342816353,
"reward_change_min": -0.2755823079496622,
"reward_change_std": 0.10156355146318674,
"reward_std": 0.4071262273937464,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.16932356543838978,
"step": 188
},
{
"clip_fraction": 0.0,
"completion_length": 3185.979217529297,
"epoch": 0.216,
"grad_norm": 0.17385567724704742,
"kl": 0.005602836608886719,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0385,
"reward": -0.16023958660662174,
"reward_after_mean": -0.16023958660662174,
"reward_after_std": 0.5064278487116098,
"reward_before_mean": 0.07299725105985999,
"reward_before_std": 0.5011386927217245,
"reward_change_max": 0.0,
"reward_change_mean": -0.23323685117065907,
"reward_change_min": -0.41370217502117157,
"reward_change_std": 0.15955450013279915,
"reward_std": 0.5064278729259968,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.09366941265761852,
"step": 189
},
{
"clip_fraction": 0.0,
"completion_length": 3201.375030517578,
"epoch": 0.21714285714285714,
"grad_norm": 0.14921893179416656,
"kl": 0.009759902954101562,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0826776744855121e-07,
"loss": -0.0151,
"reward": -0.11116122780367732,
"reward_after_mean": -0.11116122780367732,
"reward_after_std": 0.42286976985633373,
"reward_before_mean": 0.14645380340516567,
"reward_before_std": 0.37099423445761204,
"reward_change_max": 0.0,
"reward_change_mean": -0.2576150503009558,
"reward_change_min": -0.4218386374413967,
"reward_change_std": 0.15646861772984266,
"reward_std": 0.42286976985633373,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/cosine_scaled_reward": -0.041046179831027985,
"step": 190
},
{
"clip_fraction": 0.0,
"completion_length": 2774.8333587646484,
"epoch": 0.21828571428571428,
"grad_norm": 0.18684116005897522,
"kl": 0.02057647705078125,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.068365111445064e-07,
"loss": -0.0042,
"reward": -0.05179838836193085,
"reward_after_mean": -0.05179838836193085,
"reward_after_std": 0.40950570069253445,
"reward_before_mean": 0.22851569892372936,
"reward_before_std": 0.35111323557794094,
"reward_change_max": 0.0,
"reward_change_mean": -0.280314102768898,
"reward_change_min": -0.4044586792588234,
"reward_change_std": 0.15675450582057238,
"reward_std": 0.4095057025551796,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": -0.02148430421948433,
"step": 191
},
{
"clip_fraction": 0.0,
"completion_length": 3424.0,
"epoch": 0.21942857142857142,
"grad_norm": 0.15981534123420715,
"kl": 0.002536296844482422,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0554024673218806e-07,
"loss": 0.0249,
"reward": -0.2669062092900276,
"reward_after_mean": -0.2669062092900276,
"reward_after_std": 0.2870886046439409,
"reward_before_mean": -0.046382976695895195,
"reward_before_std": 0.2581921275705099,
"reward_change_max": 0.0,
"reward_change_mean": -0.22052323259413242,
"reward_change_min": -0.3341723680496216,
"reward_change_std": 0.1262814048677683,
"reward_std": 0.2870886102318764,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.15054964646697044,
"step": 192
},
{
"clip_fraction": 0.0,
"completion_length": 3204.416702270508,
"epoch": 0.22057142857142858,
"grad_norm": 0.17141887545585632,
"kl": 0.0038933753967285156,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0145,
"reward": -0.23873800691217184,
"reward_after_mean": -0.23873800691217184,
"reward_after_std": 0.4927209597080946,
"reward_before_mean": -0.037118949461728334,
"reward_before_std": 0.4673497211188078,
"reward_change_max": 0.0,
"reward_change_mean": -0.20161907002329826,
"reward_change_min": -0.3459734059870243,
"reward_change_std": 0.12958138808608055,
"reward_std": 0.4927209783345461,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/cosine_scaled_reward": -0.1412856113165617,
"step": 193
},
{
"clip_fraction": 0.0,
"completion_length": 3230.7083740234375,
"epoch": 0.22171428571428572,
"grad_norm": 0.17490136623382568,
"kl": 0.0017626285552978516,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0335423176140511e-07,
"loss": 0.0291,
"reward": 0.22551898658275604,
"reward_after_mean": 0.22551898658275604,
"reward_after_std": 0.6343558058142662,
"reward_before_mean": 0.5872141793370247,
"reward_before_std": 0.653206454589963,
"reward_change_max": 0.0,
"reward_change_mean": -0.36169516295194626,
"reward_change_min": -0.5719391945749521,
"reward_change_std": 0.23771104868501425,
"reward_std": 0.6343558225780725,
"rewards/accuracy_reward": 0.39583334885537624,
"rewards/cosine_scaled_reward": 0.19138083781581372,
"step": 194
},
{
"clip_fraction": 0.0,
"completion_length": 3455.5208435058594,
"epoch": 0.22285714285714286,
"grad_norm": 0.16181236505508423,
"kl": 0.002017974853515625,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0155,
"reward": -0.26167353615164757,
"reward_after_mean": -0.26167353615164757,
"reward_after_std": 0.3914919700473547,
"reward_before_mean": -0.0532490611076355,
"reward_before_std": 0.36043376103043556,
"reward_change_max": 0.0,
"reward_change_mean": -0.20842446759343147,
"reward_change_min": -0.3214227482676506,
"reward_change_std": 0.1241202037781477,
"reward_std": 0.3914919812232256,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.15741572994738817,
"step": 195
},
{
"clip_fraction": 0.0,
"completion_length": 3523.500030517578,
"epoch": 0.224,
"grad_norm": 0.16005918383598328,
"kl": 0.0024111270904541016,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.017123858587145e-07,
"loss": 0.0115,
"reward": -0.28297955449670553,
"reward_after_mean": -0.28297955449670553,
"reward_after_std": 0.40249344892799854,
"reward_before_mean": -0.08251086995005608,
"reward_before_std": 0.3783824220299721,
"reward_change_max": 0.0,
"reward_change_mean": -0.20046869292855263,
"reward_change_min": -0.32744314707815647,
"reward_change_std": 0.12627543695271015,
"reward_std": 0.40249346010386944,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.18667752863257192,
"step": 196
},
{
"clip_fraction": 0.0,
"completion_length": 3195.062530517578,
"epoch": 0.22514285714285714,
"grad_norm": 0.21763946115970612,
"kl": 0.11150646209716797,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0734,
"reward": 0.14665383007377386,
"reward_after_mean": 0.14665383007377386,
"reward_after_std": 0.775845367461443,
"reward_before_mean": 0.46151648461818695,
"reward_before_std": 0.8179350979626179,
"reward_change_max": 0.0,
"reward_change_mean": -0.31486267410218716,
"reward_change_min": -0.6177782695740461,
"reward_change_std": 0.24829469621181488,
"reward_std": 0.7758453991264105,
"rewards/accuracy_reward": 0.35416667722165585,
"rewards/cosine_scaled_reward": 0.10734981670975685,
"step": 197
},
{
"clip_fraction": 0.0,
"completion_length": 3091.125,
"epoch": 0.22628571428571428,
"grad_norm": 0.17106440663337708,
"kl": 0.005677938461303711,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0061670936044178e-07,
"loss": 0.0149,
"reward": -0.12829723954200745,
"reward_after_mean": -0.12829723954200745,
"reward_after_std": 0.3686715345829725,
"reward_before_mean": 0.13360333256423473,
"reward_before_std": 0.35308009944856167,
"reward_change_max": 0.0,
"reward_change_mean": -0.2619005683809519,
"reward_change_min": -0.4114610478281975,
"reward_change_std": 0.16121340077370405,
"reward_std": 0.36867155507206917,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.05389668419957161,
"step": 198
},
{
"clip_fraction": 0.0,
"completion_length": 3546.1666870117188,
"epoch": 0.22742857142857142,
"grad_norm": 0.15505783259868622,
"kl": 0.0044994354248046875,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.002741278414069e-07,
"loss": -0.0064,
"reward": -0.3825234118849039,
"reward_after_mean": -0.3825234118849039,
"reward_after_std": 0.2886116597801447,
"reward_before_mean": -0.2058500237762928,
"reward_before_std": 0.2376685068011284,
"reward_change_max": 0.0,
"reward_change_mean": -0.17667338997125626,
"reward_change_min": -0.2671913430094719,
"reward_change_std": 0.0952356569468975,
"reward_std": 0.28861166536808014,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.22668336611241102,
"step": 199
},
{
"clip_fraction": 0.0,
"completion_length": 2987.3541870117188,
"epoch": 0.22857142857142856,
"grad_norm": 0.18018940091133118,
"kl": 0.0025665760040283203,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0006853717962393e-07,
"loss": 0.0384,
"reward": 0.1265381295233965,
"reward_after_mean": 0.1265381295233965,
"reward_after_std": 0.6119197029620409,
"reward_before_mean": 0.44888845831155777,
"reward_before_std": 0.5870621418580413,
"reward_change_max": 0.0,
"reward_change_mean": -0.32235031202435493,
"reward_change_min": -0.5511045381426811,
"reward_change_std": 0.2166948076337576,
"reward_std": 0.6119197197258472,
"rewards/accuracy_reward": 0.33333333767950535,
"rewards/cosine_scaled_reward": 0.11555509176105261,
"step": 200
},
{
"epoch": 0.22857142857142856,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.0197520790877752,
"train_runtime": 32053.2934,
"train_samples_per_second": 0.3,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}