| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.22857142857142856, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2571.2083587646484, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.22715197503566742, | |
| "kl": 0.0, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 0.0, | |
| "loss": 0.0941, | |
| "reward": -0.05164351873099804, | |
| "reward_after_mean": -0.05164351873099804, | |
| "reward_after_std": 0.5470927599817514, | |
| "reward_before_mean": 0.21363236638717353, | |
| "reward_before_std": 0.541789973154664, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2652758788317442, | |
| "reward_change_min": -0.4747793525457382, | |
| "reward_change_std": 0.18003974109888077, | |
| "reward_std": 0.5470927748829126, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/cosine_scaled_reward": -0.015534311532974243, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2804.395881652832, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.21158379316329956, | |
| "kl": 0.0, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5e-08, | |
| "loss": 0.0288, | |
| "reward": -0.10030801966786385, | |
| "reward_after_mean": -0.10030801966786385, | |
| "reward_after_std": 0.2960502114146948, | |
| "reward_before_mean": 0.179365461692214, | |
| "reward_before_std": 0.2432677550241351, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27967348881065845, | |
| "reward_change_min": -0.40170327201485634, | |
| "reward_change_std": 0.154046350158751, | |
| "reward_std": 0.29605022072792053, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/cosine_scaled_reward": -0.04980122856795788, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3309.8541717529297, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.16393998265266418, | |
| "kl": 4.11495566368103e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1e-07, | |
| "loss": -0.0221, | |
| "reward": -0.3395198956131935, | |
| "reward_after_mean": -0.3395198956131935, | |
| "reward_after_std": 0.30841588601469994, | |
| "reward_before_mean": -0.14859933033585548, | |
| "reward_before_std": 0.2812267681583762, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19092058949172497, | |
| "reward_change_min": -0.3236966449767351, | |
| "reward_change_std": 0.11970376130193472, | |
| "reward_std": 0.30841588601469994, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/cosine_scaled_reward": -0.2110993228852749, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2138.9791717529297, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.3178713321685791, | |
| "kl": 3.808736801147461e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.5e-07, | |
| "loss": 0.0416, | |
| "reward": -0.010286737233400345, | |
| "reward_after_mean": -0.010286737233400345, | |
| "reward_after_std": 0.6836549900472164, | |
| "reward_before_mean": 0.25457675755023956, | |
| "reward_before_std": 0.7051120875403285, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.26486349664628506, | |
| "reward_change_min": -0.5560955684632063, | |
| "reward_change_std": 0.2087760465219617, | |
| "reward_std": 0.6836549993604422, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/cosine_scaled_reward": 0.004576747305691242, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3392.854217529297, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.16369687020778656, | |
| "kl": 4.531443119049072e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2e-07, | |
| "loss": -0.0361, | |
| "reward": -0.23667924478650093, | |
| "reward_after_mean": -0.23667924478650093, | |
| "reward_after_std": 0.3961083684116602, | |
| "reward_before_mean": -0.015432212501764297, | |
| "reward_before_std": 0.3830429194495082, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22124702110886574, | |
| "reward_change_min": -0.3868277929723263, | |
| "reward_change_std": 0.1524029728025198, | |
| "reward_std": 0.39610837027430534, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/cosine_scaled_reward": -0.1404322199523449, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2958.354217529297, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.1846713125705719, | |
| "kl": 4.254281520843506e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0131, | |
| "reward": -0.188864734955132, | |
| "reward_after_mean": -0.188864734955132, | |
| "reward_after_std": 0.4999152459204197, | |
| "reward_before_mean": 0.03220596443861723, | |
| "reward_before_std": 0.4855938693508506, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2210706938058138, | |
| "reward_change_min": -0.401586489751935, | |
| "reward_change_std": 0.15394792892038822, | |
| "reward_std": 0.4999152459204197, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/cosine_scaled_reward": -0.09279404580593109, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3168.375030517578, | |
| "epoch": 0.008, | |
| "grad_norm": 0.14882907271385193, | |
| "kl": 2.9325485229492188e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0437, | |
| "reward": -0.12195562478154898, | |
| "reward_after_mean": -0.12195562478154898, | |
| "reward_after_std": 0.5116294100880623, | |
| "reward_before_mean": 0.11521910736337304, | |
| "reward_before_std": 0.42447544634342194, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23717475309967995, | |
| "reward_change_min": -0.3341871611773968, | |
| "reward_change_std": 0.12419951800256968, | |
| "reward_std": 0.5116294212639332, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/cosine_scaled_reward": -0.07228088192641735, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2867.7708740234375, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.18323290348052979, | |
| "kl": 1.5871599316596985e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.5e-07, | |
| "loss": -0.0588, | |
| "reward": 0.05553785338997841, | |
| "reward_after_mean": 0.05553785338997841, | |
| "reward_after_std": 0.6334318313747644, | |
| "reward_before_mean": 0.3454107344150543, | |
| "reward_before_std": 0.5995671562850475, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28987287171185017, | |
| "reward_change_min": -0.49663791060447693, | |
| "reward_change_std": 0.19086131732910872, | |
| "reward_std": 0.633431838825345, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": 0.05374404788017273, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3102.6250915527344, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.19616039097309113, | |
| "kl": 3.674440085887909e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4e-07, | |
| "loss": -0.0234, | |
| "reward": -0.16837336937896907, | |
| "reward_after_mean": -0.16837336937896907, | |
| "reward_after_std": 0.4944228585809469, | |
| "reward_before_mean": 0.06263772025704384, | |
| "reward_before_std": 0.4829963054507971, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23101108148694038, | |
| "reward_change_min": -0.42128635197877884, | |
| "reward_change_std": 0.15471239853650331, | |
| "reward_std": 0.4944228623062372, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/cosine_scaled_reward": -0.08319561230018735, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2681.9791717529297, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.18401016294956207, | |
| "kl": 3.248453140258789e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.5e-07, | |
| "loss": -0.0059, | |
| "reward": -0.11956031061708927, | |
| "reward_after_mean": -0.11956031061708927, | |
| "reward_after_std": 0.5619859658181667, | |
| "reward_before_mean": 0.11641145590692759, | |
| "reward_before_std": 0.5482429880648851, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23597176373004913, | |
| "reward_change_min": -0.4013804756104946, | |
| "reward_change_std": 0.15929853450506926, | |
| "reward_std": 0.5619859807193279, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/cosine_scaled_reward": -0.0710885627195239, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3318.6041870117188, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.1448400914669037, | |
| "kl": 3.1188130378723145e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0127, | |
| "reward": -0.2986925896257162, | |
| "reward_after_mean": -0.2986925896257162, | |
| "reward_after_std": 0.47236273624002934, | |
| "reward_before_mean": -0.11717952135950327, | |
| "reward_before_std": 0.43173919059336185, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18151307478547096, | |
| "reward_change_min": -0.3272790275514126, | |
| "reward_change_std": 0.11354650370776653, | |
| "reward_std": 0.472362769767642, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.20051286462694407, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2220.812545776367, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.24356995522975922, | |
| "kl": 4.044920206069946e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.5e-07, | |
| "loss": -0.0058, | |
| "reward": -0.1285993792116642, | |
| "reward_after_mean": -0.1285993792116642, | |
| "reward_after_std": 0.4768393710255623, | |
| "reward_before_mean": 0.12047314643859863, | |
| "reward_before_std": 0.4788372376933694, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24907256104052067, | |
| "reward_change_min": -0.4614252410829067, | |
| "reward_change_std": 0.1735817501321435, | |
| "reward_std": 0.47683937288820744, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/cosine_scaled_reward": -0.08786016795784235, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3018.354217529297, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.2400161623954773, | |
| "kl": 3.460049629211426e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0728, | |
| "reward": -0.10721326060593128, | |
| "reward_after_mean": -0.10721326060593128, | |
| "reward_after_std": 0.5508640371263027, | |
| "reward_before_mean": 0.13581005320884287, | |
| "reward_before_std": 0.5371037218719721, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2430233098566532, | |
| "reward_change_min": -0.4148747492581606, | |
| "reward_change_std": 0.16182870231568813, | |
| "reward_std": 0.5508640389889479, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/cosine_scaled_reward": -0.05168995447456837, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2948.187545776367, | |
| "epoch": 0.016, | |
| "grad_norm": 0.20983624458312988, | |
| "kl": 2.816319465637207e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.5e-07, | |
| "loss": -0.0293, | |
| "reward": -0.1505213938653469, | |
| "reward_after_mean": -0.1505213938653469, | |
| "reward_after_std": 0.48780051805078983, | |
| "reward_before_mean": 0.08621821040287614, | |
| "reward_before_std": 0.4805648783221841, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2367396280169487, | |
| "reward_change_min": -0.401450265198946, | |
| "reward_change_std": 0.15510203130543232, | |
| "reward_std": 0.48780052177608013, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.08044843003153801, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2626.2291679382324, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.2066204994916916, | |
| "kl": 1.849886029958725e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7e-07, | |
| "loss": -0.038, | |
| "reward": -0.07881241291761398, | |
| "reward_after_mean": -0.07881241291761398, | |
| "reward_after_std": 0.3058329503983259, | |
| "reward_before_mean": 0.20712529122829437, | |
| "reward_before_std": 0.24552945792675018, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28593770414590836, | |
| "reward_change_min": -0.4088184628635645, | |
| "reward_change_std": 0.15766743291169405, | |
| "reward_std": 0.3058329652994871, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/cosine_scaled_reward": -0.022041399031877518, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3519.6458435058594, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.16457392275333405, | |
| "kl": 4.182755947113037e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.002, | |
| "reward": -0.3867794554680586, | |
| "reward_after_mean": -0.3867794554680586, | |
| "reward_after_std": 0.27169811353087425, | |
| "reward_before_mean": -0.21112521784380078, | |
| "reward_before_std": 0.21979969623498619, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17565423250198364, | |
| "reward_change_min": -0.2535594701766968, | |
| "reward_change_std": 0.0948435440659523, | |
| "reward_std": 0.271698116324842, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.2319585494697094, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2438.541702270508, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.2491314560174942, | |
| "kl": 3.923475742340088e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0261, | |
| "reward": -0.09273135662078857, | |
| "reward_after_mean": -0.09273135662078857, | |
| "reward_after_std": 0.533635126426816, | |
| "reward_before_mean": 0.16233503818511963, | |
| "reward_before_std": 0.5344762653112411, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25506639294326305, | |
| "reward_change_min": -0.48770347610116005, | |
| "reward_change_std": 0.18604809325188398, | |
| "reward_std": 0.5336351562291384, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/cosine_scaled_reward": -0.02516496740281582, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2808.416732788086, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.18383286893367767, | |
| "kl": 2.2854655981063843e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.0188, | |
| "reward": -0.1808744166046381, | |
| "reward_after_mean": -0.1808744166046381, | |
| "reward_after_std": 0.4772877935320139, | |
| "reward_before_mean": 0.04505238076671958, | |
| "reward_before_std": 0.45604391396045685, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22592679969966412, | |
| "reward_change_min": -0.3993493113666773, | |
| "reward_change_std": 0.1486664516851306, | |
| "reward_std": 0.4772877972573042, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.1216142950579524, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2772.4166717529297, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.18437886238098145, | |
| "kl": 2.4475157260894775e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0301, | |
| "reward": 0.15320486947894096, | |
| "reward_after_mean": 0.15320486947894096, | |
| "reward_after_std": 0.6356023158878088, | |
| "reward_before_mean": 0.4862675927579403, | |
| "reward_before_std": 0.6554644731804729, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3330627214163542, | |
| "reward_change_min": -0.6119302771985531, | |
| "reward_change_std": 0.23800678364932537, | |
| "reward_std": 0.6356023401021957, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/cosine_scaled_reward": 0.1112675853073597, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2494.0209197998047, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.2170894891023636, | |
| "kl": 1.3802200555801392e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": -0.0054, | |
| "reward": 0.16672552144154906, | |
| "reward_after_mean": 0.16672552144154906, | |
| "reward_after_std": 0.5955907795578241, | |
| "reward_before_mean": 0.5104347411543131, | |
| "reward_before_std": 0.6022562235593796, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3437092248350382, | |
| "reward_change_min": -0.6036390513181686, | |
| "reward_change_std": 0.24417879804968834, | |
| "reward_std": 0.5955907888710499, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/cosine_scaled_reward": 0.11460138857364655, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2875.125015258789, | |
| "epoch": 0.024, | |
| "grad_norm": 0.18273918330669403, | |
| "kl": 4.053860902786255e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0532, | |
| "reward": -0.10901273542549461, | |
| "reward_after_mean": -0.10901273542549461, | |
| "reward_after_std": 0.47225300781428814, | |
| "reward_before_mean": 0.14461494609713554, | |
| "reward_before_std": 0.4534954270347953, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2536276988685131, | |
| "reward_change_min": -0.41233821772038937, | |
| "reward_change_std": 0.1598598938435316, | |
| "reward_std": 0.47225301899015903, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/cosine_scaled_reward": -0.042885048780590296, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1771.5000381469727, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.31784588098526, | |
| "kl": 2.108141779899597e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.99931462820376e-07, | |
| "loss": 0.0008, | |
| "reward": -0.08861424401402473, | |
| "reward_after_mean": -0.08861424401402473, | |
| "reward_after_std": 0.3277482558041811, | |
| "reward_before_mean": 0.19050541147589684, | |
| "reward_before_std": 0.27969441190361977, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2791196908801794, | |
| "reward_change_min": -0.4190730433911085, | |
| "reward_change_std": 0.15838530845940113, | |
| "reward_std": 0.3277482632547617, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/cosine_scaled_reward": -0.03866123594343662, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2396.791702270508, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.18681152164936066, | |
| "kl": 2.3262575268745422e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0211, | |
| "reward": -0.23647188395261765, | |
| "reward_after_mean": -0.23647188395261765, | |
| "reward_after_std": 0.399081664159894, | |
| "reward_before_mean": -0.021006003953516483, | |
| "reward_before_std": 0.3748807581141591, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21546588093042374, | |
| "reward_change_min": -0.38881424628198147, | |
| "reward_change_std": 0.14373441133648157, | |
| "reward_std": 0.39908168464899063, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/cosine_scaled_reward": -0.12517266999930143, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2746.104202270508, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.2844712734222412, | |
| "kl": 1.5251338481903076e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.993832906395582e-07, | |
| "loss": 0.0208, | |
| "reward": 0.0761333703994751, | |
| "reward_after_mean": 0.0761333703994751, | |
| "reward_after_std": 0.6143560092896223, | |
| "reward_before_mean": 0.3813359132036567, | |
| "reward_before_std": 0.6187122687697411, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30520253628492355, | |
| "reward_change_min": -0.5352295599877834, | |
| "reward_change_std": 0.2117024352774024, | |
| "reward_std": 0.6143560204654932, | |
| "rewards/accuracy_reward": 0.31250000931322575, | |
| "rewards/cosine_scaled_reward": 0.06883592065423727, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2599.2708435058594, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.2062840610742569, | |
| "kl": 3.396719694137573e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0158, | |
| "reward": -0.17240291833877563, | |
| "reward_after_mean": -0.17240291833877563, | |
| "reward_after_std": 0.5134213641285896, | |
| "reward_before_mean": 0.05743051879107952, | |
| "reward_before_std": 0.5324400179088116, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22983343712985516, | |
| "reward_change_min": -0.439667459577322, | |
| "reward_change_std": 0.1796932201832533, | |
| "reward_std": 0.5134213827550411, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/cosine_scaled_reward": -0.1300694877281785, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2904.1458740234375, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.16000084578990936, | |
| "kl": 2.2212974727153778e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.982876141412855e-07, | |
| "loss": -0.0014, | |
| "reward": -0.05851639807224274, | |
| "reward_after_mean": -0.05851639807224274, | |
| "reward_after_std": 0.44963656924664974, | |
| "reward_before_mean": 0.2166665024124086, | |
| "reward_before_std": 0.4241899009793997, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2751829121261835, | |
| "reward_change_min": -0.41147186793386936, | |
| "reward_change_std": 0.16059752739965916, | |
| "reward_std": 0.44963658042252064, | |
| "rewards/accuracy_reward": 0.22916667722165585, | |
| "rewards/cosine_scaled_reward": -0.012500176206231117, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2987.2083740234375, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.17558637261390686, | |
| "kl": 1.9339844584465027e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": -0.0073, | |
| "reward": -0.2276617707684636, | |
| "reward_after_mean": -0.2276617707684636, | |
| "reward_after_std": 0.49568176455795765, | |
| "reward_before_mean": -0.021468112245202065, | |
| "reward_before_std": 0.4707259628921747, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20619365386664867, | |
| "reward_change_min": -0.3916598744690418, | |
| "reward_change_std": 0.1389320297166705, | |
| "reward_std": 0.49568178318440914, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.12563478108495474, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2722.8958740234375, | |
| "epoch": 0.032, | |
| "grad_norm": 0.2014327049255371, | |
| "kl": 2.8021633625030518e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.96645768238595e-07, | |
| "loss": 0.0061, | |
| "reward": 0.040278345346450806, | |
| "reward_after_mean": 0.040278345346450806, | |
| "reward_after_std": 0.41374246776103973, | |
| "reward_before_mean": 0.35379676637239754, | |
| "reward_before_std": 0.34872716292738914, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3135183919221163, | |
| "reward_change_min": -0.45590174198150635, | |
| "reward_change_std": 0.1750109540298581, | |
| "reward_std": 0.4137424696236849, | |
| "rewards/accuracy_reward": 0.35416666977107525, | |
| "rewards/cosine_scaled_reward": -0.00036993250250816345, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3334.1458740234375, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.1652597337961197, | |
| "kl": 4.1700899600982666e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0112, | |
| "reward": -0.39186385460197926, | |
| "reward_after_mean": -0.39186385460197926, | |
| "reward_after_std": 0.31510886177420616, | |
| "reward_before_mean": -0.2214842550456524, | |
| "reward_before_std": 0.2841310743242502, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17037959955632687, | |
| "reward_change_min": -0.3085148986428976, | |
| "reward_change_std": 0.10925670620054007, | |
| "reward_std": 0.3151088785380125, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.2631509155035019, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2979.2084045410156, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.23167316615581512, | |
| "kl": 2.894178032875061e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.944597532678119e-07, | |
| "loss": 0.0743, | |
| "reward": 0.06503391482692678, | |
| "reward_after_mean": 0.06503391482692678, | |
| "reward_after_std": 0.6253597214818001, | |
| "reward_before_mean": 0.36115007381886244, | |
| "reward_before_std": 0.6058835834264755, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2961161620914936, | |
| "reward_change_min": -0.4820784516632557, | |
| "reward_change_std": 0.18900468945503235, | |
| "reward_std": 0.6253597438335419, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/cosine_scaled_reward": 0.06948341536917724, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2974.250015258789, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.259880930185318, | |
| "kl": 2.1339859813451767e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": -0.1239, | |
| "reward": -0.17607227340340614, | |
| "reward_after_mean": -0.17607227340340614, | |
| "reward_after_std": 0.49117584340274334, | |
| "reward_before_mean": 0.05198503099381924, | |
| "reward_before_std": 0.4788073133677244, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22805731743574142, | |
| "reward_change_min": -0.39622381143271923, | |
| "reward_change_std": 0.1568627143278718, | |
| "reward_std": 0.4911758713424206, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.09384830202907324, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3248.7500610351562, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.17636683583259583, | |
| "kl": 4.393048584461212e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.917322325514487e-07, | |
| "loss": -0.005, | |
| "reward": -0.042430607602000237, | |
| "reward_after_mean": -0.042430607602000237, | |
| "reward_after_std": 0.40656069852411747, | |
| "reward_before_mean": 0.24141232948750257, | |
| "reward_before_std": 0.3450923506170511, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28384293988347054, | |
| "reward_change_min": -0.4027772378176451, | |
| "reward_change_std": 0.15636454056948423, | |
| "reward_std": 0.40656072460114956, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/cosine_scaled_reward": -0.008587680757045746, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3381.8333740234375, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.14724509418010712, | |
| "kl": 4.9639493227005005e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": -0.0222, | |
| "reward": -0.16744333039969206, | |
| "reward_after_mean": -0.16744333039969206, | |
| "reward_after_std": 0.5987622756510973, | |
| "reward_before_mean": 0.04793749377131462, | |
| "reward_before_std": 0.5893303733319044, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21538082137703896, | |
| "reward_change_min": -0.4106915630400181, | |
| "reward_change_std": 0.1567707946524024, | |
| "reward_std": 0.5987622793763876, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.0978958396390226, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2492.5208740234375, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.2158302515745163, | |
| "kl": 0.00010519847273826599, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.88466529153356e-07, | |
| "loss": 0.0103, | |
| "reward": 0.2191654071211815, | |
| "reward_after_mean": 0.2191654071211815, | |
| "reward_after_std": 0.7464860528707504, | |
| "reward_before_mean": 0.5621518101543188, | |
| "reward_before_std": 0.7706933673471212, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.342986399307847, | |
| "reward_change_min": -0.6137207373976707, | |
| "reward_change_std": 0.249726596288383, | |
| "reward_std": 0.7464860621839762, | |
| "rewards/accuracy_reward": 0.39583334140479565, | |
| "rewards/cosine_scaled_reward": 0.16631846246309578, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3080.25004196167, | |
| "epoch": 0.04, | |
| "grad_norm": 0.19586136937141418, | |
| "kl": 4.8315152525901794e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0329, | |
| "reward": -0.11165531259030104, | |
| "reward_after_mean": -0.11165531259030104, | |
| "reward_after_std": 0.6846011225134134, | |
| "reward_before_mean": 0.11056250985711813, | |
| "reward_before_std": 0.662305686622858, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22221781872212887, | |
| "reward_change_min": -0.41959609277546406, | |
| "reward_change_std": 0.15651744045317173, | |
| "reward_std": 0.6846011485904455, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.05610416317358613, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3382.4583435058594, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.1657496988773346, | |
| "kl": 6.525218486785889e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.846666218300807e-07, | |
| "loss": 0.0225, | |
| "reward": -0.3704317416995764, | |
| "reward_after_mean": -0.3704317416995764, | |
| "reward_after_std": 0.31464754045009613, | |
| "reward_before_mean": -0.19473244110122323, | |
| "reward_before_std": 0.2709705028682947, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17569930106401443, | |
| "reward_change_min": -0.2673074584454298, | |
| "reward_change_std": 0.09599933121353388, | |
| "reward_std": 0.3146475479006767, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.215565774589777, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3370.0833740234375, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.15995879471302032, | |
| "kl": 2.010539174079895e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0199, | |
| "reward": -0.4515352062880993, | |
| "reward_after_mean": -0.4515352062880993, | |
| "reward_after_std": 0.21112178452312946, | |
| "reward_before_mean": -0.28895667381584644, | |
| "reward_before_std": 0.17021457478404045, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.162578534334898, | |
| "reward_change_min": -0.2480195052921772, | |
| "reward_change_std": 0.08829918596893549, | |
| "reward_std": 0.2111217901110649, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/cosine_scaled_reward": -0.28895667754113674, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3305.812530517578, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.18195205926895142, | |
| "kl": 4.641711711883545e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.80337140183366e-07, | |
| "loss": 0.0274, | |
| "reward": -0.2661134898662567, | |
| "reward_after_mean": -0.2661134898662567, | |
| "reward_after_std": 0.26150001399219036, | |
| "reward_before_mean": -0.042573800310492516, | |
| "reward_before_std": 0.2240951219573617, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22353969514369965, | |
| "reward_change_min": -0.33214454911649227, | |
| "reward_change_std": 0.12396880332380533, | |
| "reward_std": 0.26150002889335155, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/cosine_scaled_reward": -0.14674047753214836, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2746.2500381469727, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.20554743707180023, | |
| "kl": 5.049258470535278e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": -0.0103, | |
| "reward": -0.08482712507247925, | |
| "reward_after_mean": -0.08482712507247925, | |
| "reward_after_std": 0.2759235240519047, | |
| "reward_before_mean": 0.1990803610533476, | |
| "reward_before_std": 0.17641383409500122, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2839074842631817, | |
| "reward_change_min": -0.394864222034812, | |
| "reward_change_std": 0.14654383901506662, | |
| "reward_std": 0.2759235333651304, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/cosine_scaled_reward": -0.0509196612983942, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2545.0833740234375, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.177109956741333, | |
| "kl": 0.0001279488205909729, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.754833590196926e-07, | |
| "loss": 0.0214, | |
| "reward": -0.12675099074840546, | |
| "reward_after_mean": -0.12675099074840546, | |
| "reward_after_std": 0.42964968644082546, | |
| "reward_before_mean": 0.1206003911793232, | |
| "reward_before_std": 0.3606878248974681, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24735137075185776, | |
| "reward_change_min": -0.39622214064002037, | |
| "reward_change_std": 0.1445015063509345, | |
| "reward_std": 0.4296496883034706, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/cosine_scaled_reward": -0.0668996311724186, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3163.7709045410156, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.16450874507427216, | |
| "kl": 7.349532097578049e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0068, | |
| "reward": -0.27329863607883453, | |
| "reward_after_mean": -0.27329863607883453, | |
| "reward_after_std": 0.382920210249722, | |
| "reward_before_mean": -0.06760533340275288, | |
| "reward_before_std": 0.36998474691063166, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2056933008134365, | |
| "reward_change_min": -0.3910033367574215, | |
| "reward_change_std": 0.13763669785112143, | |
| "reward_std": 0.38292021211236715, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.15093865152448416, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2923.2708492279053, | |
| "epoch": 0.048, | |
| "grad_norm": 0.22062529623508453, | |
| "kl": 3.405660390853882e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.701111919237408e-07, | |
| "loss": 0.0386, | |
| "reward": -0.43281109537929296, | |
| "reward_after_mean": -0.43281109537929296, | |
| "reward_after_std": 0.29586860351264477, | |
| "reward_before_mean": -0.2786172227934003, | |
| "reward_before_std": 0.23877727705985308, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.15419389307498932, | |
| "reward_change_min": -0.2224500197917223, | |
| "reward_change_std": 0.0808606967329979, | |
| "reward_std": 0.2958686240017414, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.2994505534879863, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3085.625015258789, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.18678560853004456, | |
| "kl": 3.5848468542099e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0173, | |
| "reward": -0.19783002510666847, | |
| "reward_after_mean": -0.19783002510666847, | |
| "reward_after_std": 0.37531200610101223, | |
| "reward_before_mean": 0.03791181556880474, | |
| "reward_before_std": 0.3623821344226599, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23574184253811836, | |
| "reward_change_min": -0.40054079331457615, | |
| "reward_change_std": 0.15123076178133488, | |
| "reward_std": 0.375312015414238, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/cosine_scaled_reward": -0.1079215258359909, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2848.187530517578, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.23331570625305176, | |
| "kl": 0.00020481832325458527, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.64227184053598e-07, | |
| "loss": 0.0656, | |
| "reward": -0.09341166913509369, | |
| "reward_after_mean": -0.09341166913509369, | |
| "reward_after_std": 0.504422040656209, | |
| "reward_before_mean": 0.16584731824696064, | |
| "reward_before_std": 0.5199198350310326, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2592589948326349, | |
| "reward_change_min": -0.47888655215501785, | |
| "reward_change_std": 0.18998684082180262, | |
| "reward_std": 0.5044220443814993, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/cosine_scaled_reward": -0.021652692928910255, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3421.7083740234375, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.1464652270078659, | |
| "kl": 3.8933008909225464e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": -0.0099, | |
| "reward": -0.18143506534397602, | |
| "reward_after_mean": -0.18143506534397602, | |
| "reward_after_std": 0.48556733317673206, | |
| "reward_before_mean": 0.0428056214004755, | |
| "reward_before_std": 0.4591532591730356, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22424068115651608, | |
| "reward_change_min": -0.3489771634340286, | |
| "reward_change_std": 0.1350772501900792, | |
| "reward_std": 0.4855673424899578, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/cosine_scaled_reward": -0.10302771534770727, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3407.2083435058594, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.1929873824119568, | |
| "kl": 0.0002593100070953369, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.578385041664925e-07, | |
| "loss": 0.0301, | |
| "reward": -0.3940184600651264, | |
| "reward_after_mean": -0.3940184600651264, | |
| "reward_after_std": 0.27828204818069935, | |
| "reward_before_mean": -0.22154150530695915, | |
| "reward_before_std": 0.22950039338320494, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17247696220874786, | |
| "reward_change_min": -0.2641389053314924, | |
| "reward_change_std": 0.09335798770189285, | |
| "reward_std": 0.27828205190598965, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.24237483832985163, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2974.5000762939453, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.26433587074279785, | |
| "kl": 0.00010039284825325012, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.1271, | |
| "reward": 0.045076385140419006, | |
| "reward_after_mean": 0.045076385140419006, | |
| "reward_after_std": 0.562898326665163, | |
| "reward_before_mean": 0.3511660899966955, | |
| "reward_before_std": 0.5953826615586877, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3060897272080183, | |
| "reward_change_min": -0.5509906392544508, | |
| "reward_change_std": 0.22344755101948977, | |
| "reward_std": 0.5628983462229371, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/cosine_scaled_reward": 0.05949941836297512, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2807.687530517578, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.25756871700286865, | |
| "kl": 0.000245068222284317, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.509529358847654e-07, | |
| "loss": 0.0432, | |
| "reward": -0.17351538315415382, | |
| "reward_after_mean": -0.17351538315415382, | |
| "reward_after_std": 0.39601418375968933, | |
| "reward_before_mean": 0.06836471986025572, | |
| "reward_before_std": 0.3816844457760453, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24188010767102242, | |
| "reward_change_min": -0.41022371128201485, | |
| "reward_change_std": 0.15736299566924572, | |
| "reward_std": 0.3960142061114311, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/cosine_scaled_reward": -0.09830194525420666, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2470.6875762939453, | |
| "epoch": 0.056, | |
| "grad_norm": 0.2077207863330841, | |
| "kl": 0.00011547654867172241, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0475, | |
| "reward": -0.004357508383691311, | |
| "reward_after_mean": -0.004357508383691311, | |
| "reward_after_std": 0.660737868398428, | |
| "reward_before_mean": 0.2607364095747471, | |
| "reward_before_std": 0.6347355041652918, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2650939039885998, | |
| "reward_change_min": -0.4125436320900917, | |
| "reward_change_std": 0.16354067996144295, | |
| "reward_std": 0.6607378907501698, | |
| "rewards/accuracy_reward": 0.25000000931322575, | |
| "rewards/cosine_scaled_reward": 0.01073638815432787, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3005.458366394043, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.14878875017166138, | |
| "kl": 0.0001731663942337036, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.43578868212728e-07, | |
| "loss": 0.0165, | |
| "reward": -0.014921462163329124, | |
| "reward_after_mean": -0.014921462163329124, | |
| "reward_after_std": 0.35436189733445644, | |
| "reward_before_mean": 0.2836841717362404, | |
| "reward_before_std": 0.25718711968511343, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2986056488007307, | |
| "reward_change_min": -0.41532920859754086, | |
| "reward_change_std": 0.15457267686724663, | |
| "reward_std": 0.35436189733445644, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/cosine_scaled_reward": 0.01285084243863821, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2303.520851135254, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.23037534952163696, | |
| "kl": 0.0005000531673431396, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0274, | |
| "reward": -0.23243643390014768, | |
| "reward_after_mean": -0.23243643390014768, | |
| "reward_after_std": 0.4816542100161314, | |
| "reward_before_mean": -0.028105121105909348, | |
| "reward_before_std": 0.4414081573486328, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2043313141912222, | |
| "reward_change_min": -0.3245617300271988, | |
| "reward_change_std": 0.1267829779535532, | |
| "reward_std": 0.4816542211920023, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/cosine_scaled_reward": -0.13227178994566202, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2936.5417289733887, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.21160194277763367, | |
| "kl": 0.0004085935652256012, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.357252853159505e-07, | |
| "loss": 0.0235, | |
| "reward": -0.22832631319761276, | |
| "reward_after_mean": -0.22832631319761276, | |
| "reward_after_std": 0.596066826954484, | |
| "reward_before_mean": -0.03995312686311081, | |
| "reward_before_std": 0.555262666195631, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18837318196892738, | |
| "reward_change_min": -0.3354088496416807, | |
| "reward_change_std": 0.12037093937397003, | |
| "reward_std": 0.5960668455809355, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.14411979634314775, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2943.395866394043, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.19955258071422577, | |
| "kl": 0.0003105923533439636, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0486, | |
| "reward": 0.046732327435165644, | |
| "reward_after_mean": 0.046732327435165644, | |
| "reward_after_std": 0.572412297129631, | |
| "reward_before_mean": 0.3493400067090988, | |
| "reward_before_std": 0.581750338897109, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30260770581662655, | |
| "reward_change_min": -0.48020620830357075, | |
| "reward_change_std": 0.20280153769999743, | |
| "reward_std": 0.5724122989922762, | |
| "rewards/accuracy_reward": 0.29166667722165585, | |
| "rewards/cosine_scaled_reward": 0.05767334741540253, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2875.0208740234375, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.16428104043006897, | |
| "kl": 8.340924978256226e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.274017555754407e-07, | |
| "loss": 0.0482, | |
| "reward": 0.17428431287407875, | |
| "reward_after_mean": 0.17428431287407875, | |
| "reward_after_std": 0.5909074582159519, | |
| "reward_before_mean": 0.5155755765736103, | |
| "reward_before_std": 0.5545622715726495, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.34129126369953156, | |
| "reward_change_min": -0.6125206407159567, | |
| "reward_change_std": 0.22563873324543238, | |
| "reward_std": 0.590907484292984, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/cosine_scaled_reward": 0.1405755653977394, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3093.875030517578, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.17089791595935822, | |
| "kl": 0.0005162432789802551, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0094, | |
| "reward": -0.11113542690873146, | |
| "reward_after_mean": -0.11113542690873146, | |
| "reward_after_std": 0.3386560436338186, | |
| "reward_before_mean": 0.15820150449872017, | |
| "reward_before_std": 0.29461055248975754, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.26933695189654827, | |
| "reward_change_min": -0.4236486293375492, | |
| "reward_change_std": 0.15788063406944275, | |
| "reward_std": 0.3386560510843992, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/cosine_scaled_reward": -0.029298486188054085, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2877.291717529297, | |
| "epoch": 0.064, | |
| "grad_norm": 0.20696526765823364, | |
| "kl": 0.00024941563606262207, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.186184199300463e-07, | |
| "loss": 0.0124, | |
| "reward": -0.11615562066435814, | |
| "reward_after_mean": -0.11615562066435814, | |
| "reward_after_std": 0.3605691157281399, | |
| "reward_before_mean": 0.15260330587625504, | |
| "reward_before_std": 0.3486750479787588, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2687589228153229, | |
| "reward_change_min": -0.3955491781234741, | |
| "reward_change_std": 0.16233802400529385, | |
| "reward_std": 0.36056913062930107, | |
| "rewards/accuracy_reward": 0.2083333432674408, | |
| "rewards/cosine_scaled_reward": -0.055730052292346954, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3307.875, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.13051925599575043, | |
| "kl": 0.00013627856969833374, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0235, | |
| "reward": -0.2993467375636101, | |
| "reward_after_mean": -0.2993467375636101, | |
| "reward_after_std": 0.4061383567750454, | |
| "reward_before_mean": -0.10633787885308266, | |
| "reward_before_std": 0.3765461528673768, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19300885125994682, | |
| "reward_change_min": -0.3217271789908409, | |
| "reward_change_std": 0.12175194267183542, | |
| "reward_std": 0.40613835863769054, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/cosine_scaled_reward": -0.1896712128072977, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2274.1667251586914, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.3047843277454376, | |
| "kl": 0.0021878480911254883, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.093859795212817e-07, | |
| "loss": 0.147, | |
| "reward": 0.014255084563046694, | |
| "reward_after_mean": 0.014255084563046694, | |
| "reward_after_std": 0.49078281223773956, | |
| "reward_before_mean": 0.30834553577005863, | |
| "reward_before_std": 0.4538161437958479, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2940904349088669, | |
| "reward_change_min": -0.4698449205607176, | |
| "reward_change_std": 0.18156961910426617, | |
| "reward_std": 0.4907828290015459, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/cosine_scaled_reward": 0.016678830608725548, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2838.6041870117188, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.15764220058918, | |
| "kl": 0.0009508654475212097, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.05, | |
| "reward": -0.15274246037006378, | |
| "reward_after_mean": -0.15274246037006378, | |
| "reward_after_std": 0.37365361116826534, | |
| "reward_before_mean": 0.1003800667822361, | |
| "reward_before_std": 0.3644466046243906, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2531225271522999, | |
| "reward_change_min": -0.41390063986182213, | |
| "reward_change_std": 0.16076032724231482, | |
| "reward_std": 0.3736536279320717, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/cosine_scaled_reward": -0.0662866085767746, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3020.4375534057617, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.17373518645763397, | |
| "kl": 0.00023894011974334717, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.997156826556369e-07, | |
| "loss": -0.0036, | |
| "reward": -0.28059580083936453, | |
| "reward_after_mean": -0.28059580083936453, | |
| "reward_after_std": 0.38212408497929573, | |
| "reward_before_mean": -0.07820725813508034, | |
| "reward_before_std": 0.35481286235153675, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20238853991031647, | |
| "reward_change_min": -0.3260616082698107, | |
| "reward_change_std": 0.12590447254478931, | |
| "reward_std": 0.3821241036057472, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/cosine_scaled_reward": -0.16154058929532766, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3286.6250610351562, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.1478388011455536, | |
| "kl": 0.00047288229689002037, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": -0.0065, | |
| "reward": -0.1639209073036909, | |
| "reward_after_mean": -0.1639209073036909, | |
| "reward_after_std": 0.5073207020759583, | |
| "reward_before_mean": 0.07017608173191547, | |
| "reward_before_std": 0.5215496774762869, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23409699089825153, | |
| "reward_change_min": -0.45022542029619217, | |
| "reward_change_std": 0.17784226965159178, | |
| "reward_std": 0.5073207188397646, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.13815725967288017, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2659.3959045410156, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.15675584971904755, | |
| "kl": 0.002586759626865387, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.896193111002475e-07, | |
| "loss": -0.0003, | |
| "reward": 0.0647954298183322, | |
| "reward_after_mean": 0.0647954298183322, | |
| "reward_after_std": 0.7668796423822641, | |
| "reward_before_mean": 0.3486335091292858, | |
| "reward_before_std": 0.797910291235894, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28383809328079224, | |
| "reward_change_min": -0.6051791943609715, | |
| "reward_change_std": 0.23171952739357948, | |
| "reward_std": 0.7668796591460705, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/cosine_scaled_reward": 0.03613350400701165, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2394.916717529297, | |
| "epoch": 0.072, | |
| "grad_norm": 0.2152957320213318, | |
| "kl": 0.0014880448579788208, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0886, | |
| "reward": 0.14790401607751846, | |
| "reward_after_mean": 0.14790401607751846, | |
| "reward_after_std": 0.5827827583998442, | |
| "reward_before_mean": 0.4811950605362654, | |
| "reward_before_std": 0.5497365463525057, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3332910742610693, | |
| "reward_change_min": -0.560102928429842, | |
| "reward_change_std": 0.21325735840946436, | |
| "reward_std": 0.5827827733010054, | |
| "rewards/accuracy_reward": 0.35416667349636555, | |
| "rewards/cosine_scaled_reward": 0.12702841963618994, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3006.4791870117188, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.1582987755537033, | |
| "kl": 0.001294851303100586, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.791091657286267e-07, | |
| "loss": 0.048, | |
| "reward": -0.125318787060678, | |
| "reward_after_mean": -0.125318787060678, | |
| "reward_after_std": 0.5649810526520014, | |
| "reward_before_mean": 0.10997231677174568, | |
| "reward_before_std": 0.557425320148468, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23529109917581081, | |
| "reward_change_min": -0.4151240587234497, | |
| "reward_change_std": 0.16171532310545444, | |
| "reward_std": 0.5649810750037432, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/cosine_scaled_reward": -0.05669435299932957, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2798.354202270508, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.2439664602279663, | |
| "kl": 0.002039670944213867, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": -0.0404, | |
| "reward": -0.1746473447419703, | |
| "reward_after_mean": -0.1746473447419703, | |
| "reward_after_std": 0.42232158221304417, | |
| "reward_before_mean": 0.05733271440840326, | |
| "reward_before_std": 0.3576160566881299, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2319800667464733, | |
| "reward_change_min": -0.3824646957218647, | |
| "reward_change_std": 0.13665939681231976, | |
| "reward_std": 0.4223215878009796, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/cosine_scaled_reward": -0.13016729429364204, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2135.3541679382324, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.23610518872737885, | |
| "kl": 0.0015348196029663086, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.681980515339463e-07, | |
| "loss": -0.052, | |
| "reward": -0.01672324660466984, | |
| "reward_after_mean": -0.01672324660466984, | |
| "reward_after_std": 0.5297550391405821, | |
| "reward_before_mean": 0.26884749345481396, | |
| "reward_before_std": 0.5452183573506773, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.285570727661252, | |
| "reward_change_min": -0.5179183334112167, | |
| "reward_change_std": 0.2046450274065137, | |
| "reward_std": 0.5297550512477756, | |
| "rewards/accuracy_reward": 0.2916666753590107, | |
| "rewards/cosine_scaled_reward": -0.02281918376684189, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3525.2291870117188, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.13342344760894775, | |
| "kl": 0.0009595267474651337, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": -0.0164, | |
| "reward": -0.4475428834557533, | |
| "reward_after_mean": -0.4475428834557533, | |
| "reward_after_std": 0.19988763332366943, | |
| "reward_before_mean": -0.28256511874496937, | |
| "reward_before_std": 0.15135450195521116, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.16497775353491306, | |
| "reward_change_min": -0.24302313476800919, | |
| "reward_change_std": 0.0873506860807538, | |
| "reward_std": 0.19988763891160488, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/cosine_scaled_reward": -0.2825651131570339, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2284.6042098999023, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.33302831649780273, | |
| "kl": 0.005011081695556641, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.568992620281243e-07, | |
| "loss": 0.095, | |
| "reward": 0.04241333995014429, | |
| "reward_after_mean": 0.04241333995014429, | |
| "reward_after_std": 0.7362328171730042, | |
| "reward_before_mean": 0.3158540027216077, | |
| "reward_before_std": 0.7369763031601906, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2734406590461731, | |
| "reward_change_min": -0.4872976951301098, | |
| "reward_change_std": 0.19121473841369152, | |
| "reward_std": 0.7362328246235847, | |
| "rewards/accuracy_reward": 0.29166667722165585, | |
| "rewards/cosine_scaled_reward": 0.02418732549995184, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2774.187545776367, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.21714705228805542, | |
| "kl": 0.003237783908843994, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": -0.0021, | |
| "reward": -0.3341971240006387, | |
| "reward_after_mean": -0.3341971240006387, | |
| "reward_after_std": 0.476218955591321, | |
| "reward_before_mean": -0.17050418560393155, | |
| "reward_before_std": 0.4226016802713275, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.16369293443858624, | |
| "reward_change_min": -0.24435340613126755, | |
| "reward_change_std": 0.08748195692896843, | |
| "reward_std": 0.4762189742177725, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.2330041821114719, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3101.041702270508, | |
| "epoch": 0.08, | |
| "grad_norm": 0.17803345620632172, | |
| "kl": 0.0014088600873947144, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.452265630457282e-07, | |
| "loss": 0.0245, | |
| "reward": -0.23275760933756828, | |
| "reward_after_mean": -0.23275760933756828, | |
| "reward_after_std": 0.4238963555544615, | |
| "reward_before_mean": -0.015425082296133041, | |
| "reward_before_std": 0.4194338507950306, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2173325251787901, | |
| "reward_change_min": -0.4142337366938591, | |
| "reward_change_std": 0.1531017581000924, | |
| "reward_std": 0.4238963592797518, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/cosine_scaled_reward": -0.11959175206720829, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2711.166679382324, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.2796599268913269, | |
| "kl": 0.001299545168876648, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": -0.0292, | |
| "reward": -0.10017471015453339, | |
| "reward_after_mean": -0.10017471015453339, | |
| "reward_after_std": 0.4897888842970133, | |
| "reward_before_mean": 0.15967663563787937, | |
| "reward_before_std": 0.4954986907541752, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25985134579241276, | |
| "reward_change_min": -0.4646740537136793, | |
| "reward_change_std": 0.18658488430082798, | |
| "reward_std": 0.48978889361023903, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/cosine_scaled_reward": -0.02782335691154003, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3315.937530517578, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.17189651727676392, | |
| "kl": 0.0025037527084350586, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.331941759724268e-07, | |
| "loss": 0.0035, | |
| "reward": -0.321637025102973, | |
| "reward_after_mean": -0.321637025102973, | |
| "reward_after_std": 0.36918958090245724, | |
| "reward_before_mean": -0.13817780697718263, | |
| "reward_before_std": 0.3153585446998477, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18345921859145164, | |
| "reward_change_min": -0.26327282935380936, | |
| "reward_change_std": 0.0971462931483984, | |
| "reward_std": 0.369189590215683, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.1798444762825966, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3520.187530517578, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.14897483587265015, | |
| "kl": 0.0005079209804534912, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0132, | |
| "reward": -0.16560733504593372, | |
| "reward_after_mean": -0.16560733504593372, | |
| "reward_after_std": 0.5065639726817608, | |
| "reward_before_mean": 0.06735767424106598, | |
| "reward_before_std": 0.5171879883855581, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23296500742435455, | |
| "reward_change_min": -0.46446412801742554, | |
| "reward_change_std": 0.17644464783370495, | |
| "reward_std": 0.5065639745444059, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.09930899925529957, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3304.5625, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.1551615595817566, | |
| "kl": 0.0015359818935394287, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.208167604184217e-07, | |
| "loss": 0.0443, | |
| "reward": -0.07261835690587759, | |
| "reward_after_mean": -0.07261835690587759, | |
| "reward_after_std": 0.4282707963138819, | |
| "reward_before_mean": 0.1986630754545331, | |
| "reward_before_std": 0.3714675856754184, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27128143049776554, | |
| "reward_change_min": -0.4079938605427742, | |
| "reward_change_std": 0.1546249119564891, | |
| "reward_std": 0.4282708205282688, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/cosine_scaled_reward": -0.030503608286380768, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2993.7084045410156, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.16756100952625275, | |
| "kl": 0.004540964961051941, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0082, | |
| "reward": -0.19706540927290916, | |
| "reward_after_mean": -0.19706540927290916, | |
| "reward_after_std": 0.2995363511145115, | |
| "reward_before_mean": 0.04089047887828201, | |
| "reward_before_std": 0.20250952150672674, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23795590549707413, | |
| "reward_change_min": -0.3240286745131016, | |
| "reward_change_std": 0.12177529092878103, | |
| "reward_std": 0.2995363622903824, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.10494285449385643, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3052.187530517578, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.15915194153785706, | |
| "kl": 0.00043383240699768066, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.081093963579707e-07, | |
| "loss": 0.0524, | |
| "reward": -0.3360134717077017, | |
| "reward_after_mean": -0.3360134717077017, | |
| "reward_after_std": 0.3105210345238447, | |
| "reward_before_mean": -0.14231654070317745, | |
| "reward_before_std": 0.29032292775809765, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19369693472981453, | |
| "reward_change_min": -0.33717145025730133, | |
| "reward_change_std": 0.12174026388674974, | |
| "reward_std": 0.31052104756236076, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.1839832067489624, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3304.375030517578, | |
| "epoch": 0.088, | |
| "grad_norm": 0.14390531182289124, | |
| "kl": 0.0005064904689788818, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0526, | |
| "reward": -0.2539611868560314, | |
| "reward_after_mean": -0.2539611868560314, | |
| "reward_after_std": 0.39100789465010166, | |
| "reward_before_mean": -0.043096862733364105, | |
| "reward_before_std": 0.35905097983777523, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21086432039737701, | |
| "reward_change_min": -0.35267689265310764, | |
| "reward_change_std": 0.12909309566020966, | |
| "reward_std": 0.3910079039633274, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/cosine_scaled_reward": -0.1472635231912136, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3308.541717529297, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.15253493189811707, | |
| "kl": 0.0008769482374191284, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.950875657567621e-07, | |
| "loss": 0.0074, | |
| "reward": -0.0791936544701457, | |
| "reward_after_mean": -0.0791936544701457, | |
| "reward_after_std": 0.5702229253947735, | |
| "reward_before_mean": 0.17483498714864254, | |
| "reward_before_std": 0.5539351883344352, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25402865186333656, | |
| "reward_change_min": -0.41291412711143494, | |
| "reward_change_std": 0.16963480412960052, | |
| "reward_std": 0.5702229347079992, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/cosine_scaled_reward": -0.03349835175322369, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2278.75004196167, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.29192036390304565, | |
| "kl": 0.0021656155586242676, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": -0.0445, | |
| "reward": -0.05769808869808912, | |
| "reward_after_mean": -0.05769808869808912, | |
| "reward_after_std": 0.5233242809772491, | |
| "reward_before_mean": 0.20565658761188388, | |
| "reward_before_std": 0.5024187322705984, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.26335467025637627, | |
| "reward_change_min": -0.40742968022823334, | |
| "reward_change_std": 0.1620514988899231, | |
| "reward_std": 0.52332429215312, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/cosine_scaled_reward": -0.023510104045271873, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3350.5833587646484, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.150605708360672, | |
| "kl": 0.001107722520828247, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.817671337095244e-07, | |
| "loss": -0.0069, | |
| "reward": -0.25659373961389065, | |
| "reward_after_mean": -0.25659373961389065, | |
| "reward_after_std": 0.37472186982631683, | |
| "reward_before_mean": -0.04372098250314593, | |
| "reward_before_std": 0.3439793400466442, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21287276968359947, | |
| "reward_change_min": -0.33237724378705025, | |
| "reward_change_std": 0.1219419464468956, | |
| "reward_std": 0.37472187355160713, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/cosine_scaled_reward": -0.16872098669409752, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3116.270866394043, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.23280547559261322, | |
| "kl": 0.0033311843872070312, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0373, | |
| "reward": -0.26977309957146645, | |
| "reward_after_mean": -0.26977309957146645, | |
| "reward_after_std": 0.28102972730994225, | |
| "reward_before_mean": -0.04979459196329117, | |
| "reward_before_std": 0.25032276660203934, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21997853554785252, | |
| "reward_change_min": -0.33210898377001286, | |
| "reward_change_std": 0.12461962644010782, | |
| "reward_std": 0.2810297291725874, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/cosine_scaled_reward": -0.15396124683320522, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2953.9166717529297, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.17104879021644592, | |
| "kl": 0.002446308732032776, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.681643291108517e-07, | |
| "loss": -0.0279, | |
| "reward": -0.029579367488622665, | |
| "reward_after_mean": -0.029579367488622665, | |
| "reward_after_std": 0.5071045402437449, | |
| "reward_before_mean": 0.24649053468601778, | |
| "reward_before_std": 0.4566717045381665, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27606993727386, | |
| "reward_change_min": -0.46944032795727253, | |
| "reward_change_std": 0.17568331584334373, | |
| "reward_std": 0.5071045681834221, | |
| "rewards/accuracy_reward": 0.25000000186264515, | |
| "rewards/cosine_scaled_reward": -0.003509456291794777, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2885.208366394043, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.21742479503154755, | |
| "kl": 0.0008513182401657104, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0297, | |
| "reward": -0.30521316826343536, | |
| "reward_after_mean": -0.30521316826343536, | |
| "reward_after_std": 0.3845277652144432, | |
| "reward_before_mean": -0.11277107335627079, | |
| "reward_before_std": 0.354814812541008, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19244208745658398, | |
| "reward_change_min": -0.30643659457564354, | |
| "reward_change_std": 0.1196863753721118, | |
| "reward_std": 0.3845277763903141, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/cosine_scaled_reward": -0.19610440777614713, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3133.1666870117188, | |
| "epoch": 0.096, | |
| "grad_norm": 0.16427645087242126, | |
| "kl": 0.00032585859298706055, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.54295724882796e-07, | |
| "loss": 0.0304, | |
| "reward": -0.038457803428173065, | |
| "reward_after_mean": -0.038457803428173065, | |
| "reward_after_std": 0.4437688495963812, | |
| "reward_before_mean": 0.24993115104734898, | |
| "reward_before_std": 0.44302100967615843, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28838893957436085, | |
| "reward_change_min": -0.47144375927746296, | |
| "reward_change_std": 0.18405816424638033, | |
| "reward_std": 0.4437688793987036, | |
| "rewards/accuracy_reward": 0.2500000111758709, | |
| "rewards/cosine_scaled_reward": -6.886757910251617e-05, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3301.979217529297, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.1837218999862671, | |
| "kl": 0.00045623257756233215, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0153, | |
| "reward": -0.0646021788707003, | |
| "reward_after_mean": -0.0646021788707003, | |
| "reward_after_std": 0.5770941227674484, | |
| "reward_before_mean": 0.1956373080611229, | |
| "reward_before_std": 0.5833419263362885, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.260239502415061, | |
| "reward_change_min": -0.4684751071035862, | |
| "reward_change_std": 0.18584690615534782, | |
| "reward_std": 0.5770941376686096, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/cosine_scaled_reward": -0.03352935239672661, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3119.625030517578, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.1819085329771042, | |
| "kl": 0.0014518499374389648, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.401782177833147e-07, | |
| "loss": -0.0187, | |
| "reward": -0.30323445051908493, | |
| "reward_after_mean": -0.30323445051908493, | |
| "reward_after_std": 0.3614001404494047, | |
| "reward_before_mean": -0.10566247068345547, | |
| "reward_before_std": 0.33295972365885973, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19757197797298431, | |
| "reward_change_min": -0.3210353907197714, | |
| "reward_change_std": 0.12041187938302755, | |
| "reward_std": 0.36140014231204987, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.20982913300395012, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3044.104217529297, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.17637749016284943, | |
| "kl": 0.002590000629425049, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0096, | |
| "reward": -0.14765978045761585, | |
| "reward_after_mean": -0.14765978045761585, | |
| "reward_after_std": 0.4970177672803402, | |
| "reward_before_mean": 0.08903376385569572, | |
| "reward_before_std": 0.48575887829065323, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23669353872537613, | |
| "reward_change_min": -0.40174689143896103, | |
| "reward_change_std": 0.15881517250090837, | |
| "reward_std": 0.49701779522001743, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.07763291290029883, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2976.3125610351562, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.2443528175354004, | |
| "kl": 0.0038170814514160156, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.258290078201731e-07, | |
| "loss": 0.1182, | |
| "reward": -0.06454395316541195, | |
| "reward_after_mean": -0.06454395316541195, | |
| "reward_after_std": 0.7364202737808228, | |
| "reward_before_mean": 0.16822397490614094, | |
| "reward_before_std": 0.7222829144448042, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23276793025434017, | |
| "reward_change_min": -0.48013338819146156, | |
| "reward_change_std": 0.17427105363458395, | |
| "reward_std": 0.7364202737808228, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/cosine_scaled_reward": -0.04010936478152871, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3386.9583435058594, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.16150832176208496, | |
| "kl": 0.0024889707565307617, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0032, | |
| "reward": -0.31388526782393456, | |
| "reward_after_mean": -0.31388526782393456, | |
| "reward_after_std": 0.37776545993983746, | |
| "reward_before_mean": -0.12356989830732346, | |
| "reward_before_std": 0.3436039094813168, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1903153732419014, | |
| "reward_change_min": -0.3229862004518509, | |
| "reward_change_std": 0.11718899849802256, | |
| "reward_std": 0.37776547484099865, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.18606988992542028, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2772.937530517578, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.2691808044910431, | |
| "kl": 0.004205763339996338, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.11265577295385e-07, | |
| "loss": 0.0159, | |
| "reward": -0.3370555378496647, | |
| "reward_after_mean": -0.3370555378496647, | |
| "reward_after_std": 0.3809457756578922, | |
| "reward_before_mean": -0.15994180366396904, | |
| "reward_before_std": 0.3365004351362586, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17711373046040535, | |
| "reward_change_min": -0.26847982592880726, | |
| "reward_change_std": 0.09736619610339403, | |
| "reward_std": 0.3809457868337631, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.20160847809165716, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3308.937530517578, | |
| "epoch": 0.104, | |
| "grad_norm": 0.20113322138786316, | |
| "kl": 0.001620747148990631, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.054, | |
| "reward": -0.1898544393479824, | |
| "reward_after_mean": -0.1898544393479824, | |
| "reward_after_std": 0.4037452917546034, | |
| "reward_before_mean": 0.04781687818467617, | |
| "reward_before_std": 0.40877903811633587, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23767131194472313, | |
| "reward_change_min": -0.40195638686418533, | |
| "reward_change_std": 0.16346925124526024, | |
| "reward_std": 0.4037453029304743, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.0980164622887969, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3016.812530517578, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.22104471921920776, | |
| "kl": 0.09951108694076538, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.965056695057204e-07, | |
| "loss": 0.0342, | |
| "reward": -0.19481445383280516, | |
| "reward_after_mean": -0.19481445383280516, | |
| "reward_after_std": 0.4913271814584732, | |
| "reward_before_mean": 0.025329099036753178, | |
| "reward_before_std": 0.468422326259315, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22014355659484863, | |
| "reward_change_min": -0.38161665946245193, | |
| "reward_change_std": 0.14695494808256626, | |
| "reward_std": 0.4913271926343441, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.1205042446963489, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3383.375, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.15710371732711792, | |
| "kl": 0.0020842552185058594, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0015, | |
| "reward": -0.4476584121584892, | |
| "reward_after_mean": -0.4476584121584892, | |
| "reward_after_std": 0.2138187699019909, | |
| "reward_before_mean": -0.28413364104926586, | |
| "reward_before_std": 0.17518659960478544, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.16352477483451366, | |
| "reward_change_min": -0.25845394283533096, | |
| "reward_change_std": 0.08956557791680098, | |
| "reward_std": 0.21381877548992634, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/cosine_scaled_reward": -0.2841336391866207, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3182.1458587646484, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.17469479143619537, | |
| "kl": 0.028481706976890564, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.815672671252315e-07, | |
| "loss": 0.0183, | |
| "reward": -0.1544804833829403, | |
| "reward_after_mean": -0.1544804833829403, | |
| "reward_after_std": 0.4560176860541105, | |
| "reward_before_mean": 0.08422049786895514, | |
| "reward_before_std": 0.4216147158294916, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23870097286999226, | |
| "reward_change_min": -0.40013051964342594, | |
| "reward_change_std": 0.1476370794698596, | |
| "reward_std": 0.4560176897794008, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.08244619239121675, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3389.1875, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.1376781314611435, | |
| "kl": 0.0006309226155281067, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": -0.0017, | |
| "reward": -0.35236234590411186, | |
| "reward_after_mean": -0.35236234590411186, | |
| "reward_after_std": 0.4059419594705105, | |
| "reward_before_mean": -0.18372074887156487, | |
| "reward_before_std": 0.34809670504182577, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1686416082084179, | |
| "reward_change_min": -0.2610710132867098, | |
| "reward_change_std": 0.0913059962913394, | |
| "reward_std": 0.4059419725090265, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.2253874186426401, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2967.5208587646484, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.18132349848747253, | |
| "kl": 0.008769378066062927, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.664685702961344e-07, | |
| "loss": 0.0782, | |
| "reward": -0.009053988382220268, | |
| "reward_after_mean": -0.009053988382220268, | |
| "reward_after_std": 0.5677688978612423, | |
| "reward_before_mean": 0.2716766329249367, | |
| "reward_before_std": 0.5742009859532118, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28073061630129814, | |
| "reward_change_min": -0.4879560321569443, | |
| "reward_change_std": 0.19168910942971706, | |
| "reward_std": 0.5677689034491777, | |
| "rewards/accuracy_reward": 0.25000000931322575, | |
| "rewards/cosine_scaled_reward": 0.021676627919077873, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3335.9166870117188, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.15675108134746552, | |
| "kl": 0.0016013942658901215, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0705, | |
| "reward": -0.20349452830851078, | |
| "reward_after_mean": -0.20349452830851078, | |
| "reward_after_std": 0.4865495152771473, | |
| "reward_before_mean": 0.01759020984172821, | |
| "reward_before_std": 0.480413525365293, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2210847306996584, | |
| "reward_change_min": -0.38277435675263405, | |
| "reward_change_std": 0.15406140219420195, | |
| "reward_std": 0.48654952459037304, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.14907646551728249, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3094.6666870117188, | |
| "epoch": 0.112, | |
| "grad_norm": 0.20178250968456268, | |
| "kl": 0.000708162784576416, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.512279744547392e-07, | |
| "loss": 0.0154, | |
| "reward": -0.2618050053715706, | |
| "reward_after_mean": -0.2618050053715706, | |
| "reward_after_std": 0.22535480838268995, | |
| "reward_before_mean": -0.03537908382713795, | |
| "reward_before_std": 0.14867154462262988, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2264259122312069, | |
| "reward_change_min": -0.31195950135588646, | |
| "reward_change_std": 0.11560725700110197, | |
| "reward_std": 0.22535481490194798, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/cosine_scaled_reward": -0.16037909872829914, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2847.812515258789, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.19928348064422607, | |
| "kl": 0.001722574234008789, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": -0.0071, | |
| "reward": -0.23697317391633987, | |
| "reward_after_mean": -0.23697317391633987, | |
| "reward_after_std": 0.22447119280695915, | |
| "reward_before_mean": -0.0008997507393360138, | |
| "reward_before_std": 0.14804014191031456, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23607343435287476, | |
| "reward_change_min": -0.3225790597498417, | |
| "reward_change_std": 0.12129600439220667, | |
| "reward_std": 0.2244712058454752, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/cosine_scaled_reward": -0.12589975725859404, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2983.666702270508, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.14032700657844543, | |
| "kl": 0.025766372680664062, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.358640479194451e-07, | |
| "loss": 0.0376, | |
| "reward": -0.2477741176262498, | |
| "reward_after_mean": -0.2477741176262498, | |
| "reward_after_std": 0.4157449547201395, | |
| "reward_before_mean": -0.03449610248208046, | |
| "reward_before_std": 0.4026447180658579, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2132780011743307, | |
| "reward_change_min": -0.3858350533992052, | |
| "reward_change_std": 0.1465577408671379, | |
| "reward_std": 0.41574496403336525, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.15949611051473767, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3004.4166717529297, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.17880059778690338, | |
| "kl": 0.0014008283615112305, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0046, | |
| "reward": -0.20459227031096816, | |
| "reward_after_mean": -0.20459227031096816, | |
| "reward_after_std": 0.33806266263127327, | |
| "reward_before_mean": 0.02772543951869011, | |
| "reward_before_std": 0.2651213016360998, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23231769353151321, | |
| "reward_change_min": -0.32845479622483253, | |
| "reward_change_std": 0.12321093957871199, | |
| "reward_std": 0.33806267008185387, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.11810789071023464, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2936.8959045410156, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.26506590843200684, | |
| "kl": 0.005680441856384277, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.203955092681039e-07, | |
| "loss": 0.0551, | |
| "reward": -0.15760041342582554, | |
| "reward_after_mean": -0.15760041342582554, | |
| "reward_after_std": 0.5418837126344442, | |
| "reward_before_mean": 0.0681868263927754, | |
| "reward_before_std": 0.5268151368945837, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22578725591301918, | |
| "reward_change_min": -0.3727243058383465, | |
| "reward_change_std": 0.14929345156997442, | |
| "reward_std": 0.5418837182223797, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.05681317043490708, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3200.9375610351562, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.16663618385791779, | |
| "kl": 0.006103813648223877, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0462, | |
| "reward": -0.21803143620491028, | |
| "reward_after_mean": -0.21803143620491028, | |
| "reward_after_std": 0.5001228470355272, | |
| "reward_before_mean": -0.009146178141236305, | |
| "reward_before_std": 0.4853008156642318, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20888524316251278, | |
| "reward_change_min": -0.39318533055484295, | |
| "reward_change_std": 0.14420427940785885, | |
| "reward_std": 0.5001228544861078, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/cosine_scaled_reward": -0.1341461860574782, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2683.6250381469727, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.19750936329364777, | |
| "kl": 0.0069179534912109375, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.048412045323164e-07, | |
| "loss": -0.0106, | |
| "reward": -0.20132402144372463, | |
| "reward_after_mean": -0.20132402144372463, | |
| "reward_after_std": 0.3386107739061117, | |
| "reward_before_mean": 0.03512360900640488, | |
| "reward_before_std": 0.28090421110391617, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2364476341754198, | |
| "reward_change_min": -0.3883332423865795, | |
| "reward_change_std": 0.13935024105012417, | |
| "reward_std": 0.33861077949404716, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.13154308311641216, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3068.687530517578, | |
| "epoch": 0.12, | |
| "grad_norm": 0.18446093797683716, | |
| "kl": 0.004405617713928223, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0333, | |
| "reward": -0.049622944090515375, | |
| "reward_after_mean": -0.049622944090515375, | |
| "reward_after_std": 0.6895017940551043, | |
| "reward_before_mean": 0.2017551939934492, | |
| "reward_before_std": 0.7053272109478712, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25137814693152905, | |
| "reward_change_min": -0.5445310994982719, | |
| "reward_change_std": 0.2017376320436597, | |
| "reward_std": 0.6895018108189106, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/cosine_scaled_reward": -0.027411479502916336, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2494.3334007263184, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.16192932426929474, | |
| "kl": 0.01092296838760376, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.892200842364462e-07, | |
| "loss": 0.0417, | |
| "reward": 0.27581100910902023, | |
| "reward_after_mean": 0.27581100910902023, | |
| "reward_after_std": 0.5071319099515676, | |
| "reward_before_mean": 0.6641694903373718, | |
| "reward_before_std": 0.4399477792903781, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3883584924042225, | |
| "reward_change_min": -0.5921457093209028, | |
| "reward_change_std": 0.2340643797069788, | |
| "reward_std": 0.5071319285780191, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/cosine_scaled_reward": 0.18500283360481262, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3063.854217529297, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.2792545557022095, | |
| "kl": 0.0023031234741210938, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": -0.0245, | |
| "reward": -0.181765828281641, | |
| "reward_after_mean": -0.181765828281641, | |
| "reward_after_std": 0.338009238243103, | |
| "reward_before_mean": 0.06094588339328766, | |
| "reward_before_std": 0.2771811536513269, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24271169770509005, | |
| "reward_change_min": -0.40003350004553795, | |
| "reward_change_std": 0.14317026268690825, | |
| "reward_std": 0.33800926245748997, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.10572081245481968, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3032.0000610351562, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.22073347866535187, | |
| "kl": 0.0028527379035949707, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.735511803093248e-07, | |
| "loss": 0.0812, | |
| "reward": -0.06596326269209385, | |
| "reward_after_mean": -0.06596326269209385, | |
| "reward_after_std": 0.6328165102750063, | |
| "reward_before_mean": 0.18634681031107903, | |
| "reward_before_std": 0.6565964054316282, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2523100730031729, | |
| "reward_change_min": -0.5472348593175411, | |
| "reward_change_std": 0.2049607066437602, | |
| "reward_std": 0.632816543802619, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/cosine_scaled_reward": -0.021986512932926416, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3071.3958435058594, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.1601523756980896, | |
| "kl": 0.0012826323509216309, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0382, | |
| "reward": -0.2346851173788309, | |
| "reward_after_mean": -0.2346851173788309, | |
| "reward_after_std": 0.3800167478621006, | |
| "reward_before_mean": -0.01405545324087143, | |
| "reward_before_std": 0.35714889597147703, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22062966413795948, | |
| "reward_change_min": -0.34099637903273106, | |
| "reward_change_std": 0.13036640547215939, | |
| "reward_std": 0.38001675345003605, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/cosine_scaled_reward": -0.13905547931790352, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3079.645866394043, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.20363496243953705, | |
| "kl": 0.0030457042157649994, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.578535828967777e-07, | |
| "loss": 0.0427, | |
| "reward": -0.24088630080223083, | |
| "reward_after_mean": -0.24088630080223083, | |
| "reward_after_std": 0.578212320804596, | |
| "reward_before_mean": -0.056965045630931854, | |
| "reward_before_std": 0.5253217183053493, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18392126075923443, | |
| "reward_change_min": -0.3033385146409273, | |
| "reward_change_std": 0.10721975099295378, | |
| "reward_std": 0.5782123357057571, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/cosine_scaled_reward": -0.18196504982188344, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3483.5625610351562, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.1459619253873825, | |
| "kl": 0.002910614013671875, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0093, | |
| "reward": -0.20540679898113012, | |
| "reward_after_mean": -0.20540679898113012, | |
| "reward_after_std": 0.5167342368513346, | |
| "reward_before_mean": 0.008995672687888145, | |
| "reward_before_std": 0.5046728178858757, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21440248005092144, | |
| "reward_change_min": -0.4584047421813011, | |
| "reward_change_std": 0.15795970242470503, | |
| "reward_std": 0.5167342405766249, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.11600433615967631, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3405.0833740234375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.16170577704906464, | |
| "kl": 0.0009909868240356445, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.421464171032224e-07, | |
| "loss": 0.0451, | |
| "reward": -0.09846091084182262, | |
| "reward_after_mean": -0.09846091084182262, | |
| "reward_after_std": 0.5994726214557886, | |
| "reward_before_mean": 0.14584718085825443, | |
| "reward_before_std": 0.6081715226173401, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24430808424949646, | |
| "reward_change_min": -0.46415193751454353, | |
| "reward_change_std": 0.18282546661794186, | |
| "reward_std": 0.5994726475328207, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/cosine_scaled_reward": -0.06248616240918636, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3235.9166870117188, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.21097467839717865, | |
| "kl": 0.004243135452270508, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0744, | |
| "reward": -0.23292838409543037, | |
| "reward_after_mean": -0.23292838409543037, | |
| "reward_after_std": 0.41003835014998913, | |
| "reward_before_mean": -0.012828025966882706, | |
| "reward_before_std": 0.4060424007475376, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22010035812854767, | |
| "reward_change_min": -0.3953362423926592, | |
| "reward_change_std": 0.15193824656307697, | |
| "reward_std": 0.41003837063908577, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.13782802782952785, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2685.479217529297, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.18926027417182922, | |
| "kl": 0.0038733482360839844, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.264488196906752e-07, | |
| "loss": -0.0345, | |
| "reward": -0.3609940633177757, | |
| "reward_after_mean": -0.3609940633177757, | |
| "reward_after_std": 0.31448194198310375, | |
| "reward_before_mean": -0.17894649133086205, | |
| "reward_before_std": 0.2785217398777604, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18204755894839764, | |
| "reward_change_min": -0.3147698640823364, | |
| "reward_change_std": 0.1119089126586914, | |
| "reward_std": 0.31448194943368435, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.22061316156759858, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3032.2916870117188, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.20660527050495148, | |
| "kl": 0.004292488098144531, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0133, | |
| "reward": -0.25222317641600966, | |
| "reward_after_mean": -0.25222317641600966, | |
| "reward_after_std": 0.3675101902335882, | |
| "reward_before_mean": -0.038438186049461365, | |
| "reward_before_std": 0.3253793818876147, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21378498524427414, | |
| "reward_change_min": -0.32515949942171574, | |
| "reward_change_std": 0.12277536746114492, | |
| "reward_std": 0.36751021072268486, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/cosine_scaled_reward": -0.1634381867479533, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3507.2291870117188, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.14657403528690338, | |
| "kl": 0.0013550519943237305, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.107799157635538e-07, | |
| "loss": 0.0071, | |
| "reward": -0.3162567066028714, | |
| "reward_after_mean": -0.3162567066028714, | |
| "reward_after_std": 0.37543779239058495, | |
| "reward_before_mean": -0.12292648106813431, | |
| "reward_before_std": 0.3578766481950879, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19333022460341454, | |
| "reward_change_min": -0.37804919853806496, | |
| "reward_change_std": 0.1362152397632599, | |
| "reward_std": 0.37543781008571386, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.2062598317861557, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3425.5, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.17479689419269562, | |
| "kl": 0.0026268959045410156, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": -0.0087, | |
| "reward": -0.39809813536703587, | |
| "reward_after_mean": -0.39809813536703587, | |
| "reward_after_std": 0.29817865043878555, | |
| "reward_before_mean": -0.22966473922133446, | |
| "reward_before_std": 0.25032039918005466, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.168433403596282, | |
| "reward_change_min": -0.254726717248559, | |
| "reward_change_std": 0.09154631663113832, | |
| "reward_std": 0.298178656026721, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.2504980657249689, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3336.7709045410156, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.1754436492919922, | |
| "kl": 0.0014584064483642578, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.951587954676837e-07, | |
| "loss": 0.0358, | |
| "reward": 0.049660567194223404, | |
| "reward_after_mean": 0.049660567194223404, | |
| "reward_after_std": 0.6818990353494883, | |
| "reward_before_mean": 0.33523961436003447, | |
| "reward_before_std": 0.6836029682308435, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2855790425091982, | |
| "reward_change_min": -0.5425571762025356, | |
| "reward_change_std": 0.20386025682091713, | |
| "reward_std": 0.6818990539759398, | |
| "rewards/accuracy_reward": 0.27083334140479565, | |
| "rewards/cosine_scaled_reward": 0.06440626340918243, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2581.8333892822266, | |
| "epoch": 0.136, | |
| "grad_norm": 0.19370344281196594, | |
| "kl": 0.008479833602905273, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0367, | |
| "reward": 0.004929829388856888, | |
| "reward_after_mean": 0.004929829388856888, | |
| "reward_after_std": 0.4302307050675154, | |
| "reward_before_mean": 0.30591132678091526, | |
| "reward_before_std": 0.3950154595077038, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3009814973920584, | |
| "reward_change_min": -0.4940522387623787, | |
| "reward_change_std": 0.18756380956619978, | |
| "reward_std": 0.43023071624338627, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": 0.05591131933033466, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2806.541717529297, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.31839409470558167, | |
| "kl": 0.005864620208740234, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.79604490731896e-07, | |
| "loss": 0.0281, | |
| "reward": -0.14376608515158296, | |
| "reward_after_mean": -0.14376608515158296, | |
| "reward_after_std": 0.4351091645658016, | |
| "reward_before_mean": 0.10138015681877732, | |
| "reward_before_std": 0.41372954845428467, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2451462484896183, | |
| "reward_change_min": -0.4056401252746582, | |
| "reward_change_std": 0.1532444702461362, | |
| "reward_std": 0.43510917387902737, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.06528650567634031, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2467.1042251586914, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.2131020575761795, | |
| "kl": 0.01170969009399414, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": -0.0323, | |
| "reward": -0.13892671652138233, | |
| "reward_after_mean": -0.13892671652138233, | |
| "reward_after_std": 0.42193593084812164, | |
| "reward_before_mean": 0.10583998123183846, | |
| "reward_before_std": 0.3574620336294174, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24476669542491436, | |
| "reward_change_min": -0.39448612928390503, | |
| "reward_change_std": 0.14441118016839027, | |
| "reward_std": 0.42193594202399254, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/cosine_scaled_reward": -0.08166004437953234, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3173.8333740234375, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.19904720783233643, | |
| "kl": 0.0028328895568847656, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.641359520805548e-07, | |
| "loss": 0.0668, | |
| "reward": -0.009627103805541992, | |
| "reward_after_mean": -0.009627103805541992, | |
| "reward_after_std": 0.4583488889038563, | |
| "reward_before_mean": 0.2879569726064801, | |
| "reward_before_std": 0.4659885112196207, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.29758409410715103, | |
| "reward_change_min": -0.47819996625185013, | |
| "reward_change_std": 0.19638753589242697, | |
| "reward_std": 0.45834890380501747, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/cosine_scaled_reward": 0.03795698191970587, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3264.0208435058594, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.152817502617836, | |
| "kl": 0.002018570899963379, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0387, | |
| "reward": -0.19304709881544113, | |
| "reward_after_mean": -0.19304709881544113, | |
| "reward_after_std": 0.4907000754028559, | |
| "reward_before_mean": 0.028430650010704994, | |
| "reward_before_std": 0.4807597752660513, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22147774696350098, | |
| "reward_change_min": -0.4099309202283621, | |
| "reward_change_std": 0.1521046319976449, | |
| "reward_std": 0.490700077265501, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.11740268766880035, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2913.812530517578, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.18517668545246124, | |
| "kl": 0.03206205368041992, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.4877202554526084e-07, | |
| "loss": -0.0244, | |
| "reward": 0.025332925841212273, | |
| "reward_after_mean": 0.025332925841212273, | |
| "reward_after_std": 0.5710857696831226, | |
| "reward_before_mean": 0.30887394258752465, | |
| "reward_before_std": 0.5023922696709633, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28354101814329624, | |
| "reward_change_min": -0.4145784545689821, | |
| "reward_change_std": 0.1590277198702097, | |
| "reward_std": 0.5710857976227999, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/cosine_scaled_reward": 0.017207262571901083, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2971.520866394043, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.18670791387557983, | |
| "kl": 0.005125522613525391, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": -0.0218, | |
| "reward": -0.09776409342885017, | |
| "reward_after_mean": -0.09776409342885017, | |
| "reward_after_std": 0.3319449555128813, | |
| "reward_before_mean": 0.17860279604792595, | |
| "reward_before_std": 0.2927938736975193, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.276366900652647, | |
| "reward_change_min": -0.4111520666629076, | |
| "reward_change_std": 0.1613099630922079, | |
| "reward_std": 0.3319449629634619, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.029730526730418205, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3012.0209045410156, | |
| "epoch": 0.144, | |
| "grad_norm": 0.1566249281167984, | |
| "kl": 0.0010142326354980469, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.3353142970386557e-07, | |
| "loss": 0.0173, | |
| "reward": -0.037185917142778635, | |
| "reward_after_mean": -0.037185917142778635, | |
| "reward_after_std": 0.4496050179004669, | |
| "reward_before_mean": 0.24768859706819057, | |
| "reward_before_std": 0.43565093353390694, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28487453050911427, | |
| "reward_change_min": -0.47056199237704277, | |
| "reward_change_std": 0.1813699882477522, | |
| "reward_std": 0.4496050253510475, | |
| "rewards/accuracy_reward": 0.2500000111758709, | |
| "rewards/cosine_scaled_reward": -0.0023114103823900223, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3442.541748046875, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.1765584796667099, | |
| "kl": 0.0017638206481933594, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": -0.0108, | |
| "reward": -0.41788332164287567, | |
| "reward_after_mean": -0.41788332164287567, | |
| "reward_after_std": 0.21475844830274582, | |
| "reward_before_mean": -0.2430284023284912, | |
| "reward_before_std": 0.1823626570403576, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17485490441322327, | |
| "reward_change_min": -0.2704111132770777, | |
| "reward_change_std": 0.09725910797715187, | |
| "reward_std": 0.21475845575332642, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/cosine_scaled_reward": -0.2430284097790718, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2990.0208587646484, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.16409148275852203, | |
| "kl": 0.0043749213218688965, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.1843273287476854e-07, | |
| "loss": 0.0516, | |
| "reward": 0.06294518522918224, | |
| "reward_after_mean": 0.06294518522918224, | |
| "reward_after_std": 0.5372496526688337, | |
| "reward_before_mean": 0.37420000694692135, | |
| "reward_before_std": 0.5208405908197165, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31125483103096485, | |
| "reward_change_min": -0.5556082502007484, | |
| "reward_change_std": 0.2155110565945506, | |
| "reward_std": 0.5372496694326401, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": 0.08253331389278173, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3551.75, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.13626347482204437, | |
| "kl": 0.0023354291915893555, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": -0.0121, | |
| "reward": -0.3392734844237566, | |
| "reward_after_mean": -0.3392734844237566, | |
| "reward_after_std": 0.30535995587706566, | |
| "reward_before_mean": -0.14702198840677738, | |
| "reward_before_std": 0.277429336681962, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19225149974226952, | |
| "reward_change_min": -0.3303426429629326, | |
| "reward_change_std": 0.11987168062478304, | |
| "reward_std": 0.30535995960235596, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.18868865631520748, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3362.312530517578, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.16789855062961578, | |
| "kl": 0.006592273712158203, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.034943304942796e-07, | |
| "loss": 0.0295, | |
| "reward": -0.2279136423021555, | |
| "reward_after_mean": -0.2279136423021555, | |
| "reward_after_std": 0.35067533142864704, | |
| "reward_before_mean": -0.0015783179551362991, | |
| "reward_before_std": 0.3160779979079962, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22633531503379345, | |
| "reward_change_min": -0.33433668687939644, | |
| "reward_change_std": 0.13016977813094854, | |
| "reward_std": 0.35067535378038883, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/cosine_scaled_reward": -0.1265783249400556, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2975.583396911621, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.23556658625602722, | |
| "kl": 0.008499383926391602, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0364, | |
| "reward": 0.07813497632741928, | |
| "reward_after_mean": 0.07813497632741928, | |
| "reward_after_std": 0.5219608303159475, | |
| "reward_before_mean": 0.39421502873301506, | |
| "reward_before_std": 0.48203556798398495, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3160800375044346, | |
| "reward_change_min": -0.5246662721037865, | |
| "reward_change_std": 0.20450247451663017, | |
| "reward_std": 0.521960835903883, | |
| "rewards/accuracy_reward": 0.3125000037252903, | |
| "rewards/cosine_scaled_reward": 0.08171500638127327, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3205.5416870117188, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.1620175540447235, | |
| "kl": 0.00755995512008667, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.8873442270461485e-07, | |
| "loss": 0.0057, | |
| "reward": -0.14543947018682957, | |
| "reward_after_mean": -0.14543947018682957, | |
| "reward_after_std": 0.47973890602588654, | |
| "reward_before_mean": 0.09642863646149635, | |
| "reward_before_std": 0.4733261279761791, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24186810292303562, | |
| "reward_change_min": -0.4151979051530361, | |
| "reward_change_std": 0.16245489288121462, | |
| "reward_std": 0.47973891720175743, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/cosine_scaled_reward": -0.09107136679813266, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3379.6666870117188, | |
| "epoch": 0.152, | |
| "grad_norm": 0.16600805521011353, | |
| "kl": 0.0018353462219238281, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": -0.0087, | |
| "reward": -0.2287772335112095, | |
| "reward_after_mean": -0.2287772335112095, | |
| "reward_after_std": 0.3634595964103937, | |
| "reward_before_mean": -0.006547695025801659, | |
| "reward_before_std": 0.3184118759818375, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22222953848540783, | |
| "reward_change_min": -0.3860268648713827, | |
| "reward_change_std": 0.13445519004017115, | |
| "reward_std": 0.363459600135684, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.17321436238125898, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2977.5416717529297, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.18056733906269073, | |
| "kl": 0.007370948791503906, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.7417099217982686e-07, | |
| "loss": -0.0006, | |
| "reward": -0.13877355121076107, | |
| "reward_after_mean": -0.13877355121076107, | |
| "reward_after_std": 0.41921201907098293, | |
| "reward_before_mean": 0.10657497371721547, | |
| "reward_before_std": 0.34318217262625694, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24534854479134083, | |
| "reward_change_min": -0.37725237011909485, | |
| "reward_change_std": 0.13643485959619284, | |
| "reward_std": 0.4192120339721441, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/cosine_scaled_reward": -0.08092502504587173, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2650.5833740234375, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.24054281413555145, | |
| "kl": 0.00955343246459961, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.1139, | |
| "reward": 0.22294577676802874, | |
| "reward_after_mean": 0.22294577676802874, | |
| "reward_after_std": 0.6538244057446718, | |
| "reward_before_mean": 0.5760083859786391, | |
| "reward_before_std": 0.6345205157995224, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3530625980347395, | |
| "reward_change_min": -0.5656783059239388, | |
| "reward_change_std": 0.22699419222772121, | |
| "reward_std": 0.653824420645833, | |
| "rewards/accuracy_reward": 0.4166666753590107, | |
| "rewards/cosine_scaled_reward": 0.1593416929244995, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3299.041717529297, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.22330515086650848, | |
| "kl": 0.0055866241455078125, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.5982178221668533e-07, | |
| "loss": 0.051, | |
| "reward": -0.04042801447212696, | |
| "reward_after_mean": -0.04042801447212696, | |
| "reward_after_std": 0.6660582087934017, | |
| "reward_before_mean": 0.213487334898673, | |
| "reward_before_std": 0.6636907681822777, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.253915349021554, | |
| "reward_change_min": -0.48380811884999275, | |
| "reward_change_std": 0.18504780530929565, | |
| "reward_std": 0.6660582162439823, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/cosine_scaled_reward": -0.01567934500053525, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3480.1875, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.14716432988643646, | |
| "kl": 0.0042552947998046875, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": -0.0185, | |
| "reward": -0.3442451786249876, | |
| "reward_after_mean": -0.3442451786249876, | |
| "reward_after_std": 0.2829555720090866, | |
| "reward_before_mean": -0.15177570283412933, | |
| "reward_before_std": 0.2524709850549698, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19246947765350342, | |
| "reward_change_min": -0.3142265174537897, | |
| "reward_change_std": 0.1153265843167901, | |
| "reward_std": 0.28295557759702206, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.1934423731872812, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2989.8125610351562, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.17452307045459747, | |
| "kl": 0.006173610687255859, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.45704275117204e-07, | |
| "loss": 0.0592, | |
| "reward": -0.17894627060741186, | |
| "reward_after_mean": -0.17894627060741186, | |
| "reward_after_std": 0.4771917462348938, | |
| "reward_before_mean": 0.04556870646774769, | |
| "reward_before_std": 0.4428609721362591, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22451498359441757, | |
| "reward_change_min": -0.34395742416381836, | |
| "reward_change_std": 0.13081647735089064, | |
| "reward_std": 0.4771917574107647, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/cosine_scaled_reward": -0.10026463610120118, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3251.3333435058594, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.1768011748790741, | |
| "kl": 0.0027399063110351562, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": -0.0126, | |
| "reward": -0.3432940714992583, | |
| "reward_after_mean": -0.3432940714992583, | |
| "reward_after_std": 0.29503406770527363, | |
| "reward_before_mean": -0.1541894283145666, | |
| "reward_before_std": 0.24837601464241743, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18910463713109493, | |
| "reward_change_min": -0.27739391289651394, | |
| "reward_change_std": 0.102206707932055, | |
| "reward_std": 0.2950340714305639, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.17502276599407196, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3437.5833435058594, | |
| "epoch": 0.16, | |
| "grad_norm": 0.17263667285442352, | |
| "kl": 0.00865936279296875, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.3183567088914833e-07, | |
| "loss": 0.023, | |
| "reward": -0.13906344398856163, | |
| "reward_after_mean": -0.13906344398856163, | |
| "reward_after_std": 0.39415651746094227, | |
| "reward_before_mean": 0.10775475949048996, | |
| "reward_before_std": 0.3045333120971918, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24681820906698704, | |
| "reward_change_min": -0.3400336131453514, | |
| "reward_change_std": 0.1301204552873969, | |
| "reward_std": 0.39415652118623257, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.05891190283000469, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3324.5416870117188, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.15513010323047638, | |
| "kl": 0.004058837890625, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0228, | |
| "reward": -0.2757847439497709, | |
| "reward_after_mean": -0.2757847439497709, | |
| "reward_after_std": 0.5616568475961685, | |
| "reward_before_mean": -0.10039510112255812, | |
| "reward_before_std": 0.5163733661174774, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17538963817059994, | |
| "reward_change_min": -0.3024909198284149, | |
| "reward_change_std": 0.1056766239926219, | |
| "reward_std": 0.5616568718105555, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.20456177182495594, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3058.2708587646484, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.17221488058567047, | |
| "kl": 0.004315614700317383, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.182328662904756e-07, | |
| "loss": 0.0362, | |
| "reward": -0.15579389221966267, | |
| "reward_after_mean": -0.15579389221966267, | |
| "reward_after_std": 0.4785591959953308, | |
| "reward_before_mean": 0.07918158546090126, | |
| "reward_before_std": 0.46206824481487274, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23497546650469303, | |
| "reward_change_min": -0.3963175192475319, | |
| "reward_change_std": 0.15385002456605434, | |
| "reward_std": 0.4785592146217823, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.08748509921133518, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3342.750030517578, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.2854948937892914, | |
| "kl": 0.006926536560058594, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0212, | |
| "reward": -0.37999577447772026, | |
| "reward_after_mean": -0.37999577447772026, | |
| "reward_after_std": 0.3082189168781042, | |
| "reward_before_mean": -0.2052184585481882, | |
| "reward_before_std": 0.2710018716752529, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1747773140668869, | |
| "reward_change_min": -0.27538760751485825, | |
| "reward_change_std": 0.0982753811404109, | |
| "reward_std": 0.3082189206033945, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.22605179494712502, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3177.875, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.19952303171157837, | |
| "kl": 0.010839700698852539, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.0491243424323783e-07, | |
| "loss": -0.0267, | |
| "reward": -0.07587263640016317, | |
| "reward_after_mean": -0.07587263640016317, | |
| "reward_after_std": 0.5351244080811739, | |
| "reward_before_mean": 0.18004657654091716, | |
| "reward_before_std": 0.5149005856364965, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25591920129954815, | |
| "reward_change_min": -0.3980993516743183, | |
| "reward_change_std": 0.15681752562522888, | |
| "reward_std": 0.5351244378834963, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/cosine_scaled_reward": -0.0491200964897871, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2733.541717529297, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.2626601457595825, | |
| "kl": 0.0073996782302856445, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": -0.0461, | |
| "reward": 0.024279942736029625, | |
| "reward_after_mean": 0.024279942736029625, | |
| "reward_after_std": 0.4396217279136181, | |
| "reward_before_mean": 0.3221998196095228, | |
| "reward_before_std": 0.3181461291387677, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.29791986010968685, | |
| "reward_change_min": -0.40559068880975246, | |
| "reward_change_std": 0.15221791248768568, | |
| "reward_std": 0.4396217316389084, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": 0.030533142387866974, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3020.479232788086, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.16485413908958435, | |
| "kl": 0.0025103092193603516, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.918906036420294e-07, | |
| "loss": 0.0027, | |
| "reward": -0.39893655199557543, | |
| "reward_after_mean": -0.39893655199557543, | |
| "reward_after_std": 0.3212998090311885, | |
| "reward_before_mean": -0.23035122826695442, | |
| "reward_before_std": 0.293486341368407, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.16858533024787903, | |
| "reward_change_min": -0.318555012345314, | |
| "reward_change_std": 0.11151506658643484, | |
| "reward_std": 0.3212998118251562, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.2720178929157555, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3294.8333740234375, | |
| "epoch": 0.168, | |
| "grad_norm": 0.1880612075328827, | |
| "kl": 0.0035572052001953125, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0363, | |
| "reward": -0.20948936231434345, | |
| "reward_after_mean": -0.20948936231434345, | |
| "reward_after_std": 0.5007001124322414, | |
| "reward_before_mean": 0.00404274370521307, | |
| "reward_before_std": 0.48459853883832693, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21353211253881454, | |
| "reward_change_min": -0.40709931403398514, | |
| "reward_change_std": 0.15078270249068737, | |
| "reward_std": 0.5007001329213381, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/cosine_scaled_reward": -0.12095727026462555, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3102.437530517578, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.17371371388435364, | |
| "kl": 0.003039836883544922, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.791832395815782e-07, | |
| "loss": 0.039, | |
| "reward": -0.17386498861014843, | |
| "reward_after_mean": -0.17386498861014843, | |
| "reward_after_std": 0.33598186261951923, | |
| "reward_before_mean": 0.07372774556279182, | |
| "reward_before_std": 0.2765905484557152, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2475927509367466, | |
| "reward_change_min": -0.4050207640975714, | |
| "reward_change_std": 0.14486555475741625, | |
| "reward_std": 0.3359818644821644, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.09293892048299313, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3192.541717529297, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.14574302732944489, | |
| "kl": 0.002967357635498047, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.052, | |
| "reward": -0.1852257801219821, | |
| "reward_after_mean": -0.1852257801219821, | |
| "reward_after_std": 0.49382951483130455, | |
| "reward_before_mean": 0.04067722149193287, | |
| "reward_before_std": 0.48999650962650776, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.225903008133173, | |
| "reward_change_min": -0.4073325898498297, | |
| "reward_change_std": 0.15895004384219646, | |
| "reward_std": 0.4938295166939497, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.10515611711889505, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3230.937530517578, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.17936724424362183, | |
| "kl": 0.006089210510253906, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.6680582402757324e-07, | |
| "loss": 0.073, | |
| "reward": -0.1742450948804617, | |
| "reward_after_mean": -0.1742450948804617, | |
| "reward_after_std": 0.46301714703440666, | |
| "reward_before_mean": 0.058416103944182396, | |
| "reward_before_std": 0.4464929485693574, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23266121558845043, | |
| "reward_change_min": -0.4085942395031452, | |
| "reward_change_std": 0.15767131559550762, | |
| "reward_std": 0.4630171600729227, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/cosine_scaled_reward": -0.08741722581908107, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3226.8333740234375, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.1740923374891281, | |
| "kl": 0.004552721977233887, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0624, | |
| "reward": -0.014221547171473503, | |
| "reward_after_mean": -0.014221547171473503, | |
| "reward_after_std": 0.5023653339594603, | |
| "reward_before_mean": 0.26862012315541506, | |
| "reward_before_std": 0.44354164972901344, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2828416656702757, | |
| "reward_change_min": -0.4648453965783119, | |
| "reward_change_std": 0.1752566946670413, | |
| "reward_std": 0.5023653507232666, | |
| "rewards/accuracy_reward": 0.25000000186264515, | |
| "rewards/cosine_scaled_reward": 0.018620114773511887, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2961.770854949951, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.23375259339809418, | |
| "kl": 0.0030716657638549805, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.547734369542718e-07, | |
| "loss": -0.0393, | |
| "reward": -0.3499955367296934, | |
| "reward_after_mean": -0.3499955367296934, | |
| "reward_after_std": 0.39264952577650547, | |
| "reward_before_mean": -0.17821490950882435, | |
| "reward_before_std": 0.34836819861084223, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17178061790764332, | |
| "reward_change_min": -0.26522634364664555, | |
| "reward_change_std": 0.09525439888238907, | |
| "reward_std": 0.3926495313644409, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.21988157741725445, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3206.6458740234375, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.21046775579452515, | |
| "kl": 0.07404422760009766, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0096, | |
| "reward": -0.26564084365963936, | |
| "reward_after_mean": -0.26564084365963936, | |
| "reward_after_std": 0.37918250635266304, | |
| "reward_before_mean": -0.05798601661808789, | |
| "reward_before_std": 0.3497645128518343, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20765481144189835, | |
| "reward_change_min": -0.3374109137803316, | |
| "reward_change_std": 0.12641454488039017, | |
| "reward_std": 0.3791825193911791, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.16215269826352596, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3517.7708740234375, | |
| "epoch": 0.176, | |
| "grad_norm": 0.14556263387203217, | |
| "kl": 0.002063751220703125, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.4310073797187573e-07, | |
| "loss": 0.007, | |
| "reward": -0.034340658225119114, | |
| "reward_after_mean": -0.034340658225119114, | |
| "reward_after_std": 0.5944497399032116, | |
| "reward_before_mean": 0.23581288009881973, | |
| "reward_before_std": 0.6102161034941673, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27015353739261627, | |
| "reward_change_min": -0.5426580011844635, | |
| "reward_change_std": 0.20289112720638514, | |
| "reward_std": 0.594449769705534, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/cosine_scaled_reward": 0.006646204274147749, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2816.2916717529297, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.2057914435863495, | |
| "kl": 0.04213452339172363, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0263, | |
| "reward": 0.010001560673117638, | |
| "reward_after_mean": 0.010001560673117638, | |
| "reward_after_std": 0.3674746323376894, | |
| "reward_before_mean": 0.31472931057214737, | |
| "reward_before_std": 0.2635038308799267, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30472773127257824, | |
| "reward_change_min": -0.4399359282106161, | |
| "reward_change_std": 0.16246692463755608, | |
| "reward_std": 0.36747463420033455, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": 0.023062625899910927, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3464.187530517578, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.14835189282894135, | |
| "kl": 0.0031020641326904297, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.3180194846605364e-07, | |
| "loss": 0.0117, | |
| "reward": -0.21181728318333626, | |
| "reward_after_mean": -0.21181728318333626, | |
| "reward_after_std": 0.47288844734430313, | |
| "reward_before_mean": 0.003191556199453771, | |
| "reward_before_std": 0.44311563204973936, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21500884927809238, | |
| "reward_change_min": -0.3760814815759659, | |
| "reward_change_std": 0.14059338811784983, | |
| "reward_std": 0.472888458520174, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/cosine_scaled_reward": -0.14264179207384586, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3396.875030517578, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.16735728085041046, | |
| "kl": 0.004496216773986816, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0078, | |
| "reward": -0.26899631321430206, | |
| "reward_after_mean": -0.26899631321430206, | |
| "reward_after_std": 0.28598711267113686, | |
| "reward_before_mean": -0.04634012281894684, | |
| "reward_before_std": 0.2616874389350414, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22265619039535522, | |
| "reward_change_min": -0.32796306163072586, | |
| "reward_change_std": 0.1285459529608488, | |
| "reward_std": 0.2859871182590723, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/cosine_scaled_reward": -0.15050679631531239, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3461.1250610351562, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.1638031005859375, | |
| "kl": 0.002267122268676758, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.2089083427137329e-07, | |
| "loss": 0.0207, | |
| "reward": -0.06732317310525104, | |
| "reward_after_mean": -0.06732317310525104, | |
| "reward_after_std": 0.4877306241542101, | |
| "reward_before_mean": 0.19353781951213023, | |
| "reward_before_std": 0.40885435976088047, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2608609963208437, | |
| "reward_change_min": -0.40801957063376904, | |
| "reward_change_std": 0.15057573933154345, | |
| "reward_std": 0.48773064091801643, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.014795510563999414, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3426.4166870117188, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.14988620579242706, | |
| "kl": 0.015199661254882812, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0272, | |
| "reward": -0.375995397567749, | |
| "reward_after_mean": -0.375995397567749, | |
| "reward_after_std": 0.30748917162418365, | |
| "reward_before_mean": -0.19884050451219082, | |
| "reward_before_std": 0.27203916758298874, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1771549005061388, | |
| "reward_change_min": -0.3032604958862066, | |
| "reward_change_std": 0.108861212618649, | |
| "reward_std": 0.3074891772121191, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.24050716683268547, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3183.500045776367, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.1655987948179245, | |
| "kl": 0.009595870971679688, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.1038068889975259e-07, | |
| "loss": -0.0121, | |
| "reward": -0.09110978245735168, | |
| "reward_after_mean": -0.09110978245735168, | |
| "reward_after_std": 0.6802807692438364, | |
| "reward_before_mean": 0.14124558059847914, | |
| "reward_before_std": 0.6676155887544155, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23235537484288216, | |
| "reward_change_min": -0.43460565991699696, | |
| "reward_change_std": 0.16776727978140116, | |
| "reward_std": 0.6802807692438364, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/cosine_scaled_reward": -0.08792108716443181, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3223.041717529297, | |
| "epoch": 0.184, | |
| "grad_norm": 0.16486133635044098, | |
| "kl": 0.032108306884765625, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.065, | |
| "reward": -0.03917538747191429, | |
| "reward_after_mean": -0.03917538747191429, | |
| "reward_after_std": 0.4343126844614744, | |
| "reward_before_mean": 0.24449945986270905, | |
| "reward_before_std": 0.38117840187624097, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2836748603731394, | |
| "reward_change_min": -0.4806545842438936, | |
| "reward_change_std": 0.17813984956592321, | |
| "reward_std": 0.43431270122528076, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/cosine_scaled_reward": 0.015332793816924095, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3437.6458740234375, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.2203022539615631, | |
| "kl": 0.00640869140625, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.0028431734436308e-07, | |
| "loss": 0.0205, | |
| "reward": -0.13686673156917095, | |
| "reward_after_mean": -0.13686673156917095, | |
| "reward_after_std": 0.4702302608639002, | |
| "reward_before_mean": 0.10987010970711708, | |
| "reward_before_std": 0.4584337314590812, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24673686362802982, | |
| "reward_change_min": -0.39340290054678917, | |
| "reward_change_std": 0.1597390165552497, | |
| "reward_std": 0.4702302720397711, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/cosine_scaled_reward": -0.09846324659883976, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2954.437545776367, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.1848427951335907, | |
| "kl": 0.021541118621826172, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0242, | |
| "reward": 0.15330182015895844, | |
| "reward_after_mean": 0.15330182015895844, | |
| "reward_after_std": 0.32201647013425827, | |
| "reward_before_mean": 0.5199118303135037, | |
| "reward_before_std": 0.22552276588976383, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3666100241243839, | |
| "reward_change_min": -0.5059312395751476, | |
| "reward_change_std": 0.1948932707309723, | |
| "reward_std": 0.3220164868980646, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/cosine_scaled_reward": 0.16574516613036394, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3017.562530517578, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.34415239095687866, | |
| "kl": 0.005047798156738281, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.9061402047871833e-07, | |
| "loss": 0.0798, | |
| "reward": -0.09205959737300873, | |
| "reward_after_mean": -0.09205959737300873, | |
| "reward_after_std": 0.3648469839245081, | |
| "reward_before_mean": 0.18029571324586868, | |
| "reward_before_std": 0.3099568961188197, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27235531620681286, | |
| "reward_change_min": -0.41520175337791443, | |
| "reward_change_std": 0.1586802341043949, | |
| "reward_std": 0.36484698951244354, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/cosine_scaled_reward": -0.048870958387851715, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3212.062530517578, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.2587808668613434, | |
| "kl": 0.005465984344482422, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0868, | |
| "reward": -0.16799346357584, | |
| "reward_after_mean": -0.16799346357584, | |
| "reward_after_std": 0.5451687276363373, | |
| "reward_before_mean": 0.05025888653472066, | |
| "reward_before_std": 0.5034487005323172, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21825237199664116, | |
| "reward_change_min": -0.3237415961921215, | |
| "reward_change_std": 0.1249654246494174, | |
| "reward_std": 0.545168736949563, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.11640777194406837, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3372.4791870117188, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.15933357179164886, | |
| "kl": 0.0029315948486328125, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.8138158006995363e-07, | |
| "loss": 0.0526, | |
| "reward": -0.01668019499629736, | |
| "reward_after_mean": -0.01668019499629736, | |
| "reward_after_std": 0.5762727987021208, | |
| "reward_before_mean": 0.2592314127832651, | |
| "reward_before_std": 0.5716875530779362, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27591159753501415, | |
| "reward_change_min": -0.48107208497822285, | |
| "reward_change_std": 0.1874433197081089, | |
| "reward_std": 0.5762728247791529, | |
| "rewards/accuracy_reward": 0.25000000931322575, | |
| "rewards/cosine_scaled_reward": 0.009231406031176448, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2937.5833892822266, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.15646594762802124, | |
| "kl": 0.022654056549072266, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0779, | |
| "reward": -0.2482462339103222, | |
| "reward_after_mean": -0.2482462339103222, | |
| "reward_after_std": 0.40840402990579605, | |
| "reward_before_mean": -0.035066988319158554, | |
| "reward_before_std": 0.3997558169066906, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21317926235496998, | |
| "reward_change_min": -0.3921571187674999, | |
| "reward_change_std": 0.1473480286076665, | |
| "reward_std": 0.40840404108166695, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/cosine_scaled_reward": -0.13923365552909672, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3454.979217529297, | |
| "epoch": 0.192, | |
| "grad_norm": 0.15156163275241852, | |
| "kl": 0.0032837390899658203, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.7259824442455923e-07, | |
| "loss": 0.0294, | |
| "reward": 0.038655126467347145, | |
| "reward_after_mean": 0.038655126467347145, | |
| "reward_after_std": 0.5489779710769653, | |
| "reward_before_mean": 0.3417097292840481, | |
| "reward_before_std": 0.5534890927374363, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3030546009540558, | |
| "reward_change_min": -0.5080073494464159, | |
| "reward_change_std": 0.2078774282708764, | |
| "reward_std": 0.5489780027419329, | |
| "rewards/accuracy_reward": 0.2916666753590107, | |
| "rewards/cosine_scaled_reward": 0.05004306013870519, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2757.8333587646484, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.20235563814640045, | |
| "kl": 0.0031278133392333984, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.016, | |
| "reward": 0.299284853041172, | |
| "reward_after_mean": 0.299284853041172, | |
| "reward_after_std": 0.3482187706977129, | |
| "reward_before_mean": 0.7110624257475138, | |
| "reward_before_std": 0.21563976723700762, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.41177756898105145, | |
| "reward_change_min": -0.5598046462982893, | |
| "reward_change_std": 0.2105923229828477, | |
| "reward_std": 0.3482187818735838, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/cosine_scaled_reward": 0.21106241270899773, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2691.5000381469727, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.19220662117004395, | |
| "kl": 0.012215614318847656, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.6427471468404952e-07, | |
| "loss": 0.001, | |
| "reward": -0.0186636159196496, | |
| "reward_after_mean": -0.0186636159196496, | |
| "reward_after_std": 0.37129569984972477, | |
| "reward_before_mean": 0.2765026893466711, | |
| "reward_before_std": 0.2835942036472261, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2951663248240948, | |
| "reward_change_min": -0.4515752512961626, | |
| "reward_change_std": 0.16774821933358908, | |
| "reward_std": 0.37129571102559566, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": -0.015163990668952465, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3110.604217529297, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.16165022552013397, | |
| "kl": 0.002307415008544922, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0289, | |
| "reward": 0.09521710127592087, | |
| "reward_after_mean": 0.09521710127592087, | |
| "reward_after_std": 0.39058924093842506, | |
| "reward_before_mean": 0.4361223494634032, | |
| "reward_before_std": 0.35083947516977787, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.34090524166822433, | |
| "reward_change_min": -0.5021266676485538, | |
| "reward_change_std": 0.19604680873453617, | |
| "reward_std": 0.3905892614275217, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/cosine_scaled_reward": 0.10278898943215609, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3107.166702270508, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.18883849680423737, | |
| "kl": 0.0051097869873046875, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.5642113178727193e-07, | |
| "loss": 0.0285, | |
| "reward": -0.044252441730350256, | |
| "reward_after_mean": -0.044252441730350256, | |
| "reward_after_std": 0.4415896963328123, | |
| "reward_before_mean": 0.2377479849383235, | |
| "reward_before_std": 0.41535679809749126, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28200042992830276, | |
| "reward_change_min": -0.4121879041194916, | |
| "reward_change_std": 0.1653971141204238, | |
| "reward_std": 0.4415897000581026, | |
| "rewards/accuracy_reward": 0.22916667722165585, | |
| "rewards/cosine_scaled_reward": 0.00858130888082087, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2682.958351135254, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.20074620842933655, | |
| "kl": 0.010714054107666016, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.058, | |
| "reward": -0.23622408602386713, | |
| "reward_after_mean": -0.23622408602386713, | |
| "reward_after_std": 0.5661862269043922, | |
| "reward_before_mean": -0.04538543475791812, | |
| "reward_before_std": 0.539402324706316, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19083864893764257, | |
| "reward_change_min": -0.3895431775599718, | |
| "reward_change_std": 0.1356267612427473, | |
| "reward_std": 0.5661862548440695, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.1703854314982891, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3129.750030517578, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.17718930542469025, | |
| "kl": 0.013014793395996094, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.4904706411523448e-07, | |
| "loss": 0.0326, | |
| "reward": -0.2630236418917775, | |
| "reward_after_mean": -0.2630236418917775, | |
| "reward_after_std": 0.5134491641074419, | |
| "reward_before_mean": -0.07400929369032383, | |
| "reward_before_std": 0.48160199262201786, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1890143509954214, | |
| "reward_change_min": -0.3234012946486473, | |
| "reward_change_std": 0.120835080742836, | |
| "reward_std": 0.513449190184474, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/cosine_scaled_reward": -0.1781759625300765, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3009.333366394043, | |
| "epoch": 0.2, | |
| "grad_norm": 0.18382960557937622, | |
| "kl": 0.06249094009399414, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0123, | |
| "reward": 0.026797737926244736, | |
| "reward_after_mean": 0.026797737926244736, | |
| "reward_after_std": 0.4659009985625744, | |
| "reward_before_mean": 0.3378504253923893, | |
| "reward_before_std": 0.48004232347011566, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3110526669770479, | |
| "reward_change_min": -0.5042965784668922, | |
| "reward_change_std": 0.2051269579678774, | |
| "reward_std": 0.46590100042521954, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/cosine_scaled_reward": 0.06701706536114216, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3047.583366394043, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.26642507314682007, | |
| "kl": 0.01358795166015625, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.4216149583350755e-07, | |
| "loss": 0.0936, | |
| "reward": -0.019476521760225296, | |
| "reward_after_mean": -0.019476521760225296, | |
| "reward_after_std": 0.6982401926070452, | |
| "reward_before_mean": 0.24331039190292358, | |
| "reward_before_std": 0.731185233220458, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2627869173884392, | |
| "reward_change_min": -0.5728251449763775, | |
| "reward_change_std": 0.22166733164340258, | |
| "reward_std": 0.6982402224093676, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/cosine_scaled_reward": -0.02752294298261404, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3382.166748046875, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.19716955721378326, | |
| "kl": 0.006411075592041016, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0274, | |
| "reward": -0.14210877695586532, | |
| "reward_after_mean": -0.14210877695586532, | |
| "reward_after_std": 0.4888409748673439, | |
| "reward_before_mean": 0.09973286651074886, | |
| "reward_before_std": 0.4782033069059253, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2418416514992714, | |
| "reward_change_min": -0.4065688345581293, | |
| "reward_change_std": 0.16381614096462727, | |
| "reward_std": 0.4888409972190857, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.06693380372598767, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3090.229217529297, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.17369556427001953, | |
| "kl": 0.011487960815429688, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.3577281594640182e-07, | |
| "loss": -0.0599, | |
| "reward": -0.03678634762763977, | |
| "reward_after_mean": -0.03678634762763977, | |
| "reward_after_std": 0.481352299451828, | |
| "reward_before_mean": 0.24844557233154774, | |
| "reward_before_std": 0.49319163616746664, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2852319311350584, | |
| "reward_change_min": -0.47761483304202557, | |
| "reward_change_std": 0.19820824172347784, | |
| "reward_std": 0.48135231621563435, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/cosine_scaled_reward": 0.01927890069782734, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3365.750030517578, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.1561906337738037, | |
| "kl": 0.0033054351806640625, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": -0.0253, | |
| "reward": -0.40081705152988434, | |
| "reward_after_mean": -0.40081705152988434, | |
| "reward_after_std": 0.29850259609520435, | |
| "reward_before_mean": -0.23233469319529831, | |
| "reward_before_std": 0.2521834969520569, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.16848235949873924, | |
| "reward_change_min": -0.2541333418339491, | |
| "reward_change_std": 0.09144267160445452, | |
| "reward_std": 0.29850260354578495, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.25316802971065044, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2725.0833587646484, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.19090747833251953, | |
| "kl": 0.011870384216308594, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.2988880807625927e-07, | |
| "loss": 0.0402, | |
| "reward": 0.11349364370107651, | |
| "reward_after_mean": 0.11349364370107651, | |
| "reward_after_std": 0.312832809984684, | |
| "reward_before_mean": 0.4614496771246195, | |
| "reward_before_std": 0.17704920517280698, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3479560390114784, | |
| "reward_change_min": -0.4591350872069597, | |
| "reward_change_std": 0.17630962189286947, | |
| "reward_std": 0.3128328137099743, | |
| "rewards/accuracy_reward": 0.39583333395421505, | |
| "rewards/cosine_scaled_reward": 0.0656163152307272, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3343.1458435058594, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.1572800576686859, | |
| "kl": 0.004780292510986328, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0256, | |
| "reward": -0.07740832306444645, | |
| "reward_after_mean": -0.07740832306444645, | |
| "reward_after_std": 0.4118566922843456, | |
| "reward_before_mean": 0.19327654596418142, | |
| "reward_before_std": 0.3510447759181261, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2706848792731762, | |
| "reward_change_min": -0.419716427102685, | |
| "reward_change_std": 0.15835009142756462, | |
| "reward_std": 0.41185671649873257, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/cosine_scaled_reward": -0.015056788921356201, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2883.9166870117188, | |
| "epoch": 0.208, | |
| "grad_norm": 0.1602364033460617, | |
| "kl": 0.00803762674331665, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.2451664098030743e-07, | |
| "loss": -0.0218, | |
| "reward": -0.044728060718625784, | |
| "reward_after_mean": -0.044728060718625784, | |
| "reward_after_std": 0.42621047236025333, | |
| "reward_before_mean": 0.23698624456301332, | |
| "reward_before_std": 0.3748616073280573, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28171432204544544, | |
| "reward_change_min": -0.42212608829140663, | |
| "reward_change_std": 0.16775514092296362, | |
| "reward_std": 0.4262104816734791, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/cosine_scaled_reward": 0.007819579914212227, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3177.2500610351562, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.1729901134967804, | |
| "kl": 0.046076297760009766, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": -0.0188, | |
| "reward": -0.0808305200189352, | |
| "reward_after_mean": -0.0808305200189352, | |
| "reward_after_std": 0.47396935522556305, | |
| "reward_before_mean": 0.1872396506369114, | |
| "reward_before_std": 0.4797352682799101, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2680701520293951, | |
| "reward_change_min": -0.47748228162527084, | |
| "reward_change_std": 0.18410342279821634, | |
| "reward_std": 0.47396937943995, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/cosine_scaled_reward": -0.02109370008111, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3157.9166870117188, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.19053320586681366, | |
| "kl": 0.003658771514892578, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1966285981663407e-07, | |
| "loss": -0.0184, | |
| "reward": -0.3129790127277374, | |
| "reward_after_mean": -0.3129790127277374, | |
| "reward_after_std": 0.2926772404462099, | |
| "reward_before_mean": -0.10822748765349388, | |
| "reward_before_std": 0.2700198283419013, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2047515194863081, | |
| "reward_change_min": -0.31184492260217667, | |
| "reward_change_std": 0.12345388997346163, | |
| "reward_std": 0.29267724975943565, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.19156083092093468, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3116.875, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.23735588788986206, | |
| "kl": 0.011540412902832031, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0255, | |
| "reward": -0.27555170468986034, | |
| "reward_after_mean": -0.27555170468986034, | |
| "reward_after_std": 0.37195089366286993, | |
| "reward_before_mean": -0.0702610481530428, | |
| "reward_before_std": 0.34090360533446074, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2052906509488821, | |
| "reward_change_min": -0.31347635202109814, | |
| "reward_change_std": 0.12224087584763765, | |
| "reward_std": 0.3719508945941925, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.1744277123361826, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3370.000030517578, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.23551151156425476, | |
| "kl": 0.004643917083740234, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1533337816991931e-07, | |
| "loss": 0.0291, | |
| "reward": -0.020450815558433533, | |
| "reward_after_mean": -0.020450815558433533, | |
| "reward_after_std": 0.43135653622448444, | |
| "reward_before_mean": 0.27618860453367233, | |
| "reward_before_std": 0.42817492596805096, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.29663942754268646, | |
| "reward_change_min": -0.45596072264015675, | |
| "reward_change_std": 0.18588601425290108, | |
| "reward_std": 0.4313565380871296, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/cosine_scaled_reward": -0.015478070825338364, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3190.541717529297, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.23826000094413757, | |
| "kl": 0.0100555419921875, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0412, | |
| "reward": -0.22396802809089422, | |
| "reward_after_mean": -0.22396802809089422, | |
| "reward_after_std": 0.3726756442338228, | |
| "reward_before_mean": 0.001834167167544365, | |
| "reward_before_std": 0.3483778089284897, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22580219060182571, | |
| "reward_change_min": -0.34733813256025314, | |
| "reward_change_std": 0.13398846238851547, | |
| "reward_std": 0.37267564609646797, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/cosine_scaled_reward": -0.12316584587097168, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3546.5625, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.15860070288181305, | |
| "kl": 0.0028228759765625, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1153347084664419e-07, | |
| "loss": 0.023, | |
| "reward": -0.3123414749279618, | |
| "reward_after_mean": -0.3123414749279618, | |
| "reward_after_std": 0.4071262162178755, | |
| "reward_before_mean": -0.1276569024194032, | |
| "reward_before_std": 0.36976926028728485, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18468456342816353, | |
| "reward_change_min": -0.2755823079496622, | |
| "reward_change_std": 0.10156355146318674, | |
| "reward_std": 0.4071262273937464, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.16932356543838978, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3185.979217529297, | |
| "epoch": 0.216, | |
| "grad_norm": 0.17385567724704742, | |
| "kl": 0.005602836608886719, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0385, | |
| "reward": -0.16023958660662174, | |
| "reward_after_mean": -0.16023958660662174, | |
| "reward_after_std": 0.5064278487116098, | |
| "reward_before_mean": 0.07299725105985999, | |
| "reward_before_std": 0.5011386927217245, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23323685117065907, | |
| "reward_change_min": -0.41370217502117157, | |
| "reward_change_std": 0.15955450013279915, | |
| "reward_std": 0.5064278729259968, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.09366941265761852, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3201.375030517578, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.14921893179416656, | |
| "kl": 0.009759902954101562, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0826776744855121e-07, | |
| "loss": -0.0151, | |
| "reward": -0.11116122780367732, | |
| "reward_after_mean": -0.11116122780367732, | |
| "reward_after_std": 0.42286976985633373, | |
| "reward_before_mean": 0.14645380340516567, | |
| "reward_before_std": 0.37099423445761204, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2576150503009558, | |
| "reward_change_min": -0.4218386374413967, | |
| "reward_change_std": 0.15646861772984266, | |
| "reward_std": 0.42286976985633373, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/cosine_scaled_reward": -0.041046179831027985, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2774.8333587646484, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.18684116005897522, | |
| "kl": 0.02057647705078125, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": -0.0042, | |
| "reward": -0.05179838836193085, | |
| "reward_after_mean": -0.05179838836193085, | |
| "reward_after_std": 0.40950570069253445, | |
| "reward_before_mean": 0.22851569892372936, | |
| "reward_before_std": 0.35111323557794094, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.280314102768898, | |
| "reward_change_min": -0.4044586792588234, | |
| "reward_change_std": 0.15675450582057238, | |
| "reward_std": 0.4095057025551796, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/cosine_scaled_reward": -0.02148430421948433, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3424.0, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.15981534123420715, | |
| "kl": 0.002536296844482422, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0554024673218806e-07, | |
| "loss": 0.0249, | |
| "reward": -0.2669062092900276, | |
| "reward_after_mean": -0.2669062092900276, | |
| "reward_after_std": 0.2870886046439409, | |
| "reward_before_mean": -0.046382976695895195, | |
| "reward_before_std": 0.2581921275705099, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22052323259413242, | |
| "reward_change_min": -0.3341723680496216, | |
| "reward_change_std": 0.1262814048677683, | |
| "reward_std": 0.2870886102318764, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/cosine_scaled_reward": -0.15054964646697044, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3204.416702270508, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.17141887545585632, | |
| "kl": 0.0038933753967285156, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0145, | |
| "reward": -0.23873800691217184, | |
| "reward_after_mean": -0.23873800691217184, | |
| "reward_after_std": 0.4927209597080946, | |
| "reward_before_mean": -0.037118949461728334, | |
| "reward_before_std": 0.4673497211188078, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20161907002329826, | |
| "reward_change_min": -0.3459734059870243, | |
| "reward_change_std": 0.12958138808608055, | |
| "reward_std": 0.4927209783345461, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/cosine_scaled_reward": -0.1412856113165617, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3230.7083740234375, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.17490136623382568, | |
| "kl": 0.0017626285552978516, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0335423176140511e-07, | |
| "loss": 0.0291, | |
| "reward": 0.22551898658275604, | |
| "reward_after_mean": 0.22551898658275604, | |
| "reward_after_std": 0.6343558058142662, | |
| "reward_before_mean": 0.5872141793370247, | |
| "reward_before_std": 0.653206454589963, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.36169516295194626, | |
| "reward_change_min": -0.5719391945749521, | |
| "reward_change_std": 0.23771104868501425, | |
| "reward_std": 0.6343558225780725, | |
| "rewards/accuracy_reward": 0.39583334885537624, | |
| "rewards/cosine_scaled_reward": 0.19138083781581372, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3455.5208435058594, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.16181236505508423, | |
| "kl": 0.002017974853515625, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0155, | |
| "reward": -0.26167353615164757, | |
| "reward_after_mean": -0.26167353615164757, | |
| "reward_after_std": 0.3914919700473547, | |
| "reward_before_mean": -0.0532490611076355, | |
| "reward_before_std": 0.36043376103043556, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20842446759343147, | |
| "reward_change_min": -0.3214227482676506, | |
| "reward_change_std": 0.1241202037781477, | |
| "reward_std": 0.3914919812232256, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.15741572994738817, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3523.500030517578, | |
| "epoch": 0.224, | |
| "grad_norm": 0.16005918383598328, | |
| "kl": 0.0024111270904541016, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.017123858587145e-07, | |
| "loss": 0.0115, | |
| "reward": -0.28297955449670553, | |
| "reward_after_mean": -0.28297955449670553, | |
| "reward_after_std": 0.40249344892799854, | |
| "reward_before_mean": -0.08251086995005608, | |
| "reward_before_std": 0.3783824220299721, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20046869292855263, | |
| "reward_change_min": -0.32744314707815647, | |
| "reward_change_std": 0.12627543695271015, | |
| "reward_std": 0.40249346010386944, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.18667752863257192, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3195.062530517578, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.21763946115970612, | |
| "kl": 0.11150646209716797, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0734, | |
| "reward": 0.14665383007377386, | |
| "reward_after_mean": 0.14665383007377386, | |
| "reward_after_std": 0.775845367461443, | |
| "reward_before_mean": 0.46151648461818695, | |
| "reward_before_std": 0.8179350979626179, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31486267410218716, | |
| "reward_change_min": -0.6177782695740461, | |
| "reward_change_std": 0.24829469621181488, | |
| "reward_std": 0.7758453991264105, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/cosine_scaled_reward": 0.10734981670975685, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3091.125, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.17106440663337708, | |
| "kl": 0.005677938461303711, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0061670936044178e-07, | |
| "loss": 0.0149, | |
| "reward": -0.12829723954200745, | |
| "reward_after_mean": -0.12829723954200745, | |
| "reward_after_std": 0.3686715345829725, | |
| "reward_before_mean": 0.13360333256423473, | |
| "reward_before_std": 0.35308009944856167, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2619005683809519, | |
| "reward_change_min": -0.4114610478281975, | |
| "reward_change_std": 0.16121340077370405, | |
| "reward_std": 0.36867155507206917, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/cosine_scaled_reward": -0.05389668419957161, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3546.1666870117188, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.15505783259868622, | |
| "kl": 0.0044994354248046875, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": -0.0064, | |
| "reward": -0.3825234118849039, | |
| "reward_after_mean": -0.3825234118849039, | |
| "reward_after_std": 0.2886116597801447, | |
| "reward_before_mean": -0.2058500237762928, | |
| "reward_before_std": 0.2376685068011284, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17667338997125626, | |
| "reward_change_min": -0.2671913430094719, | |
| "reward_change_std": 0.0952356569468975, | |
| "reward_std": 0.28861166536808014, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.22668336611241102, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2987.3541870117188, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.18018940091133118, | |
| "kl": 0.0025665760040283203, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0006853717962393e-07, | |
| "loss": 0.0384, | |
| "reward": 0.1265381295233965, | |
| "reward_after_mean": 0.1265381295233965, | |
| "reward_after_std": 0.6119197029620409, | |
| "reward_before_mean": 0.44888845831155777, | |
| "reward_before_std": 0.5870621418580413, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.32235031202435493, | |
| "reward_change_min": -0.5511045381426811, | |
| "reward_change_std": 0.2166948076337576, | |
| "reward_std": 0.6119197197258472, | |
| "rewards/accuracy_reward": 0.33333333767950535, | |
| "rewards/cosine_scaled_reward": 0.11555509176105261, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "step": 200, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0197520790877752, | |
| "train_runtime": 32053.2934, | |
| "train_samples_per_second": 0.3, | |
| "train_steps_per_second": 0.006 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |