{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 523.625, "epoch": 8e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0158, "reward": 4.268636584281921, "reward_std": 1.4991721957921982, "rewards/mrr_reward": 0.18593749962747097, "rewards/rank_analyze_format_reward": 0.3353185895830393, "rewards/rank_answer_foramt_reward": 0.46484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.7578125, "rewards/rank_verify_format_reward": 0.9678308814764023, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 466.453125, "epoch": 0.00016, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0114, "reward": 4.433048129081726, "reward_std": 1.5993161797523499, "rewards/mrr_reward": 0.26510416716337204, "rewards/rank_analyze_format_reward": 0.07943290937691927, "rewards/rank_answer_foramt_reward": 0.484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.84375, "rewards/rank_verify_format_reward": 0.9669117629528046, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 472.359375, "epoch": 0.00024, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0194, "reward": 4.567503452301025, "reward_std": 1.8688987493515015, "rewards/mrr_reward": 0.2818080373108387, "rewards/rank_analyze_format_reward": 0.15285338647663593, "rewards/rank_answer_foramt_reward": 0.53125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9679276347160339, "rewards/rank_overall_format_reward_more": 0.8515625, "rewards/rank_verify_format_reward": 0.9366776347160339, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 500.421875, "epoch": 0.00032, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0195, "reward": 4.613201439380646, "reward_std": 1.5687459707260132, "rewards/mrr_reward": 0.23981275036931038, "rewards/rank_analyze_format_reward": 0.280788647942245, "rewards/rank_answer_foramt_reward": 0.5546875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 487.859375, "epoch": 0.0004, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0273, "reward": 4.728272438049316, "reward_std": 1.583241879940033, "rewards/mrr_reward": 0.2686818018555641, "rewards/rank_analyze_format_reward": 0.2374146804213524, "rewards/rank_answer_foramt_reward": 0.55078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.8671875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 436.890625, "epoch": 0.00048, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0471, "reward": 4.336846709251404, "reward_std": 1.7730951607227325, "rewards/mrr_reward": 0.23513144627213478, "rewards/rank_analyze_format_reward": 0.042381967417895794, "rewards/rank_answer_foramt_reward": 0.505859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.8671875, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 468.625, "epoch": 0.00056, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0278, "reward": 4.507535874843597, "reward_std": 1.6193090677261353, "rewards/mrr_reward": 0.26395088993012905, "rewards/rank_analyze_format_reward": 0.17591837421059608, "rewards/rank_answer_foramt_reward": 0.466796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9826335161924362, "rewards/rank_overall_format_reward_more": 0.859375, "rewards/rank_verify_format_reward": 0.9670085161924362, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 456.734375, "epoch": 0.00064, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0658, "reward": 4.438833832740784, "reward_std": 1.7729081213474274, "rewards/mrr_reward": 0.252535967156291, "rewards/rank_analyze_format_reward": 0.11423671059310436, "rewards/rank_answer_foramt_reward": 0.494140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.8515625, "rewards/rank_verify_format_reward": 0.984375, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 496.71875, "epoch": 0.00072, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0132, "reward": 4.36483633518219, "reward_std": 1.4995907545089722, "rewards/mrr_reward": 0.19198909029364586, "rewards/rank_analyze_format_reward": 0.2855374999344349, "rewards/rank_answer_foramt_reward": 0.46875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9994212985038757, "rewards/rank_overall_format_reward_more": 0.84375, "rewards/rank_verify_format_reward": 0.9994212985038757, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 504.359375, "epoch": 0.0008, "grad_norm": 0.01934855990111828, "kl": 0.0, "learning_rate": 1.999980260856137e-05, "loss": -0.0499, "reward": 4.575979113578796, "reward_std": 1.572792112827301, "rewards/mrr_reward": 0.24622396752238274, "rewards/rank_analyze_format_reward": 0.2335465494543314, "rewards/rank_answer_foramt_reward": 0.5859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.7890625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 494.96875, "epoch": 0.00088, "grad_norm": 0.01934855990111828, "kl": -6.407499313354492e-06, "learning_rate": 1.999980260856137e-05, "loss": -0.027, "reward": 4.225071430206299, "reward_std": 1.3561933636665344, "rewards/mrr_reward": 0.1566592250019312, "rewards/rank_analyze_format_reward": 0.2257399633526802, "rewards/rank_answer_foramt_reward": 0.591796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9920112937688828, "rewards/rank_overall_format_reward_more": 0.84375, "rewards/rank_verify_format_reward": 0.9451362937688828, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 486.828125, "epoch": 0.00096, "grad_norm": 0.023287516087293625, "kl": -5.766749382019043e-06, "learning_rate": 1.9999210442038164e-05, "loss": -0.0527, "reward": 4.396093428134918, "reward_std": 1.4473387897014618, "rewards/mrr_reward": 0.20078745484352112, "rewards/rank_analyze_format_reward": 0.22184984013438225, "rewards/rank_answer_foramt_reward": 0.54296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.84375, "rewards/rank_verify_format_reward": 0.984375, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 467.21875, "epoch": 0.00104, "grad_norm": 0.021872082725167274, "kl": -5.856156349182129e-06, "learning_rate": 1.9998223523808092e-05, "loss": -0.0083, "reward": 4.451628923416138, "reward_std": 1.8520284295082092, "rewards/mrr_reward": 0.26994047313928604, "rewards/rank_analyze_format_reward": 0.05534587614238262, "rewards/rank_answer_foramt_reward": 0.552734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.8125, "rewards/rank_verify_format_reward": 0.9678308814764023, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 474.140625, "epoch": 0.00112, "grad_norm": 0.02027149125933647, "kl": -5.066394805908203e-06, "learning_rate": 1.9996841892833e-05, "loss": -0.0509, "reward": 4.311561226844788, "reward_std": 1.447220355272293, "rewards/mrr_reward": 0.18456721305847168, "rewards/rank_analyze_format_reward": 0.19243288412690163, "rewards/rank_answer_foramt_reward": 0.458984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 1.0, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 480.96875, "epoch": 0.0012, "grad_norm": 0.021198658272624016, "kl": -2.60770320892334e-06, "learning_rate": 1.9995065603657317e-05, "loss": -0.0229, "reward": 4.577821969985962, "reward_std": 1.547965168952942, "rewards/mrr_reward": 0.2568142432719469, "rewards/rank_analyze_format_reward": 0.1280731037259102, "rewards/rank_answer_foramt_reward": 0.54296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 475.6875, "epoch": 0.00128, "grad_norm": 0.021198658272624016, "kl": -6.705522537231445e-07, "learning_rate": 1.9995065603657317e-05, "loss": -0.0167, "reward": 4.51800012588501, "reward_std": 1.4131008833646774, "rewards/mrr_reward": 0.21607143804430962, "rewards/rank_analyze_format_reward": 0.18856214731931686, "rewards/rank_answer_foramt_reward": 0.583984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 458.390625, "epoch": 0.00136, "grad_norm": 0.02181260474026203, "kl": 1.0579824447631836e-06, "learning_rate": 1.9992894726405894e-05, "loss": 0.017, "reward": 4.306349337100983, "reward_std": 1.5264191627502441, "rewards/mrr_reward": 0.22886284813284874, "rewards/rank_analyze_format_reward": 0.0859806090593338, "rewards/rank_answer_foramt_reward": 0.48828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9825367629528046, "rewards/rank_overall_format_reward_more": 0.8515625, "rewards/rank_verify_format_reward": 0.9825367629528046, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 439.640625, "epoch": 0.00144, "grad_norm": 0.022151973098516464, "kl": 6.258487701416016e-07, "learning_rate": 1.999032934678125e-05, "loss": -0.0244, "reward": 4.710850834846497, "reward_std": 1.90812349319458, "rewards/mrr_reward": 0.30260416865348816, "rewards/rank_analyze_format_reward": 0.10762626118957996, "rewards/rank_answer_foramt_reward": 0.513671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 473.953125, "epoch": 0.00152, "grad_norm": 0.021077511832118034, "kl": 7.18235969543457e-06, "learning_rate": 1.998736956606018e-05, "loss": -0.015, "reward": 5.3719645738601685, "reward_std": 2.198553115129471, "rewards/mrr_reward": 0.4290364645421505, "rewards/rank_analyze_format_reward": 0.1585360560566187, "rewards/rank_answer_foramt_reward": 0.609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.998641312122345, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.998641312122345, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 495.375, "epoch": 0.0016, "grad_norm": 0.02298896014690399, "kl": 7.599592208862305e-06, "learning_rate": 1.998401550108975e-05, "loss": 0.012, "reward": 4.880205452442169, "reward_std": 2.090783953666687, "rewards/mrr_reward": 0.3513454757630825, "rewards/rank_analyze_format_reward": 0.19162042438983917, "rewards/rank_answer_foramt_reward": 0.533203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.96875, "rewards/rank_overall_format_reward_more": 0.8125, "rewards/rank_verify_format_reward": 0.96875, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 470.984375, "epoch": 0.00168, "grad_norm": 0.02298896014690399, "kl": 1.6704201698303223e-05, "learning_rate": 1.998401550108975e-05, "loss": -0.0064, "reward": 4.279784142971039, "reward_std": 1.3316340446472168, "rewards/mrr_reward": 0.2225694488734007, "rewards/rank_analyze_format_reward": 0.12974069267511368, "rewards/rank_answer_foramt_reward": 0.462890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.828125, "rewards/rank_verify_format_reward": 0.984375, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 501.34375, "epoch": 0.00176, "grad_norm": 0.021596167236566544, "kl": 1.1667609214782715e-05, "learning_rate": 1.9980267284282718e-05, "loss": -0.0209, "reward": 4.450152933597565, "reward_std": 1.6975745856761932, "rewards/mrr_reward": 0.2311818040907383, "rewards/rank_analyze_format_reward": 0.19451052881777287, "rewards/rank_answer_foramt_reward": 0.51953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9838169664144516, "rewards/rank_overall_format_reward_more": 0.84375, "rewards/rank_verify_format_reward": 0.9838169664144516, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 487.0, "epoch": 0.00184, "grad_norm": 0.021107474341988564, "kl": 1.1980533599853516e-05, "learning_rate": 1.9976125063612254e-05, "loss": -0.0022, "reward": 5.114363670349121, "reward_std": 2.0303279757499695, "rewards/mrr_reward": 0.36928942799568176, "rewards/rank_analyze_format_reward": 0.2502689193934202, "rewards/rank_answer_foramt_reward": 0.5234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.994249701499939, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.962999701499939, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 502.453125, "epoch": 0.00192, "grad_norm": 0.02096448838710785, "kl": 1.8477439880371094e-05, "learning_rate": 1.997158900260614e-05, "loss": -0.0241, "reward": 5.348981976509094, "reward_std": 1.8361419141292572, "rewards/mrr_reward": 0.382440485060215, "rewards/rank_analyze_format_reward": 0.29289935901761055, "rewards/rank_answer_foramt_reward": 0.607421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9985119104385376, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 473.28125, "epoch": 0.002, "grad_norm": 0.02216312661767006, "kl": 2.364814281463623e-05, "learning_rate": 1.99666592803403e-05, "loss": -0.0251, "reward": 5.035452842712402, "reward_std": 1.6332820057868958, "rewards/mrr_reward": 0.3567708358168602, "rewards/rank_analyze_format_reward": 0.1980982944369316, "rewards/rank_answer_foramt_reward": 0.537109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 489.21875, "epoch": 0.00208, "grad_norm": 0.020999694243073463, "kl": 2.6658177375793457e-05, "learning_rate": 1.9961336091431728e-05, "loss": -0.0222, "reward": 5.5917370319366455, "reward_std": 1.5427958369255066, "rewards/mrr_reward": 0.47931547835469246, "rewards/rank_analyze_format_reward": 0.14616985991597176, "rewards/rank_answer_foramt_reward": 0.677734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9955979138612747, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9799729138612747, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 479.796875, "epoch": 0.00216, "grad_norm": 0.02143612876534462, "kl": 3.142654895782471e-05, "learning_rate": 1.99556196460308e-05, "loss": -0.0063, "reward": 4.361609876155853, "reward_std": 1.5810727179050446, "rewards/mrr_reward": 0.19448785111308098, "rewards/rank_analyze_format_reward": 0.23178751021623611, "rewards/rank_answer_foramt_reward": 0.548828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9679276347160339, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9523026347160339, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 480.109375, "epoch": 0.00224, "grad_norm": 0.02168097533285618, "kl": 2.8908252716064453e-05, "learning_rate": 1.9949510169813006e-05, "loss": -0.0212, "reward": 5.391945242881775, "reward_std": 2.08759868144989, "rewards/mrr_reward": 0.42109374701976776, "rewards/rank_analyze_format_reward": 0.16069528087973595, "rewards/rank_answer_foramt_reward": 0.640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.984375, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 492.78125, "epoch": 0.00232, "grad_norm": 0.02250671572983265, "kl": 4.798173904418945e-05, "learning_rate": 1.994300790396999e-05, "loss": -0.0194, "reward": 4.489228665828705, "reward_std": 1.5738151967525482, "rewards/mrr_reward": 0.23200645111501217, "rewards/rank_analyze_format_reward": 0.2226954996585846, "rewards/rank_answer_foramt_reward": 0.505859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9827302694320679, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9671052694320679, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 500.84375, "epoch": 0.0024, "grad_norm": 0.021114060655236244, "kl": 6.276369094848633e-05, "learning_rate": 1.9936113105200085e-05, "loss": -0.0154, "reward": 4.461996555328369, "reward_std": 1.45789235830307, "rewards/mrr_reward": 0.22769097983837128, "rewards/rank_analyze_format_reward": 0.1932162782177329, "rewards/rank_answer_foramt_reward": 0.4921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9993206560611725, "rewards/rank_overall_format_reward_more": 0.8671875, "rewards/rank_verify_format_reward": 0.9993206560611725, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 505.078125, "epoch": 0.00248, "grad_norm": 0.020794546231627464, "kl": 5.78761100769043e-05, "learning_rate": 1.9928826045698138e-05, "loss": -0.0243, "reward": 5.217773675918579, "reward_std": 1.7885856330394745, "rewards/mrr_reward": 0.39644098468124866, "rewards/rank_analyze_format_reward": 0.2804472893476486, "rewards/rank_answer_foramt_reward": 0.6015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.796875, "rewards/rank_verify_format_reward": 0.96875, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 469.046875, "epoch": 0.00256, "grad_norm": 0.02324531599879265, "kl": 6.254017353057861e-05, "learning_rate": 1.9921147013144782e-05, "loss": -0.0303, "reward": 4.419146180152893, "reward_std": 1.6707628667354584, "rewards/mrr_reward": 0.2174479216337204, "rewards/rank_analyze_format_reward": 0.15677634999155998, "rewards/rank_answer_foramt_reward": 0.541015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.984375, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 502.171875, "epoch": 0.00264, "grad_norm": 0.022716142237186432, "kl": 7.861852645874023e-05, "learning_rate": 1.9913076310695068e-05, "loss": -0.0482, "reward": 4.03111606836319, "reward_std": 1.3852024376392365, "rewards/mrr_reward": 0.14283853769302368, "rewards/rank_analyze_format_reward": 0.2838657721877098, "rewards/rank_answer_foramt_reward": 0.294921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 508.734375, "epoch": 0.00272, "grad_norm": 0.021141935139894485, "kl": 7.855892181396484e-05, "learning_rate": 1.9904614256966514e-05, "loss": -0.0444, "reward": 4.860435843467712, "reward_std": 1.487735778093338, "rewards/mrr_reward": 0.26710689440369606, "rewards/rank_analyze_format_reward": 0.321305263787508, "rewards/rank_answer_foramt_reward": 0.619140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.8515625, "rewards/rank_verify_format_reward": 1.0, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 465.09375, "epoch": 0.0028, "grad_norm": 0.024621378630399704, "kl": 0.00012558698654174805, "learning_rate": 1.989576118602651e-05, "loss": -0.0142, "reward": 4.314668953418732, "reward_std": 1.6532922685146332, "rewards/mrr_reward": 0.2037760466337204, "rewards/rank_analyze_format_reward": 0.1608275342732668, "rewards/rank_answer_foramt_reward": 0.470703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9808920323848724, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.9808920323848724, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 489.328125, "epoch": 0.00288, "grad_norm": 0.022213207557797432, "kl": 0.0001096576452255249, "learning_rate": 1.988651744737914e-05, "loss": -0.0529, "reward": 4.881228446960449, "reward_std": 1.7375118732452393, "rewards/mrr_reward": 0.2879960313439369, "rewards/rank_analyze_format_reward": 0.2330354005098343, "rewards/rank_answer_foramt_reward": 0.599609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 487.359375, "epoch": 0.00296, "grad_norm": 0.02224157378077507, "kl": 0.000247761607170105, "learning_rate": 1.9876883405951378e-05, "loss": -0.05, "reward": 4.619623780250549, "reward_std": 1.290152132511139, "rewards/mrr_reward": 0.21312625147402287, "rewards/rank_analyze_format_reward": 0.29346670489758253, "rewards/rank_answer_foramt_reward": 0.59765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9965915977954865, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9965915977954865, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 473.125, "epoch": 0.00304, "grad_norm": 0.023519087582826614, "kl": 0.00016179680824279785, "learning_rate": 1.986685944207868e-05, "loss": -0.0503, "reward": 4.062397718429565, "reward_std": 0.7998931109905243, "rewards/mrr_reward": 0.09428943460807204, "rewards/rank_analyze_format_reward": 0.20867734402418137, "rewards/rank_answer_foramt_reward": 0.5390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 1.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 468.203125, "epoch": 0.00312, "grad_norm": 0.024895187467336655, "kl": 0.0001550912857055664, "learning_rate": 1.9856445951489984e-05, "loss": -0.0331, "reward": 4.113680422306061, "reward_std": 1.4379215836524963, "rewards/mrr_reward": 0.15409846603870392, "rewards/rank_analyze_format_reward": 0.19410677254199982, "rewards/rank_answer_foramt_reward": 0.451171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9767851084470749, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.9767851084470749, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 483.1875, "epoch": 0.0032, "grad_norm": 0.021718382835388184, "kl": 0.0002021193504333496, "learning_rate": 1.9845643345292055e-05, "loss": -0.0467, "reward": 4.880340218544006, "reward_std": 1.675790935754776, "rewards/mrr_reward": 0.3467881977558136, "rewards/rank_analyze_format_reward": 0.16176080331206322, "rewards/rank_answer_foramt_reward": 0.53515625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.995791420340538, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.964541420340538, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 476.609375, "epoch": 0.00328, "grad_norm": 0.022685538977384567, "kl": 0.00019112229347229004, "learning_rate": 1.98344520499533e-05, "loss": -0.0541, "reward": 4.650174260139465, "reward_std": 1.5386116206645966, "rewards/mrr_reward": 0.244723467156291, "rewards/rank_analyze_format_reward": 0.20142508298158646, "rewards/rank_answer_foramt_reward": 0.607421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9819979667663574, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.9819979667663574, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 476.390625, "epoch": 0.00336, "grad_norm": 0.02372577041387558, "kl": 0.00022074580192565918, "learning_rate": 1.982287250728689e-05, "loss": -0.0468, "reward": 5.371613025665283, "reward_std": 1.9288093447685242, "rewards/mrr_reward": 0.4294270798563957, "rewards/rank_analyze_format_reward": 0.18626116588711739, "rewards/rank_answer_foramt_reward": 0.615234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9808920323848724, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9808920323848724, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 487.828125, "epoch": 0.00344, "grad_norm": 0.023288395255804062, "kl": 0.00044661760330200195, "learning_rate": 1.981090517443334e-05, "loss": -0.0373, "reward": 4.478259801864624, "reward_std": 1.4162674695253372, "rewards/mrr_reward": 0.19718502089381218, "rewards/rank_analyze_format_reward": 0.25561754032969475, "rewards/rank_answer_foramt_reward": 0.576171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9679276347160339, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 471.078125, "epoch": 0.00352, "grad_norm": 0.023235054686665535, "kl": 0.00023183226585388184, "learning_rate": 1.979855052384247e-05, "loss": -0.006, "reward": 4.296527624130249, "reward_std": 1.2175047099590302, "rewards/mrr_reward": 0.17834202013909817, "rewards/rank_analyze_format_reward": 0.15524698793888092, "rewards/rank_answer_foramt_reward": 0.548828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.998135969042778, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.982510969042778, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 462.25, "epoch": 0.0036, "grad_norm": 0.0241495780646801, "kl": 0.00029909610748291016, "learning_rate": 1.978580904325472e-05, "loss": -0.0119, "reward": 4.520259380340576, "reward_std": 1.918333262205124, "rewards/mrr_reward": 0.2582031190395355, "rewards/rank_analyze_format_reward": 0.12209766078740358, "rewards/rank_answer_foramt_reward": 0.4921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 501.671875, "epoch": 0.00368, "grad_norm": 0.02564031258225441, "kl": 0.0002841353416442871, "learning_rate": 1.9772681235681936e-05, "loss": -0.0258, "reward": 5.341574311256409, "reward_std": 1.786019891500473, "rewards/mrr_reward": 0.36770833283662796, "rewards/rank_analyze_format_reward": 0.30837564170360565, "rewards/rank_answer_foramt_reward": 0.6328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9960263818502426, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9960263818502426, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 474.484375, "epoch": 0.00376, "grad_norm": 0.021510787308216095, "kl": 0.0003498196601867676, "learning_rate": 1.9759167619387474e-05, "loss": 0.0131, "reward": 5.549093842506409, "reward_std": 1.2790928333997726, "rewards/mrr_reward": 0.4385354593396187, "rewards/rank_analyze_format_reward": 0.14538356475532055, "rewards/rank_answer_foramt_reward": 0.759765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.984375, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 488.25, "epoch": 0.00384, "grad_norm": 0.022778302431106567, "kl": 0.00038307905197143555, "learning_rate": 1.9745268727865774e-05, "loss": -0.0252, "reward": 4.785283327102661, "reward_std": 1.3980281352996826, "rewards/mrr_reward": 0.275279026478529, "rewards/rank_analyze_format_reward": 0.24080759286880493, "rewards/rank_answer_foramt_reward": 0.599609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.96875, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 484.875, "epoch": 0.00392, "grad_norm": 0.02475767768919468, "kl": 0.0003733038902282715, "learning_rate": 1.9730985109821268e-05, "loss": -0.0449, "reward": 4.799848556518555, "reward_std": 1.5266623198986053, "rewards/mrr_reward": 0.281094990670681, "rewards/rank_analyze_format_reward": 0.19753827899694443, "rewards/rank_answer_foramt_reward": 0.576171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9977542459964752, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9821292459964752, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 498.015625, "epoch": 0.004, "grad_norm": 0.02530953846871853, "kl": 0.0005131959915161133, "learning_rate": 1.971631732914674e-05, "loss": -0.0202, "reward": 4.629918158054352, "reward_std": 1.451842188835144, "rewards/mrr_reward": 0.27032490260899067, "rewards/rank_analyze_format_reward": 0.11081032548099756, "rewards/rank_answer_foramt_reward": 0.556640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 509.34375, "epoch": 0.00408, "grad_norm": 0.024670109152793884, "kl": 0.00045734643936157227, "learning_rate": 1.970126596490106e-05, "loss": -0.0215, "reward": 4.508874177932739, "reward_std": 1.7198016345500946, "rewards/mrr_reward": 0.21750991977751255, "rewards/rank_analyze_format_reward": 0.2599283829331398, "rewards/rank_answer_foramt_reward": 0.51171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.984375, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 465.078125, "epoch": 0.00416, "grad_norm": 0.02744327113032341, "kl": 0.0005243420600891113, "learning_rate": 1.9685831611286312e-05, "loss": -0.0477, "reward": 4.945425391197205, "reward_std": 1.8826305270195007, "rewards/mrr_reward": 0.3301525413990021, "rewards/rank_analyze_format_reward": 0.12849162705242634, "rewards/rank_answer_foramt_reward": 0.6015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 493.359375, "epoch": 0.00424, "grad_norm": 0.024239320307970047, "kl": 0.0005372762680053711, "learning_rate": 1.9670014877624353e-05, "loss": -0.0327, "reward": 4.791419863700867, "reward_std": 1.7941501140594482, "rewards/mrr_reward": 0.26473214477300644, "rewards/rank_analyze_format_reward": 0.3062448501586914, "rewards/rank_answer_foramt_reward": 0.576171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.96875, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 515.984375, "epoch": 0.00432, "grad_norm": 0.024595744907855988, "kl": 0.0005403757095336914, "learning_rate": 1.965381638833274e-05, "loss": -0.0047, "reward": 5.216132044792175, "reward_std": 1.5839519798755646, "rewards/mrr_reward": 0.31073908880352974, "rewards/rank_analyze_format_reward": 0.3495345562696457, "rewards/rank_answer_foramt_reward": 0.671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9993206560611725, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9993206560611725, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 520.109375, "epoch": 0.0044, "grad_norm": 0.02356778457760811, "kl": 0.00048214197158813477, "learning_rate": 1.96372367829001e-05, "loss": -0.0331, "reward": 4.874648034572601, "reward_std": 1.472237765789032, "rewards/mrr_reward": 0.26940104737877846, "rewards/rank_analyze_format_reward": 0.18540729489177465, "rewards/rank_answer_foramt_reward": 0.71484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 508.28125, "epoch": 0.00448, "grad_norm": 0.022624438628554344, "kl": 0.0006122589111328125, "learning_rate": 1.962027671586086e-05, "loss": -0.0578, "reward": 5.635180950164795, "reward_std": 2.130846619606018, "rewards/mrr_reward": 0.4570746533572674, "rewards/rank_analyze_format_reward": 0.2382146641612053, "rewards/rank_answer_foramt_reward": 0.6953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 457.828125, "epoch": 0.00456, "grad_norm": 0.02831670083105564, "kl": 0.0008362531661987305, "learning_rate": 1.9602936856769432e-05, "loss": -0.0584, "reward": 4.708496928215027, "reward_std": 1.579804465174675, "rewards/mrr_reward": 0.2832403276115656, "rewards/rank_analyze_format_reward": 0.13998863846063614, "rewards/rank_answer_foramt_reward": 0.537109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.984375, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 464.515625, "epoch": 0.00464, "grad_norm": 0.027137793600559235, "kl": 0.0007263422012329102, "learning_rate": 1.958521789017376e-05, "loss": -0.0173, "reward": 4.78651237487793, "reward_std": 1.6631978452205658, "rewards/mrr_reward": 0.3081163205206394, "rewards/rank_analyze_format_reward": 0.08264920394867659, "rewards/rank_answer_foramt_reward": 0.546875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974177181720734, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9817927181720734, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 480.96875, "epoch": 0.00472, "grad_norm": 0.024516701698303223, "kl": 0.0007790327072143555, "learning_rate": 1.9567120515588307e-05, "loss": -0.0204, "reward": 4.638575911521912, "reward_std": 1.3084321022033691, "rewards/mrr_reward": 0.2515377104282379, "rewards/rank_analyze_format_reward": 0.08313189167529345, "rewards/rank_answer_foramt_reward": 0.66796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9523809552192688, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 528.8125, "epoch": 0.0048, "grad_norm": 0.02359704300761223, "kl": 0.0008711814880371094, "learning_rate": 1.9548645447466433e-05, "loss": -0.0374, "reward": 5.067440032958984, "reward_std": 1.3677224814891815, "rewards/mrr_reward": 0.28026413917541504, "rewards/rank_analyze_format_reward": 0.3526333197951317, "rewards/rank_answer_foramt_reward": 0.640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 1.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 475.203125, "epoch": 0.00488, "grad_norm": 0.02714117057621479, "kl": 0.0010082721710205078, "learning_rate": 1.952979341517219e-05, "loss": -0.0346, "reward": 4.782275915145874, "reward_std": 1.6448620557785034, "rewards/mrr_reward": 0.2842448018491268, "rewards/rank_analyze_format_reward": 0.16287481412291527, "rewards/rank_answer_foramt_reward": 0.552734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.984375, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 490.25, "epoch": 0.00496, "grad_norm": 0.0244311336427927, "kl": 0.0010666847229003906, "learning_rate": 1.9510565162951538e-05, "loss": -0.0086, "reward": 5.099712491035461, "reward_std": 1.6541226208209991, "rewards/mrr_reward": 0.3355220854282379, "rewards/rank_analyze_format_reward": 0.1826736368238926, "rewards/rank_answer_foramt_reward": 0.634765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 482.6875, "epoch": 0.00504, "grad_norm": 0.026524651795625687, "kl": 0.001157999038696289, "learning_rate": 1.9490961449902946e-05, "loss": -0.0453, "reward": 4.736083149909973, "reward_std": 1.4805326759815216, "rewards/mrr_reward": 0.26874380093067884, "rewards/rank_analyze_format_reward": 0.16470579244196415, "rewards/rank_answer_foramt_reward": 0.552734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 514.65625, "epoch": 0.00512, "grad_norm": 0.025870759040117264, "kl": 0.0010292530059814453, "learning_rate": 1.9470983049947446e-05, "loss": -0.0195, "reward": 5.119088530540466, "reward_std": 1.6753787100315094, "rewards/mrr_reward": 0.30148809775710106, "rewards/rank_analyze_format_reward": 0.32103073969483376, "rewards/rank_answer_foramt_reward": 0.703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 503.65625, "epoch": 0.0052, "grad_norm": 0.02715775929391384, "kl": 0.00127410888671875, "learning_rate": 1.945063075179805e-05, "loss": -0.0171, "reward": 5.094496250152588, "reward_std": 1.6873414516448975, "rewards/mrr_reward": 0.33283110707998276, "rewards/rank_analyze_format_reward": 0.218135098926723, "rewards/rank_answer_foramt_reward": 0.6953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9678308814764023, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 471.265625, "epoch": 0.00528, "grad_norm": 0.029613258317112923, "kl": 0.0014154911041259766, "learning_rate": 1.9429905358928648e-05, "loss": -0.0483, "reward": 4.703411102294922, "reward_std": 1.624811589717865, "rewards/mrr_reward": 0.2835937477648258, "rewards/rank_analyze_format_reward": 0.12854894250631332, "rewards/rank_answer_foramt_reward": 0.51953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.984375, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 484.359375, "epoch": 0.00536, "grad_norm": 0.026188354939222336, "kl": 0.0011410713195800781, "learning_rate": 1.9408807689542257e-05, "loss": -0.0161, "reward": 5.06879860162735, "reward_std": 2.0131714940071106, "rewards/mrr_reward": 0.3669270910322666, "rewards/rank_analyze_format_reward": 0.0931629491969943, "rewards/rank_answer_foramt_reward": 0.626953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.9678308814764023, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 483.453125, "epoch": 0.00544, "grad_norm": 0.030339252203702927, "kl": 0.001415252685546875, "learning_rate": 1.9387338576538743e-05, "loss": -0.0417, "reward": 5.697641015052795, "reward_std": 1.6298598051071167, "rewards/mrr_reward": 0.4446800611913204, "rewards/rank_analyze_format_reward": 0.23119097389280796, "rewards/rank_answer_foramt_reward": 0.73828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 511.328125, "epoch": 0.00552, "grad_norm": 0.0270386952906847, "kl": 0.0012700557708740234, "learning_rate": 1.9365498867481926e-05, "loss": -0.0277, "reward": 5.50212836265564, "reward_std": 1.753544569015503, "rewards/mrr_reward": 0.4216145873069763, "rewards/rank_analyze_format_reward": 0.2727011814713478, "rewards/rank_answer_foramt_reward": 0.66796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.96875, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 511.765625, "epoch": 0.0056, "grad_norm": 0.02776823565363884, "kl": 0.0012090206146240234, "learning_rate": 1.9343289424566122e-05, "loss": -0.0327, "reward": 5.284577131271362, "reward_std": 1.5522978603839874, "rewards/mrr_reward": 0.3268353193998337, "rewards/rank_analyze_format_reward": 0.30267586559057236, "rewards/rank_answer_foramt_reward": 0.734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 512.109375, "epoch": 0.00568, "grad_norm": 0.028361720964312553, "kl": 0.0014176368713378906, "learning_rate": 1.932071112458211e-05, "loss": -0.0295, "reward": 5.189886093139648, "reward_std": 1.5692519545555115, "rewards/mrr_reward": 0.32628969103097916, "rewards/rank_analyze_format_reward": 0.2592931464314461, "rewards/rank_answer_foramt_reward": 0.72265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982638955116272, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9826388955116272, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 502.84375, "epoch": 0.00576, "grad_norm": 0.02568405121564865, "kl": 0.0022258758544921875, "learning_rate": 1.9297764858882516e-05, "loss": -0.0394, "reward": 5.312269806861877, "reward_std": 1.6550668478012085, "rewards/mrr_reward": 0.3765625134110451, "rewards/rank_analyze_format_reward": 0.1592270117253065, "rewards/rank_answer_foramt_reward": 0.6796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 513.546875, "epoch": 0.00584, "grad_norm": 0.028704112395644188, "kl": 0.0016345977783203125, "learning_rate": 1.9274451533346617e-05, "loss": -0.0243, "reward": 5.064012050628662, "reward_std": 1.5234301090240479, "rewards/mrr_reward": 0.3003906384110451, "rewards/rank_analyze_format_reward": 0.2176099345088005, "rewards/rank_answer_foramt_reward": 0.748046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 491.609375, "epoch": 0.00592, "grad_norm": 0.02670414373278618, "kl": 0.00135040283203125, "learning_rate": 1.925077206834458e-05, "loss": -0.0157, "reward": 5.475552201271057, "reward_std": 1.6367623507976532, "rewards/mrr_reward": 0.4143601320683956, "rewards/rank_analyze_format_reward": 0.1258587222546339, "rewards/rank_answer_foramt_reward": 0.72265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9965170174837112, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9965170174837112, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 523.375, "epoch": 0.006, "grad_norm": 0.026078149676322937, "kl": 0.001718282699584961, "learning_rate": 1.922672739870115e-05, "loss": -0.025, "reward": 5.39314603805542, "reward_std": 1.488810956478119, "rewards/mrr_reward": 0.3355220817029476, "rewards/rank_analyze_format_reward": 0.35876308754086494, "rewards/rank_answer_foramt_reward": 0.759765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.997514471411705, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.981889471411705, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 526.296875, "epoch": 0.00608, "grad_norm": 0.02692423015832901, "kl": 0.0018053054809570312, "learning_rate": 1.9202318473658707e-05, "loss": -0.0014, "reward": 5.168847322463989, "reward_std": 1.5299666672945023, "rewards/mrr_reward": 0.2954613119363785, "rewards/rank_analyze_format_reward": 0.35911886021494865, "rewards/rank_answer_foramt_reward": 0.716796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9828869104385376, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 530.8125, "epoch": 0.00616, "grad_norm": 0.028736066073179245, "kl": 0.0021300315856933594, "learning_rate": 1.9177546256839814e-05, "loss": -0.0678, "reward": 4.734722018241882, "reward_std": 1.2097884118556976, "rewards/mrr_reward": 0.19413442723453045, "rewards/rank_analyze_format_reward": 0.395684190094471, "rewards/rank_answer_foramt_reward": 0.6640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.984375, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 510.046875, "epoch": 0.00624, "grad_norm": 0.02716856822371483, "kl": 0.002942800521850586, "learning_rate": 1.9152411726209176e-05, "loss": -0.0131, "reward": 5.530459046363831, "reward_std": 1.5224389135837555, "rewards/mrr_reward": 0.4035342466086149, "rewards/rank_analyze_format_reward": 0.279373524710536, "rewards/rank_answer_foramt_reward": 0.75, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9669117629528046, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 571.125, "epoch": 0.00632, "grad_norm": 0.026340851560235023, "kl": 0.0021643638610839844, "learning_rate": 1.912691587403503e-05, "loss": -0.004, "reward": 5.89811098575592, "reward_std": 1.5359521359205246, "rewards/mrr_reward": 0.4553075507283211, "rewards/rank_analyze_format_reward": 0.40509650111198425, "rewards/rank_answer_foramt_reward": 0.771484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9970249533653259, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9813999533653259, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 521.1875, "epoch": 0.0064, "grad_norm": 0.029624229297041893, "kl": 0.0025739669799804688, "learning_rate": 1.9101059706849957e-05, "loss": -0.0441, "reward": 5.700886368751526, "reward_std": 1.3499284535646439, "rewards/mrr_reward": 0.3991629406809807, "rewards/rank_analyze_format_reward": 0.45958859845995903, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9522058814764023, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 538.0, "epoch": 0.00648, "grad_norm": 0.027097424492239952, "kl": 0.0020627975463867188, "learning_rate": 1.907484424541117e-05, "loss": -0.0252, "reward": 5.453540682792664, "reward_std": 1.326317012310028, "rewards/mrr_reward": 0.328590027987957, "rewards/rank_analyze_format_reward": 0.41121166571974754, "rewards/rank_answer_foramt_reward": 0.77734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9987500011920929, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9987500011920929, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 512.8125, "epoch": 0.00656, "grad_norm": 0.029180927202105522, "kl": 0.002402782440185547, "learning_rate": 1.9048270524660197e-05, "loss": -0.0303, "reward": 5.211138844490051, "reward_std": 1.5096509754657745, "rewards/mrr_reward": 0.3459635451436043, "rewards/rank_analyze_format_reward": 0.230496846139431, "rewards/rank_answer_foramt_reward": 0.654296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9907767027616501, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9907767027616501, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 521.640625, "epoch": 0.00664, "grad_norm": 0.0309749785810709, "kl": 0.0024437904357910156, "learning_rate": 1.902133959368203e-05, "loss": -0.0342, "reward": 5.663545250892639, "reward_std": 1.5470882952213287, "rewards/mrr_reward": 0.40711185336112976, "rewards/rank_analyze_format_reward": 0.3408084437251091, "rewards/rank_answer_foramt_reward": 0.720703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9985119104385376, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 514.109375, "epoch": 0.00672, "grad_norm": 0.031366463750600815, "kl": 0.0027179718017578125, "learning_rate": 1.899405251566371e-05, "loss": 0.0076, "reward": 5.422890543937683, "reward_std": 1.6431918144226074, "rewards/mrr_reward": 0.4013392776250839, "rewards/rank_analyze_format_reward": 0.18793605640530586, "rewards/rank_answer_foramt_reward": 0.7109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9944862127304077, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9944862127304077, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 538.75, "epoch": 0.0068, "grad_norm": 0.02822289988398552, "kl": 0.0026798248291015625, "learning_rate": 1.896641036785236e-05, "loss": -0.0181, "reward": 5.379911422729492, "reward_std": 1.5721017718315125, "rewards/mrr_reward": 0.36321304738521576, "rewards/rank_analyze_format_reward": 0.2962614744901657, "rewards/rank_answer_foramt_reward": 0.759765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9784848242998123, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9628598242998123, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 534.125, "epoch": 0.00688, "grad_norm": 0.03269508108496666, "kl": 0.004064083099365234, "learning_rate": 1.893841424151264e-05, "loss": -0.0292, "reward": 5.269747972488403, "reward_std": 1.5305809080600739, "rewards/mrr_reward": 0.3544270917773247, "rewards/rank_analyze_format_reward": 0.33377680554986, "rewards/rank_answer_foramt_reward": 0.65625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9935063272714615, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9778813272714615, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 499.78125, "epoch": 0.00696, "grad_norm": 0.02979106642305851, "kl": 0.0027294158935546875, "learning_rate": 1.891006524188368e-05, "loss": -0.0168, "reward": 5.833852052688599, "reward_std": 1.696144312620163, "rewards/mrr_reward": 0.49704860895872116, "rewards/rank_analyze_format_reward": 0.1344117820262909, "rewards/rank_answer_foramt_reward": 0.759765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 508.65625, "epoch": 0.00704, "grad_norm": 0.03072383813560009, "kl": 0.0026140213012695312, "learning_rate": 1.8881364488135448e-05, "loss": 0.0314, "reward": 5.977005839347839, "reward_std": 1.5344423651695251, "rewards/mrr_reward": 0.46953124552965164, "rewards/rank_analyze_format_reward": 0.33869196474552155, "rewards/rank_answer_foramt_reward": 0.794921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 542.984375, "epoch": 0.00712, "grad_norm": 0.031293075531721115, "kl": 0.002574920654296875, "learning_rate": 1.8852313113324553e-05, "loss": -0.0392, "reward": 5.124444305896759, "reward_std": 1.0976988822221756, "rewards/mrr_reward": 0.26930804550647736, "rewards/rank_analyze_format_reward": 0.37338413298130035, "rewards/rank_answer_foramt_reward": 0.751953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.984375, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 531.578125, "epoch": 0.0072, "grad_norm": 0.028959596529603004, "kl": 0.0031723976135253906, "learning_rate": 1.8822912264349535e-05, "loss": -0.007, "reward": 5.53035843372345, "reward_std": 1.5363159328699112, "rewards/mrr_reward": 0.357527282088995, "rewards/rank_analyze_format_reward": 0.3372920826077461, "rewards/rank_answer_foramt_reward": 0.791015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9976895451545715, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9976895451545715, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 524.421875, "epoch": 0.00728, "grad_norm": 0.030261706560850143, "kl": 0.0035152435302734375, "learning_rate": 1.8793163101905562e-05, "loss": -0.0673, "reward": 5.005836725234985, "reward_std": 1.2413936257362366, "rewards/mrr_reward": 0.27682292088866234, "rewards/rank_analyze_format_reward": 0.28269378654658794, "rewards/rank_answer_foramt_reward": 0.666015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9827302694320679, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 510.0625, "epoch": 0.00736, "grad_norm": 0.030052699148654938, "kl": 0.004012107849121094, "learning_rate": 1.8763066800438638e-05, "loss": -0.0292, "reward": 5.606707572937012, "reward_std": 1.5678272545337677, "rewards/mrr_reward": 0.40956100821495056, "rewards/rank_analyze_format_reward": 0.30623918399214745, "rewards/rank_answer_foramt_reward": 0.734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 547.71875, "epoch": 0.00744, "grad_norm": 0.03216289356350899, "kl": 0.00304412841796875, "learning_rate": 1.8732624548099204e-05, "loss": 0.0231, "reward": 6.239851951599121, "reward_std": 1.7388965487480164, "rewards/mrr_reward": 0.5096540227532387, "rewards/rank_analyze_format_reward": 0.45483387261629105, "rewards/rank_answer_foramt_reward": 0.779296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 550.765625, "epoch": 0.00752, "grad_norm": 0.02862965501844883, "kl": 0.002460956573486328, "learning_rate": 1.870183754669526e-05, "loss": -0.015, "reward": 5.881725430488586, "reward_std": 1.3550093621015549, "rewards/mrr_reward": 0.44217510521411896, "rewards/rank_analyze_format_reward": 0.3357362039387226, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9970428347587585, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9970428347587585, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 585.4375, "epoch": 0.0076, "grad_norm": 0.02570868283510208, "kl": 0.0027370452880859375, "learning_rate": 1.86707070116449e-05, "loss": -0.0218, "reward": 5.226902365684509, "reward_std": 1.2030094340443611, "rewards/mrr_reward": 0.22709573805332184, "rewards/rank_analyze_format_reward": 0.6206915825605392, "rewards/rank_answer_foramt_reward": 0.787109375, "rewards/rank_contrast_format_reward": 0.015467965044081211, "rewards/rank_initial_format_reward": 0.9941282570362091, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9636223018169403, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 560.359375, "epoch": 0.00768, "grad_norm": 0.02751641720533371, "kl": 0.0033593177795410156, "learning_rate": 1.8639234171928355e-05, "loss": -0.0335, "reward": 5.1298346519470215, "reward_std": 1.307349056005478, "rewards/mrr_reward": 0.2678137496113777, "rewards/rank_analyze_format_reward": 0.398000106215477, "rewards/rank_answer_foramt_reward": 0.734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 523.328125, "epoch": 0.00776, "grad_norm": 0.029069727286696434, "kl": 0.004078865051269531, "learning_rate": 1.860742027003944e-05, "loss": -0.0148, "reward": 5.347498893737793, "reward_std": 1.6848171651363373, "rewards/mrr_reward": 0.35104167461395264, "rewards/rank_analyze_format_reward": 0.31607048213481903, "rewards/rank_answer_foramt_reward": 0.69140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 536.1875, "epoch": 0.00784, "grad_norm": 0.03183495253324509, "kl": 0.0038280487060546875, "learning_rate": 1.8575266561936526e-05, "loss": -0.006, "reward": 5.511651277542114, "reward_std": 1.5372964441776276, "rewards/mrr_reward": 0.40600818395614624, "rewards/rank_analyze_format_reward": 0.16032418236136436, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9818111509084702, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 566.296875, "epoch": 0.00792, "grad_norm": 0.028568414971232414, "kl": 0.003868579864501953, "learning_rate": 1.8542774316992953e-05, "loss": -0.0132, "reward": 5.6311503648757935, "reward_std": 1.7834250628948212, "rewards/mrr_reward": 0.42723215371370316, "rewards/rank_analyze_format_reward": 0.3120732717216015, "rewards/rank_answer_foramt_reward": 0.68359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 516.84375, "epoch": 0.008, "grad_norm": 0.031800925731658936, "kl": 0.0034732818603515625, "learning_rate": 1.850994481794692e-05, "loss": 0.0088, "reward": 5.197633624076843, "reward_std": 1.3270320296287537, "rewards/mrr_reward": 0.3310081884264946, "rewards/rank_analyze_format_reward": 0.21934139728546143, "rewards/rank_answer_foramt_reward": 0.732421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.980449989438057, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.980449989438057, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 553.359375, "epoch": 0.00808, "grad_norm": 0.03231408819556236, "kl": 0.0036554336547851562, "learning_rate": 1.8476779360850833e-05, "loss": -0.0035, "reward": 5.128762245178223, "reward_std": 1.6566264033317566, "rewards/mrr_reward": 0.27104414999485016, "rewards/rank_analyze_format_reward": 0.46976545453071594, "rewards/rank_answer_foramt_reward": 0.63671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9963944852352142, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9963944852352142, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 526.71875, "epoch": 0.00816, "grad_norm": 0.032530225813388824, "kl": 0.004127025604248047, "learning_rate": 1.8443279255020153e-05, "loss": -0.0068, "reward": 5.845712304115295, "reward_std": 1.6123826652765274, "rewards/mrr_reward": 0.4531250074505806, "rewards/rank_analyze_format_reward": 0.28228217363357544, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9985119104385376, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 540.75, "epoch": 0.00824, "grad_norm": 0.033591341227293015, "kl": 0.004050254821777344, "learning_rate": 1.8409445822981694e-05, "loss": -0.0144, "reward": 5.395983934402466, "reward_std": 1.3699063658714294, "rewards/mrr_reward": 0.35555556416511536, "rewards/rank_analyze_format_reward": 0.2890353724360466, "rewards/rank_answer_foramt_reward": 0.76171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.996660053730011, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.996660053730011, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 542.84375, "epoch": 0.00832, "grad_norm": 0.029276415705680847, "kl": 0.0040340423583984375, "learning_rate": 1.837528040042142e-05, "loss": -0.0123, "reward": 6.147975206375122, "reward_std": 1.7016419172286987, "rewards/mrr_reward": 0.5155567899346352, "rewards/rank_analyze_format_reward": 0.38848236110061407, "rewards/rank_answer_foramt_reward": 0.744140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 1.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 554.109375, "epoch": 0.0084, "grad_norm": 0.03044695220887661, "kl": 0.0034012794494628906, "learning_rate": 1.8340784336131715e-05, "loss": 0.0012, "reward": 5.674815535545349, "reward_std": 1.338746190071106, "rewards/mrr_reward": 0.39229291677474976, "rewards/rank_analyze_format_reward": 0.3825643416494131, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 563.046875, "epoch": 0.00848, "grad_norm": 0.02915959618985653, "kl": 0.003988742828369141, "learning_rate": 1.830595899195813e-05, "loss": -0.0183, "reward": 5.638059496879578, "reward_std": 1.5509518682956696, "rewards/mrr_reward": 0.36933284625411034, "rewards/rank_analyze_format_reward": 0.422447107732296, "rewards/rank_answer_foramt_reward": 0.74609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 551.046875, "epoch": 0.00856, "grad_norm": 0.028021838515996933, "kl": 0.0035920143127441406, "learning_rate": 1.827080574274562e-05, "loss": -0.017, "reward": 4.963303804397583, "reward_std": 0.9844066947698593, "rewards/mrr_reward": 0.2226934563368559, "rewards/rank_analyze_format_reward": 0.33229546807706356, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 1.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 522.9375, "epoch": 0.00864, "grad_norm": 0.05274343118071556, "kl": 0.005379676818847656, "learning_rate": 1.8235325976284276e-05, "loss": -0.0195, "reward": 5.9380234479904175, "reward_std": 1.6530174016952515, "rewards/mrr_reward": 0.4407738000154495, "rewards/rank_analyze_format_reward": 0.4156523421406746, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9948723018169403, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9948723018169403, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 551.390625, "epoch": 0.00872, "grad_norm": 0.03185953572392464, "kl": 0.004192352294921875, "learning_rate": 1.8199521093254524e-05, "loss": -0.0409, "reward": 5.470573306083679, "reward_std": 1.2188910990953445, "rewards/mrr_reward": 0.3295076973736286, "rewards/rank_analyze_format_reward": 0.4123082235455513, "rewards/rank_answer_foramt_reward": 0.763671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 540.671875, "epoch": 0.0088, "grad_norm": 0.030663657933473587, "kl": 0.004244804382324219, "learning_rate": 1.816339250717184e-05, "loss": -0.0232, "reward": 6.13028359413147, "reward_std": 1.3835676908493042, "rewards/mrr_reward": 0.4888206869363785, "rewards/rank_analyze_format_reward": 0.4102887883782387, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137707233429, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9809887707233429, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 566.09375, "epoch": 0.00888, "grad_norm": 0.029465649276971817, "kl": 0.00360107421875, "learning_rate": 1.812694164433094e-05, "loss": 0.0109, "reward": 5.707750916481018, "reward_std": 1.284827172756195, "rewards/mrr_reward": 0.4140811040997505, "rewards/rank_analyze_format_reward": 0.3466198369860649, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9744735509157181, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9744735509157181, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 550.59375, "epoch": 0.00896, "grad_norm": 0.029467904940247536, "kl": 0.004141807556152344, "learning_rate": 1.8090169943749477e-05, "loss": -0.0018, "reward": 5.9052029848098755, "reward_std": 1.6688864529132843, "rewards/mrr_reward": 0.43764881789684296, "rewards/rank_analyze_format_reward": 0.41590335220098495, "rewards/rank_answer_foramt_reward": 0.765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 593.1875, "epoch": 0.00904, "grad_norm": 0.028183218091726303, "kl": 0.003570079803466797, "learning_rate": 1.8053078857111218e-05, "loss": 0.0146, "reward": 5.618794202804565, "reward_std": 1.03659488260746, "rewards/mrr_reward": 0.3117559552192688, "rewards/rank_analyze_format_reward": 0.5591553151607513, "rewards/rank_answer_foramt_reward": 0.830078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 565.296875, "epoch": 0.00912, "grad_norm": 0.02837606891989708, "kl": 0.003941059112548828, "learning_rate": 1.8015669848708768e-05, "loss": -0.006, "reward": 6.164483904838562, "reward_std": 1.8252239227294922, "rewards/mrr_reward": 0.49081721156835556, "rewards/rank_analyze_format_reward": 0.4622950553894043, "rewards/rank_answer_foramt_reward": 0.7890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9944599568843842, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9944599568843842, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 570.953125, "epoch": 0.0092, "grad_norm": 0.03188610076904297, "kl": 0.00440216064453125, "learning_rate": 1.7977944395385713e-05, "loss": 0.005, "reward": 5.40685248374939, "reward_std": 1.3315111100673676, "rewards/mrr_reward": 0.33445560559630394, "rewards/rank_analyze_format_reward": 0.4321964457631111, "rewards/rank_answer_foramt_reward": 0.716796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 530.5, "epoch": 0.00928, "grad_norm": 0.035740312188863754, "kl": 0.005484580993652344, "learning_rate": 1.7939903986478354e-05, "loss": 0.0026, "reward": 5.0470871925354, "reward_std": 1.3495656102895737, "rewards/mrr_reward": 0.28152902517467737, "rewards/rank_analyze_format_reward": 0.3473521089181304, "rewards/rank_answer_foramt_reward": 0.650390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9967704266309738, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9811454266309738, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 536.078125, "epoch": 0.00936, "grad_norm": 0.033542290329933167, "kl": 0.006087303161621094, "learning_rate": 1.7901550123756906e-05, "loss": -0.0487, "reward": 5.8007895946502686, "reward_std": 1.6038341969251633, "rewards/mrr_reward": 0.40520213916897774, "rewards/rank_analyze_format_reward": 0.46123120933771133, "rewards/rank_answer_foramt_reward": 0.765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.984375, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 538.34375, "epoch": 0.00944, "grad_norm": 0.032972656190395355, "kl": 0.005175590515136719, "learning_rate": 1.786288432136619e-05, "loss": -0.0416, "reward": 5.9379658699035645, "reward_std": 1.0582128167152405, "rewards/mrr_reward": 0.40219494327902794, "rewards/rank_analyze_format_reward": 0.45497939735651016, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966736733913422, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9966736733913422, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 565.6875, "epoch": 0.00952, "grad_norm": 0.03251040726900101, "kl": 0.005161285400390625, "learning_rate": 1.7823908105765883e-05, "loss": -0.0058, "reward": 5.529563307762146, "reward_std": 1.545524537563324, "rewards/mrr_reward": 0.3502108156681061, "rewards/rank_analyze_format_reward": 0.44849435053765774, "rewards/rank_answer_foramt_reward": 0.748046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9817143976688385, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 551.46875, "epoch": 0.0096, "grad_norm": 0.030254479497671127, "kl": 0.0051136016845703125, "learning_rate": 1.7784623015670237e-05, "loss": 0.0084, "reward": 5.68108344078064, "reward_std": 1.4055528044700623, "rewards/mrr_reward": 0.39509548246860504, "rewards/rank_analyze_format_reward": 0.4017583131790161, "rewards/rank_answer_foramt_reward": 0.77734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9959558844566345, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9803308844566345, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 558.5, "epoch": 0.00968, "grad_norm": 0.029839498922228813, "kl": 0.0052356719970703125, "learning_rate": 1.7745030601987338e-05, "loss": -0.0391, "reward": 4.900716543197632, "reward_std": 1.4889276921749115, "rewards/mrr_reward": 0.23041915893554688, "rewards/rank_analyze_format_reward": 0.389343686401844, "rewards/rank_answer_foramt_reward": 0.611328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9969965815544128, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9969965815544128, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 561.359375, "epoch": 0.00976, "grad_norm": 0.031139617785811424, "kl": 0.004563331604003906, "learning_rate": 1.7705132427757895e-05, "loss": -0.0024, "reward": 6.621507525444031, "reward_std": 1.76107919216156, "rewards/mrr_reward": 0.5932291746139526, "rewards/rank_analyze_format_reward": 0.44585638865828514, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 1.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 571.9375, "epoch": 0.00984, "grad_norm": 0.029514476656913757, "kl": 0.005183219909667969, "learning_rate": 1.76649300680935e-05, "loss": -0.0173, "reward": 6.09501588344574, "reward_std": 0.9353384003043175, "rewards/mrr_reward": 0.43046875298023224, "rewards/rank_analyze_format_reward": 0.47569501772522926, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9955979138612747, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9955979138612747, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 548.65625, "epoch": 0.00992, "grad_norm": 0.03300454095005989, "kl": 0.005576133728027344, "learning_rate": 1.762442511011448e-05, "loss": 0.0006, "reward": 5.726975083351135, "reward_std": 1.6468315720558167, "rewards/mrr_reward": 0.39826390892267227, "rewards/rank_analyze_format_reward": 0.39954447001218796, "rewards/rank_answer_foramt_reward": 0.765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 1.0, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 577.609375, "epoch": 0.01, "grad_norm": 0.034321434795856476, "kl": 0.004874229431152344, "learning_rate": 1.7583619152887222e-05, "loss": -0.0315, "reward": 6.236923098564148, "reward_std": 1.866959035396576, "rewards/mrr_reward": 0.501798115670681, "rewards/rank_analyze_format_reward": 0.49014563485980034, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9820971935987473, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9977221935987473, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 561.984375, "epoch": 0.01008, "grad_norm": 0.03288774937391281, "kl": 0.004177093505859375, "learning_rate": 1.754251380736104e-05, "loss": -0.0457, "reward": 5.8624351024627686, "reward_std": 1.3166293054819107, "rewards/mrr_reward": 0.4008122459053993, "rewards/rank_analyze_format_reward": 0.4229400232434273, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 575.515625, "epoch": 0.01016, "grad_norm": 0.02873223088681698, "kl": 0.004273891448974609, "learning_rate": 1.7501110696304598e-05, "loss": 0.0081, "reward": 6.8257163763046265, "reward_std": 1.3797296285629272, "rewards/mrr_reward": 0.6047929003834724, "rewards/rank_analyze_format_reward": 0.5578483566641808, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137856245041, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9809887856245041, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 564.03125, "epoch": 0.01024, "grad_norm": 0.0341954343020916, "kl": 0.005435466766357422, "learning_rate": 1.7459411454241822e-05, "loss": -0.0212, "reward": 5.7786431312561035, "reward_std": 1.5796920359134674, "rewards/mrr_reward": 0.39012279361486435, "rewards/rank_analyze_format_reward": 0.45386332646012306, "rewards/rank_answer_foramt_reward": 0.775390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 569.8125, "epoch": 0.01032, "grad_norm": 0.0335434228181839, "kl": 0.005364894866943359, "learning_rate": 1.7417417727387392e-05, "loss": -0.0446, "reward": 6.519121050834656, "reward_std": 1.615359753370285, "rewards/mrr_reward": 0.5382998660206795, "rewards/rank_analyze_format_reward": 0.5694946646690369, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9958697408437729, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9958697408437729, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 571.40625, "epoch": 0.0104, "grad_norm": 0.034590717405080795, "kl": 0.005486965179443359, "learning_rate": 1.737513117358174e-05, "loss": -0.0285, "reward": 5.513644099235535, "reward_std": 1.2361056506633759, "rewards/mrr_reward": 0.3074776791036129, "rewards/rank_analyze_format_reward": 0.5335964635014534, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9951855838298798, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9951855838298798, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 527.375, "epoch": 0.01048, "grad_norm": 0.03462553396821022, "kl": 0.0054836273193359375, "learning_rate": 1.7332553462225604e-05, "loss": -0.0024, "reward": 6.27109682559967, "reward_std": 1.4046059399843216, "rewards/mrr_reward": 0.5207465216517448, "rewards/rank_analyze_format_reward": 0.36311861872673035, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 557.0, "epoch": 0.01056, "grad_norm": 0.031150279566645622, "kl": 0.005171775817871094, "learning_rate": 1.7289686274214116e-05, "loss": -0.0292, "reward": 6.205300688743591, "reward_std": 1.4860585927963257, "rewards/mrr_reward": 0.47241443395614624, "rewards/rank_analyze_format_reward": 0.4847182631492615, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9965170323848724, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9965170323848724, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 556.421875, "epoch": 0.01064, "grad_norm": 0.03107454627752304, "kl": 0.0052852630615234375, "learning_rate": 1.7246531301870467e-05, "loss": -0.031, "reward": 5.376925587654114, "reward_std": 0.8234259486198425, "rewards/mrr_reward": 0.2710193581879139, "rewards/rank_analyze_format_reward": 0.39582546800374985, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9973393976688385, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 573.609375, "epoch": 0.01072, "grad_norm": 0.03103426657617092, "kl": 0.0062732696533203125, "learning_rate": 1.720309024887907e-05, "loss": -0.0231, "reward": 5.918271780014038, "reward_std": 1.3870733082294464, "rewards/mrr_reward": 0.4325892850756645, "rewards/rank_analyze_format_reward": 0.4667883366346359, "rewards/rank_answer_foramt_reward": 0.787109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 552.796875, "epoch": 0.0108, "grad_norm": 0.030996868386864662, "kl": 0.005532264709472656, "learning_rate": 1.7159364830218312e-05, "loss": -0.0354, "reward": 6.0459229946136475, "reward_std": 1.8146674633026123, "rewards/mrr_reward": 0.46649305522441864, "rewards/rank_analyze_format_reward": 0.497793473303318, "rewards/rank_answer_foramt_reward": 0.732421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9979332089424133, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9830522537231445, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 565.59375, "epoch": 0.01088, "grad_norm": 0.033554572612047195, "kl": 0.00591278076171875, "learning_rate": 1.7115356772092858e-05, "loss": 0.0011, "reward": 5.612787961959839, "reward_std": 1.117805376648903, "rewards/mrr_reward": 0.34348339214921, "rewards/rank_analyze_format_reward": 0.4954646974802017, "rewards/rank_answer_foramt_reward": 0.80859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9947417825460434, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9947417825460434, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 582.65625, "epoch": 0.01096, "grad_norm": 0.03443153202533722, "kl": 0.006453514099121094, "learning_rate": 1.7071067811865477e-05, "loss": 0.0088, "reward": 6.655031681060791, "reward_std": 1.6121004223823547, "rewards/mrr_reward": 0.5930121541023254, "rewards/rank_analyze_format_reward": 0.4836224094033241, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992897808551788, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9992897808551788, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 586.078125, "epoch": 0.01104, "grad_norm": 0.03265470266342163, "kl": 0.006455421447753906, "learning_rate": 1.7026499697988496e-05, "loss": -0.0047, "reward": 5.641196846961975, "reward_std": 1.2902484983205795, "rewards/mrr_reward": 0.3412078395485878, "rewards/rank_analyze_format_reward": 0.5842561349272728, "rewards/rank_answer_foramt_reward": 0.771484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9993750005960464, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9837500005960464, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 601.1875, "epoch": 0.01112, "grad_norm": 0.03656386211514473, "kl": 0.00684356689453125, "learning_rate": 1.698165418993473e-05, "loss": -0.001, "reward": 5.204757332801819, "reward_std": 1.590161681175232, "rewards/mrr_reward": 0.2827877067029476, "rewards/rank_analyze_format_reward": 0.499110646545887, "rewards/rank_answer_foramt_reward": 0.6953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9786541759967804, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9786541759967804, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 595.546875, "epoch": 0.0112, "grad_norm": 0.0316004641354084, "kl": 0.0055179595947265625, "learning_rate": 1.693653305812805e-05, "loss": -0.0265, "reward": 6.290101528167725, "reward_std": 0.8225864917039871, "rewards/mrr_reward": 0.44620535895228386, "rewards/rank_analyze_format_reward": 0.6547454744577408, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975329041481018, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9819079041481018, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 558.4375, "epoch": 0.01128, "grad_norm": 0.03095630370080471, "kl": 0.005917549133300781, "learning_rate": 1.6891138083873486e-05, "loss": -0.0297, "reward": 6.904297590255737, "reward_std": 1.2176014333963394, "rewards/mrr_reward": 0.6421006992459297, "rewards/rank_analyze_format_reward": 0.46468614786863327, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 569.6875, "epoch": 0.01136, "grad_norm": 0.03015676885843277, "kl": 0.0056896209716796875, "learning_rate": 1.684547105928689e-05, "loss": -0.0281, "reward": 6.165593385696411, "reward_std": 0.8665569871664047, "rewards/mrr_reward": 0.4452628940343857, "rewards/rank_analyze_format_reward": 0.49287670850753784, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137856245041, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9966137856245041, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 565.328125, "epoch": 0.01144, "grad_norm": 0.03297252953052521, "kl": 0.0056705474853515625, "learning_rate": 1.6799533787224192e-05, "loss": -0.0115, "reward": 6.573715448379517, "reward_std": 1.222780704498291, "rewards/mrr_reward": 0.5543588846921921, "rewards/rank_analyze_format_reward": 0.5295134037733078, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9993206560611725, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9993206560611725, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 538.71875, "epoch": 0.01152, "grad_norm": 0.03429215028882027, "kl": 0.007515907287597656, "learning_rate": 1.6753328081210244e-05, "loss": -0.0238, "reward": 6.36323094367981, "reward_std": 1.3417619168758392, "rewards/mrr_reward": 0.5290364548563957, "rewards/rank_analyze_format_reward": 0.43458499759435654, "rewards/rank_answer_foramt_reward": 0.8359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 575.21875, "epoch": 0.0116, "grad_norm": 0.033866554498672485, "kl": 0.006348609924316406, "learning_rate": 1.6706855765367202e-05, "loss": -0.0501, "reward": 6.455610156059265, "reward_std": 1.2715766429901123, "rewards/mrr_reward": 0.5324404761195183, "rewards/rank_analyze_format_reward": 0.520428977906704, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 589.0, "epoch": 0.01168, "grad_norm": 0.03349940851330757, "kl": 0.00588226318359375, "learning_rate": 1.666011867434252e-05, "loss": -0.0104, "reward": 6.319154739379883, "reward_std": 1.2373842000961304, "rewards/mrr_reward": 0.49678821861743927, "rewards/rank_analyze_format_reward": 0.5474087037146091, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9967888444662094, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.9967888444662094, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 637.96875, "epoch": 0.01176, "grad_norm": 0.03079179674386978, "kl": 0.005105018615722656, "learning_rate": 1.661311865323652e-05, "loss": 0.0098, "reward": 6.185090184211731, "reward_std": 1.3005170226097107, "rewards/mrr_reward": 0.3938988223671913, "rewards/rank_analyze_format_reward": 0.8048074841499329, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 1.0, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 606.5625, "epoch": 0.01184, "grad_norm": 0.030851203948259354, "kl": 0.00566864013671875, "learning_rate": 1.6565857557529567e-05, "loss": -0.0224, "reward": 6.025929689407349, "reward_std": 1.4896414875984192, "rewards/mrr_reward": 0.41127852350473404, "rewards/rank_analyze_format_reward": 0.5112152397632599, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9963235259056091, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9963235259056091, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 595.828125, "epoch": 0.01192, "grad_norm": 0.030933480709791183, "kl": 0.006669044494628906, "learning_rate": 1.651833725300879e-05, "loss": -0.0119, "reward": 6.044245362281799, "reward_std": 1.4313164055347443, "rewards/mrr_reward": 0.41648685932159424, "rewards/rank_analyze_format_reward": 0.5746514052152634, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9955731779336929, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9955731779336929, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 577.296875, "epoch": 0.012, "grad_norm": 0.03206388279795647, "kl": 0.005721092224121094, "learning_rate": 1.6470559615694445e-05, "loss": 0.0001, "reward": 6.099404215812683, "reward_std": 1.641929566860199, "rewards/mrr_reward": 0.4632812514901161, "rewards/rank_analyze_format_reward": 0.46816740930080414, "rewards/rank_answer_foramt_reward": 0.841796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9798761606216431, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9798761606216431, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 594.96875, "epoch": 0.01208, "grad_norm": 0.035973358899354935, "kl": 0.006251335144042969, "learning_rate": 1.6422526531765846e-05, "loss": -0.0005, "reward": 5.8869417905807495, "reward_std": 1.4181711375713348, "rewards/mrr_reward": 0.38827505707740784, "rewards/rank_analyze_format_reward": 0.5360363647341728, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9985119104385376, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 574.53125, "epoch": 0.01216, "grad_norm": 0.03355732187628746, "kl": 0.005978584289550781, "learning_rate": 1.63742398974869e-05, "loss": -0.0083, "reward": 6.842912316322327, "reward_std": 1.6922471225261688, "rewards/mrr_reward": 0.6038318425416946, "rewards/rank_analyze_format_reward": 0.5811449587345123, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.011609229259192944, "rewards/rank_initial_format_reward": 0.9965170323848724, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9965170323848724, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 568.75, "epoch": 0.01224, "grad_norm": 0.03528546541929245, "kl": 0.007189750671386719, "learning_rate": 1.6325701619131246e-05, "loss": 0.0101, "reward": 6.01763129234314, "reward_std": 1.5633238703012466, "rewards/mrr_reward": 0.4449838772416115, "rewards/rank_analyze_format_reward": 0.5096804201602936, "rewards/rank_answer_foramt_reward": 0.771484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9899842143058777, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9899842143058777, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 568.625, "epoch": 0.01232, "grad_norm": 0.032814644277095795, "kl": 0.00629425048828125, "learning_rate": 1.6276913612907005e-05, "loss": -0.0075, "reward": 5.502658128738403, "reward_std": 1.473616749048233, "rewards/mrr_reward": 0.34048859775066376, "rewards/rank_analyze_format_reward": 0.4369618855416775, "rewards/rank_answer_foramt_reward": 0.78515625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9827302694320679, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9827302694320679, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 599.203125, "epoch": 0.0124, "grad_norm": 0.030942142009735107, "kl": 0.0059185028076171875, "learning_rate": 1.6227877804881126e-05, "loss": -0.0135, "reward": 6.150311231613159, "reward_std": 1.3842836022377014, "rewards/mrr_reward": 0.4085627496242523, "rewards/rank_analyze_format_reward": 0.6939246654510498, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9833333343267441, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9989583343267441, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 541.609375, "epoch": 0.01248, "grad_norm": 0.036237865686416626, "kl": 0.007732391357421875, "learning_rate": 1.6178596130903345e-05, "loss": -0.0202, "reward": 7.824005961418152, "reward_std": 0.8896946907043457, "rewards/mrr_reward": 0.8803571313619614, "rewards/rank_analyze_format_reward": 0.3638391522690654, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9976895451545715, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9976895451545715, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 569.96875, "epoch": 0.01256, "grad_norm": 0.033276233822107315, "kl": 0.00786590576171875, "learning_rate": 1.6129070536529767e-05, "loss": -0.0323, "reward": 6.17628014087677, "reward_std": 1.2629180029034615, "rewards/mrr_reward": 0.43666915595531464, "rewards/rank_analyze_format_reward": 0.5974572598934174, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 559.640625, "epoch": 0.01264, "grad_norm": 0.03208913654088974, "kl": 0.0061359405517578125, "learning_rate": 1.6079302976946055e-05, "loss": -0.0149, "reward": 7.196354031562805, "reward_std": 1.3507582545280457, "rewards/mrr_reward": 0.7078993171453476, "rewards/rank_analyze_format_reward": 0.43116307258605957, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 536.03125, "epoch": 0.01272, "grad_norm": 0.0373532772064209, "kl": 0.008005142211914062, "learning_rate": 1.602929541689025e-05, "loss": -0.0095, "reward": 6.174514055252075, "reward_std": 1.5039510130882263, "rewards/mrr_reward": 0.4937500022351742, "rewards/rank_analyze_format_reward": 0.35319407656788826, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 528.796875, "epoch": 0.0128, "grad_norm": 0.035693973302841187, "kl": 0.007555961608886719, "learning_rate": 1.597904983057519e-05, "loss": -0.0096, "reward": 6.581840634346008, "reward_std": 1.4593189358711243, "rewards/mrr_reward": 0.5941840261220932, "rewards/rank_analyze_format_reward": 0.4255777597427368, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9825367629528046, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 556.515625, "epoch": 0.01288, "grad_norm": 0.03452171012759209, "kl": 0.006771087646484375, "learning_rate": 1.5928568201610593e-05, "loss": 0.0078, "reward": 6.382677316665649, "reward_std": 1.3916960656642914, "rewards/mrr_reward": 0.5152777880430222, "rewards/rank_analyze_format_reward": 0.5301270186901093, "rewards/rank_answer_foramt_reward": 0.826171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 591.421875, "epoch": 0.01296, "grad_norm": 0.03284810855984688, "kl": 0.006465911865234375, "learning_rate": 1.5877852522924733e-05, "loss": -0.0104, "reward": 5.920182585716248, "reward_std": 1.1881801784038544, "rewards/mrr_reward": 0.36235737428069115, "rewards/rank_analyze_format_reward": 0.6010741665959358, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9973393976688385, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 584.46875, "epoch": 0.01304, "grad_norm": 0.034803055226802826, "kl": 0.006131172180175781, "learning_rate": 1.5826904796685763e-05, "loss": -0.0434, "reward": 6.171157360076904, "reward_std": 1.1627227067947388, "rewards/mrr_reward": 0.4615575298666954, "rewards/rank_analyze_format_reward": 0.47822096198797226, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9936655461788177, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9936655461788177, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 585.734375, "epoch": 0.01312, "grad_norm": 0.031842779368162155, "kl": 0.0071163177490234375, "learning_rate": 1.5775727034222675e-05, "loss": -0.0231, "reward": 6.161918997764587, "reward_std": 0.9850689172744751, "rewards/mrr_reward": 0.45631202310323715, "rewards/rank_analyze_format_reward": 0.4493050128221512, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9895813316106796, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9895813316106796, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 588.890625, "epoch": 0.0132, "grad_norm": 0.03469368442893028, "kl": 0.007396697998046875, "learning_rate": 1.572432125594591e-05, "loss": -0.0202, "reward": 6.6021623611450195, "reward_std": 1.4077537953853607, "rewards/mrr_reward": 0.5369295701384544, "rewards/rank_analyze_format_reward": 0.5988607704639435, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 595.65625, "epoch": 0.01328, "grad_norm": 0.037268709391355515, "kl": 0.008581161499023438, "learning_rate": 1.567268949126757e-05, "loss": -0.0097, "reward": 6.477015256881714, "reward_std": 1.300216481089592, "rewards/mrr_reward": 0.5178385674953461, "rewards/rank_analyze_format_reward": 0.5814422965049744, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 587.421875, "epoch": 0.01336, "grad_norm": 0.03310069069266319, "kl": 0.0058498382568359375, "learning_rate": 1.5620833778521306e-05, "loss": 0.0128, "reward": 6.108349561691284, "reward_std": 1.0274000614881516, "rewards/mrr_reward": 0.3897755593061447, "rewards/rank_analyze_format_reward": 0.6581573709845543, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 637.453125, "epoch": 0.01344, "grad_norm": 0.03266943618655205, "kl": 0.0059185028076171875, "learning_rate": 1.556875616488188e-05, "loss": 0.0049, "reward": 6.826786279678345, "reward_std": 0.6903130635619164, "rewards/mrr_reward": 0.5530568063259125, "rewards/rank_analyze_format_reward": 0.7122153341770172, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 1.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 587.78125, "epoch": 0.01352, "grad_norm": 0.033783189952373505, "kl": 0.006679534912109375, "learning_rate": 1.5516458706284306e-05, "loss": -0.0256, "reward": 5.9577813148498535, "reward_std": 1.7437602877616882, "rewards/mrr_reward": 0.4260416626930237, "rewards/rank_analyze_format_reward": 0.49166587740182877, "rewards/rank_answer_foramt_reward": 0.7890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 575.359375, "epoch": 0.0136, "grad_norm": 0.03344302624464035, "kl": 0.0066394805908203125, "learning_rate": 1.5463943467342694e-05, "loss": -0.03, "reward": 6.153290033340454, "reward_std": 1.4977188408374786, "rewards/mrr_reward": 0.42273685336112976, "rewards/rank_analyze_format_reward": 0.6419153958559036, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 570.84375, "epoch": 0.01368, "grad_norm": 0.032901864498853683, "kl": 0.006804466247558594, "learning_rate": 1.541121252126876e-05, "loss": -0.0356, "reward": 6.095202207565308, "reward_std": 1.6976246535778046, "rewards/mrr_reward": 0.4567708298563957, "rewards/rank_analyze_format_reward": 0.5074281394481659, "rewards/rank_answer_foramt_reward": 0.78125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975329041481018, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9975329041481018, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 580.453125, "epoch": 0.01376, "grad_norm": 0.035641156136989594, "kl": 0.0066776275634765625, "learning_rate": 1.5358267949789968e-05, "loss": 0.0231, "reward": 6.122780799865723, "reward_std": 1.4794773757457733, "rewards/mrr_reward": 0.4195932522416115, "rewards/rank_analyze_format_reward": 0.6257399246096611, "rewards/rank_answer_foramt_reward": 0.8359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 573.875, "epoch": 0.01384, "grad_norm": 0.03523016348481178, "kl": 0.0072040557861328125, "learning_rate": 1.5305111843067343e-05, "loss": -0.0158, "reward": 6.5374016761779785, "reward_std": 1.196079671382904, "rewards/mrr_reward": 0.5444630570709705, "rewards/rank_analyze_format_reward": 0.5388138145208359, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9826335161924362, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9513835161924362, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 591.1875, "epoch": 0.01392, "grad_norm": 0.033210430294275284, "kl": 0.006505012512207031, "learning_rate": 1.5251746299612959e-05, "loss": 0.0111, "reward": 6.4858158826828, "reward_std": 1.148768663406372, "rewards/mrr_reward": 0.5117621794342995, "rewards/rank_analyze_format_reward": 0.5615842193365097, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 599.953125, "epoch": 0.014, "grad_norm": 0.03484224155545235, "kl": 0.006390571594238281, "learning_rate": 1.5198173426207095e-05, "loss": -0.0077, "reward": 5.973394751548767, "reward_std": 1.187799260020256, "rewards/mrr_reward": 0.3873387798666954, "rewards/rank_analyze_format_reward": 0.5737537667155266, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.012578125111758709, "rewards/rank_initial_format_reward": 0.9950257539749146, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9950257539749146, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 608.28125, "epoch": 0.01408, "grad_norm": 0.029736081138253212, "kl": 0.006103515625, "learning_rate": 1.5144395337815066e-05, "loss": -0.0059, "reward": 6.961119174957275, "reward_std": 1.4198603332042694, "rewards/mrr_reward": 0.629464291036129, "rewards/rank_analyze_format_reward": 0.5694418847560883, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 582.265625, "epoch": 0.01416, "grad_norm": 0.03364516794681549, "kl": 0.006920814514160156, "learning_rate": 1.5090414157503715e-05, "loss": -0.05, "reward": 5.788230657577515, "reward_std": 1.0193384140729904, "rewards/mrr_reward": 0.3578063100576401, "rewards/rank_analyze_format_reward": 0.456499919295311, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 586.25, "epoch": 0.01424, "grad_norm": 0.0315423421561718, "kl": 0.006535530090332031, "learning_rate": 1.503623201635761e-05, "loss": -0.0208, "reward": 6.086873650550842, "reward_std": 1.2373364567756653, "rewards/mrr_reward": 0.440817229449749, "rewards/rank_analyze_format_reward": 0.5261917412281036, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9973393976688385, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 611.25, "epoch": 0.01432, "grad_norm": 0.03188354894518852, "kl": 0.0071659088134765625, "learning_rate": 1.498185105339491e-05, "loss": -0.0136, "reward": 6.844085216522217, "reward_std": 1.3567530810832977, "rewards/mrr_reward": 0.5900793671607971, "rewards/rank_analyze_format_reward": 0.6632254719734192, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 575.75, "epoch": 0.0144, "grad_norm": 0.03355080261826515, "kl": 0.00963592529296875, "learning_rate": 1.4927273415482916e-05, "loss": -0.0141, "reward": 6.776444315910339, "reward_std": 1.585608810186386, "rewards/mrr_reward": 0.5911458507180214, "rewards/rank_analyze_format_reward": 0.5980076417326927, "rewards/rank_answer_foramt_reward": 0.8359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9967704117298126, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9967704117298126, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 582.390625, "epoch": 0.01448, "grad_norm": 0.03208262100815773, "kl": 0.0070934295654296875, "learning_rate": 1.4872501257253325e-05, "loss": -0.0242, "reward": 6.444717884063721, "reward_std": 1.1609647572040558, "rewards/mrr_reward": 0.5165798738598824, "rewards/rank_analyze_format_reward": 0.4797312021255493, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 562.84375, "epoch": 0.01456, "grad_norm": 0.038560837507247925, "kl": 0.006739616394042969, "learning_rate": 1.4817536741017153e-05, "loss": -0.0292, "reward": 6.455266952514648, "reward_std": 1.4491407573223114, "rewards/mrr_reward": 0.4999814033508301, "rewards/rank_analyze_format_reward": 0.6045431345701218, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.998641312122345, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.998641312122345, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 589.375, "epoch": 0.01464, "grad_norm": 0.03276016190648079, "kl": 0.007555961608886719, "learning_rate": 1.4762382036679393e-05, "loss": -0.0179, "reward": 6.023438096046448, "reward_std": 1.1295487061142921, "rewards/mrr_reward": 0.4091145694255829, "rewards/rank_analyze_format_reward": 0.5913630276918411, "rewards/rank_answer_foramt_reward": 0.81640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974177181720734, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9974177181720734, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 566.015625, "epoch": 0.01472, "grad_norm": 0.03136757016181946, "kl": 0.010315895080566406, "learning_rate": 1.470703932165333e-05, "loss": -0.0393, "reward": 6.486603856086731, "reward_std": 1.2860779017210007, "rewards/mrr_reward": 0.5573040544986725, "rewards/rank_analyze_format_reward": 0.3760632053017616, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 577.25, "epoch": 0.0148, "grad_norm": 0.033665306866168976, "kl": 0.006897926330566406, "learning_rate": 1.4651510780774585e-05, "loss": -0.0423, "reward": 6.129797697067261, "reward_std": 1.1345724314451218, "rewards/mrr_reward": 0.4183593839406967, "rewards/rank_analyze_format_reward": 0.5632438734173775, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9817143976688385, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 608.671875, "epoch": 0.01488, "grad_norm": 0.033717554062604904, "kl": 0.008335113525390625, "learning_rate": 1.4595798606214882e-05, "loss": -0.0456, "reward": 6.490463733673096, "reward_std": 1.5740629434585571, "rewards/mrr_reward": 0.5045758858323097, "rewards/rank_analyze_format_reward": 0.6379460543394089, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 587.25, "epoch": 0.01496, "grad_norm": 0.03451388329267502, "kl": 0.00830841064453125, "learning_rate": 1.4539904997395468e-05, "loss": -0.0232, "reward": 6.77128529548645, "reward_std": 1.3209696114063263, "rewards/mrr_reward": 0.5951326936483383, "rewards/rank_analyze_format_reward": 0.5232585184276104, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 616.640625, "epoch": 0.01504, "grad_norm": 0.03379754349589348, "kl": 0.010415077209472656, "learning_rate": 1.4483832160900326e-05, "loss": 0.0116, "reward": 6.252481937408447, "reward_std": 1.1688702702522278, "rewards/mrr_reward": 0.4422247149050236, "rewards/rank_analyze_format_reward": 0.6675014421343803, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_contrast_format_reward": 0.014375000260770321, "rewards/rank_initial_format_reward": 0.9819079041481018, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9819079041481018, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 605.0, "epoch": 0.01512, "grad_norm": 0.03375570476055145, "kl": 0.008455276489257812, "learning_rate": 1.442758231038902e-05, "loss": 0.0194, "reward": 6.565407395362854, "reward_std": 1.2886944562196732, "rewards/mrr_reward": 0.5319196432828903, "rewards/rank_analyze_format_reward": 0.6408539563417435, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.96875, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 591.453125, "epoch": 0.0152, "grad_norm": 0.03769073262810707, "kl": 0.008349418640136719, "learning_rate": 1.437115766650933e-05, "loss": -0.0112, "reward": 6.194711208343506, "reward_std": 1.1796721369028091, "rewards/mrr_reward": 0.42529140785336494, "rewards/rank_analyze_format_reward": 0.6068268120288849, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 612.75, "epoch": 0.01528, "grad_norm": 0.03448958322405815, "kl": 0.007760047912597656, "learning_rate": 1.4314560456809592e-05, "loss": -0.0273, "reward": 5.8755223751068115, "reward_std": 1.0131285190582275, "rewards/mrr_reward": 0.3057849854230881, "rewards/rank_analyze_format_reward": 0.7399645447731018, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 605.953125, "epoch": 0.01536, "grad_norm": 0.03278854861855507, "kl": 0.008983612060546875, "learning_rate": 1.4257792915650728e-05, "loss": -0.0278, "reward": 6.080556154251099, "reward_std": 0.9068511724472046, "rewards/mrr_reward": 0.37085194140672684, "rewards/rank_analyze_format_reward": 0.6850389912724495, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 597.5, "epoch": 0.01544, "grad_norm": 0.034792449325323105, "kl": 0.008208274841308594, "learning_rate": 1.4200857284118067e-05, "loss": 0.0011, "reward": 7.220125198364258, "reward_std": 1.4368582963943481, "rewards/mrr_reward": 0.6703745126724243, "rewards/rank_analyze_format_reward": 0.6417193710803986, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 629.84375, "epoch": 0.01552, "grad_norm": 0.03334759920835495, "kl": 0.008100509643554688, "learning_rate": 1.4143755809932843e-05, "loss": 0.0016, "reward": 7.150384306907654, "reward_std": 0.9511245638132095, "rewards/mrr_reward": 0.6351190358400345, "rewards/rank_analyze_format_reward": 0.7146453708410263, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 617.59375, "epoch": 0.0156, "grad_norm": 0.035201530903577805, "kl": 0.009708404541015625, "learning_rate": 1.4086490747363492e-05, "loss": -0.0412, "reward": 5.7019020318984985, "reward_std": 1.2331421077251434, "rewards/mrr_reward": 0.3069382458925247, "rewards/rank_analyze_format_reward": 0.6959587037563324, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9965170323848724, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9965170323848724, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 617.609375, "epoch": 0.01568, "grad_norm": 0.03423590958118439, "kl": 0.007636070251464844, "learning_rate": 1.4029064357136628e-05, "loss": -0.0078, "reward": 7.051144123077393, "reward_std": 1.805798053741455, "rewards/mrr_reward": 0.638324648141861, "rewards/rank_analyze_format_reward": 0.6594932079315186, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_contrast_format_reward": 0.013648715801537037, "rewards/rank_initial_format_reward": 0.9934065341949463, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9934065341949463, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 598.59375, "epoch": 0.01576, "grad_norm": 0.032945919781923294, "kl": 0.0088653564453125, "learning_rate": 1.3971478906347806e-05, "loss": 0.0035, "reward": 5.926463842391968, "reward_std": 0.974092960357666, "rewards/mrr_reward": 0.34780507162213326, "rewards/rank_analyze_format_reward": 0.6289933621883392, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 628.984375, "epoch": 0.01584, "grad_norm": 0.0346326045691967, "kl": 0.007419586181640625, "learning_rate": 1.3913736668372027e-05, "loss": -0.0072, "reward": 5.591045498847961, "reward_std": 1.3578578382730484, "rewards/mrr_reward": 0.291989091783762, "rewards/rank_analyze_format_reward": 0.6496517956256866, "rewards/rank_answer_foramt_reward": 0.828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 594.25, "epoch": 0.01592, "grad_norm": 0.03434177115559578, "kl": 0.008726119995117188, "learning_rate": 1.3855839922773968e-05, "loss": -0.0242, "reward": 6.450882911682129, "reward_std": 1.2623412013053894, "rewards/mrr_reward": 0.5072668790817261, "rewards/rank_analyze_format_reward": 0.567683108150959, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 554.21875, "epoch": 0.016, "grad_norm": 0.03701239451766014, "kl": 0.0102386474609375, "learning_rate": 1.3797790955218014e-05, "loss": 0.0011, "reward": 6.486022472381592, "reward_std": 1.2553091049194336, "rewards/mrr_reward": 0.560825876891613, "rewards/rank_analyze_format_reward": 0.410379346460104, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 607.296875, "epoch": 0.01608, "grad_norm": 0.033033620566129684, "kl": 0.008151054382324219, "learning_rate": 1.3739592057378005e-05, "loss": 0.0044, "reward": 5.552197694778442, "reward_std": 1.1763339191675186, "rewards/mrr_reward": 0.2838045693933964, "rewards/rank_analyze_format_reward": 0.641008548438549, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9827118366956711, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9983368366956711, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 612.921875, "epoch": 0.01616, "grad_norm": 0.03353811055421829, "kl": 0.00972747802734375, "learning_rate": 1.3681245526846782e-05, "loss": -0.0107, "reward": 6.006348371505737, "reward_std": 1.2108041644096375, "rewards/mrr_reward": 0.3894965425133705, "rewards/rank_analyze_format_reward": 0.6179756820201874, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 625.5625, "epoch": 0.01624, "grad_norm": 0.032895009964704514, "kl": 0.007708549499511719, "learning_rate": 1.3622753667045459e-05, "loss": -0.0391, "reward": 6.472328782081604, "reward_std": 1.0835448950529099, "rewards/mrr_reward": 0.4733879044651985, "rewards/rank_analyze_format_reward": 0.6490897387266159, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 583.0625, "epoch": 0.01632, "grad_norm": 0.03540404513478279, "kl": 0.008310317993164062, "learning_rate": 1.3564118787132507e-05, "loss": -0.0078, "reward": 6.018280386924744, "reward_std": 1.3106110841035843, "rewards/mrr_reward": 0.4133184626698494, "rewards/rank_analyze_format_reward": 0.5459154918789864, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 653.96875, "epoch": 0.0164, "grad_norm": 0.031230907887220383, "kl": 0.008515357971191406, "learning_rate": 1.350534320191259e-05, "loss": -0.0201, "reward": 5.963681578636169, "reward_std": 1.103264644742012, "rewards/mrr_reward": 0.354712288826704, "rewards/rank_analyze_format_reward": 0.6953528821468353, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9989583343267441, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9989583343267441, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 618.4375, "epoch": 0.01648, "grad_norm": 0.032825764268636703, "kl": 0.009063720703125, "learning_rate": 1.344642923174517e-05, "loss": -0.0478, "reward": 6.054850339889526, "reward_std": 1.114606112241745, "rewards/mrr_reward": 0.3818700462579727, "rewards/rank_analyze_format_reward": 0.6812862306833267, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9962841123342514, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9962841123342514, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 585.484375, "epoch": 0.01656, "grad_norm": 0.035573042929172516, "kl": 0.009103775024414062, "learning_rate": 1.3387379202452917e-05, "loss": -0.0207, "reward": 7.044532299041748, "reward_std": 1.047465205192566, "rewards/mrr_reward": 0.6281250268220901, "rewards/rank_analyze_format_reward": 0.582579180598259, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.013906249776482582, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 617.171875, "epoch": 0.01664, "grad_norm": 0.034511882811784744, "kl": 0.009004592895507812, "learning_rate": 1.3328195445229869e-05, "loss": -0.0051, "reward": 6.457865118980408, "reward_std": 1.1667783446609974, "rewards/mrr_reward": 0.46604663878679276, "rewards/rank_analyze_format_reward": 0.6706357970833778, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966776371002197, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9966776371002197, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 625.171875, "epoch": 0.01672, "grad_norm": 0.03478395193815231, "kl": 0.008777618408203125, "learning_rate": 1.3268880296549424e-05, "loss": -0.0271, "reward": 5.905049920082092, "reward_std": 1.0248078405857086, "rewards/mrr_reward": 0.35685763508081436, "rewards/rank_analyze_format_reward": 0.6553537249565125, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 1.0, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 612.5625, "epoch": 0.0168, "grad_norm": 0.03253443166613579, "kl": 0.008145332336425781, "learning_rate": 1.3209436098072095e-05, "loss": 0.0045, "reward": 6.248061418533325, "reward_std": 1.1693440824747086, "rewards/mrr_reward": 0.4291480556130409, "rewards/rank_analyze_format_reward": 0.6678794920444489, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 594.453125, "epoch": 0.01688, "grad_norm": 0.038207363337278366, "kl": 0.008943557739257812, "learning_rate": 1.3149865196553049e-05, "loss": -0.0179, "reward": 5.902388572692871, "reward_std": 1.0627788752317429, "rewards/mrr_reward": 0.39007317274808884, "rewards/rank_analyze_format_reward": 0.4827209860086441, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 1.0, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 606.328125, "epoch": 0.01696, "grad_norm": 0.03223838657140732, "kl": 0.009801864624023438, "learning_rate": 1.3090169943749475e-05, "loss": -0.0054, "reward": 7.29668128490448, "reward_std": 0.9951428025960922, "rewards/mrr_reward": 0.659281998872757, "rewards/rank_analyze_format_reward": 0.7411612272262573, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 598.328125, "epoch": 0.01704, "grad_norm": 0.03369125351309776, "kl": 0.009103775024414062, "learning_rate": 1.3030352696327741e-05, "loss": -0.0261, "reward": 7.164615988731384, "reward_std": 1.2512931823730469, "rewards/mrr_reward": 0.6658420264720917, "rewards/rank_analyze_format_reward": 0.6357051134109497, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 580.359375, "epoch": 0.01712, "grad_norm": 0.03930777311325073, "kl": 0.009029388427734375, "learning_rate": 1.297041581577035e-05, "loss": -0.0126, "reward": 7.0739957094192505, "reward_std": 1.461501270532608, "rewards/mrr_reward": 0.6658420115709305, "rewards/rank_analyze_format_reward": 0.5605533644556999, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 645.453125, "epoch": 0.0172, "grad_norm": 0.03185691684484482, "kl": 0.008588790893554688, "learning_rate": 1.2910361668282718e-05, "loss": -0.0298, "reward": 6.496977686882019, "reward_std": 1.269655779004097, "rewards/mrr_reward": 0.4930989518761635, "rewards/rank_analyze_format_reward": 0.7445924282073975, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9788619577884674, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9788619577884674, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 648.203125, "epoch": 0.01728, "grad_norm": 0.03207946568727493, "kl": 0.008671760559082031, "learning_rate": 1.2850192624699762e-05, "loss": -0.0151, "reward": 6.4450095891952515, "reward_std": 1.170839011669159, "rewards/mrr_reward": 0.4551587291061878, "rewards/rank_analyze_format_reward": 0.813712939620018, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 591.078125, "epoch": 0.01736, "grad_norm": 0.03578515723347664, "kl": 0.009796142578125, "learning_rate": 1.2789911060392295e-05, "loss": -0.0008, "reward": 5.849558115005493, "reward_std": 1.4805233478546143, "rewards/mrr_reward": 0.3869667761027813, "rewards/rank_analyze_format_reward": 0.5430506467819214, "rewards/rank_answer_foramt_reward": 0.775390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9916248768568039, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9916248768568039, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 659.984375, "epoch": 0.01744, "grad_norm": 0.03413332626223564, "kl": 0.009096145629882812, "learning_rate": 1.2729519355173254e-05, "loss": -0.0242, "reward": 6.31634783744812, "reward_std": 0.8450592309236526, "rewards/mrr_reward": 0.41633186116814613, "rewards/rank_analyze_format_reward": 0.7740670591592789, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 608.234375, "epoch": 0.01752, "grad_norm": 0.03580684959888458, "kl": 0.0093231201171875, "learning_rate": 1.2669019893203758e-05, "loss": -0.0046, "reward": 6.914145231246948, "reward_std": 1.2913256287574768, "rewards/mrr_reward": 0.5963417589664459, "rewards/rank_analyze_format_reward": 0.6810068488121033, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 619.140625, "epoch": 0.0176, "grad_norm": 0.03536830097436905, "kl": 0.010969161987304688, "learning_rate": 1.2608415062898971e-05, "loss": -0.0075, "reward": 6.057724118232727, "reward_std": 1.276400476694107, "rewards/mrr_reward": 0.395721722394228, "rewards/rank_analyze_format_reward": 0.6955401599407196, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 616.328125, "epoch": 0.01768, "grad_norm": 0.03289422765374184, "kl": 0.009067535400390625, "learning_rate": 1.2547707256833823e-05, "loss": -0.0107, "reward": 6.9917004108428955, "reward_std": 1.0728786289691925, "rewards/mrr_reward": 0.5950520858168602, "rewards/rank_analyze_format_reward": 0.7065781205892563, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 614.765625, "epoch": 0.01776, "grad_norm": 0.03429026901721954, "kl": 0.0088348388671875, "learning_rate": 1.2486898871648552e-05, "loss": 0.0155, "reward": 6.751330614089966, "reward_std": 1.3774945735931396, "rewards/mrr_reward": 0.5524739772081375, "rewards/rank_analyze_format_reward": 0.6996376663446426, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 590.75, "epoch": 0.01784, "grad_norm": 0.0418301522731781, "kl": 0.00893402099609375, "learning_rate": 1.2425992307954075e-05, "loss": 0.0168, "reward": 6.152212023735046, "reward_std": 1.2027994394302368, "rewards/mrr_reward": 0.4172246977686882, "rewards/rank_analyze_format_reward": 0.6294892318546772, "rewards/rank_answer_foramt_reward": 0.85546875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 640.546875, "epoch": 0.01792, "grad_norm": 0.03684825077652931, "kl": 0.009303092956542969, "learning_rate": 1.236498997023725e-05, "loss": -0.0016, "reward": 6.84193217754364, "reward_std": 1.457335740327835, "rewards/mrr_reward": 0.5667286813259125, "rewards/rank_analyze_format_reward": 0.7013539373874664, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 588.9375, "epoch": 0.018, "grad_norm": 0.03261956572532654, "kl": 0.010713577270507812, "learning_rate": 1.2303894266765908e-05, "loss": -0.0193, "reward": 6.265663504600525, "reward_std": 0.8513515889644623, "rewards/mrr_reward": 0.45071304962038994, "rewards/rank_analyze_format_reward": 0.5975766256451607, "rewards/rank_answer_foramt_reward": 0.880859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 587.53125, "epoch": 0.01808, "grad_norm": 0.03243914246559143, "kl": 0.011205673217773438, "learning_rate": 1.2242707609493814e-05, "loss": -0.0393, "reward": 7.010910987854004, "reward_std": 0.8175117075443268, "rewards/mrr_reward": 0.5879092290997505, "rewards/rank_analyze_format_reward": 0.7197061777114868, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 611.484375, "epoch": 0.01816, "grad_norm": 0.03738885000348091, "kl": 0.0115203857421875, "learning_rate": 1.2181432413965428e-05, "loss": -0.0205, "reward": 6.74303412437439, "reward_std": 1.154506966471672, "rewards/mrr_reward": 0.5482390895485878, "rewards/rank_analyze_format_reward": 0.717931717634201, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 592.09375, "epoch": 0.01824, "grad_norm": 0.037249598652124405, "kl": 0.0104522705078125, "learning_rate": 1.212007109922055e-05, "loss": -0.0125, "reward": 5.738320231437683, "reward_std": 1.3927419930696487, "rewards/mrr_reward": 0.339936763048172, "rewards/rank_analyze_format_reward": 0.6192561089992523, "rewards/rank_answer_foramt_reward": 0.791015625, "rewards/rank_contrast_format_reward": 0.012178309261798859, "rewards/rank_initial_format_reward": 0.9975927919149399, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9975927919149399, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 620.84375, "epoch": 0.01832, "grad_norm": 0.03598544001579285, "kl": 0.008494377136230469, "learning_rate": 1.2058626087698814e-05, "loss": -0.0105, "reward": 6.275591254234314, "reward_std": 1.0191064924001694, "rewards/mrr_reward": 0.43397197872400284, "rewards/rank_analyze_format_reward": 0.6505665332078934, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 659.03125, "epoch": 0.0184, "grad_norm": 0.030200203880667686, "kl": 0.008350372314453125, "learning_rate": 1.1997099805144071e-05, "loss": -0.0112, "reward": 7.059328317642212, "reward_std": 0.7555132284760475, "rewards/mrr_reward": 0.5876922160387039, "rewards/rank_analyze_format_reward": 0.7512356638908386, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.009943181648850441, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 641.5625, "epoch": 0.01848, "grad_norm": 0.03280678391456604, "kl": 0.008802413940429688, "learning_rate": 1.1935494680508606e-05, "loss": 0.0031, "reward": 6.11614203453064, "reward_std": 1.2392974197864532, "rewards/mrr_reward": 0.4011656865477562, "rewards/rank_analyze_format_reward": 0.7221871167421341, "rewards/rank_answer_foramt_reward": 0.80859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 620.609375, "epoch": 0.01856, "grad_norm": 0.03436440974473953, "kl": 0.009710311889648438, "learning_rate": 1.187381314585725e-05, "loss": 0.0183, "reward": 6.469323635101318, "reward_std": 0.8228216767311096, "rewards/mrr_reward": 0.46866319328546524, "rewards/rank_analyze_format_reward": 0.6747487634420395, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 606.03125, "epoch": 0.01864, "grad_norm": 0.032794322818517685, "kl": 0.013088226318359375, "learning_rate": 1.1812057636271374e-05, "loss": -0.0559, "reward": 6.536515951156616, "reward_std": 0.8844896629452705, "rewards/mrr_reward": 0.5136780887842178, "rewards/rank_analyze_format_reward": 0.6375885009765625, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9836309552192688, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9836309552192688, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 568.5, "epoch": 0.01872, "grad_norm": 0.03515826165676117, "kl": 0.010290145874023438, "learning_rate": 1.1750230589752763e-05, "loss": -0.0354, "reward": 6.66556990146637, "reward_std": 0.9657221138477325, "rewards/mrr_reward": 0.559157982468605, "rewards/rank_analyze_format_reward": 0.5167137011885643, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 629.125, "epoch": 0.0188, "grad_norm": 0.036613740026950836, "kl": 0.010158538818359375, "learning_rate": 1.1688334447127338e-05, "loss": 0.0009, "reward": 6.372735619544983, "reward_std": 1.2347291707992554, "rewards/mrr_reward": 0.45282118767499924, "rewards/rank_analyze_format_reward": 0.7127545475959778, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137856245041, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9809887856245041, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 620.71875, "epoch": 0.01888, "grad_norm": 0.04982118308544159, "kl": 0.015569686889648438, "learning_rate": 1.1626371651948839e-05, "loss": 0.021, "reward": 6.901611328125, "reward_std": 1.007930152118206, "rewards/mrr_reward": 0.5724516361951828, "rewards/rank_analyze_format_reward": 0.7467468529939651, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.99502894282341, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.99502894282341, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 633.96875, "epoch": 0.01896, "grad_norm": 0.03350101783871651, "kl": 0.008681297302246094, "learning_rate": 1.156434465040231e-05, "loss": 0.0062, "reward": 6.326759576797485, "reward_std": 0.6072492152452469, "rewards/mrr_reward": 0.3864273354411125, "rewards/rank_analyze_format_reward": 0.8293402343988419, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9973393976688385, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 628.578125, "epoch": 0.01904, "grad_norm": 0.033342353999614716, "kl": 0.011646270751953125, "learning_rate": 1.1502255891207572e-05, "loss": -0.0247, "reward": 6.36961042881012, "reward_std": 1.0631940811872482, "rewards/mrr_reward": 0.4433903731405735, "rewards/rank_analyze_format_reward": 0.6857778131961823, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 613.671875, "epoch": 0.01912, "grad_norm": 0.037100598216056824, "kl": 0.009510040283203125, "learning_rate": 1.1440107825522522e-05, "loss": 0.0078, "reward": 7.012573838233948, "reward_std": 1.218707486987114, "rewards/mrr_reward": 0.6299479156732559, "rewards/rank_analyze_format_reward": 0.6543532609939575, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9817143976688385, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 580.546875, "epoch": 0.0192, "grad_norm": 0.036013565957546234, "kl": 0.009958267211914062, "learning_rate": 1.137790290684638e-05, "loss": -0.0096, "reward": 6.823012590408325, "reward_std": 0.9740518927574158, "rewards/mrr_reward": 0.5885416679084301, "rewards/rank_analyze_format_reward": 0.6494416892528534, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975927919149399, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9975927919149399, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 624.875, "epoch": 0.01928, "grad_norm": 0.033712439239025116, "kl": 0.009075164794921875, "learning_rate": 1.1315643590922827e-05, "loss": -0.0098, "reward": 6.5401880741119385, "reward_std": 0.8591099269688129, "rewards/mrr_reward": 0.47449157014489174, "rewards/rank_analyze_format_reward": 0.7002828568220139, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992897808551788, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9992897808551788, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 610.84375, "epoch": 0.01936, "grad_norm": 0.034642983227968216, "kl": 0.009491920471191406, "learning_rate": 1.1253332335643043e-05, "loss": -0.0239, "reward": 6.54269015789032, "reward_std": 1.4378929883241653, "rewards/mrr_reward": 0.5039682611823082, "rewards/rank_analyze_format_reward": 0.642557792365551, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.01411533821374178, "rewards/rank_initial_format_reward": 0.9965953528881073, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9965953528881073, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 614.5625, "epoch": 0.01944, "grad_norm": 0.03675766661763191, "kl": 0.00836181640625, "learning_rate": 1.11909716009487e-05, "loss": 0.0047, "reward": 6.093774437904358, "reward_std": 0.8132295906543732, "rewards/mrr_reward": 0.384920634329319, "rewards/rank_analyze_format_reward": 0.6615138202905655, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 610.109375, "epoch": 0.01952, "grad_norm": 0.03454392030835152, "kl": 0.009915351867675781, "learning_rate": 1.1128563848734817e-05, "loss": -0.0159, "reward": 7.29901921749115, "reward_std": 1.3508446514606476, "rewards/mrr_reward": 0.6611111089587212, "rewards/rank_analyze_format_reward": 0.7444185465574265, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 606.046875, "epoch": 0.0196, "grad_norm": 0.037709299474954605, "kl": 0.010133743286132812, "learning_rate": 1.10661115427526e-05, "loss": 0.0007, "reward": 6.508638501167297, "reward_std": 1.0574052929878235, "rewards/mrr_reward": 0.4710255637764931, "rewards/rank_analyze_format_reward": 0.7313414663076401, "rewards/rank_answer_foramt_reward": 0.912109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 615.875, "epoch": 0.01968, "grad_norm": 0.03252964839339256, "kl": 0.008385658264160156, "learning_rate": 1.1003617148512149e-05, "loss": -0.0004, "reward": 5.982852816581726, "reward_std": 0.967894122004509, "rewards/mrr_reward": 0.35572296380996704, "rewards/rank_analyze_format_reward": 0.6874139159917831, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9948672950267792, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9948672950267792, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 638.859375, "epoch": 0.01976, "grad_norm": 0.032493218779563904, "kl": 0.008337020874023438, "learning_rate": 1.0941083133185146e-05, "loss": -0.0105, "reward": 6.534578561782837, "reward_std": 0.8327680304646492, "rewards/mrr_reward": 0.46357887238264084, "rewards/rank_analyze_format_reward": 0.7738983333110809, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 583.046875, "epoch": 0.01984, "grad_norm": 0.03498000279068947, "kl": 0.010382652282714844, "learning_rate": 1.0878511965507435e-05, "loss": -0.0044, "reward": 5.863273620605469, "reward_std": 1.2028781324625015, "rewards/mrr_reward": 0.3611855283379555, "rewards/rank_analyze_format_reward": 0.5884533822536469, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 617.75, "epoch": 0.01992, "grad_norm": 0.03203579783439636, "kl": 0.010402679443359375, "learning_rate": 1.0815906115681579e-05, "loss": -0.0101, "reward": 5.775153875350952, "reward_std": 0.9553007483482361, "rewards/mrr_reward": 0.31241320818662643, "rewards/rank_analyze_format_reward": 0.6676141172647476, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 602.34375, "epoch": 0.02, "grad_norm": 0.032924287021160126, "kl": 0.009527206420898438, "learning_rate": 1.0753268055279328e-05, "loss": -0.0195, "reward": 6.826016068458557, "reward_std": 1.3634543120861053, "rewards/mrr_reward": 0.5691840499639511, "rewards/rank_analyze_format_reward": 0.6899046450853348, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 619.40625, "epoch": 0.02008, "grad_norm": 0.03474834933876991, "kl": 0.009187698364257812, "learning_rate": 1.0690600257144062e-05, "loss": 0.0113, "reward": 6.13844633102417, "reward_std": 1.342699646949768, "rewards/mrr_reward": 0.39120785146951675, "rewards/rank_analyze_format_reward": 0.7396307289600372, "rewards/rank_answer_foramt_reward": 0.841796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 602.171875, "epoch": 0.02016, "grad_norm": 0.035354483872652054, "kl": 0.009152412414550781, "learning_rate": 1.0627905195293135e-05, "loss": -0.0182, "reward": 6.190269231796265, "reward_std": 1.6783248633146286, "rewards/mrr_reward": 0.4631262421607971, "rewards/rank_analyze_format_reward": 0.5699562430381775, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9825367629528046, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9669117629528046, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 558.265625, "epoch": 0.02024, "grad_norm": 0.03778017684817314, "kl": 0.007927894592285156, "learning_rate": 1.0565185344820248e-05, "loss": -0.0307, "reward": 6.815933346748352, "reward_std": 1.4374285638332367, "rewards/mrr_reward": 0.5922494977712631, "rewards/rank_analyze_format_reward": 0.5947561264038086, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 616.484375, "epoch": 0.02032, "grad_norm": 0.03456341475248337, "kl": 0.008115768432617188, "learning_rate": 1.0502443181797696e-05, "loss": -0.0113, "reward": 6.270037531852722, "reward_std": 0.6828516721725464, "rewards/mrr_reward": 0.4079117327928543, "rewards/rank_analyze_format_reward": 0.7165155410766602, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 607.078125, "epoch": 0.0204, "grad_norm": 0.03374814614653587, "kl": 0.008345603942871094, "learning_rate": 1.043968118317865e-05, "loss": 0.0006, "reward": 6.478882312774658, "reward_std": 1.0142697505652905, "rewards/mrr_reward": 0.48450521379709244, "rewards/rank_analyze_format_reward": 0.6263755261898041, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 589.3125, "epoch": 0.02048, "grad_norm": 0.03523356467485428, "kl": 0.008609771728515625, "learning_rate": 1.0376901826699349e-05, "loss": -0.0178, "reward": 6.468229651451111, "reward_std": 1.1832327842712402, "rewards/mrr_reward": 0.5023189447820187, "rewards/rank_analyze_format_reward": 0.613250732421875, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 618.390625, "epoch": 0.02056, "grad_norm": 0.03347039595246315, "kl": 0.010356903076171875, "learning_rate": 1.0314107590781284e-05, "loss": -0.0175, "reward": 6.459206938743591, "reward_std": 1.114880457520485, "rewards/mrr_reward": 0.4722842425107956, "rewards/rank_analyze_format_reward": 0.7281581908464432, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 589.515625, "epoch": 0.02064, "grad_norm": 0.03712701424956322, "kl": 0.008486747741699219, "learning_rate": 1.0251300954433377e-05, "loss": -0.0061, "reward": 6.525125622749329, "reward_std": 1.5343350768089294, "rewards/mrr_reward": 0.5073970928788185, "rewards/rank_analyze_format_reward": 0.6517873406410217, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990234375, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990234375, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 599.1875, "epoch": 0.02072, "grad_norm": 0.03250723332166672, "kl": 0.008466720581054688, "learning_rate": 1.0188484397154083e-05, "loss": -0.0186, "reward": 6.997652888298035, "reward_std": 1.1146360635757446, "rewards/mrr_reward": 0.5959821343421936, "rewards/rank_analyze_format_reward": 0.7026797086000443, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.997514471411705, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.997514471411705, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 599.0625, "epoch": 0.0208, "grad_norm": 0.037865590304136276, "kl": 0.008708953857421875, "learning_rate": 1.0125660398833528e-05, "loss": -0.0165, "reward": 6.580657601356506, "reward_std": 1.3034490644931793, "rewards/mrr_reward": 0.5210069417953491, "rewards/rank_analyze_format_reward": 0.5690314322710037, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.01146788988262415, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 576.765625, "epoch": 0.02088, "grad_norm": 0.03623431921005249, "kl": 0.009592056274414062, "learning_rate": 1.0062831439655591e-05, "loss": -0.0092, "reward": 6.1294450759887695, "reward_std": 0.9818276166915894, "rewards/mrr_reward": 0.4232886955142021, "rewards/rank_analyze_format_reward": 0.5435971990227699, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 591.359375, "epoch": 0.02096, "grad_norm": 0.032129354774951935, "kl": 0.007951736450195312, "learning_rate": 1e-05, "loss": -0.0151, "reward": 6.759868860244751, "reward_std": 0.6981609910726547, "rewards/mrr_reward": 0.5290364511311054, "rewards/rank_analyze_format_reward": 0.683406338095665, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.012742718681693077, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 611.84375, "epoch": 0.02104, "grad_norm": 0.03505862131714821, "kl": 0.008457183837890625, "learning_rate": 9.937168560344412e-06, "loss": -0.0444, "reward": 6.360743880271912, "reward_std": 1.2799600958824158, "rewards/mrr_reward": 0.47020088881254196, "rewards/rank_analyze_format_reward": 0.6649815738201141, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.99829962849617, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.99829962849617, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 598.484375, "epoch": 0.02112, "grad_norm": 0.03508510813117027, "kl": 0.00876617431640625, "learning_rate": 9.874339601166474e-06, "loss": -0.0171, "reward": 7.134628176689148, "reward_std": 1.1341713666915894, "rewards/mrr_reward": 0.6606026887893677, "rewards/rank_analyze_format_reward": 0.6343306750059128, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9836309552192688, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 613.90625, "epoch": 0.0212, "grad_norm": 0.03620074689388275, "kl": 0.009752273559570312, "learning_rate": 9.81151560284592e-06, "loss": -0.0038, "reward": 6.344595313072205, "reward_std": 1.0392844527959824, "rewards/mrr_reward": 0.444940485060215, "rewards/rank_analyze_format_reward": 0.7277408987283707, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9956946671009064, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9956946671009064, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 609.84375, "epoch": 0.02128, "grad_norm": 0.03558015078306198, "kl": 0.010416030883789062, "learning_rate": 9.748699045566626e-06, "loss": -0.0238, "reward": 6.37187659740448, "reward_std": 1.1155817210674286, "rewards/mrr_reward": 0.4424107037484646, "rewards/rank_analyze_format_reward": 0.736382469534874, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 659.703125, "epoch": 0.02136, "grad_norm": 0.03403196856379509, "kl": 0.008001327514648438, "learning_rate": 9.685892409218718e-06, "loss": -0.0039, "reward": 6.388388395309448, "reward_std": 0.8915980607271194, "rewards/mrr_reward": 0.4221044182777405, "rewards/rank_analyze_format_reward": 0.7507519423961639, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 573.625, "epoch": 0.02144, "grad_norm": 0.0370207354426384, "kl": 0.009798049926757812, "learning_rate": 9.623098173300655e-06, "loss": 0.0105, "reward": 6.413051247596741, "reward_std": 1.3421672135591507, "rewards/mrr_reward": 0.4918030798435211, "rewards/rank_analyze_format_reward": 0.6328675001859665, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973060339689255, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9973060339689255, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 634.59375, "epoch": 0.02152, "grad_norm": 0.032293617725372314, "kl": 0.008089065551757812, "learning_rate": 9.560318816821354e-06, "loss": -0.0188, "reward": 6.482900857925415, "reward_std": 0.8697319701313972, "rewards/mrr_reward": 0.4679315537214279, "rewards/rank_analyze_format_reward": 0.7565037906169891, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9956946671009064, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9800696671009064, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 624.734375, "epoch": 0.0216, "grad_norm": 0.03470136597752571, "kl": 0.009441375732421875, "learning_rate": 9.497556818202306e-06, "loss": -0.0027, "reward": 6.912677049636841, "reward_std": 1.208983063697815, "rewards/mrr_reward": 0.5806547477841377, "rewards/rank_analyze_format_reward": 0.7326363176107407, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 584.015625, "epoch": 0.02168, "grad_norm": 0.03612437844276428, "kl": 0.0095062255859375, "learning_rate": 9.434814655179756e-06, "loss": -0.0221, "reward": 5.728673934936523, "reward_std": 1.0402158945798874, "rewards/mrr_reward": 0.29209451377391815, "rewards/rank_analyze_format_reward": 0.726196676492691, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 604.234375, "epoch": 0.02176, "grad_norm": 0.037423085421323776, "kl": 0.0115814208984375, "learning_rate": 9.372094804706867e-06, "loss": -0.0295, "reward": 6.138251185417175, "reward_std": 0.9122535288333893, "rewards/mrr_reward": 0.371589794754982, "rewards/rank_analyze_format_reward": 0.7316619157791138, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 582.3125, "epoch": 0.02184, "grad_norm": 0.038006190210580826, "kl": 0.009668350219726562, "learning_rate": 9.309399742855943e-06, "loss": -0.0188, "reward": 6.551778435707092, "reward_std": 1.4611459486186504, "rewards/mrr_reward": 0.5049107447266579, "rewards/rank_analyze_format_reward": 0.6832826882600784, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.996692106127739, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.996692106127739, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 623.5625, "epoch": 0.02192, "grad_norm": 0.03509177267551422, "kl": 0.008122444152832031, "learning_rate": 9.246731944720675e-06, "loss": -0.0263, "reward": 6.217320084571838, "reward_std": 1.235550969839096, "rewards/mrr_reward": 0.42896206490695477, "rewards/rank_analyze_format_reward": 0.7137094661593437, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9964202791452408, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9807952791452408, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 606.96875, "epoch": 0.022, "grad_norm": 0.035237036645412445, "kl": 0.008487701416015625, "learning_rate": 9.184093884318426e-06, "loss": -0.0229, "reward": 5.953908681869507, "reward_std": 0.9819855093955994, "rewards/mrr_reward": 0.3739955499768257, "rewards/rank_analyze_format_reward": 0.6079726666212082, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9972426444292068, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9972426444292068, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 626.28125, "epoch": 0.02208, "grad_norm": 0.03610999137163162, "kl": 0.009578704833984375, "learning_rate": 9.121488034492569e-06, "loss": -0.0373, "reward": 6.458534240722656, "reward_std": 0.910767138004303, "rewards/mrr_reward": 0.46837178990244865, "rewards/rank_analyze_format_reward": 0.7256718277931213, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 605.5625, "epoch": 0.02216, "grad_norm": 0.03417709842324257, "kl": 0.008665084838867188, "learning_rate": 9.058916866814857e-06, "loss": -0.0297, "reward": 6.420701861381531, "reward_std": 1.0284410268068314, "rewards/mrr_reward": 0.45776910334825516, "rewards/rank_analyze_format_reward": 0.6865500882267952, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 594.15625, "epoch": 0.02224, "grad_norm": 0.03378501906991005, "kl": 0.009778976440429688, "learning_rate": 8.996382851487851e-06, "loss": -0.0126, "reward": 6.550466299057007, "reward_std": 1.2819590270519257, "rewards/mrr_reward": 0.5111111104488373, "rewards/rank_analyze_format_reward": 0.6680163145065308, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9678308814764023, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 590.046875, "epoch": 0.02232, "grad_norm": 0.03543354943394661, "kl": 0.009675979614257812, "learning_rate": 8.933888457247402e-06, "loss": -0.0418, "reward": 6.273947596549988, "reward_std": 1.0915377736091614, "rewards/mrr_reward": 0.41968007013201714, "rewards/rank_analyze_format_reward": 0.6909303814172745, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 620.53125, "epoch": 0.0224, "grad_norm": 0.034412283450365067, "kl": 0.008032798767089844, "learning_rate": 8.871436151265183e-06, "loss": -0.0073, "reward": 6.562302350997925, "reward_std": 0.9009479358792305, "rewards/mrr_reward": 0.48469121754169464, "rewards/rank_analyze_format_reward": 0.7504905164241791, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 617.1875, "epoch": 0.02248, "grad_norm": 0.033824581652879715, "kl": 0.008426666259765625, "learning_rate": 8.809028399051302e-06, "loss": -0.0115, "reward": 5.791995048522949, "reward_std": 0.8779707551002502, "rewards/mrr_reward": 0.2919456958770752, "rewards/rank_analyze_format_reward": 0.7479636520147324, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9947650283575058, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9947650283575058, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 599.84375, "epoch": 0.02256, "grad_norm": 0.03484233841300011, "kl": 0.0097198486328125, "learning_rate": 8.746667664356957e-06, "loss": -0.0143, "reward": 5.93380606174469, "reward_std": 1.071552962064743, "rewards/mrr_reward": 0.3823288567364216, "rewards/rank_analyze_format_reward": 0.6016412377357483, "rewards/rank_answer_foramt_reward": 0.8046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 621.90625, "epoch": 0.02264, "grad_norm": 0.0346490815281868, "kl": 0.010959625244140625, "learning_rate": 8.684356409077177e-06, "loss": -0.0204, "reward": 6.894137263298035, "reward_std": 1.4641993045806885, "rewards/mrr_reward": 0.5689174234867096, "rewards/rank_analyze_format_reward": 0.7035585567355156, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9965170323848724, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9965170323848724, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 641.0625, "epoch": 0.02272, "grad_norm": 0.03447302430868149, "kl": 0.00998687744140625, "learning_rate": 8.62209709315362e-06, "loss": 0.0057, "reward": 6.52327024936676, "reward_std": 0.9611741378903389, "rewards/mrr_reward": 0.44332218170166016, "rewards/rank_analyze_format_reward": 0.821939080953598, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 600.421875, "epoch": 0.0228, "grad_norm": 0.03485405445098877, "kl": 0.009679794311523438, "learning_rate": 8.559892174477478e-06, "loss": 0.0201, "reward": 6.434827446937561, "reward_std": 0.8777450993657112, "rewards/mrr_reward": 0.4580047130584717, "rewards/rank_analyze_format_reward": 0.6848397850990295, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.984375, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 576.140625, "epoch": 0.02288, "grad_norm": 0.03371486812829971, "kl": 0.009853363037109375, "learning_rate": 8.49774410879243e-06, "loss": -0.0234, "reward": 6.682247757911682, "reward_std": 1.1762762665748596, "rewards/mrr_reward": 0.5230840817093849, "rewards/rank_analyze_format_reward": 0.6620622426271439, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 606.546875, "epoch": 0.02296, "grad_norm": 0.034511007368564606, "kl": 0.009128570556640625, "learning_rate": 8.43565534959769e-06, "loss": -0.0085, "reward": 6.868860602378845, "reward_std": 0.8355295807123184, "rewards/mrr_reward": 0.5646825507283211, "rewards/rank_analyze_format_reward": 0.6624417304992676, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 629.453125, "epoch": 0.02304, "grad_norm": 0.03616128861904144, "kl": 0.010618209838867188, "learning_rate": 8.373628348051165e-06, "loss": 0.0011, "reward": 6.6424126625061035, "reward_std": 1.3670581132173538, "rewards/mrr_reward": 0.5132812634110451, "rewards/rank_analyze_format_reward": 0.6888970136642456, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 581.65625, "epoch": 0.02312, "grad_norm": 0.0351540707051754, "kl": 0.0092926025390625, "learning_rate": 8.311665552872662e-06, "loss": -0.0046, "reward": 7.051009774208069, "reward_std": 0.641165129840374, "rewards/mrr_reward": 0.6131138354539871, "rewards/rank_analyze_format_reward": 0.627851277589798, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 630.6875, "epoch": 0.0232, "grad_norm": 0.03694026172161102, "kl": 0.008800506591796875, "learning_rate": 8.249769410247239e-06, "loss": 0.0063, "reward": 5.9682828187942505, "reward_std": 1.0954477787017822, "rewards/mrr_reward": 0.35110368207097054, "rewards/rank_analyze_format_reward": 0.6924659460783005, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 596.453125, "epoch": 0.02328, "grad_norm": 0.03731337562203407, "kl": 0.010684967041015625, "learning_rate": 8.187942363728626e-06, "loss": -0.0003, "reward": 6.487108945846558, "reward_std": 1.1193221658468246, "rewards/mrr_reward": 0.49732763320207596, "rewards/rank_analyze_format_reward": 0.6694436743855476, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 594.703125, "epoch": 0.02336, "grad_norm": 0.0360497310757637, "kl": 0.010662078857421875, "learning_rate": 8.126186854142752e-06, "loss": -0.0017, "reward": 6.155300855636597, "reward_std": 1.1617285907268524, "rewards/mrr_reward": 0.4130828529596329, "rewards/rank_analyze_format_reward": 0.6708233654499054, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 594.796875, "epoch": 0.02344, "grad_norm": 0.036106646060943604, "kl": 0.008665084838867188, "learning_rate": 8.064505319491398e-06, "loss": 0.0043, "reward": 6.877136588096619, "reward_std": 1.1807830333709717, "rewards/mrr_reward": 0.5557167679071426, "rewards/rank_analyze_format_reward": 0.7323943823575974, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 594.265625, "epoch": 0.02352, "grad_norm": 0.035376593470573425, "kl": 0.011880874633789062, "learning_rate": 8.00290019485593e-06, "loss": -0.0206, "reward": 6.829400300979614, "reward_std": 1.2492860555648804, "rewards/mrr_reward": 0.5686569958925247, "rewards/rank_analyze_format_reward": 0.6731568202376366, "rewards/rank_answer_foramt_reward": 0.896484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9964717775583267, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9964717775583267, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 614.109375, "epoch": 0.0236, "grad_norm": 0.03899754583835602, "kl": 0.009807586669921875, "learning_rate": 7.94137391230119e-06, "loss": -0.0211, "reward": 6.770255446434021, "reward_std": 1.4081167876720428, "rewards/mrr_reward": 0.5702814757823944, "rewards/rank_analyze_format_reward": 0.6101082153618336, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 619.75, "epoch": 0.02368, "grad_norm": 0.0367281548678875, "kl": 0.010372161865234375, "learning_rate": 7.879928900779457e-06, "loss": -0.0235, "reward": 6.41059148311615, "reward_std": 0.8655820339918137, "rewards/mrr_reward": 0.41867558658123016, "rewards/rank_analyze_format_reward": 0.7627051472663879, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.012131605297327042, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 620.859375, "epoch": 0.02376, "grad_norm": 0.03433919697999954, "kl": 0.0086517333984375, "learning_rate": 7.818567586034578e-06, "loss": -0.012, "reward": 6.571309804916382, "reward_std": 0.789227120578289, "rewards/mrr_reward": 0.47058533132076263, "rewards/rank_analyze_format_reward": 0.7648319751024246, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 657.625, "epoch": 0.02384, "grad_norm": 0.031991150230169296, "kl": 0.009012222290039062, "learning_rate": 7.757292390506191e-06, "loss": 0.0109, "reward": 6.454801440238953, "reward_std": 1.196733683347702, "rewards/mrr_reward": 0.4277467578649521, "rewards/rank_analyze_format_reward": 0.8160154074430466, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.023501884192228317, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.984375, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 624.890625, "epoch": 0.02392, "grad_norm": 0.03431214392185211, "kl": 0.009241104125976562, "learning_rate": 7.696105733234099e-06, "loss": -0.0133, "reward": 6.268079876899719, "reward_std": 0.9799875319004059, "rewards/mrr_reward": 0.4241071380674839, "rewards/rank_analyze_format_reward": 0.6654012054204941, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 616.125, "epoch": 0.024, "grad_norm": 0.035013388842344284, "kl": 0.00870513916015625, "learning_rate": 7.635010029762755e-06, "loss": -0.0228, "reward": 6.986905932426453, "reward_std": 0.8502253144979477, "rewards/mrr_reward": 0.5640562921762466, "rewards/rank_analyze_format_reward": 0.7599775195121765, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 609.234375, "epoch": 0.02408, "grad_norm": 0.03479558601975441, "kl": 0.010095596313476562, "learning_rate": 7.574007692045928e-06, "loss": -0.0214, "reward": 6.7003912925720215, "reward_std": 1.3771826922893524, "rewards/mrr_reward": 0.5331969261169434, "rewards/rank_analyze_format_reward": 0.7176856696605682, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 597.96875, "epoch": 0.02416, "grad_norm": 0.035783056169748306, "kl": 0.008653640747070312, "learning_rate": 7.513101128351454e-06, "loss": -0.031, "reward": 6.923213839530945, "reward_std": 0.9379879832267761, "rewards/mrr_reward": 0.589682549238205, "rewards/rank_analyze_format_reward": 0.6501126140356064, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 600.828125, "epoch": 0.02424, "grad_norm": 0.03350872918963432, "kl": 0.009677886962890625, "learning_rate": 7.4522927431661805e-06, "loss": -0.0081, "reward": 6.277737736701965, "reward_std": 0.9385035932064056, "rewards/mrr_reward": 0.42640750855207443, "rewards/rank_analyze_format_reward": 0.646096795797348, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 593.015625, "epoch": 0.02432, "grad_norm": 0.0383150689303875, "kl": 0.010992050170898438, "learning_rate": 7.391584937101034e-06, "loss": -0.0242, "reward": 6.897738218307495, "reward_std": 1.2141970098018646, "rewards/mrr_reward": 0.5597098171710968, "rewards/rank_analyze_format_reward": 0.7648108601570129, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9968487471342087, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9968487471342087, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 603.015625, "epoch": 0.0244, "grad_norm": 0.03774309158325195, "kl": 0.010526657104492188, "learning_rate": 7.330980106796247e-06, "loss": -0.0115, "reward": 6.950722813606262, "reward_std": 1.4682192206382751, "rewards/mrr_reward": 0.580598995089531, "rewards/rank_analyze_format_reward": 0.7509506046772003, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 584.09375, "epoch": 0.02448, "grad_norm": 0.03783104196190834, "kl": 0.010644912719726562, "learning_rate": 7.27048064482675e-06, "loss": -0.0184, "reward": 6.647589683532715, "reward_std": 1.4444166421890259, "rewards/mrr_reward": 0.5297247096896172, "rewards/rank_analyze_format_reward": 0.6682709604501724, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975927919149399, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9975927919149399, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 631.875, "epoch": 0.02456, "grad_norm": 0.03669432923197746, "kl": 0.01029205322265625, "learning_rate": 7.210088939607709e-06, "loss": -0.0274, "reward": 5.5699498653411865, "reward_std": 0.8376500010490417, "rewards/mrr_reward": 0.24411582946777344, "rewards/rank_analyze_format_reward": 0.7595020979642868, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 615.828125, "epoch": 0.02464, "grad_norm": 0.037494342774152756, "kl": 0.008539199829101562, "learning_rate": 7.149807375300239e-06, "loss": -0.0051, "reward": 5.882465720176697, "reward_std": 1.2774192243814468, "rewards/mrr_reward": 0.37479541078209877, "rewards/rank_analyze_format_reward": 0.6367670595645905, "rewards/rank_answer_foramt_reward": 0.7890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9826335161924362, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 633.0625, "epoch": 0.02472, "grad_norm": 0.03609183803200722, "kl": 0.008897781372070312, "learning_rate": 7.0896383317172845e-06, "loss": -0.0018, "reward": 6.69511079788208, "reward_std": 1.0238350331783295, "rewards/mrr_reward": 0.510937511920929, "rewards/rank_analyze_format_reward": 0.7096461206674576, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 615.421875, "epoch": 0.0248, "grad_norm": 0.036191485822200775, "kl": 0.010578155517578125, "learning_rate": 7.029584184229653e-06, "loss": -0.0367, "reward": 6.708524942398071, "reward_std": 0.7616401016712189, "rewards/mrr_reward": 0.5263455025851727, "rewards/rank_analyze_format_reward": 0.6617367416620255, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 623.359375, "epoch": 0.02488, "grad_norm": 0.034229110926389694, "kl": 0.009495735168457031, "learning_rate": 6.969647303672262e-06, "loss": 0.0, "reward": 6.277475357055664, "reward_std": 1.0946559607982635, "rewards/mrr_reward": 0.3988219127058983, "rewards/rank_analyze_format_reward": 0.7778907120227814, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 618.625, "epoch": 0.02496, "grad_norm": 0.03546452149748802, "kl": 0.010522842407226562, "learning_rate": 6.909830056250527e-06, "loss": -0.0133, "reward": 5.894702076911926, "reward_std": 1.0494917929172516, "rewards/mrr_reward": 0.3269965313374996, "rewards/rank_analyze_format_reward": 0.6794258505105972, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.010690789669752121, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 599.265625, "epoch": 0.02504, "grad_norm": 0.03506970778107643, "kl": 0.009166717529296875, "learning_rate": 6.850134803446955e-06, "loss": -0.0205, "reward": 7.8615734577178955, "reward_std": 1.0825934708118439, "rewards/mrr_reward": 0.7979166805744171, "rewards/rank_analyze_format_reward": 0.7187349647283554, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 625.1875, "epoch": 0.02512, "grad_norm": 0.03662346303462982, "kl": 0.008745193481445312, "learning_rate": 6.790563901927907e-06, "loss": 0.0075, "reward": 6.225973844528198, "reward_std": 1.0001718550920486, "rewards/mrr_reward": 0.3985738977789879, "rewards/rank_analyze_format_reward": 0.7045020833611488, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9977678656578064, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9977678656578064, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 624.765625, "epoch": 0.0252, "grad_norm": 0.033335208892822266, "kl": 0.008983612060546875, "learning_rate": 6.731119703450577e-06, "loss": -0.0014, "reward": 7.485305547714233, "reward_std": 0.9429552778601646, "rewards/mrr_reward": 0.6808593943715096, "rewards/rank_analyze_format_reward": 0.806481346487999, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 613.859375, "epoch": 0.02528, "grad_norm": 0.03578289970755577, "kl": 0.011020660400390625, "learning_rate": 6.671804554770135e-06, "loss": -0.004, "reward": 6.955068826675415, "reward_std": 1.177416741847992, "rewards/mrr_reward": 0.6026041470468044, "rewards/rank_analyze_format_reward": 0.6764245927333832, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137707233429, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9966137707233429, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 593.34375, "epoch": 0.02536, "grad_norm": 0.03771011531352997, "kl": 0.010204315185546875, "learning_rate": 6.612620797547087e-06, "loss": -0.0037, "reward": 6.470096230506897, "reward_std": 1.2905988246202469, "rewards/mrr_reward": 0.4610429108142853, "rewards/rank_analyze_format_reward": 0.7326149865984917, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.997436136007309, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.997436136007309, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 600.8125, "epoch": 0.02544, "grad_norm": 0.03544236347079277, "kl": 0.010606765747070312, "learning_rate": 6.553570768254831e-06, "loss": -0.0217, "reward": 7.515864014625549, "reward_std": 1.1141066551208496, "rewards/mrr_reward": 0.7500000149011612, "rewards/rank_analyze_format_reward": 0.5861764326691628, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 647.625, "epoch": 0.02552, "grad_norm": 0.037698011845350266, "kl": 0.009312629699707031, "learning_rate": 6.494656798087412e-06, "loss": 0.0103, "reward": 6.85070276260376, "reward_std": 1.285613864660263, "rewards/mrr_reward": 0.568979412317276, "rewards/rank_analyze_format_reward": 0.7075974643230438, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 622.921875, "epoch": 0.0256, "grad_norm": 0.037663985043764114, "kl": 0.010009765625, "learning_rate": 6.435881212867494e-06, "loss": -0.014, "reward": 7.560014843940735, "reward_std": 1.2492257058620453, "rewards/mrr_reward": 0.7132812440395355, "rewards/rank_analyze_format_reward": 0.7761619389057159, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137856245041, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9966137856245041, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 633.15625, "epoch": 0.02568, "grad_norm": 0.03401071950793266, "kl": 0.010232925415039062, "learning_rate": 6.377246332954544e-06, "loss": -0.0042, "reward": 6.2611448764801025, "reward_std": 0.8378316983580589, "rewards/mrr_reward": 0.4037884473800659, "rewards/rank_analyze_format_reward": 0.7192105799913406, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.012603184208273888, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 585.265625, "epoch": 0.02576, "grad_norm": 0.038607846945524216, "kl": 0.011226654052734375, "learning_rate": 6.318754473153221e-06, "loss": -0.0237, "reward": 6.479233503341675, "reward_std": 0.898737758398056, "rewards/mrr_reward": 0.4837425537407398, "rewards/rank_analyze_format_reward": 0.6142673417925835, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 598.828125, "epoch": 0.02584, "grad_norm": 0.03643499314785004, "kl": 0.011678695678710938, "learning_rate": 6.260407942621998e-06, "loss": -0.033, "reward": 6.358767151832581, "reward_std": 0.9248391389846802, "rewards/mrr_reward": 0.4494357705116272, "rewards/rank_analyze_format_reward": 0.6235239952802658, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 619.875, "epoch": 0.02592, "grad_norm": 0.035200949758291245, "kl": 0.010311126708984375, "learning_rate": 6.202209044781991e-06, "loss": -0.002, "reward": 6.491126179695129, "reward_std": 1.1114313453435898, "rewards/mrr_reward": 0.470951147377491, "rewards/rank_analyze_format_reward": 0.7224409729242325, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 624.609375, "epoch": 0.026, "grad_norm": 0.038123924285173416, "kl": 0.011371612548828125, "learning_rate": 6.144160077226035e-06, "loss": -0.0034, "reward": 6.530197501182556, "reward_std": 1.0080813020467758, "rewards/mrr_reward": 0.4721726290881634, "rewards/rank_analyze_format_reward": 0.7212767750024796, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 662.734375, "epoch": 0.02608, "grad_norm": 0.03231590986251831, "kl": 0.008310317993164062, "learning_rate": 6.086263331627976e-06, "loss": -0.0218, "reward": 7.471861004829407, "reward_std": 1.26896370947361, "rewards/mrr_reward": 0.6869977712631226, "rewards/rank_analyze_format_reward": 0.8039481341838837, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 604.5625, "epoch": 0.02616, "grad_norm": 0.03686119616031647, "kl": 0.009931564331054688, "learning_rate": 6.028521093652195e-06, "loss": -0.0337, "reward": 6.72059953212738, "reward_std": 1.2022047340869904, "rewards/mrr_reward": 0.5316592305898666, "rewards/rank_analyze_format_reward": 0.7187327444553375, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 580.640625, "epoch": 0.02624, "grad_norm": 0.03718126565217972, "kl": 0.009431838989257812, "learning_rate": 5.970935642863375e-06, "loss": -0.0156, "reward": 6.806818127632141, "reward_std": 0.9255675077438354, "rewards/mrr_reward": 0.5923363044857979, "rewards/rank_analyze_format_reward": 0.546847976744175, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 611.4375, "epoch": 0.02632, "grad_norm": 0.03433491289615631, "kl": 0.01043701171875, "learning_rate": 5.913509252636511e-06, "loss": -0.0049, "reward": 6.354798913002014, "reward_std": 0.9331846535205841, "rewards/mrr_reward": 0.43651413917541504, "rewards/rank_analyze_format_reward": 0.6975409835577011, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975927770137787, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9975927770137787, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 648.171875, "epoch": 0.0264, "grad_norm": 0.03562232106924057, "kl": 0.010423660278320312, "learning_rate": 5.85624419006716e-06, "loss": 0.0002, "reward": 7.243241667747498, "reward_std": 0.9737610071897507, "rewards/mrr_reward": 0.6152715981006622, "rewards/rank_analyze_format_reward": 0.8469991683959961, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9968750029802322, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9968750029802322, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 628.703125, "epoch": 0.02648, "grad_norm": 0.10225867480039597, "kl": 0.035511016845703125, "learning_rate": 5.799142715881938e-06, "loss": 0.0198, "reward": 6.2833216190338135, "reward_std": 1.1008527427911758, "rewards/mrr_reward": 0.40704986080527306, "rewards/rank_analyze_format_reward": 0.7547315508127213, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 624.96875, "epoch": 0.02656, "grad_norm": 0.03582165390253067, "kl": 0.011472702026367188, "learning_rate": 5.742207084349274e-06, "loss": 0.0036, "reward": 6.44066846370697, "reward_std": 1.1891821920871735, "rewards/mrr_reward": 0.4519655257463455, "rewards/rank_analyze_format_reward": 0.7733165174722672, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 632.890625, "epoch": 0.02664, "grad_norm": 0.038169655948877335, "kl": 0.009614944458007812, "learning_rate": 5.685439543190409e-06, "loss": -0.0182, "reward": 6.4908305406570435, "reward_std": 0.8565306067466736, "rewards/mrr_reward": 0.4479600638151169, "rewards/rank_analyze_format_reward": 0.8512190878391266, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 601.796875, "epoch": 0.02672, "grad_norm": 0.03706088289618492, "kl": 0.01105499267578125, "learning_rate": 5.628842333490674e-06, "loss": 0.0176, "reward": 7.4073134660720825, "reward_std": 1.1913396269083023, "rewards/mrr_reward": 0.6971354186534882, "rewards/rank_analyze_format_reward": 0.6827599853277206, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 602.71875, "epoch": 0.0268, "grad_norm": 0.03910846635699272, "kl": 0.012481689453125, "learning_rate": 5.572417689610987e-06, "loss": -0.022, "reward": 6.7601906061172485, "reward_std": 1.1457826048135757, "rewards/mrr_reward": 0.543483380228281, "rewards/rank_analyze_format_reward": 0.6800069808959961, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 625.46875, "epoch": 0.02688, "grad_norm": 0.031742773950099945, "kl": 0.008184432983398438, "learning_rate": 5.516167839099679e-06, "loss": 0.0003, "reward": 7.035795331001282, "reward_std": 0.7620124816894531, "rewards/mrr_reward": 0.578131191432476, "rewards/rank_analyze_format_reward": 0.7634648084640503, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.012231691740453243, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 620.921875, "epoch": 0.02696, "grad_norm": 0.03347519040107727, "kl": 0.010087966918945312, "learning_rate": 5.460095002604533e-06, "loss": -0.0157, "reward": 7.338195204734802, "reward_std": 0.9315738379955292, "rewards/mrr_reward": 0.6716766208410263, "rewards/rank_analyze_format_reward": 0.7213779538869858, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 608.28125, "epoch": 0.02704, "grad_norm": 0.03307173773646355, "kl": 0.010683059692382812, "learning_rate": 5.404201393785123e-06, "loss": -0.015, "reward": 6.1055203676223755, "reward_std": 0.8149298634380102, "rewards/mrr_reward": 0.3720672056078911, "rewards/rank_analyze_format_reward": 0.7175754755735397, "rewards/rank_answer_foramt_reward": 0.912109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9976895451545715, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9976895451545715, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 649.375, "epoch": 0.02712, "grad_norm": 0.036583006381988525, "kl": 0.008082389831542969, "learning_rate": 5.348489219225417e-06, "loss": 0.0087, "reward": 6.978741407394409, "reward_std": 1.0568707874044776, "rewards/mrr_reward": 0.5587177574634552, "rewards/rank_analyze_format_reward": 0.8041087239980698, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 609.40625, "epoch": 0.0272, "grad_norm": 0.034769993275403976, "kl": 0.009695053100585938, "learning_rate": 5.292960678346674e-06, "loss": 0.0174, "reward": 7.053924083709717, "reward_std": 0.8420550748705864, "rewards/mrr_reward": 0.5800099223852158, "rewards/rank_analyze_format_reward": 0.7648259848356247, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 608.703125, "epoch": 0.02728, "grad_norm": 0.03784068673849106, "kl": 0.009168624877929688, "learning_rate": 5.237617963320608e-06, "loss": -0.0155, "reward": 6.456003546714783, "reward_std": 0.8597143590450287, "rewards/mrr_reward": 0.4546565040946007, "rewards/rank_analyze_format_reward": 0.7096430659294128, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 650.15625, "epoch": 0.02736, "grad_norm": 0.03336174413561821, "kl": 0.009044647216796875, "learning_rate": 5.1824632589828465e-06, "loss": 0.0051, "reward": 6.55331826210022, "reward_std": 0.8850341439247131, "rewards/mrr_reward": 0.45027901232242584, "rewards/rank_analyze_format_reward": 0.7795460075139999, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 612.015625, "epoch": 0.02744, "grad_norm": 0.03610434755682945, "kl": 0.008493423461914062, "learning_rate": 5.127498742746675e-06, "loss": -0.011, "reward": 6.6224623918533325, "reward_std": 0.8006821572780609, "rewards/mrr_reward": 0.4943762458860874, "rewards/rank_analyze_format_reward": 0.7283474206924438, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9993206560611725, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9993206560611725, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 587.109375, "epoch": 0.02752, "grad_norm": 0.03993268683552742, "kl": 0.013347625732421875, "learning_rate": 5.072726584517086e-06, "loss": -0.0161, "reward": 6.7273242473602295, "reward_std": 1.2600727677345276, "rewards/mrr_reward": 0.5446118488907814, "rewards/rank_analyze_format_reward": 0.6595312505960464, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983836114406586, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9983836114406586, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 582.265625, "epoch": 0.0276, "grad_norm": 0.03480137512087822, "kl": 0.0099334716796875, "learning_rate": 5.018148946605092e-06, "loss": -0.0253, "reward": 7.481507897377014, "reward_std": 0.9491081535816193, "rewards/mrr_reward": 0.7095238268375397, "rewards/rank_analyze_format_reward": 0.7210727632045746, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 586.1875, "epoch": 0.02768, "grad_norm": 0.0376473069190979, "kl": 0.009815216064453125, "learning_rate": 4.9637679836423926e-06, "loss": 0.0063, "reward": 6.882553815841675, "reward_std": 1.0581007301807404, "rewards/mrr_reward": 0.5559895858168602, "rewards/rank_analyze_format_reward": 0.7923161536455154, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975927919149399, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9819677919149399, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 606.140625, "epoch": 0.02776, "grad_norm": 0.04050064831972122, "kl": 0.012176513671875, "learning_rate": 4.909585842496287e-06, "loss": -0.008, "reward": 6.3230438232421875, "reward_std": 1.5663468837738037, "rewards/mrr_reward": 0.4587549418210983, "rewards/rank_analyze_format_reward": 0.6187685653567314, "rewards/rank_answer_foramt_reward": 0.87109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 590.640625, "epoch": 0.02784, "grad_norm": 0.04040512815117836, "kl": 0.011077880859375, "learning_rate": 4.855604662184935e-06, "loss": -0.0192, "reward": 6.102820634841919, "reward_std": 1.1095046699047089, "rewards/mrr_reward": 0.3781436085700989, "rewards/rank_analyze_format_reward": 0.6955999881029129, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 663.96875, "epoch": 0.02792, "grad_norm": 0.03195063769817352, "kl": 0.009430885314941406, "learning_rate": 4.801826573792905e-06, "loss": 0.0135, "reward": 6.144711256027222, "reward_std": 1.0616885423660278, "rewards/mrr_reward": 0.38415180146694183, "rewards/rank_analyze_format_reward": 0.7307276725769043, "rewards/rank_answer_foramt_reward": 0.896484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 630.328125, "epoch": 0.028, "grad_norm": 0.03630421310663223, "kl": 0.00982666015625, "learning_rate": 4.7482537003870425e-06, "loss": 0.0058, "reward": 6.388647079467773, "reward_std": 0.9659361243247986, "rewards/mrr_reward": 0.43986857682466507, "rewards/rank_analyze_format_reward": 0.7287820726633072, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.984375, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 594.59375, "epoch": 0.02808, "grad_norm": 0.03534599766135216, "kl": 0.009351730346679688, "learning_rate": 4.694888156932657e-06, "loss": 0.0086, "reward": 6.570671319961548, "reward_std": 1.0458246171474457, "rewards/mrr_reward": 0.4713975638151169, "rewards/rank_analyze_format_reward": 0.802268460392952, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.984375, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 618.203125, "epoch": 0.02816, "grad_norm": 0.036375947296619415, "kl": 0.008737564086914062, "learning_rate": 4.641732050210032e-06, "loss": -0.0228, "reward": 6.213122844696045, "reward_std": 0.8897507041692734, "rewards/mrr_reward": 0.38513146340847015, "rewards/rank_analyze_format_reward": 0.7735430300235748, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 620.453125, "epoch": 0.02824, "grad_norm": 0.033981017768383026, "kl": 0.009093284606933594, "learning_rate": 4.588787478731242e-06, "loss": 0.0088, "reward": 7.132024168968201, "reward_std": 1.1730490624904633, "rewards/mrr_reward": 0.6218005865812302, "rewards/rank_analyze_format_reward": 0.7281893491744995, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 615.453125, "epoch": 0.02832, "grad_norm": 0.03723985329270363, "kl": 0.010562896728515625, "learning_rate": 4.53605653265731e-06, "loss": -0.0289, "reward": 6.976137280464172, "reward_std": 1.249916672706604, "rewards/mrr_reward": 0.5726562514901161, "rewards/rank_analyze_format_reward": 0.7682479172945023, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9928118884563446, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9928118884563446, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 613.640625, "epoch": 0.0284, "grad_norm": 0.03720582649111748, "kl": 0.009979248046875, "learning_rate": 4.483541293715699e-06, "loss": -0.0122, "reward": 6.593631267547607, "reward_std": 0.8079218193888664, "rewards/mrr_reward": 0.4882316589355469, "rewards/rank_analyze_format_reward": 0.7051577717065811, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 642.171875, "epoch": 0.02848, "grad_norm": 0.03377283364534378, "kl": 0.009444236755371094, "learning_rate": 4.4312438351181246e-06, "loss": 0.0089, "reward": 6.717236280441284, "reward_std": 0.9809283912181854, "rewards/mrr_reward": 0.4922495186328888, "rewards/rank_analyze_format_reward": 0.8283162266016006, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 633.328125, "epoch": 0.02856, "grad_norm": 0.0329078733921051, "kl": 0.009710311889648438, "learning_rate": 4.379166221478697e-06, "loss": -0.0127, "reward": 6.455557823181152, "reward_std": 1.1159992516040802, "rewards/mrr_reward": 0.46347346156835556, "rewards/rank_analyze_format_reward": 0.7203813195228577, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 603.25, "epoch": 0.02864, "grad_norm": 0.03654990345239639, "kl": 0.00997161865234375, "learning_rate": 4.3273105087324375e-06, "loss": -0.0291, "reward": 7.5022441148757935, "reward_std": 1.070105716586113, "rewards/mrr_reward": 0.7200954854488373, "rewards/rank_analyze_format_reward": 0.6741493195295334, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9914346039295197, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9914346039295197, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 604.4375, "epoch": 0.02872, "grad_norm": 0.03658515587449074, "kl": 0.010080337524414062, "learning_rate": 4.275678744054094e-06, "loss": -0.027, "reward": 6.201189398765564, "reward_std": 0.977226585149765, "rewards/mrr_reward": 0.4017237201333046, "rewards/rank_analyze_format_reward": 0.6781641393899918, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 633.0625, "epoch": 0.0288, "grad_norm": 0.03946786746382713, "kl": 0.00982666015625, "learning_rate": 4.224272965777326e-06, "loss": -0.004, "reward": 6.33308732509613, "reward_std": 1.07747383415699, "rewards/mrr_reward": 0.41968005895614624, "rewards/rank_analyze_format_reward": 0.7598357796669006, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 614.6875, "epoch": 0.02888, "grad_norm": 0.03583682328462601, "kl": 0.010387420654296875, "learning_rate": 4.173095203314241e-06, "loss": -0.0061, "reward": 6.099523663520813, "reward_std": 1.1623765230178833, "rewards/mrr_reward": 0.37878844887018204, "rewards/rank_analyze_format_reward": 0.7050404101610184, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 602.3125, "epoch": 0.02896, "grad_norm": 0.03713896498084068, "kl": 0.01088714599609375, "learning_rate": 4.12214747707527e-06, "loss": -0.0364, "reward": 6.781999826431274, "reward_std": 1.1397517770528793, "rewards/mrr_reward": 0.5694382265210152, "rewards/rank_analyze_format_reward": 0.648046463727951, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 644.03125, "epoch": 0.02904, "grad_norm": 0.03442864865064621, "kl": 0.009798049926757812, "learning_rate": 4.071431798389408e-06, "loss": -0.015, "reward": 6.419280529022217, "reward_std": 0.9935359209775925, "rewards/mrr_reward": 0.4470672234892845, "rewards/rank_analyze_format_reward": 0.7556310743093491, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9962841123342514, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9962841123342514, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 611.796875, "epoch": 0.02912, "grad_norm": 0.03818686679005623, "kl": 0.0087432861328125, "learning_rate": 4.020950169424815e-06, "loss": -0.0195, "reward": 7.141218543052673, "reward_std": 0.7832182869315147, "rewards/mrr_reward": 0.5974578335881233, "rewards/rank_analyze_format_reward": 0.7961938977241516, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 610.96875, "epoch": 0.0292, "grad_norm": 0.0375281386077404, "kl": 0.010234832763671875, "learning_rate": 3.970704583109755e-06, "loss": 0.0078, "reward": 6.407486200332642, "reward_std": 1.1848650872707367, "rewards/mrr_reward": 0.4760354720056057, "rewards/rank_analyze_format_reward": 0.6728429794311523, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 598.046875, "epoch": 0.02928, "grad_norm": 0.04480404406785965, "kl": 0.010717391967773438, "learning_rate": 3.920697023053949e-06, "loss": 0.0126, "reward": 6.092200040817261, "reward_std": 1.5816690325737, "rewards/mrr_reward": 0.4796627089381218, "rewards/rank_analyze_format_reward": 0.5576980113983154, "rewards/rank_answer_foramt_reward": 0.837890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9358552694320679, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.9358552694320679, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 633.34375, "epoch": 0.02936, "grad_norm": 0.03479861095547676, "kl": 0.008795738220214844, "learning_rate": 3.8709294634702374e-06, "loss": -0.0017, "reward": 6.176755309104919, "reward_std": 1.414333239197731, "rewards/mrr_reward": 0.38570189103484154, "rewards/rank_analyze_format_reward": 0.8014931678771973, "rewards/rank_answer_foramt_reward": 0.84375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 579.625, "epoch": 0.02944, "grad_norm": 0.03566717356443405, "kl": 0.010448455810546875, "learning_rate": 3.821403869096658e-06, "loss": 0.0051, "reward": 7.180590748786926, "reward_std": 0.7666215635836124, "rewards/mrr_reward": 0.6469184011220932, "rewards/rank_analyze_format_reward": 0.635885939002037, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 609.34375, "epoch": 0.02952, "grad_norm": 0.03476927429437637, "kl": 0.010423660278320312, "learning_rate": 3.772122195118877e-06, "loss": -0.0132, "reward": 6.9105294942855835, "reward_std": 1.0616263002157211, "rewards/mrr_reward": 0.5765129029750824, "rewards/rank_analyze_format_reward": 0.6901070028543472, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 645.96875, "epoch": 0.0296, "grad_norm": 0.03369462862610817, "kl": 0.009494781494140625, "learning_rate": 3.723086387092997e-06, "loss": 0.0182, "reward": 6.838284850120544, "reward_std": 1.1357534378767014, "rewards/mrr_reward": 0.5285032242536545, "rewards/rank_analyze_format_reward": 0.8257192522287369, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 583.671875, "epoch": 0.02968, "grad_norm": 0.041327208280563354, "kl": 0.0105438232421875, "learning_rate": 3.674298380868756e-06, "loss": -0.0435, "reward": 7.2045464515686035, "reward_std": 1.1361991167068481, "rewards/mrr_reward": 0.6859375014901161, "rewards/rank_analyze_format_reward": 0.5190819948911667, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 593.296875, "epoch": 0.02976, "grad_norm": 0.036297328770160675, "kl": 0.011859893798828125, "learning_rate": 3.625760102513103e-06, "loss": -0.0416, "reward": 6.148580312728882, "reward_std": 1.7913504540920258, "rewards/mrr_reward": 0.4453311152756214, "rewards/rank_analyze_format_reward": 0.6309278607368469, "rewards/rank_answer_foramt_reward": 0.791015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 618.171875, "epoch": 0.02984, "grad_norm": 0.03555576875805855, "kl": 0.011425018310546875, "learning_rate": 3.5774734682341563e-06, "loss": -0.0192, "reward": 6.411505222320557, "reward_std": 1.4318915605545044, "rewards/mrr_reward": 0.45578497648239136, "rewards/rank_analyze_format_reward": 0.7688093036413193, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.996692106127739, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.996692106127739, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 629.71875, "epoch": 0.02992, "grad_norm": 0.0373053178191185, "kl": 0.009508132934570312, "learning_rate": 3.5294403843055604e-06, "loss": 0.0099, "reward": 7.484785676002502, "reward_std": 0.6732370555400848, "rewards/mrr_reward": 0.690730407834053, "rewards/rank_analyze_format_reward": 0.7667859047651291, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 612.109375, "epoch": 0.03, "grad_norm": 0.03780434653162956, "kl": 0.009645462036132812, "learning_rate": 3.4816627469912147e-06, "loss": -0.0354, "reward": 6.8859922885894775, "reward_std": 1.1529514789581299, "rewards/mrr_reward": 0.5565538331866264, "rewards/rank_analyze_format_reward": 0.7345536947250366, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9977678656578064, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9977678656578064, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 633.734375, "epoch": 0.03008, "grad_norm": 0.03804342821240425, "kl": 0.009288787841796875, "learning_rate": 3.4341424424704373e-06, "loss": 0.0207, "reward": 6.187745928764343, "reward_std": 1.1112108528614044, "rewards/mrr_reward": 0.3717943951487541, "rewards/rank_analyze_format_reward": 0.7976381182670593, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9963869154453278, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9963869154453278, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 579.28125, "epoch": 0.03016, "grad_norm": 0.04059469699859619, "kl": 0.012218475341796875, "learning_rate": 3.3868813467634833e-06, "loss": -0.0257, "reward": 6.707830786705017, "reward_std": 0.9580760449171066, "rewards/mrr_reward": 0.5683593824505806, "rewards/rank_analyze_format_reward": 0.6120125353336334, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 613.671875, "epoch": 0.03024, "grad_norm": 0.03483661636710167, "kl": 0.010251045227050781, "learning_rate": 3.3398813256574847e-06, "loss": -0.0056, "reward": 7.134432435035706, "reward_std": 0.9167252779006958, "rewards/mrr_reward": 0.6160156428813934, "rewards/rank_analyze_format_reward": 0.7152916193008423, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 610.3125, "epoch": 0.03032, "grad_norm": 0.03524739295244217, "kl": 0.009969711303710938, "learning_rate": 3.2931442346328e-06, "loss": -0.0247, "reward": 6.126248240470886, "reward_std": 0.920678161084652, "rewards/mrr_reward": 0.3877604305744171, "rewards/rank_analyze_format_reward": 0.6787221133708954, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.984375, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 613.484375, "epoch": 0.0304, "grad_norm": 0.03536607325077057, "kl": 0.01009368896484375, "learning_rate": 3.2466719187897555e-06, "loss": -0.0244, "reward": 6.265165448188782, "reward_std": 0.8990872912108898, "rewards/mrr_reward": 0.41821056604385376, "rewards/rank_analyze_format_reward": 0.6899793595075607, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 642.625, "epoch": 0.03048, "grad_norm": 0.0343351848423481, "kl": 0.009939193725585938, "learning_rate": 3.200466212775808e-06, "loss": 0.0027, "reward": 6.489507675170898, "reward_std": 1.303632378578186, "rewards/mrr_reward": 0.46700769662857056, "rewards/rank_analyze_format_reward": 0.7479704767465591, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9963235259056091, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9963235259056091, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 622.625, "epoch": 0.03056, "grad_norm": 0.03577445447444916, "kl": 0.00930023193359375, "learning_rate": 3.1545289407131128e-06, "loss": -0.0298, "reward": 6.44808030128479, "reward_std": 1.0737406611442566, "rewards/mrr_reward": 0.43384797126054764, "rewards/rank_analyze_format_reward": 0.780159518122673, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.997514471411705, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.997514471411705, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 571.890625, "epoch": 0.03064, "grad_norm": 0.1850980520248413, "kl": 0.03582954406738281, "learning_rate": 3.108861916126518e-06, "loss": -0.0107, "reward": 6.22871720790863, "reward_std": 0.7401619739830494, "rewards/mrr_reward": 0.421564981341362, "rewards/rank_analyze_format_reward": 0.6010510697960854, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 637.828125, "epoch": 0.03072, "grad_norm": 0.03250962123274803, "kl": 0.008481979370117188, "learning_rate": 3.063466941871952e-06, "loss": -0.0059, "reward": 6.834837317466736, "reward_std": 0.7794247586280107, "rewards/mrr_reward": 0.5347098335623741, "rewards/rank_analyze_format_reward": 0.758498027920723, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 611.765625, "epoch": 0.0308, "grad_norm": 0.03917098790407181, "kl": 0.009183883666992188, "learning_rate": 3.0183458100652752e-06, "loss": 0.0082, "reward": 6.392803072929382, "reward_std": 1.4655225276947021, "rewards/mrr_reward": 0.4779699891805649, "rewards/rank_analyze_format_reward": 0.5965095832943916, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_contrast_format_reward": 0.01247317623347044, "rewards/rank_initial_format_reward": 0.9965170323848724, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9965170323848724, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 587.703125, "epoch": 0.03088, "grad_norm": 0.039243102073669434, "kl": 0.009486198425292969, "learning_rate": 2.9735003020115095e-06, "loss": -0.0086, "reward": 5.71747624874115, "reward_std": 0.8861955106258392, "rewards/mrr_reward": 0.30232515186071396, "rewards/rank_analyze_format_reward": 0.6506390273571014, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 615.8125, "epoch": 0.03096, "grad_norm": 0.035459235310554504, "kl": 0.00905609130859375, "learning_rate": 2.9289321881345257e-06, "loss": -0.0395, "reward": 6.300573468208313, "reward_std": 1.4003549814224243, "rewards/mrr_reward": 0.4438368156552315, "rewards/rank_analyze_format_reward": 0.6784829795360565, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9809887856245041, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9809887856245041, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 641.46875, "epoch": 0.03104, "grad_norm": 0.0343979112803936, "kl": 0.010465621948242188, "learning_rate": 2.884643227907147e-06, "loss": -0.0135, "reward": 6.683073997497559, "reward_std": 1.1601893454790115, "rewards/mrr_reward": 0.516765870153904, "rewards/rank_analyze_format_reward": 0.7346126735210419, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9973393976688385, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 592.40625, "epoch": 0.03112, "grad_norm": 0.040942125022411346, "kl": 0.008916854858398438, "learning_rate": 2.840635169781688e-06, "loss": -0.0325, "reward": 7.064440608024597, "reward_std": 0.9985512457787991, "rewards/mrr_reward": 0.5985739231109619, "rewards/rank_analyze_format_reward": 0.7302684485912323, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 615.859375, "epoch": 0.0312, "grad_norm": 0.03723229840397835, "kl": 0.012697219848632812, "learning_rate": 2.796909751120931e-06, "loss": -0.0364, "reward": 7.294129252433777, "reward_std": 0.6092105805873871, "rewards/mrr_reward": 0.651909738779068, "rewards/rank_analyze_format_reward": 0.7553057968616486, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9978189468383789, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9978189468383789, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 600.0, "epoch": 0.03128, "grad_norm": 0.03801019862294197, "kl": 0.008825302124023438, "learning_rate": 2.7534686981295335e-06, "loss": -0.0388, "reward": 5.955231785774231, "reward_std": 0.8021261096000671, "rewards/mrr_reward": 0.3279885984957218, "rewards/rank_analyze_format_reward": 0.7077305614948273, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 632.859375, "epoch": 0.03136, "grad_norm": 0.0338691882789135, "kl": 0.009738922119140625, "learning_rate": 2.7103137257858867e-06, "loss": -0.029, "reward": 6.992282867431641, "reward_std": 1.0505472123622894, "rewards/mrr_reward": 0.589570939540863, "rewards/rank_analyze_format_reward": 0.7277490943670273, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.96875, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 619.5625, "epoch": 0.03144, "grad_norm": 0.03506048768758774, "kl": 0.009555816650390625, "learning_rate": 2.667446537774402e-06, "loss": -0.0089, "reward": 7.091682434082031, "reward_std": 1.136030226945877, "rewards/mrr_reward": 0.6293340623378754, "rewards/rank_analyze_format_reward": 0.7073224931955338, "rewards/rank_answer_foramt_reward": 0.87109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9979648143053055, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9979648143053055, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 615.625, "epoch": 0.03152, "grad_norm": 0.03391087427735329, "kl": 0.009349822998046875, "learning_rate": 2.624868826418262e-06, "loss": 0.0044, "reward": 6.959715723991394, "reward_std": 1.1997208297252655, "rewards/mrr_reward": 0.5721912235021591, "rewards/rank_analyze_format_reward": 0.7744666039943695, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 1.0, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 602.578125, "epoch": 0.0316, "grad_norm": 0.03866111487150192, "kl": 0.010669708251953125, "learning_rate": 2.5825822726126095e-06, "loss": -0.0307, "reward": 5.969284653663635, "reward_std": 1.157243937253952, "rewards/mrr_reward": 0.3815476205199957, "rewards/rank_analyze_format_reward": 0.6304792910814285, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 603.765625, "epoch": 0.03168, "grad_norm": 0.03797340765595436, "kl": 0.008867263793945312, "learning_rate": 2.5405885457581793e-06, "loss": 0.0062, "reward": 6.636697292327881, "reward_std": 0.7527401149272919, "rewards/mrr_reward": 0.4902529753744602, "rewards/rank_analyze_format_reward": 0.7323259860277176, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 599.90625, "epoch": 0.03176, "grad_norm": 0.035425517708063126, "kl": 0.010492324829101562, "learning_rate": 2.4988893036954045e-06, "loss": -0.0125, "reward": 6.985053658485413, "reward_std": 1.0101159363985062, "rewards/mrr_reward": 0.5865017548203468, "rewards/rank_analyze_format_reward": 0.6989770531654358, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 590.578125, "epoch": 0.03184, "grad_norm": 0.03650686517357826, "kl": 0.00948333740234375, "learning_rate": 2.4574861926389615e-06, "loss": -0.0212, "reward": 6.057408928871155, "reward_std": 1.0152454525232315, "rewards/mrr_reward": 0.3773561716079712, "rewards/rank_analyze_format_reward": 0.6631038039922714, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 603.515625, "epoch": 0.03192, "grad_norm": 0.036450207233428955, "kl": 0.008769989013671875, "learning_rate": 2.4163808471127815e-06, "loss": -0.0475, "reward": 6.1118505001068115, "reward_std": 0.7709715962409973, "rewards/mrr_reward": 0.4016121178865433, "rewards/rank_analyze_format_reward": 0.5518537759780884, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 603.34375, "epoch": 0.032, "grad_norm": 0.04163345322012901, "kl": 0.010300636291503906, "learning_rate": 2.37557488988552e-06, "loss": -0.0222, "reward": 7.210442543029785, "reward_std": 0.9683633595705032, "rewards/mrr_reward": 0.6489583253860474, "rewards/rank_analyze_format_reward": 0.6807069778442383, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 624.46875, "epoch": 0.03208, "grad_norm": 0.035172924399375916, "kl": 0.009805679321289062, "learning_rate": 2.335069931906503e-06, "loss": -0.0369, "reward": 7.072114825248718, "reward_std": 0.6663669645786285, "rewards/mrr_reward": 0.5996093451976776, "rewards/rank_analyze_format_reward": 0.7114098370075226, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.01457868330180645, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 603.8125, "epoch": 0.03216, "grad_norm": 0.037878427654504776, "kl": 0.01148223876953125, "learning_rate": 2.2948675722421086e-06, "loss": -0.0138, "reward": 6.7854098081588745, "reward_std": 1.3723297119140625, "rewards/mrr_reward": 0.5404080003499985, "rewards/rank_analyze_format_reward": 0.7189428806304932, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9973393976688385, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 636.328125, "epoch": 0.03224, "grad_norm": 0.037404462695121765, "kl": 0.009088516235351562, "learning_rate": 2.254969398012663e-06, "loss": -0.0129, "reward": 6.989399433135986, "reward_std": 1.2586904168128967, "rewards/mrr_reward": 0.6032304167747498, "rewards/rank_analyze_format_reward": 0.6896441280841827, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 640.203125, "epoch": 0.03232, "grad_norm": 0.03499702736735344, "kl": 0.00972747802734375, "learning_rate": 2.215376984329767e-06, "loss": 0.0128, "reward": 6.231714963912964, "reward_std": 1.7257316261529922, "rewards/mrr_reward": 0.45218875259160995, "rewards/rank_analyze_format_reward": 0.672342911362648, "rewards/rank_answer_foramt_reward": 0.81640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9827302694320679, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9827302694320679, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 611.703125, "epoch": 0.0324, "grad_norm": 0.038646895438432693, "kl": 0.010084152221679688, "learning_rate": 2.1760918942341193e-06, "loss": -0.0069, "reward": 6.684186339378357, "reward_std": 1.0474583506584167, "rewards/mrr_reward": 0.5014199018478394, "rewards/rank_analyze_format_reward": 0.789061650633812, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9828085899353027, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 616.796875, "epoch": 0.03248, "grad_norm": 0.03426486253738403, "kl": 0.0100555419921875, "learning_rate": 2.1371156786338108e-06, "loss": -0.0002, "reward": 6.611807346343994, "reward_std": 1.2847396433353424, "rewards/mrr_reward": 0.48984377086162567, "rewards/rank_analyze_format_reward": 0.7792705148458481, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 623.359375, "epoch": 0.03256, "grad_norm": 0.03667137771844864, "kl": 0.009508132934570312, "learning_rate": 2.098449876243096e-06, "loss": 0.0006, "reward": 6.135548949241638, "reward_std": 0.9647811651229858, "rewards/mrr_reward": 0.382998526096344, "rewards/rank_analyze_format_reward": 0.7095229774713516, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9938909858465195, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9938909858465195, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 639.9375, "epoch": 0.03264, "grad_norm": 0.03392645716667175, "kl": 0.009807586669921875, "learning_rate": 2.0600960135216463e-06, "loss": 0.0062, "reward": 6.379873037338257, "reward_std": 0.9333349615335464, "rewards/mrr_reward": 0.41111112385988235, "rewards/rank_analyze_format_reward": 0.8054327666759491, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 615.90625, "epoch": 0.03272, "grad_norm": 0.034899573773145676, "kl": 0.009767532348632812, "learning_rate": 2.022055604614289e-06, "loss": -0.0232, "reward": 6.8878679275512695, "reward_std": 1.175618052482605, "rewards/mrr_reward": 0.5766059085726738, "rewards/rank_analyze_format_reward": 0.6694720983505249, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9950486272573471, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9950486272573471, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 638.6875, "epoch": 0.0328, "grad_norm": 0.03311561793088913, "kl": 0.009159088134765625, "learning_rate": 1.984330151291233e-06, "loss": -0.0033, "reward": 6.117011308670044, "reward_std": 1.1643436253070831, "rewards/mrr_reward": 0.4042472764849663, "rewards/rank_analyze_format_reward": 0.6660377532243729, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 612.359375, "epoch": 0.03288, "grad_norm": 0.03604024276137352, "kl": 0.010951995849609375, "learning_rate": 1.9469211428887813e-06, "loss": -0.0202, "reward": 6.442740440368652, "reward_std": 0.8698384165763855, "rewards/mrr_reward": 0.44493427872657776, "rewards/rank_analyze_format_reward": 0.7637190371751785, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9965170323848724, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9965170323848724, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 606.3125, "epoch": 0.03296, "grad_norm": 0.03604024276137352, "kl": 0.009672164916992188, "learning_rate": 1.9469211428887813e-06, "loss": -0.0193, "reward": 6.506339073181152, "reward_std": 0.7897710353136063, "rewards/mrr_reward": 0.4584573544561863, "rewards/rank_analyze_format_reward": 0.7428222447633743, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 642.125, "epoch": 0.03304, "grad_norm": 0.03551711514592171, "kl": 0.010496139526367188, "learning_rate": 1.9098300562505266e-06, "loss": -0.0168, "reward": 6.8633400201797485, "reward_std": 1.0601329803466797, "rewards/mrr_reward": 0.5445498451590538, "rewards/rank_analyze_format_reward": 0.7944645285606384, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.013414634391665459, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 641.171875, "epoch": 0.03312, "grad_norm": 0.033040959388017654, "kl": 0.009469985961914062, "learning_rate": 1.8730583556690607e-06, "loss": -0.001, "reward": 6.547058701515198, "reward_std": 1.2053719013929367, "rewards/mrr_reward": 0.46091271191835403, "rewards/rank_analyze_format_reward": 0.7405171990394592, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 632.265625, "epoch": 0.0332, "grad_norm": 0.03571722283959389, "kl": 0.010266304016113281, "learning_rate": 1.8366074928281608e-06, "loss": -0.0129, "reward": 7.1048054695129395, "reward_std": 1.001551479101181, "rewards/mrr_reward": 0.5974144265055656, "rewards/rank_analyze_format_reward": 0.7951110303401947, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 629.4375, "epoch": 0.03328, "grad_norm": 0.03565621003508568, "kl": 0.011236190795898438, "learning_rate": 1.8004789067454763e-06, "loss": -0.0258, "reward": 6.165696740150452, "reward_std": 0.9990394413471222, "rewards/mrr_reward": 0.40422867238521576, "rewards/rank_analyze_format_reward": 0.6850776374340057, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 613.546875, "epoch": 0.03336, "grad_norm": 0.03054845705628395, "kl": 0.0113372802734375, "learning_rate": 1.7646740237157256e-06, "loss": -0.0031, "reward": 7.537035346031189, "reward_std": 0.841257207095623, "rewards/mrr_reward": 0.702126756310463, "rewards/rank_analyze_format_reward": 0.7714973986148834, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 624.34375, "epoch": 0.03344, "grad_norm": 0.03489568457007408, "kl": 0.009769439697265625, "learning_rate": 1.7291942572543806e-06, "loss": -0.0103, "reward": 7.2256258726119995, "reward_std": 0.8925280421972275, "rewards/mrr_reward": 0.644314244389534, "rewards/rank_analyze_format_reward": 0.6971971243619919, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 609.296875, "epoch": 0.03352, "grad_norm": 0.038806188851594925, "kl": 0.009914398193359375, "learning_rate": 1.6940410080418723e-06, "loss": 0.0084, "reward": 6.689801931381226, "reward_std": 0.9456233829259872, "rewards/mrr_reward": 0.52598587423563, "rewards/rank_analyze_format_reward": 0.6835145801305771, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.984375, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 645.046875, "epoch": 0.0336, "grad_norm": 0.033611051738262177, "kl": 0.007742881774902344, "learning_rate": 1.6592156638682887e-06, "loss": 0.0089, "reward": 6.4848939180374146, "reward_std": 0.5431230962276459, "rewards/mrr_reward": 0.4354538768529892, "rewards/rank_analyze_format_reward": 0.7645627856254578, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 584.65625, "epoch": 0.03368, "grad_norm": 0.037055887281894684, "kl": 0.011796951293945312, "learning_rate": 1.6247195995785836e-06, "loss": -0.0099, "reward": 6.415200710296631, "reward_std": 1.102502852678299, "rewards/mrr_reward": 0.49494048580527306, "rewards/rank_analyze_format_reward": 0.6444233506917953, "rewards/rank_answer_foramt_reward": 0.791015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 631.203125, "epoch": 0.03376, "grad_norm": 0.034259483218193054, "kl": 0.00965118408203125, "learning_rate": 1.5905541770183096e-06, "loss": -0.0123, "reward": 6.928329586982727, "reward_std": 1.423426702618599, "rewards/mrr_reward": 0.5699404925107956, "rewards/rank_analyze_format_reward": 0.7634178251028061, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9953093379735947, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9953093379735947, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 592.765625, "epoch": 0.03384, "grad_norm": 0.03593182563781738, "kl": 0.01035308837890625, "learning_rate": 1.5567207449798517e-06, "loss": -0.0218, "reward": 7.0378159284591675, "reward_std": 1.166471242904663, "rewards/mrr_reward": 0.6242559626698494, "rewards/rank_analyze_format_reward": 0.6754428595304489, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 616.328125, "epoch": 0.03392, "grad_norm": 0.03504238650202751, "kl": 0.009710311889648438, "learning_rate": 1.52322063914917e-06, "loss": -0.0014, "reward": 6.82245671749115, "reward_std": 1.6725111305713654, "rewards/mrr_reward": 0.5604600608348846, "rewards/rank_analyze_format_reward": 0.7439109534025192, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9955011606216431, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9955011606216431, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 617.390625, "epoch": 0.034, "grad_norm": 0.03463631868362427, "kl": 0.009096145629882812, "learning_rate": 1.490055182053083e-06, "loss": -0.0062, "reward": 5.61866819858551, "reward_std": 0.8437648266553879, "rewards/mrr_reward": 0.27474578842520714, "rewards/rank_analyze_format_reward": 0.6401620507240295, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 618.5, "epoch": 0.03408, "grad_norm": 0.03268309310078621, "kl": 0.010545730590820312, "learning_rate": 1.4572256830070497e-06, "loss": -0.0102, "reward": 6.763679027557373, "reward_std": 0.92563696205616, "rewards/mrr_reward": 0.5274677723646164, "rewards/rank_analyze_format_reward": 0.7003744468092918, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 594.859375, "epoch": 0.03416, "grad_norm": 0.03423745185136795, "kl": 0.008687973022460938, "learning_rate": 1.4247334380634792e-06, "loss": -0.026, "reward": 6.149406552314758, "reward_std": 0.9334520697593689, "rewards/mrr_reward": 0.386607151478529, "rewards/rank_analyze_format_reward": 0.65961854159832, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 653.625, "epoch": 0.03424, "grad_norm": 0.038005102425813675, "kl": 0.011844635009765625, "learning_rate": 1.3925797299605649e-06, "loss": -0.0388, "reward": 6.840047359466553, "reward_std": 0.8250576257705688, "rewards/mrr_reward": 0.5030567795038223, "rewards/rank_analyze_format_reward": 0.85711669921875, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 600.5, "epoch": 0.03432, "grad_norm": 0.035267122089862823, "kl": 0.010089874267578125, "learning_rate": 1.3607658280716474e-06, "loss": -0.0116, "reward": 6.638450741767883, "reward_std": 1.190705269575119, "rewards/mrr_reward": 0.498046875, "rewards/rank_analyze_format_reward": 0.7751696109771729, "rewards/rank_answer_foramt_reward": 0.87109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 618.5, "epoch": 0.0344, "grad_norm": 0.040324531495571136, "kl": 0.0123138427734375, "learning_rate": 1.3292929883550998e-06, "loss": -0.0029, "reward": 6.510358572006226, "reward_std": 1.168705016374588, "rewards/mrr_reward": 0.4863591343164444, "rewards/rank_analyze_format_reward": 0.742541715502739, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9893152564764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9893152564764023, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 623.046875, "epoch": 0.03448, "grad_norm": 0.03522971272468567, "kl": 0.010587692260742188, "learning_rate": 1.2981624533047432e-06, "loss": -0.0064, "reward": 6.658617973327637, "reward_std": 1.3042782470583916, "rewards/mrr_reward": 0.5222842358052731, "rewards/rank_analyze_format_reward": 0.7292925119400024, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9972426444292068, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9972426444292068, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 618.484375, "epoch": 0.03456, "grad_norm": 0.033920254558324814, "kl": 0.009311676025390625, "learning_rate": 1.2673754519008008e-06, "loss": -0.0111, "reward": 6.775153756141663, "reward_std": 0.9337569251656532, "rewards/mrr_reward": 0.5195684432983398, "rewards/rank_analyze_format_reward": 0.761218249797821, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 606.5, "epoch": 0.03464, "grad_norm": 0.03532830998301506, "kl": 0.009275436401367188, "learning_rate": 1.2369331995613664e-06, "loss": -0.0158, "reward": 7.530662536621094, "reward_std": 1.0039202123880386, "rewards/mrr_reward": 0.727083332836628, "rewards/rank_analyze_format_reward": 0.659323662519455, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 594.59375, "epoch": 0.03472, "grad_norm": 0.03901416435837746, "kl": 0.010721206665039062, "learning_rate": 1.206836898094439e-06, "loss": -0.0105, "reward": 6.700058579444885, "reward_std": 1.1378709971904755, "rewards/mrr_reward": 0.5464595705270767, "rewards/rank_analyze_format_reward": 0.6720000803470612, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 660.734375, "epoch": 0.0348, "grad_norm": 0.03564814478158951, "kl": 0.008474349975585938, "learning_rate": 1.1770877356504684e-06, "loss": 0.0263, "reward": 6.397605299949646, "reward_std": 1.089090034365654, "rewards/mrr_reward": 0.4306361712515354, "rewards/rank_analyze_format_reward": 0.8266727924346924, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9818111509084702, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 606.90625, "epoch": 0.03488, "grad_norm": 0.03534642979502678, "kl": 0.00969696044921875, "learning_rate": 1.1476868866754488e-06, "loss": 0.0114, "reward": 6.399877071380615, "reward_std": 0.921464130282402, "rewards/mrr_reward": 0.4511904865503311, "rewards/rank_analyze_format_reward": 0.7162086665630341, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 597.953125, "epoch": 0.03496, "grad_norm": 0.035875290632247925, "kl": 0.010082244873046875, "learning_rate": 1.1186355118645552e-06, "loss": 0.0145, "reward": 6.998130917549133, "reward_std": 0.9189672097563744, "rewards/mrr_reward": 0.5853918865323067, "rewards/rank_analyze_format_reward": 0.7498901337385178, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 575.671875, "epoch": 0.03504, "grad_norm": 0.03929242491722107, "kl": 0.010173797607421875, "learning_rate": 1.0899347581163222e-06, "loss": -0.0524, "reward": 6.882150292396545, "reward_std": 1.4270594418048859, "rewards/mrr_reward": 0.6054873615503311, "rewards/rank_analyze_format_reward": 0.5320434123277664, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 622.734375, "epoch": 0.03512, "grad_norm": 0.03167697787284851, "kl": 0.010068893432617188, "learning_rate": 1.0615857584873624e-06, "loss": -0.0152, "reward": 6.310707330703735, "reward_std": 0.9130070395767689, "rewards/mrr_reward": 0.3952566981315613, "rewards/rank_analyze_format_reward": 0.805852472782135, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 612.921875, "epoch": 0.0352, "grad_norm": 0.03580213338136673, "kl": 0.010387420654296875, "learning_rate": 1.0335896321476413e-06, "loss": -0.0229, "reward": 6.60002064704895, "reward_std": 1.2272609919309616, "rewards/mrr_reward": 0.5005332306027412, "rewards/rank_analyze_format_reward": 0.6589923650026321, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.010737558826804161, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 597.171875, "epoch": 0.03528, "grad_norm": 0.03527718037366867, "kl": 0.009809494018554688, "learning_rate": 1.0059474843362893e-06, "loss": 0.0001, "reward": 6.267144680023193, "reward_std": 0.9275566190481186, "rewards/mrr_reward": 0.42506199702620506, "rewards/rank_analyze_format_reward": 0.7266296148300171, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 650.359375, "epoch": 0.03536, "grad_norm": 0.036646001040935516, "kl": 0.009122848510742188, "learning_rate": 9.786604063179728e-07, "loss": -0.0093, "reward": 6.3471105098724365, "reward_std": 1.43297678232193, "rewards/mrr_reward": 0.44096602499485016, "rewards/rank_analyze_format_reward": 0.788094699382782, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 631.21875, "epoch": 0.03544, "grad_norm": 0.03925708681344986, "kl": 0.0123443603515625, "learning_rate": 9.517294753398066e-07, "loss": -0.0097, "reward": 6.7111616134643555, "reward_std": 0.9859621822834015, "rewards/mrr_reward": 0.5009238570928574, "rewards/rank_analyze_format_reward": 0.8187942802906036, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 608.53125, "epoch": 0.03552, "grad_norm": 0.03424806892871857, "kl": 0.0093231201171875, "learning_rate": 9.251557545888312e-07, "loss": -0.0482, "reward": 6.7897608280181885, "reward_std": 1.0599493011832237, "rewards/mrr_reward": 0.5596788078546524, "rewards/rank_analyze_format_reward": 0.64479561150074, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 609.609375, "epoch": 0.0356, "grad_norm": 0.033046841621398926, "kl": 0.009771347045898438, "learning_rate": 8.989402931500434e-07, "loss": -0.0208, "reward": 6.172217845916748, "reward_std": 0.7055819928646088, "rewards/mrr_reward": 0.3654824048280716, "rewards/rank_analyze_format_reward": 0.7532570362091064, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 656.9375, "epoch": 0.03568, "grad_norm": 0.03781959414482117, "kl": 0.009064674377441406, "learning_rate": 8.730841259649725e-07, "loss": -0.0158, "reward": 6.611280202865601, "reward_std": 1.0067766308784485, "rewards/mrr_reward": 0.4670758917927742, "rewards/rank_analyze_format_reward": 0.8445390909910202, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 635.609375, "epoch": 0.03576, "grad_norm": 0.03509828820824623, "kl": 0.009130477905273438, "learning_rate": 8.475882737908248e-07, "loss": 0.0066, "reward": 6.223300814628601, "reward_std": 1.1437466144561768, "rewards/mrr_reward": 0.3843750059604645, "rewards/rank_analyze_format_reward": 0.8202577084302902, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 605.21875, "epoch": 0.03584, "grad_norm": 0.03470684215426445, "kl": 0.011194229125976562, "learning_rate": 8.224537431601886e-07, "loss": -0.0225, "reward": 6.184381008148193, "reward_std": 1.2531072199344635, "rewards/mrr_reward": 0.4095486253499985, "rewards/rank_analyze_format_reward": 0.7081810683012009, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 592.4375, "epoch": 0.03592, "grad_norm": 0.03405920788645744, "kl": 0.009883880615234375, "learning_rate": 7.976815263412963e-07, "loss": -0.0354, "reward": 6.243209958076477, "reward_std": 1.0181850045919418, "rewards/mrr_reward": 0.43634671717882156, "rewards/rank_analyze_format_reward": 0.6361865997314453, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 630.953125, "epoch": 0.036, "grad_norm": 0.03643513098359108, "kl": 0.010174751281738281, "learning_rate": 7.732726012988512e-07, "loss": -0.0255, "reward": 6.52116322517395, "reward_std": 0.8739643394947052, "rewards/mrr_reward": 0.45942460745573044, "rewards/rank_analyze_format_reward": 0.7518241107463837, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.984375, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 581.453125, "epoch": 0.03608, "grad_norm": 0.037287477403879166, "kl": 0.01116180419921875, "learning_rate": 7.492279316554207e-07, "loss": -0.0188, "reward": 6.294661641120911, "reward_std": 1.3103710114955902, "rewards/mrr_reward": 0.4556175582110882, "rewards/rank_analyze_format_reward": 0.6205140501260757, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 611.671875, "epoch": 0.03616, "grad_norm": 0.039395540952682495, "kl": 0.01100921630859375, "learning_rate": 7.255484666533874e-07, "loss": 0.0032, "reward": 6.90537166595459, "reward_std": 0.7694846913218498, "rewards/mrr_reward": 0.5536458417773247, "rewards/rank_analyze_format_reward": 0.7124979794025421, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.015399639494717121, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 629.96875, "epoch": 0.03624, "grad_norm": 0.03781802952289581, "kl": 0.010683059692382812, "learning_rate": 7.022351411174866e-07, "loss": 0.0104, "reward": 5.850533366203308, "reward_std": 0.9768451899290085, "rewards/mrr_reward": 0.3016926981508732, "rewards/rank_analyze_format_reward": 0.829309344291687, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 1.0, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 615.96875, "epoch": 0.03632, "grad_norm": 0.03431001305580139, "kl": 0.009929656982421875, "learning_rate": 6.792888754178906e-07, "loss": -0.012, "reward": 6.1610002517700195, "reward_std": 0.8694123476743698, "rewards/mrr_reward": 0.39418403804302216, "rewards/rank_analyze_format_reward": 0.7350245714187622, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9949321895837784, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9793071895837784, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 613.671875, "epoch": 0.0364, "grad_norm": 0.03638536110520363, "kl": 0.010156631469726562, "learning_rate": 6.567105754338798e-07, "loss": -0.0044, "reward": 7.308522462844849, "reward_std": 0.9199118688702583, "rewards/mrr_reward": 0.6604166626930237, "rewards/rank_analyze_format_reward": 0.7781838029623032, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 598.859375, "epoch": 0.03648, "grad_norm": 0.03470359370112419, "kl": 0.009462356567382812, "learning_rate": 6.345011325180772e-07, "loss": -0.0033, "reward": 6.756022214889526, "reward_std": 1.15298330783844, "rewards/mrr_reward": 0.5096850246191025, "rewards/rank_analyze_format_reward": 0.7817352265119553, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 637.296875, "epoch": 0.03656, "grad_norm": 0.034352075308561325, "kl": 0.008943557739257812, "learning_rate": 6.126614234612593e-07, "loss": 0.0134, "reward": 6.162580370903015, "reward_std": 0.8458864092826843, "rewards/mrr_reward": 0.3814049959182739, "rewards/rank_analyze_format_reward": 0.7811831086874008, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9679276347160339, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 609.828125, "epoch": 0.03664, "grad_norm": 0.035168495029211044, "kl": 0.0104522705078125, "learning_rate": 5.911923104577455e-07, "loss": -0.0336, "reward": 6.1486005783081055, "reward_std": 1.0874281823635101, "rewards/mrr_reward": 0.4020957425236702, "rewards/rank_analyze_format_reward": 0.7062332183122635, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 644.359375, "epoch": 0.03672, "grad_norm": 0.03987110033631325, "kl": 0.011552810668945312, "learning_rate": 5.700946410713548e-07, "loss": 0.0247, "reward": 7.164777755737305, "reward_std": 0.8137249499559402, "rewards/mrr_reward": 0.6104786545038223, "rewards/rank_analyze_format_reward": 0.8068473786115646, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 638.21875, "epoch": 0.0368, "grad_norm": 0.0355086512863636, "kl": 0.009400367736816406, "learning_rate": 5.49369248201953e-07, "loss": 0.0035, "reward": 6.449381470680237, "reward_std": 0.8406965732574463, "rewards/mrr_reward": 0.46933283284306526, "rewards/rank_analyze_format_reward": 0.7418571263551712, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 622.515625, "epoch": 0.03688, "grad_norm": 0.0342673696577549, "kl": 0.010198593139648438, "learning_rate": 5.290169500525577e-07, "loss": -0.0001, "reward": 6.792163372039795, "reward_std": 0.8770151287317276, "rewards/mrr_reward": 0.532372273504734, "rewards/rank_analyze_format_reward": 0.7558075189590454, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 618.671875, "epoch": 0.03696, "grad_norm": 0.05343034863471985, "kl": 0.01750946044921875, "learning_rate": 5.090385500970551e-07, "loss": -0.0201, "reward": 6.158111333847046, "reward_std": 1.4306797087192535, "rewards/mrr_reward": 0.4081101268529892, "rewards/rank_analyze_format_reward": 0.7109093144536018, "rewards/rank_answer_foramt_reward": 0.87109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 588.484375, "epoch": 0.03704, "grad_norm": 0.03817480802536011, "kl": 0.009739875793457031, "learning_rate": 4.894348370484648e-07, "loss": -0.0067, "reward": 7.117258071899414, "reward_std": 1.0584193989634514, "rewards/mrr_reward": 0.6482390910387039, "rewards/rank_analyze_format_reward": 0.6141452640295029, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 603.34375, "epoch": 0.03712, "grad_norm": 0.03466404229402542, "kl": 0.012298583984375, "learning_rate": 4.702065848278126e-07, "loss": -0.0413, "reward": 7.422214508056641, "reward_std": 0.8890604227781296, "rewards/mrr_reward": 0.682440496981144, "rewards/rank_analyze_format_reward": 0.7002649903297424, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 585.109375, "epoch": 0.0372, "grad_norm": 0.07943949103355408, "kl": 0.028760910034179688, "learning_rate": 4.5135455253357053e-07, "loss": -0.0282, "reward": 7.254140734672546, "reward_std": 1.0048578381538391, "rewards/mrr_reward": 0.6743055582046509, "rewards/rank_analyze_format_reward": 0.6506686955690384, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 634.90625, "epoch": 0.03728, "grad_norm": 0.036992065608501434, "kl": 0.008815765380859375, "learning_rate": 4.3287948441169457e-07, "loss": -0.0208, "reward": 6.762825846672058, "reward_std": 1.450620323419571, "rewards/mrr_reward": 0.5584201440215111, "rewards/rank_analyze_format_reward": 0.6478208154439926, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9836309552192688, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 614.953125, "epoch": 0.03736, "grad_norm": 0.03635398671030998, "kl": 0.013956069946289062, "learning_rate": 4.1478210982624055e-07, "loss": -0.0085, "reward": 6.211694121360779, "reward_std": 1.2436326742172241, "rewards/mrr_reward": 0.3837921619415283, "rewards/rank_analyze_format_reward": 0.7839471846818924, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 601.296875, "epoch": 0.03744, "grad_norm": 0.03799626603722572, "kl": 0.0088653564453125, "learning_rate": 3.9706314323056936e-07, "loss": 0.009, "reward": 6.51821494102478, "reward_std": 1.0044037997722626, "rewards/mrr_reward": 0.5042286813259125, "rewards/rank_analyze_format_reward": 0.6204409003257751, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.96875, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 660.515625, "epoch": 0.03752, "grad_norm": 0.03617721423506737, "kl": 0.010265350341796875, "learning_rate": 3.7972328413914074e-07, "loss": -0.0196, "reward": 6.52497124671936, "reward_std": 1.3645110726356506, "rewards/mrr_reward": 0.49324777349829674, "rewards/rank_analyze_format_reward": 0.7527287006378174, "rewards/rank_answer_foramt_reward": 0.841796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 637.078125, "epoch": 0.0376, "grad_norm": 0.034526970237493515, "kl": 0.010463714599609375, "learning_rate": 3.627632170999029e-07, "loss": 0.0019, "reward": 6.943032145500183, "reward_std": 1.1558240801095963, "rewards/mrr_reward": 0.5853918790817261, "rewards/rank_analyze_format_reward": 0.7242032513022423, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 603.796875, "epoch": 0.03768, "grad_norm": 0.03654225543141365, "kl": 0.009853363037109375, "learning_rate": 3.4618361166726123e-07, "loss": -0.0186, "reward": 6.777468323707581, "reward_std": 0.8651553392410278, "rewards/mrr_reward": 0.5183097645640373, "rewards/rank_analyze_format_reward": 0.7641593515872955, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 634.984375, "epoch": 0.03776, "grad_norm": 0.037216901779174805, "kl": 0.010089874267578125, "learning_rate": 3.2998512237565005e-07, "loss": -0.0256, "reward": 6.442400097846985, "reward_std": 0.7653128877282143, "rewards/mrr_reward": 0.4249566048383713, "rewards/rank_analyze_format_reward": 0.8017018139362335, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.013137437403202057, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 625.65625, "epoch": 0.03784, "grad_norm": 0.03593156486749649, "kl": 0.009992599487304688, "learning_rate": 3.1416838871368925e-07, "loss": 0.0176, "reward": 6.9145790338516235, "reward_std": 1.224390022456646, "rewards/mrr_reward": 0.5557229667901993, "rewards/rank_analyze_format_reward": 0.8166871517896652, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 592.109375, "epoch": 0.03792, "grad_norm": 0.03700824826955795, "kl": 0.009860992431640625, "learning_rate": 2.987340350989421e-07, "loss": -0.0362, "reward": 6.408146023750305, "reward_std": 0.7867753058671951, "rewards/mrr_reward": 0.4782242253422737, "rewards/rank_analyze_format_reward": 0.5636085942387581, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 567.09375, "epoch": 0.038, "grad_norm": 0.038203008472919464, "kl": 0.010484695434570312, "learning_rate": 2.836826708532603e-07, "loss": -0.0324, "reward": 7.634762167930603, "reward_std": 1.2639281451702118, "rewards/mrr_reward": 0.7582031190395355, "rewards/rank_analyze_format_reward": 0.6603135764598846, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 618.5, "epoch": 0.03808, "grad_norm": 0.03985745459794998, "kl": 0.0098419189453125, "learning_rate": 2.6901489017873375e-07, "loss": -0.0205, "reward": 7.322023034095764, "reward_std": 0.9801450744271278, "rewards/mrr_reward": 0.6473462358117104, "rewards/rank_analyze_format_reward": 0.7939786314964294, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.996673658490181, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.996673658490181, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 626.140625, "epoch": 0.03816, "grad_norm": 0.036369435489177704, "kl": 0.010087966918945312, "learning_rate": 2.547312721342277e-07, "loss": -0.0221, "reward": 6.455911755561829, "reward_std": 0.8561778645962477, "rewards/mrr_reward": 0.45921996980905533, "rewards/rank_analyze_format_reward": 0.7321979850530624, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 636.359375, "epoch": 0.03824, "grad_norm": 0.0350259467959404, "kl": 0.010454177856445312, "learning_rate": 2.4083238061252565e-07, "loss": 0.0142, "reward": 6.237196445465088, "reward_std": 1.073834478855133, "rewards/mrr_reward": 0.3946118652820587, "rewards/rank_analyze_format_reward": 0.7696539908647537, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 583.25, "epoch": 0.03832, "grad_norm": 0.03830111399292946, "kl": 0.012111663818359375, "learning_rate": 2.273187643180652e-07, "loss": 0.0132, "reward": 6.970828056335449, "reward_std": 1.1110046803951263, "rewards/mrr_reward": 0.6242187544703484, "rewards/rank_analyze_format_reward": 0.6432948112487793, "rewards/rank_answer_foramt_reward": 0.896484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9827118366956711, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9827118366956711, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 630.625, "epoch": 0.0384, "grad_norm": 0.03383020684123039, "kl": 0.008692741394042969, "learning_rate": 2.1419095674527934e-07, "loss": -0.0146, "reward": 7.096667051315308, "reward_std": 1.0939511805772781, "rewards/mrr_reward": 0.6235615238547325, "rewards/rank_analyze_format_reward": 0.6802377104759216, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 631.671875, "epoch": 0.03848, "grad_norm": 0.03305520862340927, "kl": 0.00862884521484375, "learning_rate": 2.014494761575314e-07, "loss": -0.0096, "reward": 5.934294104576111, "reward_std": 0.5831545442342758, "rewards/mrr_reward": 0.31956226006150246, "rewards/rank_analyze_format_reward": 0.7341699302196503, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 625.75, "epoch": 0.03856, "grad_norm": 0.03797845542430878, "kl": 0.012144088745117188, "learning_rate": 1.8909482556666026e-07, "loss": 0.0039, "reward": 6.3423285484313965, "reward_std": 1.0918098539113998, "rewards/mrr_reward": 0.44094742834568024, "rewards/rank_analyze_format_reward": 0.6985926777124405, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137856245041, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9966137856245041, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 603.03125, "epoch": 0.03864, "grad_norm": 0.03884769231081009, "kl": 0.011470794677734375, "learning_rate": 1.7712749271311392e-07, "loss": -0.0123, "reward": 6.4430999755859375, "reward_std": 1.1809905469417572, "rewards/mrr_reward": 0.44941095635294914, "rewards/rank_analyze_format_reward": 0.7616034150123596, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137856245041, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9966137856245041, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 615.53125, "epoch": 0.03872, "grad_norm": 0.03682565316557884, "kl": 0.012861251831054688, "learning_rate": 1.6554795004670389e-07, "loss": -0.0146, "reward": 7.397425532341003, "reward_std": 1.2327516973018646, "rewards/mrr_reward": 0.7266927137970924, "rewards/rank_analyze_format_reward": 0.642998531460762, "rewards/rank_answer_foramt_reward": 0.87109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 646.671875, "epoch": 0.0388, "grad_norm": 0.034126777201890945, "kl": 0.009260177612304688, "learning_rate": 1.543566547079467e-07, "loss": -0.0285, "reward": 6.137201905250549, "reward_std": 1.233183205127716, "rewards/mrr_reward": 0.39816469699144363, "rewards/rank_analyze_format_reward": 0.7322130650281906, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9667119532823563, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9823369532823563, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 628.578125, "epoch": 0.03888, "grad_norm": 0.03833496570587158, "kl": 0.0088043212890625, "learning_rate": 1.4355404851001953e-07, "loss": -0.0292, "reward": 6.252779960632324, "reward_std": 0.6386439949274063, "rewards/mrr_reward": 0.3790612444281578, "rewards/rank_analyze_format_reward": 0.7674764841794968, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 610.953125, "epoch": 0.03896, "grad_norm": 0.03661027178168297, "kl": 0.009855270385742188, "learning_rate": 1.3314055792131964e-07, "loss": -0.0102, "reward": 6.001603722572327, "reward_std": 0.7615599036216736, "rewards/mrr_reward": 0.3329427130520344, "rewards/rank_analyze_format_reward": 0.7049891948699951, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 615.25, "epoch": 0.03904, "grad_norm": 0.03613339737057686, "kl": 0.010988235473632812, "learning_rate": 1.231165940486234e-07, "loss": -0.0115, "reward": 6.3978800773620605, "reward_std": 1.2636699676513672, "rewards/mrr_reward": 0.44477927312254906, "rewards/rank_analyze_format_reward": 0.7632941901683807, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 653.15625, "epoch": 0.03912, "grad_norm": 0.036585692316293716, "kl": 0.010234832763671875, "learning_rate": 1.134825526208605e-07, "loss": -0.0012, "reward": 6.1229846477508545, "reward_std": 1.0384633243083954, "rewards/mrr_reward": 0.3722408339381218, "rewards/rank_analyze_format_reward": 0.760859802365303, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 597.171875, "epoch": 0.0392, "grad_norm": 0.04195049777626991, "kl": 0.010122299194335938, "learning_rate": 1.0423881397349067e-07, "loss": -0.0278, "reward": 6.497667908668518, "reward_std": 1.2408464550971985, "rewards/mrr_reward": 0.4950520880520344, "rewards/rank_analyze_format_reward": 0.6833600848913193, "rewards/rank_answer_foramt_reward": 0.8359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 597.515625, "epoch": 0.03928, "grad_norm": 0.03897390887141228, "kl": 0.008887290954589844, "learning_rate": 9.538574303348813e-08, "loss": 0.0029, "reward": 6.999032139778137, "reward_std": 1.4336660504341125, "rewards/mrr_reward": 0.5883680731058121, "rewards/rank_analyze_format_reward": 0.687958374619484, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.014242256991565228, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 594.609375, "epoch": 0.03936, "grad_norm": 0.037622641772031784, "kl": 0.010972976684570312, "learning_rate": 8.692368930493522e-08, "loss": 0.0146, "reward": 6.304690718650818, "reward_std": 0.9507746249437332, "rewards/mrr_reward": 0.44383060559630394, "rewards/rank_analyze_format_reward": 0.6758524030447006, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.96875, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 646.375, "epoch": 0.03944, "grad_norm": 0.03410767763853073, "kl": 0.00960540771484375, "learning_rate": 7.885298685522235e-08, "loss": -0.0147, "reward": 6.207166433334351, "reward_std": 0.8575708866119385, "rewards/mrr_reward": 0.3724454566836357, "rewards/rank_analyze_format_reward": 0.8248064666986465, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 1.0, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 672.078125, "epoch": 0.03952, "grad_norm": 0.032211244106292725, "kl": 0.008607864379882812, "learning_rate": 7.117395430186414e-08, "loss": -0.0028, "reward": 6.419814825057983, "reward_std": 1.1731875389814377, "rewards/mrr_reward": 0.4230406805872917, "rewards/rank_analyze_format_reward": 0.8230469077825546, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 626.078125, "epoch": 0.0396, "grad_norm": 0.036501456052064896, "kl": 0.01013946533203125, "learning_rate": 6.388689479991606e-08, "loss": -0.0117, "reward": 6.6302900314331055, "reward_std": 1.359847217798233, "rewards/mrr_reward": 0.49904515594244003, "rewards/rank_analyze_format_reward": 0.8055609464645386, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 638.3125, "epoch": 0.03968, "grad_norm": 0.03620471432805061, "kl": 0.008656501770019531, "learning_rate": 5.699209603001077e-08, "loss": -0.0221, "reward": 7.734559535980225, "reward_std": 0.8384698703885078, "rewards/mrr_reward": 0.768446184694767, "rewards/rank_analyze_format_reward": 0.7662435621023178, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.96875, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 607.265625, "epoch": 0.03976, "grad_norm": 0.03670933097600937, "kl": 0.010890960693359375, "learning_rate": 5.048983018699827e-08, "loss": -0.0255, "reward": 6.960987329483032, "reward_std": 1.200978696346283, "rewards/mrr_reward": 0.5852182433009148, "rewards/rank_analyze_format_reward": 0.7158172428607941, "rewards/rank_answer_foramt_reward": 0.912109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 598.78125, "epoch": 0.03984, "grad_norm": 0.03467821702361107, "kl": 0.011655807495117188, "learning_rate": 4.438035396920004e-08, "loss": -0.0143, "reward": 6.67683732509613, "reward_std": 1.3541464805603027, "rewards/mrr_reward": 0.5460937470197678, "rewards/rank_analyze_format_reward": 0.6230134591460228, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 617.90625, "epoch": 0.03992, "grad_norm": 0.03608880192041397, "kl": 0.010183334350585938, "learning_rate": 3.866390856827495e-08, "loss": 0.0055, "reward": 6.798374891281128, "reward_std": 1.0321412235498428, "rewards/mrr_reward": 0.5455481261014938, "rewards/rank_analyze_format_reward": 0.7584522217512131, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 638.640625, "epoch": 0.04, "grad_norm": 0.03619951009750366, "kl": 0.009807586669921875, "learning_rate": 3.3340719659701315e-08, "loss": -0.0346, "reward": 6.6675262451171875, "reward_std": 0.9007093012332916, "rewards/mrr_reward": 0.48638393729925156, "rewards/rank_analyze_format_reward": 0.7921882122755051, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 500 }, { "epoch": 0.04, "step": 500, "total_flos": 0.0, "train_loss": -0.014922690442996099, "train_runtime": 140889.6657, "train_samples_per_second": 0.227, "train_steps_per_second": 0.004 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }