{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998452810727179, "eval_steps": 500, "global_step": 242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 668.921875, "completions/mean_terminated_length": 668.921875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.0041258380608561115, "grad_norm": 0.10799287259578705, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 566496.0, "reward": 0.1473214328289032, "reward_std": 0.15947088599205017, "rewards/code_format_reward/mean": 0.015625, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 661.0089721679688, "completions/mean_terminated_length": 661.0089721679688, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.008251676121712223, "grad_norm": 0.11381220817565918, "kl": 8.106231689453125e-05, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 1111968.0, "reward": 0.1540178656578064, "reward_std": 0.16666662693023682, "rewards/code_format_reward/mean": 0.0066964286379516125, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 671.9486694335938, "completions/mean_terminated_length": 671.9486694335938, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.012377514182568335, "grad_norm": 0.1249830350279808, "kl": 8.392333984375e-05, "learning_rate": 1e-06, "loss": -0.0116, "num_tokens": 1665400.0, "reward": 0.2008928656578064, "reward_std": 0.22945983707904816, "rewards/code_format_reward/mean": 0.013392857275903225, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.1875, "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 703.0803833007812, "completions/mean_terminated_length": 703.0803833007812, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.016503352243424446, "grad_norm": 0.11092118918895721, "kl": 8.213520050048828e-05, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 2260401.0, "reward": 0.1383928656578064, "reward_std": 0.17362166941165924, "rewards/code_format_reward/mean": 0.008928571827709675, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1294642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3360883891582489, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2443.0, "completions/mean_length": 737.0491333007812, "completions/mean_terminated_length": 729.53466796875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.020629190304280558, "grad_norm": 0.10656815767288208, "kl": 8.863210678100586e-05, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 2869542.0, "reward": 0.113839291036129, "reward_std": 0.15831470489501953, "rewards/code_format_reward/mean": 0.02008928544819355, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 688.6563110351562, "completions/mean_terminated_length": 681.0335693359375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.02475502836513667, "grad_norm": 0.1315726637840271, "kl": 9.709596633911133e-05, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 3453363.0, "reward": 0.212053582072258, "reward_std": 0.2333904653787613, "rewards/code_format_reward/mean": 0.02008928544819355, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 689.3348388671875, "completions/mean_terminated_length": 689.3348388671875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.02888086642599278, "grad_norm": 0.12198060005903244, "kl": 0.00011324882507324219, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 4042927.0, "reward": 0.1875000149011612, "reward_std": 0.21217122673988342, "rewards/code_format_reward/mean": 0.0223214291036129, "rewards/code_format_reward/std": 0.14789186418056488, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175676226615906, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 664.5491333007812, "completions/mean_terminated_length": 664.5491333007812, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.03300670448684889, "grad_norm": 0.1352839320898056, "kl": 0.00013583898544311523, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 4603978.0, "reward": 0.1763392984867096, "reward_std": 0.22943782806396484, "rewards/code_format_reward/mean": 0.02901785634458065, "rewards/code_format_reward/std": 0.16804419457912445, "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 716.2545166015625, "completions/mean_terminated_length": 708.6935424804688, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.037132542547705004, "grad_norm": 0.11896871030330658, "kl": 0.0001590251922607422, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 5186359.0, "reward": 0.1696428656578064, "reward_std": 0.2090558409690857, "rewards/code_format_reward/mean": 0.013392857275903225, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.15625, "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 672.8772583007812, "completions/mean_terminated_length": 672.8772583007812, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.041258380608561115, "grad_norm": 0.13409991562366486, "kl": 0.0002256631851196289, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 5758253.0, "reward": 0.1473214328289032, "reward_std": 0.2007039338350296, "rewards/code_format_reward/mean": 0.02678571455180645, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 643.1741333007812, "completions/mean_terminated_length": 643.1741333007812, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.04538421866941723, "grad_norm": 0.14812405407428741, "kl": 0.00032067298889160156, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 6311443.0, "reward": 0.2276785969734192, "reward_std": 0.26675671339035034, "rewards/code_format_reward/mean": 0.0513392873108387, "rewards/code_format_reward/std": 0.22093556821346283, "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 668.1942138671875, "completions/mean_terminated_length": 668.1942138671875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.04951005673027334, "grad_norm": 0.12490642070770264, "kl": 0.0004642009735107422, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 6878733.0, "reward": 0.1875000149011612, "reward_std": 0.22568528354167938, "rewards/code_format_reward/mean": 0.0535714291036129, "rewards/code_format_reward/std": 0.225421741604805, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 674.5982666015625, "completions/mean_terminated_length": 674.5982666015625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.05363589479112945, "grad_norm": 0.16906772553920746, "kl": 0.000667572021484375, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 7449452.0, "reward": 0.2589285969734192, "reward_std": 0.27419817447662354, "rewards/code_format_reward/mean": 0.1116071417927742, "rewards/code_format_reward/std": 0.31523454189300537, "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 640.0513916015625, "completions/mean_terminated_length": 640.0513916015625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.05776173285198556, "grad_norm": 0.1850261241197586, "kl": 0.0011305809020996094, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 8021298.0, "reward": 0.3727678656578064, "reward_std": 0.3690127730369568, "rewards/code_format_reward/mean": 0.1540178507566452, "rewards/code_format_reward/std": 0.36136937141418457, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.41923147439956665, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 630.685302734375, "completions/mean_terminated_length": 630.685302734375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.06188757091284167, "grad_norm": 0.2137249857187271, "kl": 0.0015087127685546875, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 8558421.0, "reward": 0.4040178954601288, "reward_std": 0.42001262307167053, "rewards/code_format_reward/mean": 0.1808035671710968, "rewards/code_format_reward/std": 0.3852856159210205, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 670.8147583007812, "completions/mean_terminated_length": 655.4552001953125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.06601340897369778, "grad_norm": 0.2017928510904312, "kl": 0.001926422119140625, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 9143538.0, "reward": 0.3950892984867096, "reward_std": 0.4540654122829437, "rewards/code_format_reward/mean": 0.2321428507566452, "rewards/code_format_reward/std": 0.4226716160774231, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 667.6428833007812, "completions/mean_terminated_length": 667.6428833007812, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.07013924703455389, "grad_norm": 0.21064649522304535, "kl": 0.0026302337646484375, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 9701705.0, "reward": 0.4464286267757416, "reward_std": 0.49284079670906067, "rewards/code_format_reward/mean": 0.2790178656578064, "rewards/code_format_reward/std": 0.449017733335495, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.38554468750953674, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 623.09375, "completions/mean_terminated_length": 623.09375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.07426508509541001, "grad_norm": 0.24700897932052612, "kl": 0.00388336181640625, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 10250232.0, "reward": 0.6383928656578064, "reward_std": 0.5543317198753357, "rewards/code_format_reward/mean": 0.4151785671710968, "rewards/code_format_reward/std": 0.49330368638038635, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 648.0067138671875, "completions/mean_terminated_length": 632.5448608398438, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.07839092315626611, "grad_norm": 0.24264982342720032, "kl": 0.00533294677734375, "learning_rate": 1e-06, "loss": 0.0486, "num_tokens": 10831018.0, "reward": 0.578125, "reward_std": 0.5442748069763184, "rewards/code_format_reward/mean": 0.5, "rewards/code_format_reward/std": 0.5005589723587036, "rewards/curriculum_aware_reward_fn/mean": 0.078125, "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 632.3392944335938, "completions/mean_terminated_length": 632.3392944335938, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.08251676121712223, "grad_norm": 0.21786393225193024, "kl": 0.00759124755859375, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 11383491.0, "reward": 0.8147321939468384, "reward_std": 0.5451315641403198, "rewards/code_format_reward/mean": 0.6383928656578064, "rewards/code_format_reward/std": 0.48100295662879944, "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 634.1763916015625, "completions/mean_terminated_length": 634.1763916015625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.08664259927797834, "grad_norm": 0.2021212875843048, "kl": 0.0095367431640625, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 11928583.0, "reward": 0.8928571939468384, "reward_std": 0.4793805480003357, "rewards/code_format_reward/mean": 0.7433035969734192, "rewards/code_format_reward/std": 0.4372987747192383, "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, "rewards/curriculum_aware_reward_fn/std": 0.3570319712162018, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 603.0892944335938, "completions/mean_terminated_length": 603.0892944335938, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.09076843733883445, "grad_norm": 0.1993289738893509, "kl": 0.01096343994140625, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 12455898.0, "reward": 1.0290179252624512, "reward_std": 0.4283655285835266, "rewards/code_format_reward/mean": 0.8526785969734192, "rewards/code_format_reward/std": 0.3548222780227661, "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, "rewards/curriculum_aware_reward_fn/std": 0.3873537480831146, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 616.935302734375, "completions/mean_terminated_length": 593.4808959960938, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.09489427539969056, "grad_norm": 0.1654028296470642, "kl": 0.01332855224609375, "learning_rate": 1e-06, "loss": 0.0484, "num_tokens": 12994834.0, "reward": 0.9866071939468384, "reward_std": 0.29616448283195496, "rewards/code_format_reward/mean": 0.8973214030265808, "rewards/code_format_reward/std": 0.30387791991233826, "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, "rewards/curriculum_aware_reward_fn/std": 0.2854745090007782, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 590.9442138671875, "completions/mean_terminated_length": 590.9442138671875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.09902011346054668, "grad_norm": 0.15793830156326294, "kl": 0.01537322998046875, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 13531842.0, "reward": 1.0982143878936768, "reward_std": 0.2986489236354828, "rewards/code_format_reward/mean": 0.9330357313156128, "rewards/code_format_reward/std": 0.2502395808696747, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175676226615906, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 581.3705444335938, "completions/mean_terminated_length": 581.3705444335938, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.10314595152140278, "grad_norm": 0.16621442139148712, "kl": 0.01451873779296875, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 14062713.0, "reward": 1.125, "reward_std": 0.2694872319698334, "rewards/code_format_reward/mean": 0.953125, "rewards/code_format_reward/std": 0.21160738170146942, "rewards/curriculum_aware_reward_fn/mean": 0.171875, "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 584.8549194335938, "completions/mean_terminated_length": 577.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1072717895822589, "grad_norm": 0.1315479427576065, "kl": 0.01580047607421875, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 14602047.0, "reward": 1.0848214626312256, "reward_std": 0.24191585183143616, "rewards/code_format_reward/mean": 0.953125, "rewards/code_format_reward/std": 0.21160738170146942, "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 587.4241333007812, "completions/mean_terminated_length": 579.574951171875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.111397627643115, "grad_norm": 0.1357060819864273, "kl": 0.0148773193359375, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 15140733.0, "reward": 1.1160714626312256, "reward_std": 0.20137658715248108, "rewards/code_format_reward/mean": 0.9620535969734192, "rewards/code_format_reward/std": 0.19128035008907318, "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 551.857177734375, "completions/mean_terminated_length": 551.857177734375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.11552346570397112, "grad_norm": 0.1520267277956009, "kl": 0.017547607421875, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 15659173.0, "reward": 1.118303656578064, "reward_std": 0.24786671996116638, "rewards/code_format_reward/mean": 0.9553571343421936, "rewards/code_format_reward/std": 0.2067493349313736, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 549.4642944335938, "completions/mean_terminated_length": 549.4642944335938, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.11964930376482723, "grad_norm": 0.13469964265823364, "kl": 0.01837158203125, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 16182323.0, "reward": 1.1607143878936768, "reward_std": 0.19862547516822815, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3526.0, "completions/mean_length": 599.1205444335938, "completions/mean_terminated_length": 591.2975463867188, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.12377514182568335, "grad_norm": 0.13613486289978027, "kl": 0.01648712158203125, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 16714097.0, "reward": 1.1205357313156128, "reward_std": 0.22188395261764526, "rewards/code_format_reward/mean": 0.96875, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1741.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 552.9420166015625, "completions/mean_terminated_length": 552.9420166015625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.12790097988653945, "grad_norm": 0.14838948845863342, "kl": 0.01715087890625, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 17243747.0, "reward": 1.171875, "reward_std": 0.23096007108688354, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 561.5402221679688, "completions/mean_terminated_length": 553.6331176757812, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.13202681794739557, "grad_norm": 0.11722318828105927, "kl": 0.01806640625, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 17765895.0, "reward": 1.1540179252624512, "reward_std": 0.17504097521305084, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 551.763427734375, "completions/mean_terminated_length": 551.763427734375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.1361526560082517, "grad_norm": 0.13597670197486877, "kl": 0.0184478759765625, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 18290726.0, "reward": 1.1540179252624512, "reward_std": 0.1896202266216278, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 585.1004638671875, "completions/mean_terminated_length": 577.24609375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.14027849406910778, "grad_norm": 0.12794998288154602, "kl": 0.0279083251953125, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 18814021.0, "reward": 1.1049107313156128, "reward_std": 0.16800154745578766, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774, "rewards/curriculum_aware_reward_fn/std": 0.32332828640937805, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 546.5670166015625, "completions/mean_terminated_length": 538.6264038085938, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1444043321299639, "grad_norm": 0.13102807104587555, "kl": 0.018310546875, "learning_rate": 1e-06, "loss": 0.0365, "num_tokens": 19343279.0, "reward": 1.1741071939468384, "reward_std": 0.18075977265834808, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.38894903659820557, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 562.859375, "completions/mean_terminated_length": 562.859375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.14853017019082002, "grad_norm": 0.12028873711824417, "kl": 0.0249481201171875, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 19864716.0, "reward": 1.133928656578064, "reward_std": 0.1538485586643219, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.3525845408439636, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 558.2991333007812, "completions/mean_terminated_length": 558.2991333007812, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.15265600825167613, "grad_norm": 0.1312171369791031, "kl": 0.0178375244140625, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 20382906.0, "reward": 1.1138393878936768, "reward_std": 0.2045470029115677, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2174.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 553.5892944335938, "completions/mean_terminated_length": 553.5892944335938, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.15678184631253222, "grad_norm": 0.11422319710254669, "kl": 0.018768310546875, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 20888402.0, "reward": 1.1875, "reward_std": 0.17206379771232605, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 581.9910888671875, "completions/mean_terminated_length": 574.1297607421875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.16090768437338834, "grad_norm": 0.1188901886343956, "kl": 0.017364501953125, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 21419775.0, "reward": 1.1160714626312256, "reward_std": 0.18699714541435242, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2615.0, "completions/max_terminated_length": 2615.0, "completions/mean_length": 571.1808471679688, "completions/mean_terminated_length": 571.1808471679688, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.16503352243424446, "grad_norm": 0.1219087690114975, "kl": 0.0186920166015625, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 21939305.0, "reward": 1.1227679252624512, "reward_std": 0.19861508905887604, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2388.0, "completions/mean_length": 579.6741333007812, "completions/mean_terminated_length": 571.8076171875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.16915936049510058, "grad_norm": 0.12759414315223694, "kl": 0.0255584716796875, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 22469619.0, "reward": 1.1473214626312256, "reward_std": 0.2073148787021637, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3233.0, "completions/max_terminated_length": 3233.0, "completions/mean_length": 520.4330444335938, "completions/mean_terminated_length": 520.4330444335938, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.17328519855595667, "grad_norm": 0.14117014408111572, "kl": 0.020721435546875, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 22944147.0, "reward": 1.2254464626312256, "reward_std": 0.22889530658721924, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3966.0, "completions/max_terminated_length": 3966.0, "completions/mean_length": 569.7076416015625, "completions/mean_terminated_length": 569.7076416015625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1774110366168128, "grad_norm": 0.11849313229322433, "kl": 0.018157958984375, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 23460056.0, "reward": 1.140625, "reward_std": 0.1712193638086319, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 576.7589721679688, "completions/mean_terminated_length": 568.8859252929688, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.1815368746776689, "grad_norm": 0.13535721600055695, "kl": 0.0171051025390625, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 23963941.0, "reward": 1.1741071939468384, "reward_std": 0.22849145531654358, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, "rewards/curriculum_aware_reward_fn/std": 0.3985181450843811, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 601.53125, "completions/mean_terminated_length": 593.713623046875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.18566271273852503, "grad_norm": 0.11846836656332016, "kl": 0.0178985595703125, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 24522849.0, "reward": 1.1406251192092896, "reward_std": 0.16945691406726837, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2539.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 563.6138916015625, "completions/mean_terminated_length": 563.6138916015625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.18978855079938112, "grad_norm": 0.1301787793636322, "kl": 0.017669677734375, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 25043497.0, "reward": 1.1428571939468384, "reward_std": 0.21056850254535675, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2100.0, "completions/mean_length": 582.7857666015625, "completions/mean_terminated_length": 574.9261474609375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.19391438886023724, "grad_norm": 0.13183555006980896, "kl": 0.025726318359375, "learning_rate": 1e-06, "loss": 0.0312, "num_tokens": 25574715.0, "reward": 1.1674107313156128, "reward_std": 0.2145567387342453, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 561.357177734375, "completions/mean_terminated_length": 561.357177734375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.19804022692109335, "grad_norm": 0.11861719936132431, "kl": 0.018646240234375, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 26080884.0, "reward": 1.1852679252624512, "reward_std": 0.17879967391490936, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 574.2142944335938, "completions/mean_terminated_length": 566.3355712890625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.20216606498194944, "grad_norm": 0.13924604654312134, "kl": 0.01712799072265625, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 26608066.0, "reward": 1.1830357313156128, "reward_std": 0.23577159643173218, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1875, "rewards/curriculum_aware_reward_fn/std": 0.402036190032959, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 569.015625, "completions/mean_terminated_length": 561.1253051757812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.20629190304280556, "grad_norm": 0.12972353398799896, "kl": 0.0221405029296875, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 27123441.0, "reward": 1.1875, "reward_std": 0.19132289290428162, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 554.4174194335938, "completions/mean_terminated_length": 554.4174194335938, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.21041774110366168, "grad_norm": 0.12087388336658478, "kl": 0.016815185546875, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 27635720.0, "reward": 1.165178656578064, "reward_std": 0.15248996019363403, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175676226615906, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 632.21875, "completions/mean_terminated_length": 632.21875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.2145435791645178, "grad_norm": 0.08631845563650131, "kl": 0.0154571533203125, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 28210007.0, "reward": 1.102678656578064, "reward_std": 0.12690326571464539, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.109375, "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2132.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 601.0357666015625, "completions/mean_terminated_length": 601.0357666015625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.2186694172253739, "grad_norm": 0.21726638078689575, "kl": 0.05242919921875, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 28743087.0, "reward": 1.1808037757873535, "reward_std": 0.1981913149356842, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 584.5982666015625, "completions/mean_terminated_length": 584.5982666015625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.22279525528623, "grad_norm": 0.12823925912380219, "kl": 0.017669677734375, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 29258499.0, "reward": 1.140625, "reward_std": 0.21394827961921692, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 612.7902221679688, "completions/mean_terminated_length": 612.7902221679688, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.22692109334708613, "grad_norm": 0.11272678524255753, "kl": 0.0160064697265625, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 29827351.0, "reward": 1.1316965818405151, "reward_std": 0.17087402939796448, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452, "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 611.3795166015625, "completions/mean_terminated_length": 611.3795166015625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.23104693140794225, "grad_norm": 0.10199145972728729, "kl": 0.016357421875, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 30390993.0, "reward": 1.0803571939468384, "reward_std": 0.11527875065803528, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 597.1027221679688, "completions/mean_terminated_length": 597.1027221679688, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.23517276946879834, "grad_norm": 0.12143676728010178, "kl": 0.01859283447265625, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 30936241.0, "reward": 1.21875, "reward_std": 0.20143161714076996, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.4326665699481964, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 575.671875, "completions/mean_terminated_length": 575.671875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.23929860752965446, "grad_norm": 0.12541887164115906, "kl": 0.01563262939453125, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 31455067.0, "reward": 1.1696429252624512, "reward_std": 0.18197356164455414, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, "rewards/curriculum_aware_reward_fn/std": 0.37962549924850464, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 616.9263916015625, "completions/mean_terminated_length": 609.1431884765625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.24342444559051057, "grad_norm": 0.11948662996292114, "kl": 0.0161285400390625, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 32001300.0, "reward": 1.165178656578064, "reward_std": 0.17752328515052795, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, "rewards/curriculum_aware_reward_fn/std": 0.37962549924850464, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 595.2678833007812, "completions/mean_terminated_length": 587.4362182617188, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2475502836513667, "grad_norm": 0.12722472846508026, "kl": 0.01483154296875, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 32549471.0, "reward": 1.196428656578064, "reward_std": 0.2070293128490448, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 631.9285888671875, "completions/mean_terminated_length": 624.178955078125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.2516761217122228, "grad_norm": 0.14178670942783356, "kl": 0.01406097412109375, "learning_rate": 1e-06, "loss": 0.0344, "num_tokens": 33092800.0, "reward": 1.2053571939468384, "reward_std": 0.2744157314300537, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2098214328289032, "rewards/curriculum_aware_reward_fn/std": 0.40763625502586365, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 603.2299194335938, "completions/mean_terminated_length": 595.4161376953125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.2558019597730789, "grad_norm": 0.13572575151920319, "kl": 0.016876220703125, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 33646834.0, "reward": 1.1897321939468384, "reward_std": 0.2664901316165924, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 611.3348388671875, "completions/mean_terminated_length": 595.7085571289062, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.259927797833935, "grad_norm": 0.11936476826667786, "kl": 0.01442718505859375, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 34183226.0, "reward": 1.203125, "reward_std": 0.21366269886493683, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 642.2455444335938, "completions/mean_terminated_length": 642.2455444335938, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.26405363589479114, "grad_norm": 0.11704245209693909, "kl": 0.01555633544921875, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 34750833.0, "reward": 1.1116071939468384, "reward_std": 0.19346334040164948, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 607.3147583007812, "completions/mean_terminated_length": 599.5100708007812, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.26817947395564723, "grad_norm": 0.12566031515598297, "kl": 0.02191925048828125, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 35292291.0, "reward": 1.1584821939468384, "reward_std": 0.19699904322624207, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 603.044677734375, "completions/mean_terminated_length": 595.2304077148438, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.2723053120165034, "grad_norm": 0.13625170290470123, "kl": 0.0141448974609375, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 35832494.0, "reward": 1.149553656578064, "reward_std": 0.2086959183216095, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, "rewards/curriculum_aware_reward_fn/std": 0.3757382929325104, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 602.109375, "completions/mean_terminated_length": 586.4417114257812, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.27643115007735947, "grad_norm": 0.1381009817123413, "kl": 0.013885498046875, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 36359328.0, "reward": 1.1763393878936768, "reward_std": 0.2517925798892975, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.426973819732666, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 602.9464721679688, "completions/mean_terminated_length": 595.1320190429688, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.28055698813821556, "grad_norm": 0.1229892149567604, "kl": 0.01766204833984375, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 36893726.0, "reward": 1.1674107313156128, "reward_std": 0.208026722073555, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 604.1763916015625, "completions/mean_terminated_length": 604.1763916015625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2846828261990717, "grad_norm": 0.1331387758255005, "kl": 0.01482391357421875, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 37422389.0, "reward": 1.1852679252624512, "reward_std": 0.19479824602603912, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 625.5892944335938, "completions/mean_terminated_length": 602.1932373046875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2888086642599278, "grad_norm": 0.134484201669693, "kl": 0.01519775390625, "learning_rate": 1e-06, "loss": 0.0574, "num_tokens": 37958272.0, "reward": 1.1383929252624512, "reward_std": 0.24126741290092468, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.15625, "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 587.359375, "completions/mean_terminated_length": 587.359375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2929345023207839, "grad_norm": 0.1287248283624649, "kl": 0.01821136474609375, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 38495081.0, "reward": 1.1875, "reward_std": 0.20400018990039825, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 625.84375, "completions/mean_terminated_length": 618.08056640625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.29706034038164003, "grad_norm": 0.13347220420837402, "kl": 0.0157470703125, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 39055914.0, "reward": 1.1383929252624512, "reward_std": 0.22964541614055634, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 589.4576416015625, "completions/mean_terminated_length": 589.4576416015625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3011861784424961, "grad_norm": 0.13391341269016266, "kl": 0.0176544189453125, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 39585242.0, "reward": 1.1428571939468384, "reward_std": 0.22163446247577667, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.14789186418056488, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.39509516954421997, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2017.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 638.2991333007812, "completions/mean_terminated_length": 638.2991333007812, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.30531201650335227, "grad_norm": 0.1166314035654068, "kl": 0.015380859375, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 40164003.0, "reward": 1.1227679252624512, "reward_std": 0.2151300609111786, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1637.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 612.3839721679688, "completions/mean_terminated_length": 612.3839721679688, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.30943785456420836, "grad_norm": 0.12206628918647766, "kl": 0.019195556640625, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 40704694.0, "reward": 1.1629464626312256, "reward_std": 0.19813279807567596, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.4200585186481476, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 619.9933471679688, "completions/mean_terminated_length": 619.9933471679688, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.31356369262506445, "grad_norm": 0.1261732131242752, "kl": 0.01723480224609375, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 41267428.0, "reward": 1.1450893878936768, "reward_std": 0.20383337140083313, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175676226615906, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 641.7076416015625, "completions/mean_terminated_length": 641.7076416015625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3176895306859206, "grad_norm": 0.11442097276449203, "kl": 0.01760101318359375, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 41834227.0, "reward": 1.243303656578064, "reward_std": 0.21721595525741577, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.2589285671710968, "rewards/curriculum_aware_reward_fn/std": 0.47289931774139404, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 632.2299194335938, "completions/mean_terminated_length": 632.2299194335938, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.3218153687467767, "grad_norm": 0.1199866384267807, "kl": 0.01998138427734375, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 42393014.0, "reward": 1.2053571939468384, "reward_std": 0.17362168431282043, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 616.8013916015625, "completions/mean_terminated_length": 616.8013916015625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.3259412068076328, "grad_norm": 0.13121961057186127, "kl": 0.01862335205078125, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 42940626.0, "reward": 1.140625, "reward_std": 0.19861890375614166, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 630.747802734375, "completions/mean_terminated_length": 615.2085571289062, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.3300670448684889, "grad_norm": 0.12364884465932846, "kl": 0.0167694091796875, "learning_rate": 1e-06, "loss": 0.0392, "num_tokens": 43485491.0, "reward": 1.1696430444717407, "reward_std": 0.18513384461402893, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 605.671875, "completions/mean_terminated_length": 597.863525390625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.334192882929345, "grad_norm": 0.1336803287267685, "kl": 0.017303466796875, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 44028236.0, "reward": 1.2522321939468384, "reward_std": 0.25795263051986694, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.2678571343421936, "rewards/curriculum_aware_reward_fn/std": 0.4726457893848419, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 638.4129638671875, "completions/mean_terminated_length": 630.6778564453125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.33831872099020116, "grad_norm": 0.11250525712966919, "kl": 0.0165863037109375, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 44582569.0, "reward": 1.1875, "reward_std": 0.19034792482852936, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.2098214328289032, "rewards/curriculum_aware_reward_fn/std": 0.40763622522354126, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 634.029052734375, "completions/mean_terminated_length": 634.029052734375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.34244455905105725, "grad_norm": 0.13481482863426208, "kl": 0.016998291015625, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 45158598.0, "reward": 1.1227679252624512, "reward_std": 0.21613793075084686, "rewards/code_format_reward/mean": 0.9642857313156128, "rewards/code_format_reward/std": 0.18578432500362396, "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 676.75, "completions/mean_terminated_length": 676.75, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.34657039711191334, "grad_norm": 0.10433954745531082, "kl": 0.016571044921875, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 45744715.0, "reward": 1.1004464626312256, "reward_std": 0.15173983573913574, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774, "rewards/curriculum_aware_reward_fn/std": 0.32332828640937805, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 616.357177734375, "completions/mean_terminated_length": 616.357177734375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3506962351727695, "grad_norm": 0.12722183763980865, "kl": 0.01569366455078125, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 46301571.0, "reward": 1.102678656578064, "reward_std": 0.21642474830150604, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.14789186418056488, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 602.5022583007812, "completions/mean_terminated_length": 602.5022583007812, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.3548220732336256, "grad_norm": 0.13291223347187042, "kl": 0.016021728515625, "learning_rate": 1e-06, "loss": 0.024, "num_tokens": 46829036.0, "reward": 1.1428571939468384, "reward_std": 0.20733731985092163, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 642.7210083007812, "completions/mean_terminated_length": 642.7210083007812, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.35894791129448167, "grad_norm": 0.16741511225700378, "kl": 0.03893280029296875, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 47401329.0, "reward": 1.1383929252624512, "reward_std": 0.24450711905956268, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 628.5424194335938, "completions/mean_terminated_length": 628.5424194335938, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.3630737493553378, "grad_norm": 0.12773050367832184, "kl": 0.01621246337890625, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 47958632.0, "reward": 1.203125, "reward_std": 0.23335763812065125, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 665.046875, "completions/mean_terminated_length": 665.046875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.3671995874161939, "grad_norm": 0.11331582814455032, "kl": 0.015838623046875, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 48538449.0, "reward": 1.15625, "reward_std": 0.19617268443107605, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175676226615906, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 634.9375, "completions/mean_terminated_length": 634.9375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.37132542547705005, "grad_norm": 0.11462749540805817, "kl": 0.01662445068359375, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 49102432.0, "reward": 1.1383929252624512, "reward_std": 0.18480999767780304, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 624.1473388671875, "completions/mean_terminated_length": 624.1473388671875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.37545126353790614, "grad_norm": 0.13003239035606384, "kl": 0.01912689208984375, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 49660230.0, "reward": 1.1361607313156128, "reward_std": 0.2249971330165863, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 642.044677734375, "completions/mean_terminated_length": 626.5560913085938, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.37957710159876223, "grad_norm": 0.13096508383750916, "kl": 0.01824951171875, "learning_rate": 1e-06, "loss": 0.0459, "num_tokens": 50206530.0, "reward": 1.1763393878936768, "reward_std": 0.22821438312530518, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.4439624845981598, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 604.8192138671875, "completions/mean_terminated_length": 604.8192138671875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3837029396596184, "grad_norm": 0.14180265367031097, "kl": 0.01837921142578125, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 50750958.0, "reward": 1.21875, "reward_std": 0.26692840456962585, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516, "rewards/curriculum_aware_reward_fn/std": 0.44197845458984375, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 634.872802734375, "completions/mean_terminated_length": 634.872802734375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.38782877772047447, "grad_norm": 0.11505790799856186, "kl": 0.0185699462890625, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 51300891.0, "reward": 1.1696429252624512, "reward_std": 0.19843201339244843, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39252740144729614, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 604.3013916015625, "completions/mean_terminated_length": 604.3013916015625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.39195461578133056, "grad_norm": 0.12556031346321106, "kl": 0.0222320556640625, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 51847348.0, "reward": 1.1741071939468384, "reward_std": 0.22430425882339478, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, "rewards/curriculum_aware_reward_fn/std": 0.3999190926551819, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 625.044677734375, "completions/mean_terminated_length": 625.044677734375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.3960804538421867, "grad_norm": 0.11164320260286331, "kl": 0.01787567138671875, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 52398597.0, "reward": 1.149553656578064, "reward_std": 0.18126830458641052, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3177.0, "completions/max_terminated_length": 3177.0, "completions/mean_length": 626.9710083007812, "completions/mean_terminated_length": 626.9710083007812, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.4002062919030428, "grad_norm": 0.1311129480600357, "kl": 0.0181884765625, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 52945459.0, "reward": 1.2165179252624512, "reward_std": 0.23649927973747253, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844924926758, "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516, "rewards/curriculum_aware_reward_fn/std": 0.4569111168384552, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 621.1830444335938, "completions/mean_terminated_length": 621.1830444335938, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4043321299638989, "grad_norm": 0.11989960819482803, "kl": 0.01808929443359375, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 53499423.0, "reward": 1.1897321939468384, "reward_std": 0.18616074323654175, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, "rewards/curriculum_aware_reward_fn/std": 0.3994380533695221, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 587.953125, "completions/mean_terminated_length": 580.1051635742188, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.40845796802475504, "grad_norm": 0.10423073917627335, "kl": 0.018463134765625, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 54033040.0, "reward": 1.1383929252624512, "reward_std": 0.15635327994823456, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 627.5714721679688, "completions/mean_terminated_length": 619.8120727539062, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.4125838060856111, "grad_norm": 0.1182645708322525, "kl": 0.024627685546875, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 54578827.0, "reward": 1.1607143878936768, "reward_std": 0.20138676464557648, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.38894903659820557, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 635.8951416015625, "completions/mean_terminated_length": 635.8951416015625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.4167096441464673, "grad_norm": 0.12514905631542206, "kl": 0.0258026123046875, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 55146718.0, "reward": 1.1227679252624512, "reward_std": 0.17645807564258575, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.14789186418056488, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 623.75, "completions/mean_terminated_length": 623.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.42083548220732336, "grad_norm": 0.11934695392847061, "kl": 0.016937255859375, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 55711506.0, "reward": 1.1897321939468384, "reward_std": 0.19643448293209076, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, "rewards/curriculum_aware_reward_fn/std": 0.4011160135269165, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 647.3973388671875, "completions/mean_terminated_length": 647.3973388671875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.42496132026817945, "grad_norm": 0.1259731650352478, "kl": 0.01915740966796875, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 56281974.0, "reward": 1.1875, "reward_std": 0.20970258116722107, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 653.1741333007812, "completions/mean_terminated_length": 653.1741333007812, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.4290871583290356, "grad_norm": 0.09783176332712173, "kl": 0.01717376708984375, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 56830579.0, "reward": 1.0602679252624512, "reward_std": 0.13404518365859985, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 566.125, "completions/mean_terminated_length": 566.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.4332129963898917, "grad_norm": 0.13474880158901215, "kl": 0.02336883544921875, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 57344281.0, "reward": 1.21875, "reward_std": 0.21563071012496948, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484, "rewards/curriculum_aware_reward_fn/std": 0.4268510043621063, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 624.140625, "completions/mean_terminated_length": 624.140625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4373388344507478, "grad_norm": 0.12046890705823898, "kl": 0.02100372314453125, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 57897932.0, "reward": 1.1383929252624512, "reward_std": 0.19550348818302155, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 616.5670166015625, "completions/mean_terminated_length": 616.5670166015625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.44146467251160393, "grad_norm": 0.12603536248207092, "kl": 0.01837158203125, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 58426959.0, "reward": 1.21875, "reward_std": 0.1976865828037262, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.234375, "rewards/curriculum_aware_reward_fn/std": 0.42408111691474915, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 611.6964721679688, "completions/mean_terminated_length": 603.9015502929688, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.44559051057246, "grad_norm": 0.13077113032341003, "kl": 0.021392822265625, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 58969127.0, "reward": 1.243303656578064, "reward_std": 0.2142733931541443, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2544642984867096, "rewards/curriculum_aware_reward_fn/std": 0.4360465705394745, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 602.03125, "completions/mean_terminated_length": 602.03125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.44971634863331617, "grad_norm": 0.23752839863300323, "kl": 0.047576904296875, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 59506216.0, "reward": 1.2366071939468384, "reward_std": 0.19941167533397675, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2455357164144516, "rewards/curriculum_aware_reward_fn/std": 0.43088552355766296, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 630.8080444335938, "completions/mean_terminated_length": 630.8080444335938, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.45384218669417226, "grad_norm": 0.12410211563110352, "kl": 0.017669677734375, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 60060838.0, "reward": 1.1919643878936768, "reward_std": 0.2088748961687088, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548, "rewards/curriculum_aware_reward_fn/std": 0.40441396832466125, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 590.529052734375, "completions/mean_terminated_length": 590.529052734375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.45796802475502835, "grad_norm": 0.1436229646205902, "kl": 0.0192108154296875, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 60570843.0, "reward": 1.2366071939468384, "reward_std": 0.2522122859954834, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3595.0, "completions/max_terminated_length": 3595.0, "completions/mean_length": 620.0535888671875, "completions/mean_terminated_length": 620.0535888671875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4620938628158845, "grad_norm": 0.12236104905605316, "kl": 0.0196990966796875, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 61130538.0, "reward": 1.1584821939468384, "reward_std": 0.20096704363822937, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 630.7142944335938, "completions/mean_terminated_length": 630.7142944335938, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.4662197008767406, "grad_norm": 0.1225244402885437, "kl": 0.0185546875, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 61698285.0, "reward": 1.1852679252624512, "reward_std": 0.19447438418865204, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 594.0201416015625, "completions/mean_terminated_length": 586.1856689453125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.4703455389375967, "grad_norm": 0.1265028417110443, "kl": 0.01844024658203125, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 62216595.0, "reward": 1.1540179252624512, "reward_std": 0.21466010808944702, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2905.0, "completions/max_terminated_length": 2905.0, "completions/mean_length": 616.450927734375, "completions/mean_terminated_length": 616.450927734375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.4744713769984528, "grad_norm": 0.13178738951683044, "kl": 0.01700592041015625, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 62754222.0, "reward": 1.25, "reward_std": 0.2588232457637787, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844924926758, "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, "rewards/curriculum_aware_reward_fn/std": 0.4608125388622284, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 623.7701416015625, "completions/mean_terminated_length": 616.0022583007812, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.4785972150593089, "grad_norm": 0.10957789421081543, "kl": 0.0179290771484375, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 63299125.0, "reward": 1.1830357313156128, "reward_std": 0.18335460126399994, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 620.732177734375, "completions/mean_terminated_length": 620.732177734375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.48272305312016506, "grad_norm": 0.12838409841060638, "kl": 0.017333984375, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 63856163.0, "reward": 1.2366071939468384, "reward_std": 0.2278459519147873, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.2544642984867096, "rewards/curriculum_aware_reward_fn/std": 0.4360465705394745, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 628.8303833007812, "completions/mean_terminated_length": 621.0738525390625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.48684889118102115, "grad_norm": 0.12741097807884216, "kl": 0.017852783203125, "learning_rate": 1e-06, "loss": 0.0342, "num_tokens": 64409101.0, "reward": 1.133928656578064, "reward_std": 0.2076984941959381, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2284.0, "completions/max_terminated_length": 2284.0, "completions/mean_length": 616.4866333007812, "completions/mean_terminated_length": 616.4866333007812, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.49097472924187724, "grad_norm": 0.13502834737300873, "kl": 0.01747894287109375, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 64957741.0, "reward": 1.1986607313156128, "reward_std": 0.25643712282180786, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.4722230136394501, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 593.984375, "completions/mean_terminated_length": 593.984375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.4951005673027334, "grad_norm": 0.13753342628479004, "kl": 0.0208587646484375, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 65486437.0, "reward": 1.1785714626312256, "reward_std": 0.24612975120544434, "rewards/code_format_reward/mean": 0.9665178656578064, "rewards/code_format_reward/std": 0.1800929754972458, "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968, "rewards/curriculum_aware_reward_fn/std": 0.40921953320503235, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 633.1942138671875, "completions/mean_terminated_length": 617.6659545898438, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.4992264053635895, "grad_norm": 0.12871219217777252, "kl": 0.018951416015625, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 66037256.0, "reward": 1.149553656578064, "reward_std": 0.23094110190868378, "rewards/code_format_reward/mean": 0.9665178656578064, "rewards/code_format_reward/std": 0.1800929754972458, "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 591.872802734375, "completions/mean_terminated_length": 591.872802734375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5033522434244456, "grad_norm": 0.1308499574661255, "kl": 0.01856231689453125, "learning_rate": 1e-06, "loss": -0.0144, "num_tokens": 66568310.0, "reward": 1.2232143878936768, "reward_std": 0.2293757200241089, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.234375, "rewards/curriculum_aware_reward_fn/std": 0.42408111691474915, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 634.4642944335938, "completions/mean_terminated_length": 634.4642944335938, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5074780814853017, "grad_norm": 0.1262272447347641, "kl": 0.0186920166015625, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 67134237.0, "reward": 1.1852679252624512, "reward_std": 0.21710634231567383, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 607.5, "completions/mean_terminated_length": 607.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5116039195461578, "grad_norm": 0.10594639182090759, "kl": 0.0202789306640625, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 67662464.0, "reward": 1.1517857313156128, "reward_std": 0.15240901708602905, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 614.2545166015625, "completions/mean_terminated_length": 614.2545166015625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5157297576070139, "grad_norm": 0.12127646803855896, "kl": 0.0180511474609375, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 68200136.0, "reward": 1.1852679252624512, "reward_std": 0.19039402902126312, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548, "rewards/curriculum_aware_reward_fn/std": 0.40441393852233887, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 613.4710083007812, "completions/mean_terminated_length": 613.4710083007812, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.51985559566787, "grad_norm": 0.13562914729118347, "kl": 0.0213470458984375, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 68734384.0, "reward": 1.2232143878936768, "reward_std": 0.23532813787460327, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.2477678507566452, "rewards/curriculum_aware_reward_fn/std": 0.4321989119052887, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 627.4910888671875, "completions/mean_terminated_length": 627.4910888671875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5239814337287262, "grad_norm": 0.12564301490783691, "kl": 0.01911163330078125, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 69288906.0, "reward": 1.1741071939468384, "reward_std": 0.2296229898929596, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.4088349938392639, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1849.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 636.8839721679688, "completions/mean_terminated_length": 636.8839721679688, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5281072717895823, "grad_norm": 0.09942315518856049, "kl": 0.0200653076171875, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 69840881.0, "reward": 1.15625, "reward_std": 0.14822185039520264, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 667.1986694335938, "completions/mean_terminated_length": 659.5279541015625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5322331098504384, "grad_norm": 0.1194535493850708, "kl": 0.02094268798828125, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 70437755.0, "reward": 1.165178656578064, "reward_std": 0.23538923263549805, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 606.2076416015625, "completions/mean_terminated_length": 606.2076416015625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5363589479112945, "grad_norm": 0.12562017142772675, "kl": 0.0233001708984375, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 70975980.0, "reward": 1.2544643878936768, "reward_std": 0.2233707308769226, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2611607015132904, "rewards/curriculum_aware_reward_fn/std": 0.46928995847702026, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 633.4085083007812, "completions/mean_terminated_length": 633.4085083007812, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5404847859721505, "grad_norm": 0.11832914501428604, "kl": 0.0189666748046875, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 71537752.0, "reward": 1.1741071939468384, "reward_std": 0.19277827441692352, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.38894903659820557, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 642.388427734375, "completions/mean_terminated_length": 642.388427734375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5446106240330068, "grad_norm": 0.11375118792057037, "kl": 0.02691650390625, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 72108069.0, "reward": 1.1696429252624512, "reward_std": 0.18575815856456757, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2311.0, "completions/max_terminated_length": 2311.0, "completions/mean_length": 632.935302734375, "completions/mean_terminated_length": 632.935302734375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5487364620938628, "grad_norm": 0.09970323741436005, "kl": 0.0205230712890625, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 72670091.0, "reward": 1.1093751192092896, "reward_std": 0.14794285595417023, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.125, "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 674.279052734375, "completions/mean_terminated_length": 674.279052734375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.5528623001547189, "grad_norm": 0.09895305335521698, "kl": 0.0227508544921875, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 73242676.0, "reward": 1.133928656578064, "reward_std": 0.1548776626586914, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2877.0, "completions/max_terminated_length": 2877.0, "completions/mean_length": 681.5111694335938, "completions/mean_terminated_length": 681.5111694335938, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.556988138215575, "grad_norm": 0.08888301998376846, "kl": 0.0201263427734375, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 73831661.0, "reward": 1.1428571939468384, "reward_std": 0.13796453177928925, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 660.6272583007812, "completions/mean_terminated_length": 652.9418334960938, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5611139762764311, "grad_norm": 0.1272604912519455, "kl": 0.020172119140625, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 74389457.0, "reward": 1.1852679252624512, "reward_std": 0.22296936810016632, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 670.9308471679688, "completions/mean_terminated_length": 663.2684326171875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5652398143372873, "grad_norm": 0.11799541860818863, "kl": 0.0193634033203125, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 74955544.0, "reward": 1.203125, "reward_std": 0.2177913635969162, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2142857164144516, "rewards/curriculum_aware_reward_fn/std": 0.4107845723628998, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 684.8861694335938, "completions/mean_terminated_length": 677.2550659179688, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.5693656523981434, "grad_norm": 0.11375576257705688, "kl": 0.0201263427734375, "learning_rate": 1e-06, "loss": 0.026, "num_tokens": 75538306.0, "reward": 1.1361607313156128, "reward_std": 0.2069287896156311, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, "rewards/curriculum_aware_reward_fn/std": 0.379651814699173, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 650.4464721679688, "completions/mean_terminated_length": 642.73828125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.5734914904589995, "grad_norm": 0.1262710690498352, "kl": 0.02080535888671875, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 76080467.0, "reward": 1.1852679252624512, "reward_std": 0.22098958492279053, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 662.0736694335938, "completions/mean_terminated_length": 662.0736694335938, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.5776173285198556, "grad_norm": 0.10197340697050095, "kl": 0.0188140869140625, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 76652428.0, "reward": 1.1785714626312256, "reward_std": 0.16689015924930573, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.38894903659820557, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2022.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 655.3170166015625, "completions/mean_terminated_length": 655.3170166015625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.5817431665807117, "grad_norm": 0.12121007591485977, "kl": 0.0174407958984375, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 77216728.0, "reward": 1.243303656578064, "reward_std": 0.22300545871257782, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2455357164144516, "rewards/curriculum_aware_reward_fn/std": 0.43088552355766296, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 722.2879638671875, "completions/mean_terminated_length": 714.740478515625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5858690046415678, "grad_norm": 0.09955620020627975, "kl": 0.01959228515625, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 77830071.0, "reward": 1.1316965818405151, "reward_std": 0.1848302185535431, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844924926758, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.37113162875175476, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 697.4063110351562, "completions/mean_terminated_length": 697.4063110351562, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.589994842702424, "grad_norm": 0.10823310911655426, "kl": 0.017974853515625, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 78423972.0, "reward": 1.1674107313156128, "reward_std": 0.1770811378955841, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 681.8504638671875, "completions/mean_terminated_length": 681.8504638671875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5941206807632801, "grad_norm": 0.11343028396368027, "kl": 0.020416259765625, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 78988504.0, "reward": 1.1272321939468384, "reward_std": 0.2015698403120041, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1361607164144516, "rewards/curriculum_aware_reward_fn/std": 0.34334251284599304, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 684.2567138671875, "completions/mean_terminated_length": 684.2567138671875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.5982465188241362, "grad_norm": 0.10634089261293411, "kl": 0.0197601318359375, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 79564390.0, "reward": 1.1919643878936768, "reward_std": 0.19861890375614166, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3032.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 667.3817138671875, "completions/mean_terminated_length": 667.3817138671875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.6023723568849922, "grad_norm": 0.1292986273765564, "kl": 0.01715850830078125, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 80119059.0, "reward": 1.2165179252624512, "reward_std": 0.24052199721336365, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.44790980219841003, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 737.810302734375, "completions/mean_terminated_length": 730.2975463867188, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6064981949458483, "grad_norm": 0.10353605449199677, "kl": 0.02104949951171875, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 80715402.0, "reward": 1.1830357313156128, "reward_std": 0.23996861279010773, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.4088349938392639, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 702.0357666015625, "completions/mean_terminated_length": 694.4429321289062, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6106240330067045, "grad_norm": 0.11491792649030685, "kl": 0.0184326171875, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 81301514.0, "reward": 1.165178656578064, "reward_std": 0.19101710617542267, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349845170975, "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, "rewards/curriculum_aware_reward_fn/std": 0.37962549924850464, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 683.513427734375, "completions/mean_terminated_length": 683.513427734375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.6147498710675606, "grad_norm": 0.10107113420963287, "kl": 0.0191650390625, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 81896163.0, "reward": 1.1852679252624512, "reward_std": 0.15074901282787323, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3832.0, "completions/max_terminated_length": 3832.0, "completions/mean_length": 752.7254638671875, "completions/mean_terminated_length": 752.7254638671875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.6188757091284167, "grad_norm": 0.11351041495800018, "kl": 0.01995849609375, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 82519075.0, "reward": 1.1116071939468384, "reward_std": 0.21597826480865479, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 670.0424194335938, "completions/mean_terminated_length": 654.6793823242188, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6230015471892728, "grad_norm": 0.12200962752103806, "kl": 0.0219268798828125, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 83082342.0, "reward": 1.1897321939468384, "reward_std": 0.21293501555919647, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, "rewards/curriculum_aware_reward_fn/std": 0.4011160135269165, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 664.5223388671875, "completions/mean_terminated_length": 664.5223388671875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.6271273852501289, "grad_norm": 0.08827964961528778, "kl": 0.01688385009765625, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 83636990.0, "reward": 1.2366071939468384, "reward_std": 0.11630566418170929, "rewards/code_format_reward/mean": 1.0, "rewards/code_format_reward/std": 0.0, "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 699.8951416015625, "completions/mean_terminated_length": 699.8951416015625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.631253223310985, "grad_norm": 0.12167635560035706, "kl": 0.021514892578125, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 84224880.0, "reward": 1.265625, "reward_std": 0.2635366916656494, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2745535671710968, "rewards/curriculum_aware_reward_fn/std": 0.4467879831790924, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 731.013427734375, "completions/mean_terminated_length": 731.013427734375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6353790613718412, "grad_norm": 0.10003366321325302, "kl": 0.01879119873046875, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 84822315.0, "reward": 1.1361607313156128, "reward_std": 0.16879120469093323, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.3525845408439636, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 788.5736694335938, "completions/mean_terminated_length": 781.1744995117188, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6395048994326973, "grad_norm": 0.09420903027057648, "kl": 0.01959228515625, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 85458179.0, "reward": 1.1584821939468384, "reward_std": 0.16418080031871796, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 721.4844360351562, "completions/mean_terminated_length": 721.4844360351562, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6436307374935534, "grad_norm": 0.10942105203866959, "kl": 0.017852783203125, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 86056840.0, "reward": 1.2008929252624512, "reward_std": 0.20982369780540466, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968, "rewards/curriculum_aware_reward_fn/std": 0.40921956300735474, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 763.9330444335938, "completions/mean_terminated_length": 748.9910888671875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6477565755544095, "grad_norm": 0.11640376597642899, "kl": 0.0218353271484375, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 86676695.0, "reward": 1.234375, "reward_std": 0.22038111090660095, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.2566964328289032, "rewards/curriculum_aware_reward_fn/std": 0.4717521667480469, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1877.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 763.904052734375, "completions/mean_terminated_length": 763.904052734375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6518824136152656, "grad_norm": 0.1016639918088913, "kl": 0.01653289794921875, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 87263147.0, "reward": 1.2142857313156128, "reward_std": 0.19103483855724335, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2254464328289032, "rewards/curriculum_aware_reward_fn/std": 0.41834309697151184, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1834.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 765.2902221679688, "completions/mean_terminated_length": 765.2902221679688, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6560082516761218, "grad_norm": 0.1068812683224678, "kl": 0.0192413330078125, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 87886410.0, "reward": 1.1830357313156128, "reward_std": 0.2021593153476715, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2763.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 819.7835083007812, "completions/mean_terminated_length": 819.7835083007812, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6601340897369778, "grad_norm": 0.11231759190559387, "kl": 0.01705169677734375, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 88553539.0, "reward": 1.180803656578064, "reward_std": 0.2456439733505249, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, "rewards/curriculum_aware_reward_fn/std": 0.3994380533695221, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2596.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 789.435302734375, "completions/mean_terminated_length": 789.435302734375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.6642599277978339, "grad_norm": 0.10527466982603073, "kl": 0.0230255126953125, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 89179842.0, "reward": 1.171875, "reward_std": 0.22573941946029663, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1875, "rewards/curriculum_aware_reward_fn/std": 0.3964326083660126, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1926.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 758.3013916015625, "completions/mean_terminated_length": 758.3013916015625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.66838576585869, "grad_norm": 0.11272504925727844, "kl": 0.0263824462890625, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 89788452.0, "reward": 1.1517857313156128, "reward_std": 0.2092604786157608, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 799.6719360351562, "completions/mean_terminated_length": 792.2975463867188, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6725116039195461, "grad_norm": 0.10686300694942474, "kl": 0.01806640625, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 90413545.0, "reward": 1.2388393878936768, "reward_std": 0.2416548728942871, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2477678507566452, "rewards/curriculum_aware_reward_fn/std": 0.4321989119052887, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3462.0, "completions/max_terminated_length": 3462.0, "completions/mean_length": 768.435302734375, "completions/mean_terminated_length": 768.435302734375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.6766374419804023, "grad_norm": 0.11534981429576874, "kl": 0.01709747314453125, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 91028403.0, "reward": 1.1897321939468384, "reward_std": 0.21289893984794617, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, "rewards/curriculum_aware_reward_fn/std": 0.4011160135269165, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 799.2813110351562, "completions/mean_terminated_length": 799.2813110351562, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.6807632800412584, "grad_norm": 0.10446373373270035, "kl": 0.017852783203125, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 91666194.0, "reward": 1.1852679252624512, "reward_std": 0.22161205112934113, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, "rewards/curriculum_aware_reward_fn/std": 0.3977404832839966, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 829.2656860351562, "completions/mean_terminated_length": 829.2656860351562, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.6848891181021145, "grad_norm": 0.09481582790613174, "kl": 0.0187835693359375, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 92322920.0, "reward": 1.1763393878936768, "reward_std": 0.19797870516777039, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39818593859672546, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 739.9464721679688, "completions/mean_terminated_length": 739.9464721679688, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6890149561629706, "grad_norm": 0.11167096346616745, "kl": 0.01824951171875, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 92920794.0, "reward": 1.21875, "reward_std": 0.22270405292510986, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2254464328289032, "rewards/curriculum_aware_reward_fn/std": 0.41834309697151184, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 865.0111694335938, "completions/mean_terminated_length": 865.0111694335938, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6931407942238267, "grad_norm": 0.07918214797973633, "kl": 0.01930999755859375, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 93598987.0, "reward": 1.0959821939468384, "reward_std": 0.13547813892364502, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 815.7589721679688, "completions/mean_terminated_length": 815.7589721679688, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6972666322846828, "grad_norm": 0.10182922333478928, "kl": 0.0192413330078125, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 94249900.0, "reward": 1.2075893878936768, "reward_std": 0.19486020505428314, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517054051160812, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2445.0, "completions/max_terminated_length": 2445.0, "completions/mean_length": 820.9017944335938, "completions/mean_terminated_length": 820.9017944335938, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.701392470345539, "grad_norm": 0.09352380782365799, "kl": 0.01873016357421875, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 94909858.0, "reward": 1.1584821939468384, "reward_std": 0.17088989913463593, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 862.466552734375, "completions/mean_terminated_length": 862.466552734375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.7055183084063951, "grad_norm": 0.09764091670513153, "kl": 0.017181396484375, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 95570062.0, "reward": 1.1517857313156128, "reward_std": 0.21354569494724274, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3199.0, "completions/mean_length": 842.435302734375, "completions/mean_terminated_length": 835.1566162109375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7096441464672512, "grad_norm": 0.10323174297809601, "kl": 0.0164642333984375, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 96207915.0, "reward": 1.2008929252624512, "reward_std": 0.20108754932880402, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.2142857164144516, "rewards/curriculum_aware_reward_fn/std": 0.41619497537612915, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 845.0424194335938, "completions/mean_terminated_length": 837.7695922851562, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7137699845281072, "grad_norm": 0.10463208705186844, "kl": 0.01827239990234375, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 96843415.0, "reward": 1.2366071939468384, "reward_std": 0.23014923930168152, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.2522321343421936, "rewards/curriculum_aware_reward_fn/std": 0.4347792863845825, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3954.0, "completions/max_terminated_length": 3954.0, "completions/mean_length": 805.4866333007812, "completions/mean_terminated_length": 805.4866333007812, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.7178958225889633, "grad_norm": 0.12585392594337463, "kl": 0.0217132568359375, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 97470840.0, "reward": 1.2857143878936768, "reward_std": 0.29375454783439636, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.3013392984867096, "rewards/curriculum_aware_reward_fn/std": 0.47843703627586365, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1918.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 860.9777221679688, "completions/mean_terminated_length": 860.9777221679688, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7220216606498195, "grad_norm": 0.10259881615638733, "kl": 0.0202789306640625, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 98155122.0, "reward": 1.1785714626312256, "reward_std": 0.20454701781272888, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 866.0379638671875, "completions/mean_terminated_length": 858.8120727539062, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.7261474987106756, "grad_norm": 0.10293217748403549, "kl": 0.019378662109375, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 98811984.0, "reward": 1.21875, "reward_std": 0.2203586846590042, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032, "rewards/curriculum_aware_reward_fn/std": 0.42821168899536133, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2003.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 870.7969360351562, "completions/mean_terminated_length": 870.7969360351562, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.7302733367715317, "grad_norm": 0.10950500518083572, "kl": 0.0195465087890625, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 99470481.0, "reward": 1.1941964626312256, "reward_std": 0.24999091029167175, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548, "rewards/curriculum_aware_reward_fn/std": 0.43122729659080505, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2099.0, "completions/max_terminated_length": 2099.0, "completions/mean_length": 858.9777221679688, "completions/mean_terminated_length": 858.9777221679688, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7343991748323878, "grad_norm": 0.09902805835008621, "kl": 0.0228271484375, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 100135695.0, "reward": 1.2098214626312256, "reward_std": 0.22280491888523102, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516, "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 862.7076416015625, "completions/mean_terminated_length": 862.7076416015625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.7385250128932439, "grad_norm": 0.09314218163490295, "kl": 0.021453857421875, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 100771269.0, "reward": 1.2388393878936768, "reward_std": 0.19304141402244568, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 995.138427734375, "completions/mean_terminated_length": 995.138427734375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.7426508509541001, "grad_norm": 0.08944858610630035, "kl": 0.01837921142578125, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 101476777.0, "reward": 1.1361608505249023, "reward_std": 0.22203704714775085, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, "rewards/curriculum_aware_reward_fn/std": 0.3632439076900482, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2357.0, "completions/mean_length": 981.1763916015625, "completions/mean_terminated_length": 974.2080688476562, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.7467766890149562, "grad_norm": 0.09441213309764862, "kl": 0.0189971923828125, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 102197898.0, "reward": 1.2142857313156128, "reward_std": 0.22620178759098053, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.234375, "rewards/curriculum_aware_reward_fn/std": 0.42408111691474915, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 975.138427734375, "completions/mean_terminated_length": 975.138427734375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.7509025270758123, "grad_norm": 0.08850647509098053, "kl": 0.0180511474609375, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 102889924.0, "reward": 1.125, "reward_std": 0.2066715806722641, "rewards/code_format_reward/mean": 0.9754464030265808, "rewards/code_format_reward/std": 0.1549331247806549, "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, "rewards/curriculum_aware_reward_fn/std": 0.3632439076900482, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 925.591552734375, "completions/mean_terminated_length": 925.591552734375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.7550283651366684, "grad_norm": 0.08986890316009521, "kl": 0.02154541015625, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 103564016.0, "reward": 1.1316964626312256, "reward_std": 0.19241398572921753, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3099.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 981.9420166015625, "completions/mean_terminated_length": 981.9420166015625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.7591542031975245, "grad_norm": 0.09173586964607239, "kl": 0.02154541015625, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 104288609.0, "reward": 1.1383929252624512, "reward_std": 0.19453509151935577, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 994.1763916015625, "completions/mean_terminated_length": 994.1763916015625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.7632800412583806, "grad_norm": 0.10288364440202713, "kl": 0.021636962890625, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 105003829.0, "reward": 1.2321429252624512, "reward_std": 0.2604503035545349, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.2544642984867096, "rewards/curriculum_aware_reward_fn/std": 0.4360465705394745, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2455.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 970.47998046875, "completions/mean_terminated_length": 970.47998046875, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.7674058793192368, "grad_norm": 0.09114697575569153, "kl": 0.020721435546875, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 105698626.0, "reward": 1.2254464626312256, "reward_std": 0.207960307598114, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.2455357164144516, "rewards/curriculum_aware_reward_fn/std": 0.44618961215019226, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1019.5982666015625, "completions/mean_terminated_length": 1019.5982666015625, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.7715317173800929, "grad_norm": 0.08832815289497375, "kl": 0.0213623046875, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 106422967.0, "reward": 1.1004464626312256, "reward_std": 0.21086002886295319, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.1227678582072258, "rewards/curriculum_aware_reward_fn/std": 0.3285374045372009, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 983.5402221679688, "completions/mean_terminated_length": 983.5402221679688, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.7756575554409489, "grad_norm": 0.08387215435504913, "kl": 0.0205841064453125, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 107126422.0, "reward": 1.133928656578064, "reward_std": 0.17117545008659363, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.3525845408439636, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 967.9464721679688, "completions/mean_terminated_length": 967.9464721679688, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.779783393501805, "grad_norm": 0.10670884698629379, "kl": 0.0246124267578125, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 107818152.0, "reward": 1.212053656578064, "reward_std": 0.2627289295196533, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484, "rewards/curriculum_aware_reward_fn/std": 0.4268510043621063, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 989.60498046875, "completions/mean_terminated_length": 968.6629028320312, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.7839092315626611, "grad_norm": 0.08669859915971756, "kl": 0.0212860107421875, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 108514230.0, "reward": 1.2120537757873535, "reward_std": 0.2092031091451645, "rewards/code_format_reward/mean": 0.9732142686843872, "rewards/code_format_reward/std": 0.1616371124982834, "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484, "rewards/curriculum_aware_reward_fn/std": 0.43206024169921875, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2899.0, "completions/max_terminated_length": 2899.0, "completions/mean_length": 1001.8147583007812, "completions/mean_terminated_length": 1001.8147583007812, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.7880350696235173, "grad_norm": 0.09591568261384964, "kl": 0.021392822265625, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 109212173.0, "reward": 1.3013393878936768, "reward_std": 0.251747727394104, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.3147321343421936, "rewards/curriculum_aware_reward_fn/std": 0.47445425391197205, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 1040.8460693359375, "completions/mean_terminated_length": 1040.8460693359375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.7921609076843734, "grad_norm": 0.09632934629917145, "kl": 0.023712158203125, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 109971485.0, "reward": 1.1607143878936768, "reward_std": 0.2267763912677765, "rewards/code_format_reward/mean": 0.9665178656578064, "rewards/code_format_reward/std": 0.1800929754972458, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 1001.200927734375, "completions/mean_terminated_length": 1001.200927734375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.7962867457452295, "grad_norm": 0.1007714718580246, "kl": 0.02294921875, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 110693408.0, "reward": 1.149553656578064, "reward_std": 0.23755519092082977, "rewards/code_format_reward/mean": 0.9620535969734192, "rewards/code_format_reward/std": 0.191280335187912, "rewards/curriculum_aware_reward_fn/mean": 0.1875, "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2629.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 1031.904052734375, "completions/mean_terminated_length": 1031.904052734375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.8004125838060856, "grad_norm": 0.08654844015836716, "kl": 0.0239715576171875, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 111420091.0, "reward": 1.1785714626312256, "reward_std": 0.19378496706485748, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, "rewards/curriculum_aware_reward_fn/std": 0.4011159837245941, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2569.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 995.4063110351562, "completions/mean_terminated_length": 995.4063110351562, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.8045384218669417, "grad_norm": 0.07940182089805603, "kl": 0.02203369140625, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 112124816.0, "reward": 1.2008929252624512, "reward_std": 0.17429085075855255, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 1040.5535888671875, "completions/mean_terminated_length": 1040.5535888671875, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.8086642599277978, "grad_norm": 0.0882866159081459, "kl": 0.021759033203125, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 112862472.0, "reward": 1.2008929252624512, "reward_std": 0.2306138277053833, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968, "rewards/curriculum_aware_reward_fn/std": 0.4146503508090973, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1044.10498046875, "completions/mean_terminated_length": 1044.10498046875, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.812790097988654, "grad_norm": 0.07740884274244308, "kl": 0.022308349609375, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 113597260.0, "reward": 1.1227679252624512, "reward_std": 0.15906482934951782, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1026.8348388671875, "completions/mean_terminated_length": 1026.8348388671875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.8169159360495101, "grad_norm": 0.07976430654525757, "kl": 0.0210418701171875, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 114311151.0, "reward": 1.196428656578064, "reward_std": 0.17916743457317352, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2091.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 998.3035888671875, "completions/mean_terminated_length": 998.3035888671875, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.8210417741103662, "grad_norm": 0.09024214744567871, "kl": 0.02362060546875, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 115017531.0, "reward": 1.203125, "reward_std": 0.20551542937755585, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, "rewards/curriculum_aware_reward_fn/std": 0.4230436384677887, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 1039.9241943359375, "completions/mean_terminated_length": 1033.0872802734375, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.8251676121712223, "grad_norm": 0.11066131293773651, "kl": 0.020751953125, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 115765377.0, "reward": 1.171875, "reward_std": 0.19552592933177948, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1022.7567138671875, "completions/mean_terminated_length": 1015.8814086914062, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.8292934502320783, "grad_norm": 0.09393317997455597, "kl": 0.0224609375, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 116492274.0, "reward": 1.2790179252624512, "reward_std": 0.2514621615409851, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.2924107015132904, "rewards/curriculum_aware_reward_fn/std": 0.4553784430027008, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1020.19873046875, "completions/mean_terminated_length": 1020.19873046875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.8334192882929345, "grad_norm": 0.09607246518135071, "kl": 0.0228271484375, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 117208540.0, "reward": 1.1852679252624512, "reward_std": 0.23009294271469116, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3299.0, "completions/max_terminated_length": 3299.0, "completions/mean_length": 1075.765625, "completions/mean_terminated_length": 1075.765625, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.8375451263537906, "grad_norm": 0.08126305043697357, "kl": 0.021484375, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 117972485.0, "reward": 1.0758929252624512, "reward_std": 0.17059066891670227, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.09375, "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 1024.625, "completions/mean_terminated_length": 1024.625, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.8416709644146467, "grad_norm": 0.07800742983818054, "kl": 0.02099609375, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 118696348.0, "reward": 1.133928656578064, "reward_std": 0.16076093912124634, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 1029.7366943359375, "completions/mean_terminated_length": 1029.7366943359375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.8457968024755028, "grad_norm": 0.09388712793588638, "kl": 0.0228729248046875, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 119422452.0, "reward": 1.2522321939468384, "reward_std": 0.25209179520606995, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, "rewards/curriculum_aware_reward_fn/std": 0.4509984850883484, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 1000.4308471679688, "completions/mean_terminated_length": 993.505615234375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.8499226405363589, "grad_norm": 0.09919248521327972, "kl": 0.0224609375, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 120131759.0, "reward": 1.1852679252624512, "reward_std": 0.28246626257896423, "rewards/code_format_reward/mean": 0.9821428656578064, "rewards/code_format_reward/std": 0.13258016109466553, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3939.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 973.294677734375, "completions/mean_terminated_length": 973.294677734375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.8540484785972151, "grad_norm": 0.08878267556428909, "kl": 0.021728515625, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 120824513.0, "reward": 1.265625, "reward_std": 0.1982576996088028, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2767857015132904, "rewards/curriculum_aware_reward_fn/std": 0.44790980219841003, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3076.0, "completions/max_terminated_length": 3076.0, "completions/mean_length": 998.2098388671875, "completions/mean_terminated_length": 998.2098388671875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.8581743166580712, "grad_norm": 0.09275484085083008, "kl": 0.023162841796875, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 121532923.0, "reward": 1.2165179252624512, "reward_std": 0.21220733225345612, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2254464328289032, "rewards/curriculum_aware_reward_fn/std": 0.41834309697151184, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 1017.2410888671875, "completions/mean_terminated_length": 1017.2410888671875, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.8623001547189273, "grad_norm": 0.08458858728408813, "kl": 0.0215301513671875, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 122267527.0, "reward": 1.2254464626312256, "reward_std": 0.16818052530288696, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2620.0, "completions/mean_length": 1007.96435546875, "completions/mean_terminated_length": 1001.055908203125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.8664259927797834, "grad_norm": 0.08145608752965927, "kl": 0.0200653076171875, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 122985788.0, "reward": 1.212053656578064, "reward_std": 0.1650066077709198, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 963.2254638671875, "completions/mean_terminated_length": 963.2254638671875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.8705518308406395, "grad_norm": 0.09574118256568909, "kl": 0.02252197265625, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 123672043.0, "reward": 1.2410714626312256, "reward_std": 0.23384374380111694, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349845170975, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.4585747718811035, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2170.0, "completions/max_terminated_length": 2170.0, "completions/mean_length": 962.5469360351562, "completions/mean_terminated_length": 962.5469360351562, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.8746776689014956, "grad_norm": 0.09219788014888763, "kl": 0.0257720947265625, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 124366779.0, "reward": 1.234375, "reward_std": 0.22962519526481628, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.25, "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2332.0, "completions/max_terminated_length": 2332.0, "completions/mean_length": 1023.37060546875, "completions/mean_terminated_length": 1023.37060546875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.8788035069623518, "grad_norm": 0.09224989265203476, "kl": 0.020538330078125, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 125097150.0, "reward": 1.2455357313156128, "reward_std": 0.24242521822452545, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2522321343421936, "rewards/curriculum_aware_reward_fn/std": 0.4347793161869049, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 1034.680908203125, "completions/mean_terminated_length": 1034.680908203125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.8829293450232079, "grad_norm": 0.08268304169178009, "kl": 0.0210113525390625, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 125840792.0, "reward": 1.2299107313156128, "reward_std": 0.2035113424062729, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 1026.953125, "completions/mean_terminated_length": 1020.0872802734375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.887055183084064, "grad_norm": 0.08131767809391022, "kl": 0.0207977294921875, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 126560049.0, "reward": 1.1294643878936768, "reward_std": 0.1556638777256012, "rewards/code_format_reward/mean": 0.984375, "rewards/code_format_reward/std": 0.12415824085474014, "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, "rewards/curriculum_aware_reward_fn/std": 0.3525845408439636, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 1032.1785888671875, "completions/mean_terminated_length": 1032.1785888671875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.89118102114492, "grad_norm": 0.09227032214403152, "kl": 0.0215301513671875, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 127291629.0, "reward": 1.234375, "reward_std": 0.2220146358013153, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507843434810638, "rewards/curriculum_aware_reward_fn/mean": 0.2477678507566452, "rewards/curriculum_aware_reward_fn/std": 0.4321989119052887, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1020.9844360351562, "completions/mean_terminated_length": 1014.1051635742188, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.8953068592057761, "grad_norm": 0.07654750347137451, "kl": 0.020233154296875, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 128011152.0, "reward": 1.1741071939468384, "reward_std": 0.16328808665275574, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.1808035671710968, "rewards/curriculum_aware_reward_fn/std": 0.3852856159210205, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2088.0, "completions/max_terminated_length": 2088.0, "completions/mean_length": 942.0826416015625, "completions/mean_terminated_length": 942.0826416015625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.8994326972666323, "grad_norm": 0.10224290937185287, "kl": 0.0222015380859375, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 128690602.0, "reward": 1.2633929252624512, "reward_std": 0.23926715552806854, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2700892984867096, "rewards/curriculum_aware_reward_fn/std": 0.4593527019023895, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3910.0, "completions/mean_length": 1007.5670166015625, "completions/mean_terminated_length": 1000.65771484375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9035585353274884, "grad_norm": 0.08801200240850449, "kl": 0.0219879150390625, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 129412339.0, "reward": 1.196428656578064, "reward_std": 0.19058862328529358, "rewards/code_format_reward/mean": 0.9776785969734192, "rewards/code_format_reward/std": 0.1478918492794037, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4245342016220093, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3068.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 956.02685546875, "completions/mean_terminated_length": 956.02685546875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.9076843733883445, "grad_norm": 0.08594941347837448, "kl": 0.024871826171875, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 130095292.0, "reward": 1.2388393878936768, "reward_std": 0.18124587833881378, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2455357164144516, "rewards/curriculum_aware_reward_fn/std": 0.4308854937553406, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3954.0, "completions/mean_length": 965.7098388671875, "completions/mean_terminated_length": 958.7069091796875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.9118102114492006, "grad_norm": 0.10265861451625824, "kl": 0.0210723876953125, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 130779650.0, "reward": 1.2165179252624512, "reward_std": 0.22863872349262238, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, "rewards/curriculum_aware_reward_fn/std": 0.41980284452438354, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 959.5067138671875, "completions/mean_terminated_length": 959.5067138671875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9159360495100567, "grad_norm": 0.08753044903278351, "kl": 0.0197601318359375, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 131472989.0, "reward": 1.1897321939468384, "reward_std": 0.19170650839805603, "rewards/code_format_reward/mean": 0.9866071343421936, "rewards/code_format_reward/std": 0.11507844179868698, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1048.279052734375, "completions/mean_terminated_length": 1041.4608154296875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.9200618875709129, "grad_norm": 0.07725991308689117, "kl": 0.01824951171875, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 132207813.0, "reward": 1.1897321939468384, "reward_std": 0.1292732208967209, "rewards/code_format_reward/mean": 0.9799107313156128, "rewards/code_format_reward/std": 0.14046260714530945, "rewards/curriculum_aware_reward_fn/mean": 0.2098214328289032, "rewards/curriculum_aware_reward_fn/std": 0.4443953037261963, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 1044.83935546875, "completions/mean_terminated_length": 1044.83935546875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.924187725631769, "grad_norm": 0.08536474406719208, "kl": 0.0180511474609375, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 132953787.0, "reward": 1.2075893878936768, "reward_std": 0.2070293128490448, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2098214328289032, "rewards/curriculum_aware_reward_fn/std": 0.40763622522354126, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 923.24560546875, "completions/mean_terminated_length": 923.24560546875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.9283135636926251, "grad_norm": 0.08723261952400208, "kl": 0.0198822021484375, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 133636019.0, "reward": 1.1830357313156128, "reward_std": 0.1872720867395401, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, "rewards/curriculum_aware_reward_fn/std": 0.39252740144729614, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 956.4888916015625, "completions/mean_terminated_length": 956.4888916015625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.9324394017534812, "grad_norm": 0.0859052836894989, "kl": 0.0199737548828125, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 134328111.0, "reward": 1.180803656578064, "reward_std": 0.17333610355854034, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1852678507566452, "rewards/curriculum_aware_reward_fn/std": 0.3889490067958832, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 933.4420166015625, "completions/mean_terminated_length": 933.4420166015625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.9365652398143373, "grad_norm": 0.08171730488538742, "kl": 0.020416259765625, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 135008926.0, "reward": 1.212053656578064, "reward_std": 0.17479148507118225, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 960.0938110351562, "completions/mean_terminated_length": 953.0783081054688, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.9406910778751933, "grad_norm": 0.08944277465343475, "kl": 0.0196075439453125, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 135704131.0, "reward": 1.2142857313156128, "reward_std": 0.21784988045692444, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548, "rewards/curriculum_aware_reward_fn/std": 0.43122729659080505, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2404.0, "completions/max_terminated_length": 2404.0, "completions/mean_length": 966.7076416015625, "completions/mean_terminated_length": 966.7076416015625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.9448169159360496, "grad_norm": 0.0876958817243576, "kl": 0.019561767578125, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 136418315.0, "reward": 1.1941964626312256, "reward_std": 0.19966824352741241, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, "rewards/curriculum_aware_reward_fn/std": 0.3994380831718445, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 932.6295166015625, "completions/mean_terminated_length": 925.5525512695312, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.9489427539969056, "grad_norm": 0.08093264698982239, "kl": 0.0186920166015625, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 137105535.0, "reward": 1.1629464626312256, "reward_std": 0.14830277860164642, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 937.8192138671875, "completions/mean_terminated_length": 937.8192138671875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.9530685920577617, "grad_norm": 0.09406352043151855, "kl": 0.020416259765625, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 137787310.0, "reward": 1.2611607313156128, "reward_std": 0.2289111316204071, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843171834946, "rewards/curriculum_aware_reward_fn/mean": 0.2678571343421936, "rewards/curriculum_aware_reward_fn/std": 0.45331767201423645, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2223.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 884.19873046875, "completions/mean_terminated_length": 884.19873046875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9571944301186178, "grad_norm": 0.10777918994426727, "kl": 0.021209716796875, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 138450138.0, "reward": 1.2410714626312256, "reward_std": 0.23166537284851074, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.2455357164144516, "rewards/curriculum_aware_reward_fn/std": 0.43088552355766296, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 902.7879638671875, "completions/mean_terminated_length": 902.7879638671875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.9613202681794739, "grad_norm": 0.09343870729207993, "kl": 0.019561767578125, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 139122386.0, "reward": 1.2098214626312256, "reward_std": 0.2065286785364151, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2054.0, "completions/max_terminated_length": 2054.0, "completions/mean_length": 896.2120971679688, "completions/mean_terminated_length": 896.2120971679688, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.9654461062403301, "grad_norm": 0.09210006892681122, "kl": 0.02313232421875, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 139795846.0, "reward": 1.1919643878936768, "reward_std": 0.1857805997133255, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.203125, "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 928.8326416015625, "completions/mean_terminated_length": 928.8326416015625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.9695719443011862, "grad_norm": 0.10233625769615173, "kl": 0.0190887451171875, "learning_rate": 1e-06, "loss": -0.006, "num_tokens": 140482933.0, "reward": 1.2566964626312256, "reward_std": 0.22570116817951202, "rewards/code_format_reward/mean": 0.9933035969734192, "rewards/code_format_reward/std": 0.08164843916893005, "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, "rewards/curriculum_aware_reward_fn/std": 0.46564197540283203, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 850.4844360351562, "completions/mean_terminated_length": 850.4844360351562, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.9736977823620423, "grad_norm": 0.09629914909601212, "kl": 0.02154541015625, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 141138406.0, "reward": 1.2254464626312256, "reward_std": 0.17291639745235443, "rewards/code_format_reward/mean": 0.9977678656578064, "rewards/code_format_reward/std": 0.047245558351278305, "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, "rewards/curriculum_aware_reward_fn/std": 0.41980284452438354, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 869.8236694335938, "completions/mean_terminated_length": 862.6062622070312, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.9778236204228984, "grad_norm": 0.10232575237751007, "kl": 0.021240234375, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 141784371.0, "reward": 1.25, "reward_std": 0.23957639932632446, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.2611607015132904, "rewards/curriculum_aware_reward_fn/std": 0.43975839018821716, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 873.8035888671875, "completions/mean_terminated_length": 873.8035888671875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.9819494584837545, "grad_norm": 0.09762946516275406, "kl": 0.021453857421875, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 142451789.0, "reward": 1.1875, "reward_std": 0.20032154023647308, "rewards/code_format_reward/mean": 0.9888392686843872, "rewards/code_format_reward/std": 0.10517053306102753, "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, "rewards/curriculum_aware_reward_fn/std": 0.3994380831718445, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2776.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 884.4866333007812, "completions/mean_terminated_length": 884.4866333007812, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.9860752965446106, "grad_norm": 0.09725674241781235, "kl": 0.01922607421875, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 143115292.0, "reward": 1.2098214626312256, "reward_std": 0.20011693239212036, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.21875, "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 896.3928833007812, "completions/mean_terminated_length": 889.2349243164062, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.9902011346054668, "grad_norm": 0.09036494791507721, "kl": 0.0186920166015625, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 143789631.0, "reward": 1.1763393878936768, "reward_std": 0.19829724729061127, "rewards/code_format_reward/mean": 0.9955357313156128, "rewards/code_format_reward/std": 0.06674052774906158, "rewards/curriculum_aware_reward_fn/mean": 0.1808035671710968, "rewards/curriculum_aware_reward_fn/std": 0.3852856159210205, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 872.5781860351562, "completions/mean_terminated_length": 872.5781860351562, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.9943269726663229, "grad_norm": 0.10011576861143112, "kl": 0.021697998046875, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 144456836.0, "reward": 1.2142857313156128, "reward_std": 0.22189761698246002, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 865.1642456054688, "completions/mean_terminated_length": 865.1642456054688, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.998452810727179, "grad_norm": 0.09739121049642563, "kl": 0.01898193359375, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 145125734.0, "reward": 1.165178656578064, "reward_std": 0.2198539525270462, "rewards/code_format_reward/mean": 0.9910714030265808, "rewards/code_format_reward/std": 0.09417349100112915, "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, "rewards/curriculum_aware_reward_fn/std": 0.37962549924850464, "step": 242 }, { "epoch": 0.998452810727179, "step": 242, "total_flos": 0.0, "train_loss": 0.01044145032583514, "train_runtime": 40223.769, "train_samples_per_second": 0.385, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 242, "num_input_tokens_seen": 145125734, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }