{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9302325581395348, "eval_steps": 50, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10416666666666663, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 580.8333740234375, "completions/mean_terminated_length": 529.3023071289062, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0, "eval_loss": 9.23273983062245e-05, "eval_runtime": 30.5641, "eval_samples_per_second": 0.654, "eval_steps_per_second": 0.033, "kl": 0.000934600830078125, "num_tokens": 80224.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23307291666666663, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.5, "completions/mean_length": 645.7864685058594, "completions/mean_terminated_length": 531.2034301757812, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.37209302325581395, "grad_norm": 0.0, "kl": 0.00021028518676757812, "learning_rate": 3.3333333333333333e-06, "loss": 0.0, "num_tokens": 1440148.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2981770833333333, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 660.4414367675781, "completions/mean_terminated_length": 506.07061767578125, "completions/min_length": 204.5, "completions/min_terminated_length": 204.5, "epoch": 0.7441860465116279, "grad_norm": 397.1085205078125, "kl": 0.9753599166870117, "learning_rate": 1e-05, "loss": 0.0975, "num_tokens": 2834391.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 646.3483276367188, "completions/mean_terminated_length": 519.0995330810547, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 1.1860465116279069, "grad_norm": 46054105088.0, "kl": 43881017.763025284, "learning_rate": 9.777864028930705e-06, "loss": 4398960.0, "num_tokens": 4201695.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.27213541666666663, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.5, "completions/mean_length": 673.9479370117188, "completions/mean_terminated_length": 543.1653747558594, "completions/min_length": 254.5, "completions/min_terminated_length": 254.5, "epoch": 1.558139534883721, "grad_norm": 0.28854840993881226, "kl": 20.63982391357422, "learning_rate": 9.131193871579975e-06, "loss": 2.0676, "num_tokens": 5603183.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2740885416666667, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.5, "completions/mean_length": 673.3385620117188, "completions/mean_terminated_length": 540.8829956054688, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 1.9302325581395348, "grad_norm": 0.31550031900405884, "kl": 0.0182647705078125, "learning_rate": 8.117449009293668e-06, "loss": 0.0018, "num_tokens": 6971974.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.24609375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.5, "completions/mean_length": 648.3164367675781, "completions/mean_terminated_length": 526.1214752197266, "completions/min_length": 184.5, "completions/min_terminated_length": 184.5, "epoch": 2.3720930232558137, "grad_norm": 0.32607021927833557, "kl": 0.021625518798828125, "learning_rate": 6.8267051218319766e-06, "loss": 0.0022, "num_tokens": 8358137.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22395833333333331, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.5, "completions/mean_length": 630.4388122558594, "completions/mean_terminated_length": 516.7716064453125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 2.744186046511628, "grad_norm": 0.12816570699214935, "kl": 0.0286407470703125, "learning_rate": 5.373650467932122e-06, "loss": 0.0029, "num_tokens": 9705290.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.5, "completions/mean_length": 662.1341247558594, "completions/mean_terminated_length": 526.5307464599609, "completions/min_length": 158.5, "completions/min_terminated_length": 158.5, "epoch": 3.186046511627907, "grad_norm": 2.3328659534454346, "kl": 0.038421630859375, "learning_rate": 3.887395330218429e-06, "loss": 0.0038, "num_tokens": 11084995.0, "reward": 0.5, "reward_std": 0.0, "rewards/compiled_reward_inst/mean": 0.5, "rewards/compiled_reward_inst/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.18359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.5, "completions/mean_length": 614.7044677734375, "completions/mean_terminated_length": 522.2958679199219, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 3.558139534883721, "grad_norm": 0.08445177972316742, "kl": 0.03482818603515625, "learning_rate": 2.5000000000000015e-06, "loss": 0.007, "num_tokens": 12414672.0, "reward": 0.5013020932674408, "reward_std": 0.003682847833260894, "rewards/compiled_reward_inst/mean": 0.5013020932674408, "rewards/compiled_reward_inst/std": 0.025515519082546234, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.24088541666666669, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 658.873046875, "completions/mean_terminated_length": 541.7471008300781, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 3.9302325581395348, "grad_norm": 0.2177199274301529, "kl": 0.0333709716796875, "learning_rate": 1.3347406408508695e-06, "loss": 0.0033, "num_tokens": 13810302.0, "reward": 0.5013020932674408, "reward_std": 0.003682847833260894, "rewards/compiled_reward_inst/mean": 0.5013020932674408, "rewards/compiled_reward_inst/std": 0.025515519082546234, "step": 20 } ], "logging_steps": 2, "max_steps": 24, "num_input_tokens_seen": 13810302, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }