{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.9302325581395348,
  "eval_steps": 50,
  "global_step": 20,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.10416666666666663,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 580.8333740234375,
      "completions/mean_terminated_length": 529.3023071289062,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 0,
      "eval_loss": 9.23273983062245e-05,
      "eval_runtime": 30.5641,
      "eval_samples_per_second": 0.654,
      "eval_steps_per_second": 0.033,
      "kl": 0.000934600830078125,
      "num_tokens": 80224.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 0
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.23307291666666663,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.5,
      "completions/mean_length": 645.7864685058594,
      "completions/mean_terminated_length": 531.2034301757812,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "epoch": 0.37209302325581395,
      "grad_norm": 0.0,
      "kl": 0.00021028518676757812,
      "learning_rate": 3.3333333333333333e-06,
      "loss": 0.0,
      "num_tokens": 1440148.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2981770833333333,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 660.4414367675781,
      "completions/mean_terminated_length": 506.07061767578125,
      "completions/min_length": 204.5,
      "completions/min_terminated_length": 204.5,
      "epoch": 0.7441860465116279,
      "grad_norm": 397.1085205078125,
      "kl": 0.9753599166870117,
      "learning_rate": 1e-05,
      "loss": 0.0975,
      "num_tokens": 2834391.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.251953125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 646.3483276367188,
      "completions/mean_terminated_length": 519.0995330810547,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "epoch": 1.1860465116279069,
      "grad_norm": 46054105088.0,
      "kl": 43881017.763025284,
      "learning_rate": 9.777864028930705e-06,
      "loss": 4398960.0,
      "num_tokens": 4201695.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.27213541666666663,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.5,
      "completions/mean_length": 673.9479370117188,
      "completions/mean_terminated_length": 543.1653747558594,
      "completions/min_length": 254.5,
      "completions/min_terminated_length": 254.5,
      "epoch": 1.558139534883721,
      "grad_norm": 0.28854840993881226,
      "kl": 20.63982391357422,
      "learning_rate": 9.131193871579975e-06,
      "loss": 2.0676,
      "num_tokens": 5603183.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2740885416666667,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.5,
      "completions/mean_length": 673.3385620117188,
      "completions/mean_terminated_length": 540.8829956054688,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 1.9302325581395348,
      "grad_norm": 0.31550031900405884,
      "kl": 0.0182647705078125,
      "learning_rate": 8.117449009293668e-06,
      "loss": 0.0018,
      "num_tokens": 6971974.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.24609375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.5,
      "completions/mean_length": 648.3164367675781,
      "completions/mean_terminated_length": 526.1214752197266,
      "completions/min_length": 184.5,
      "completions/min_terminated_length": 184.5,
      "epoch": 2.3720930232558137,
      "grad_norm": 0.32607021927833557,
      "kl": 0.021625518798828125,
      "learning_rate": 6.8267051218319766e-06,
      "loss": 0.0022,
      "num_tokens": 8358137.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.22395833333333331,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.5,
      "completions/mean_length": 630.4388122558594,
      "completions/mean_terminated_length": 516.7716064453125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 2.744186046511628,
      "grad_norm": 0.12816570699214935,
      "kl": 0.0286407470703125,
      "learning_rate": 5.373650467932122e-06,
      "loss": 0.0029,
      "num_tokens": 9705290.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2734375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.5,
      "completions/mean_length": 662.1341247558594,
      "completions/mean_terminated_length": 526.5307464599609,
      "completions/min_length": 158.5,
      "completions/min_terminated_length": 158.5,
      "epoch": 3.186046511627907,
      "grad_norm": 2.3328659534454346,
      "kl": 0.038421630859375,
      "learning_rate": 3.887395330218429e-06,
      "loss": 0.0038,
      "num_tokens": 11084995.0,
      "reward": 0.5,
      "reward_std": 0.0,
      "rewards/compiled_reward_inst/mean": 0.5,
      "rewards/compiled_reward_inst/std": 0.0,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.18359375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.5,
      "completions/mean_length": 614.7044677734375,
      "completions/mean_terminated_length": 522.2958679199219,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 3.558139534883721,
      "grad_norm": 0.08445177972316742,
      "kl": 0.03482818603515625,
      "learning_rate": 2.5000000000000015e-06,
      "loss": 0.007,
      "num_tokens": 12414672.0,
      "reward": 0.5013020932674408,
      "reward_std": 0.003682847833260894,
      "rewards/compiled_reward_inst/mean": 0.5013020932674408,
      "rewards/compiled_reward_inst/std": 0.025515519082546234,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.24088541666666669,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 658.873046875,
      "completions/mean_terminated_length": 541.7471008300781,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 3.9302325581395348,
      "grad_norm": 0.2177199274301529,
      "kl": 0.0333709716796875,
      "learning_rate": 1.3347406408508695e-06,
      "loss": 0.0033,
      "num_tokens": 13810302.0,
      "reward": 0.5013020932674408,
      "reward_std": 0.003682847833260894,
      "rewards/compiled_reward_inst/mean": 0.5013020932674408,
      "rewards/compiled_reward_inst/std": 0.025515519082546234,
      "step": 20
    }
  ],
  "logging_steps": 2,
  "max_steps": 24,
  "num_input_tokens_seen": 13810302,
  "num_train_epochs": 4,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}