{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4995836802664446, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012489592006661115, "grad_norm": 402.1712646484375, "learning_rate": 2.0833333333333333e-07, "logits/chosen": 1.3385416269302368, "logits/rejected": 0.6893351078033447, "logps/chosen": -2177.333251953125, "logps/rejected": -1575.3333740234375, "loss": 0.712, "rewards/accuracies": 0.12708333134651184, "rewards/chosen": 0.02794392965734005, "rewards/margins": 0.02732747420668602, "rewards/rejected": 0.0006296793580986559, "step": 5 }, { "epoch": 0.02497918401332223, "grad_norm": 389.4602966308594, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 1.1759765148162842, "logits/rejected": 0.5095987915992737, "logps/chosen": -2215.7333984375, "logps/rejected": -1587.4666748046875, "loss": 0.6586, "rewards/accuracies": 0.2708333432674408, "rewards/chosen": 0.16729532182216644, "rewards/margins": 0.20574747025966644, "rewards/rejected": -0.03834431990981102, "step": 10 }, { "epoch": 0.03746877601998335, "grad_norm": 343.98779296875, "learning_rate": 6.25e-07, "logits/chosen": 1.3039062023162842, "logits/rejected": 0.625195324420929, "logps/chosen": -2254.666748046875, "logps/rejected": -1640.5333251953125, "loss": 0.5826, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": 0.39796143770217896, "rewards/margins": 0.3895019590854645, "rewards/rejected": 0.008770751766860485, "step": 15 }, { "epoch": 0.04995836802664446, "grad_norm": 82.75628662109375, "learning_rate": 8.333333333333333e-07, "logits/chosen": 1.5281250476837158, "logits/rejected": 0.8776692748069763, "logps/chosen": -2298.13330078125, "logps/rejected": -1688.5333251953125, "loss": 0.2254, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 1.8971353769302368, "rewards/margins": 1.9559895992279053, "rewards/rejected": -0.05841471254825592, "step": 20 }, { "epoch": 0.06244796003330558, "grad_norm": 15.137582778930664, "learning_rate": 1.0416666666666667e-06, "logits/chosen": 1.7648437023162842, "logits/rejected": 1.1162760257720947, "logps/chosen": -2257.066650390625, "logps/rejected": -1692.2667236328125, "loss": 0.0733, "rewards/accuracies": 0.9770833253860474, "rewards/chosen": 5.196354389190674, "rewards/margins": 5.3125, "rewards/rejected": -0.11479899287223816, "step": 25 }, { "epoch": 0.0749375520399667, "grad_norm": 1.8799092769622803, "learning_rate": 1.25e-06, "logits/chosen": 1.9369791746139526, "logits/rejected": 1.3065755367279053, "logps/chosen": -2104.2666015625, "logps/rejected": -1579.4666748046875, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 7.6072916984558105, "rewards/margins": 7.9510416984558105, "rewards/rejected": -0.34428709745407104, "step": 30 }, { "epoch": 0.08742714404662781, "grad_norm": 0.47296273708343506, "learning_rate": 1.4583333333333335e-06, "logits/chosen": 2.621875047683716, "logits/rejected": 2.1070313453674316, "logps/chosen": -2109.86669921875, "logps/rejected": -1642.4000244140625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 12.877083778381348, "rewards/margins": 13.658333778381348, "rewards/rejected": -0.7810343503952026, "step": 35 }, { "epoch": 0.09991673605328892, "grad_norm": 0.28285741806030273, "learning_rate": 1.6666666666666667e-06, "logits/chosen": 3.0453124046325684, "logits/rejected": 2.5919270515441895, "logps/chosen": -2224.2666015625, "logps/rejected": -1753.5999755859375, "loss": 0.0048, "rewards/accuracies": 0.9958333373069763, "rewards/chosen": 14.229166984558105, "rewards/margins": 15.608333587646484, "rewards/rejected": -1.367578148841858, "step": 40 }, { "epoch": 0.11240632805995004, "grad_norm": 0.037709660828113556, "learning_rate": 1.8750000000000003e-06, "logits/chosen": 3.230729103088379, "logits/rejected": 2.8765625953674316, "logps/chosen": -2051.466552734375, "logps/rejected": -1633.3333740234375, "loss": 0.0018, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 15.229166984558105, "rewards/margins": 17.397916793823242, "rewards/rejected": -2.1578125953674316, "step": 45 }, { "epoch": 0.12489592006661115, "grad_norm": 0.07219494879245758, "learning_rate": 2.0833333333333334e-06, "logits/chosen": 3.6119792461395264, "logits/rejected": 3.425520896911621, "logps/chosen": -2038.6666259765625, "logps/rejected": -1636.13330078125, "loss": 0.002, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 17.63541603088379, "rewards/margins": 20.879167556762695, "rewards/rejected": -3.2317707538604736, "step": 50 }, { "epoch": 0.12489592006661115, "eval_logits/chosen": 3.69328236579895, "eval_logits/rejected": 3.562869071960449, "eval_logps/chosen": -1998.5511474609375, "eval_logps/rejected": -1615.590576171875, "eval_loss": 0.0010278059635311365, "eval_rewards/accuracies": 0.999015748500824, "eval_rewards/chosen": 17.693897247314453, "eval_rewards/margins": 21.503936767578125, "eval_rewards/rejected": -3.8153913021087646, "eval_runtime": 354.0236, "eval_samples_per_second": 5.714, "eval_steps_per_second": 0.359, "step": 50 }, { "epoch": 0.13738551207327226, "grad_norm": 0.10287120193243027, "learning_rate": 2.2916666666666666e-06, "logits/chosen": 3.7432291507720947, "logits/rejected": 3.621875047683716, "logps/chosen": -2011.4666748046875, "logps/rejected": -1638.4000244140625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 17.558332443237305, "rewards/margins": 21.45833396911621, "rewards/rejected": -3.894270896911621, "step": 55 }, { "epoch": 0.1498751040799334, "grad_norm": 0.04395420849323273, "learning_rate": 2.5e-06, "logits/chosen": 3.898958444595337, "logits/rejected": 3.788541555404663, "logps/chosen": -2004.800048828125, "logps/rejected": -1642.13330078125, "loss": 0.0017, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 17.62708282470703, "rewards/margins": 22.004167556762695, "rewards/rejected": -4.3848958015441895, "step": 60 }, { "epoch": 0.1623646960865945, "grad_norm": 0.03243907541036606, "learning_rate": 2.7083333333333334e-06, "logits/chosen": 3.807812452316284, "logits/rejected": 3.750520944595337, "logps/chosen": -1999.199951171875, "logps/rejected": -1647.7332763671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 17.706249237060547, "rewards/margins": 22.52083396911621, "rewards/rejected": -4.805729389190674, "step": 65 }, { "epoch": 0.17485428809325562, "grad_norm": 0.03527356684207916, "learning_rate": 2.916666666666667e-06, "logits/chosen": 3.929166555404663, "logits/rejected": 3.938020944595337, "logps/chosen": -1937.5999755859375, "logps/rejected": -1582.4000244140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 18.258333206176758, "rewards/margins": 23.65833282470703, "rewards/rejected": -5.389062404632568, "step": 70 }, { "epoch": 0.18734388009991673, "grad_norm": 0.024034911766648293, "learning_rate": 3.125e-06, "logits/chosen": 3.778125047683716, "logits/rejected": 3.7671875953674316, "logps/chosen": -2009.86669921875, "logps/rejected": -1654.4000244140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 19.09583282470703, "rewards/margins": 25.28333282470703, "rewards/rejected": -6.188541889190674, "step": 75 }, { "epoch": 0.19983347210657784, "grad_norm": 0.025018172338604927, "learning_rate": 3.3333333333333333e-06, "logits/chosen": 3.8463542461395264, "logits/rejected": 3.8359375, "logps/chosen": -2092.2666015625, "logps/rejected": -1736.5333251953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 19.879167556762695, "rewards/margins": 26.391666412353516, "rewards/rejected": -6.508333206176758, "step": 80 }, { "epoch": 0.21232306411323898, "grad_norm": 0.1799844652414322, "learning_rate": 3.5416666666666673e-06, "logits/chosen": 3.660937547683716, "logits/rejected": 3.6328125, "logps/chosen": -2101.333251953125, "logps/rejected": -1755.199951171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 20.102083206176758, "rewards/margins": 26.983333587646484, "rewards/rejected": -6.862500190734863, "step": 85 }, { "epoch": 0.2248126561199001, "grad_norm": 0.09295323491096497, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 3.882291555404663, "logits/rejected": 3.8661458492279053, "logps/chosen": -2164.533447265625, "logps/rejected": -1826.5333251953125, "loss": 0.0015, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 19.837499618530273, "rewards/margins": 27.637500762939453, "rewards/rejected": -7.805208206176758, "step": 90 }, { "epoch": 0.2373022481265612, "grad_norm": 0.004204944707453251, "learning_rate": 3.958333333333333e-06, "logits/chosen": 3.894270896911621, "logits/rejected": 3.875520944595337, "logps/chosen": -1963.199951171875, "logps/rejected": -1655.199951171875, "loss": 0.0017, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 18.54166603088379, "rewards/margins": 26.837499618530273, "rewards/rejected": -8.288541793823242, "step": 95 }, { "epoch": 0.2497918401332223, "grad_norm": 0.048603788018226624, "learning_rate": 4.166666666666667e-06, "logits/chosen": 3.921354055404663, "logits/rejected": 3.925520896911621, "logps/chosen": -2031.199951171875, "logps/rejected": -1718.933349609375, "loss": 0.0016, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 18.587499618530273, "rewards/margins": 27.987499237060547, "rewards/rejected": -9.398958206176758, "step": 100 }, { "epoch": 0.2497918401332223, "eval_logits/chosen": 3.922736167907715, "eval_logits/rejected": 4.058809280395508, "eval_logps/chosen": -1992.755859375, "eval_logps/rejected": -1681.7637939453125, "eval_loss": 0.0007825501379556954, "eval_rewards/accuracies": 0.999015748500824, "eval_rewards/chosen": 18.322341918945312, "eval_rewards/margins": 28.72933006286621, "eval_rewards/rejected": -10.410186767578125, "eval_runtime": 354.0873, "eval_samples_per_second": 5.713, "eval_steps_per_second": 0.359, "step": 100 }, { "epoch": 0.2622814321398834, "grad_norm": 0.0369204506278038, "learning_rate": 4.3750000000000005e-06, "logits/chosen": 3.8265624046325684, "logits/rejected": 3.9385416507720947, "logps/chosen": -2018.4000244140625, "logps/rejected": -1706.933349609375, "loss": 0.0016, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 19.227083206176758, "rewards/margins": 29.808332443237305, "rewards/rejected": -10.591666221618652, "step": 105 }, { "epoch": 0.27477102414654453, "grad_norm": 0.004119668155908585, "learning_rate": 4.583333333333333e-06, "logits/chosen": 3.9609375, "logits/rejected": 4.120312690734863, "logps/chosen": -2136.800048828125, "logps/rejected": -1816.5333251953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.649999618530273, "rewards/margins": 31.487499237060547, "rewards/rejected": -11.845833778381348, "step": 110 }, { "epoch": 0.28726061615320564, "grad_norm": 0.004336973652243614, "learning_rate": 4.791666666666668e-06, "logits/chosen": 3.9557292461395264, "logits/rejected": 4.090624809265137, "logps/chosen": -2115.466552734375, "logps/rejected": -1824.800048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.47083282470703, "rewards/margins": 31.47083282470703, "rewards/rejected": -13.002083778381348, "step": 115 }, { "epoch": 0.2997502081598668, "grad_norm": 0.011746798641979694, "learning_rate": 5e-06, "logits/chosen": 3.9713542461395264, "logits/rejected": 4.204166889190674, "logps/chosen": -2078.933349609375, "logps/rejected": -1798.4000244140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 17.797916412353516, "rewards/margins": 31.070833206176758, "rewards/rejected": -13.283333778381348, "step": 120 }, { "epoch": 0.3122398001665279, "grad_norm": 0.019313249737024307, "learning_rate": 4.999735579817769e-06, "logits/chosen": 3.953125, "logits/rejected": 4.2479166984558105, "logps/chosen": -1961.3333740234375, "logps/rejected": -1672.0, "loss": 0.0029, "rewards/accuracies": 0.9958333373069763, "rewards/chosen": 18.149999618530273, "rewards/margins": 31.941667556762695, "rewards/rejected": -13.795833587646484, "step": 125 }, { "epoch": 0.324729392173189, "grad_norm": 0.07502448558807373, "learning_rate": 4.998942375205502e-06, "logits/chosen": 4.006770610809326, "logits/rejected": 4.3182291984558105, "logps/chosen": -2085.333251953125, "logps/rejected": -1804.2667236328125, "loss": 0.0016, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 16.993749618530273, "rewards/margins": 31.887500762939453, "rewards/rejected": -14.893750190734863, "step": 130 }, { "epoch": 0.33721898417985013, "grad_norm": 0.030766427516937256, "learning_rate": 4.997620553954645e-06, "logits/chosen": 3.9973957538604736, "logits/rejected": 4.283854007720947, "logps/chosen": -2090.13330078125, "logps/rejected": -1820.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 17.149999618530273, "rewards/margins": 33.13750076293945, "rewards/rejected": -16.006250381469727, "step": 135 }, { "epoch": 0.34970857618651124, "grad_norm": 0.00906953401863575, "learning_rate": 4.995770395678171e-06, "logits/chosen": 3.9937500953674316, "logits/rejected": 4.267187595367432, "logps/chosen": -2218.13330078125, "logps/rejected": -1936.800048828125, "loss": 0.0015, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 17.758333206176758, "rewards/margins": 34.358333587646484, "rewards/rejected": -16.606250762939453, "step": 140 }, { "epoch": 0.36219816819317235, "grad_norm": 0.05252711847424507, "learning_rate": 4.993392291751431e-06, "logits/chosen": 3.9697916507720947, "logits/rejected": 4.3770833015441895, "logps/chosen": -2011.199951171875, "logps/rejected": -1744.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 16.858333587646484, "rewards/margins": 33.07083511352539, "rewards/rejected": -16.225000381469727, "step": 145 }, { "epoch": 0.37468776019983346, "grad_norm": 0.0009563255007378757, "learning_rate": 4.990486745229364e-06, "logits/chosen": 3.9588541984558105, "logits/rejected": 4.2421875, "logps/chosen": -2060.2666015625, "logps/rejected": -1802.933349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.51041603088379, "rewards/margins": 34.93333435058594, "rewards/rejected": -17.433332443237305, "step": 150 }, { "epoch": 0.37468776019983346, "eval_logits/chosen": 3.966412305831909, "eval_logits/rejected": 4.413877964019775, "eval_logps/chosen": -2014.6141357421875, "eval_logps/rejected": -1753.8897705078125, "eval_loss": 0.0007276261458173394, "eval_rewards/accuracies": 0.999015748500824, "eval_rewards/chosen": 16.152067184448242, "eval_rewards/margins": 33.754920959472656, "eval_rewards/rejected": -17.605806350708008, "eval_runtime": 353.8504, "eval_samples_per_second": 5.717, "eval_steps_per_second": 0.359, "step": 150 }, { "epoch": 0.3871773522064946, "grad_norm": 0.00035138276871293783, "learning_rate": 4.9870543707400835e-06, "logits/chosen": 4.015625, "logits/rejected": 4.403645992279053, "logps/chosen": -2007.7332763671875, "logps/rejected": -1773.066650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 16.077083587646484, "rewards/margins": 33.63750076293945, "rewards/rejected": -17.568750381469727, "step": 155 }, { "epoch": 0.3996669442131557, "grad_norm": 0.01966533623635769, "learning_rate": 4.983095894354858e-06, "logits/chosen": 3.9979166984558105, "logits/rejected": 4.403124809265137, "logps/chosen": -2036.2667236328125, "logps/rejected": -1789.3333740234375, "loss": 0.0016, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 15.366666793823242, "rewards/margins": 33.329166412353516, "rewards/rejected": -17.950000762939453, "step": 160 }, { "epoch": 0.4121565362198168, "grad_norm": 0.022654341533780098, "learning_rate": 4.978612153434527e-06, "logits/chosen": 3.992708444595337, "logits/rejected": 4.473437309265137, "logps/chosen": -1962.6666259765625, "logps/rejected": -1722.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.25, "rewards/margins": 32.91666793823242, "rewards/rejected": -17.643749237060547, "step": 165 }, { "epoch": 0.42464612822647796, "grad_norm": 0.01498562190681696, "learning_rate": 4.973604096452361e-06, "logits/chosen": 3.9619791507720947, "logits/rejected": 4.350520610809326, "logps/chosen": -2128.0, "logps/rejected": -1873.199951171875, "loss": 0.0015, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 17.016666412353516, "rewards/margins": 35.84583282470703, "rewards/rejected": -18.84583282470703, "step": 170 }, { "epoch": 0.43713572023313907, "grad_norm": 0.14902526140213013, "learning_rate": 4.968072782793436e-06, "logits/chosen": 3.8828125, "logits/rejected": 4.206250190734863, "logps/chosen": -2172.0, "logps/rejected": -1934.4000244140625, "loss": 0.0015, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 16.26041603088379, "rewards/margins": 35.599998474121094, "rewards/rejected": -19.31458282470703, "step": 175 }, { "epoch": 0.4496253122398002, "grad_norm": 0.021196195855736732, "learning_rate": 4.962019382530521e-06, "logits/chosen": 3.9286458492279053, "logits/rejected": 4.356249809265137, "logps/chosen": -2046.4000244140625, "logps/rejected": -1812.800048828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 16.28125, "rewards/margins": 35.162498474121094, "rewards/rejected": -18.860416412353516, "step": 180 }, { "epoch": 0.4621149042464613, "grad_norm": 0.0006863160524517298, "learning_rate": 4.955445176176577e-06, "logits/chosen": 4.022916793823242, "logits/rejected": 4.585416793823242, "logps/chosen": -2089.333251953125, "logps/rejected": -1849.86669921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 15.956250190734863, "rewards/margins": 36.108333587646484, "rewards/rejected": -20.170833587646484, "step": 185 }, { "epoch": 0.4746044962531224, "grad_norm": 0.060519758611917496, "learning_rate": 4.948351554413879e-06, "logits/chosen": 3.9661457538604736, "logits/rejected": 4.583333492279053, "logps/chosen": -2091.466552734375, "logps/rejected": -1858.4000244140625, "loss": 0.0015, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 16.024999618530273, "rewards/margins": 36.06666564941406, "rewards/rejected": -20.05208396911621, "step": 190 }, { "epoch": 0.4870940882597835, "grad_norm": 0.002069060690701008, "learning_rate": 4.9407400177998335e-06, "logits/chosen": 4.0880208015441895, "logits/rejected": 4.604687690734863, "logps/chosen": -2102.666748046875, "logps/rejected": -1866.13330078125, "loss": 0.0014, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 15.90625, "rewards/margins": 37.00833511352539, "rewards/rejected": -21.112499237060547, "step": 195 }, { "epoch": 0.4995836802664446, "grad_norm": 0.0009484069887548685, "learning_rate": 4.93261217644956e-06, "logits/chosen": 3.917187452316284, "logits/rejected": 4.394270896911621, "logps/chosen": -2110.933349609375, "logps/rejected": -1877.5999755859375, "loss": 0.0015, "rewards/accuracies": 0.9979166388511658, "rewards/chosen": 16.28125, "rewards/margins": 37.204166412353516, "rewards/rejected": -20.943750381469727, "step": 200 }, { "epoch": 0.4995836802664446, "eval_logits/chosen": 3.9514026641845703, "eval_logits/rejected": 4.566190719604492, "eval_logps/chosen": -2021.5433349609375, "eval_logps/rejected": -1789.1654052734375, "eval_loss": 0.0007149713928811252, "eval_rewards/accuracies": 0.999015748500824, "eval_rewards/chosen": 15.391486167907715, "eval_rewards/margins": 36.55905532836914, "eval_rewards/rejected": -21.160432815551758, "eval_runtime": 354.1005, "eval_samples_per_second": 5.713, "eval_steps_per_second": 0.359, "step": 200 } ], "logging_steps": 5, "max_steps": 1200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }