{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9964664310954063, "eval_steps": 100, "global_step": 282, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 917.2587478637695, "epoch": 0.0176678445229682, "grad_norm": 0.05332557484507561, "kl": 0.00013451576232910157, "learning_rate": 3.448275862068966e-06, "loss": 0.0289, "reward": 0.27843193283770235, "reward_std": 0.14517987524159254, "rewards/accuracy_reward": 0.21607143768342213, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.062360493757296354, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 847.1094116210937, "epoch": 0.0353356890459364, "grad_norm": 0.053962819278240204, "kl": 0.001752650737762451, "learning_rate": 6.896551724137932e-06, "loss": 0.0249, "reward": 0.4628627423895523, "reward_std": 0.15404398429673166, "rewards/accuracy_reward": 0.3459821574622765, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.11688058525323868, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 732.8753692626954, "epoch": 0.053003533568904596, "grad_norm": 0.04866955429315567, "kl": 0.010287094116210937, "learning_rate": 1.0344827586206898e-05, "loss": 0.0172, "reward": 0.8053571786731482, "reward_std": 0.18580246912315487, "rewards/accuracy_reward": 0.5641741327941417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2411830423399806, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 706.6745796203613, "epoch": 0.0706713780918728, "grad_norm": 0.051155973225831985, "kl": 0.02039794921875, "learning_rate": 1.3793103448275863e-05, "loss": 0.0149, "reward": 0.86157928109169, "reward_std": 0.1875305467285216, "rewards/accuracy_reward": 0.6122768152505159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24930245764553546, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 710.6496971130371, "epoch": 0.08833922261484099, "grad_norm": 0.05199262499809265, "kl": 0.02178192138671875, "learning_rate": 1.7241379310344828e-05, "loss": 0.0149, "reward": 0.8626953538507223, "reward_std": 0.17808558829128743, "rewards/accuracy_reward": 0.6143973529338836, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2482979955151677, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 714.621907043457, "epoch": 0.10600706713780919, "grad_norm": 0.05970175191760063, "kl": 0.02830352783203125, "learning_rate": 1.999923511388017e-05, "loss": 0.015, "reward": 0.860267897695303, "reward_std": 0.1923450878355652, "rewards/accuracy_reward": 0.6117187786847353, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24854911137372254, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 706.1554992675781, "epoch": 0.12367491166077739, "grad_norm": 0.064034104347229, "kl": 0.03692779541015625, "learning_rate": 1.9972476383747748e-05, "loss": 0.0133, "reward": 0.8735770478844642, "reward_std": 0.19234329210594298, "rewards/accuracy_reward": 0.625111635401845, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24846540708094836, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 695.877042388916, "epoch": 0.1413427561837456, "grad_norm": 0.09493730962276459, "kl": 0.0706024169921875, "learning_rate": 1.9907590277344582e-05, "loss": 0.0091, "reward": 0.854743342846632, "reward_std": 0.21510332841426133, "rewards/accuracy_reward": 0.6077009219676256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24704242013394834, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 704.042106628418, "epoch": 0.15901060070671377, "grad_norm": 0.06780606508255005, "kl": 0.129840087890625, "learning_rate": 1.9804824871166254e-05, "loss": 0.0127, "reward": 0.8354911088943482, "reward_std": 0.2541661562398076, "rewards/accuracy_reward": 0.5992187757045031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23627233169972897, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 723.2655464172364, "epoch": 0.17667844522968199, "grad_norm": 0.13506263494491577, "kl": 0.2400848388671875, "learning_rate": 1.9664573064143604e-05, "loss": 0.0784, "reward": 0.817243343964219, "reward_std": 0.28774973265826703, "rewards/accuracy_reward": 0.5887277062982321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.22851563505828382, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 732.3425590515137, "epoch": 0.19434628975265017, "grad_norm": 0.1647893786430359, "kl": 0.31966552734375, "learning_rate": 1.948737107548771e-05, "loss": 0.1074, "reward": 0.7718192338943481, "reward_std": 0.3275716399773955, "rewards/accuracy_reward": 0.5561384186148643, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21568081323057414, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 713.8921081542969, "epoch": 0.21201413427561838, "grad_norm": 0.14291171729564667, "kl": 0.58829345703125, "learning_rate": 1.9273896394584103e-05, "loss": 0.1926, "reward": 0.6627232447266579, "reward_std": 0.39091945756226776, "rewards/accuracy_reward": 0.47834823541343213, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.18437500866129994, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 610.0054931640625, "epoch": 0.22968197879858657, "grad_norm": 0.10107492655515671, "kl": 0.25198974609375, "learning_rate": 1.9024965190774262e-05, "loss": 0.1113, "reward": 0.8659877650439739, "reward_std": 0.27730538016185163, "rewards/accuracy_reward": 0.6375000283122063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.22848773319274188, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 578.1935543060303, "epoch": 0.24734982332155478, "grad_norm": 0.07951921969652176, "kl": 0.2242431640625, "learning_rate": 1.8741529192927528e-05, "loss": 0.0939, "reward": 0.8772321827709675, "reward_std": 0.26367146205157044, "rewards/accuracy_reward": 0.6401786014437676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23705358244478703, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 606.7115242004395, "epoch": 0.26501766784452296, "grad_norm": 0.09012111276388168, "kl": 0.3244873046875, "learning_rate": 1.8424672050733577e-05, "loss": 0.1685, "reward": 0.8198382072150707, "reward_std": 0.3401181472465396, "rewards/accuracy_reward": 0.5989955652505159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.22084264531731607, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 571.5734634399414, "epoch": 0.2826855123674912, "grad_norm": 0.06397970765829086, "kl": 0.164495849609375, "learning_rate": 1.8075605191627242e-05, "loss": 0.0724, "reward": 0.9197545051574707, "reward_std": 0.22428997224196792, "rewards/accuracy_reward": 0.6797991372644901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23995536621659994, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 606.1346244812012, "epoch": 0.3003533568904594, "grad_norm": 0.07042359560728073, "kl": 0.234246826171875, "learning_rate": 1.7695663189185703e-05, "loss": 0.0965, "reward": 0.8753627657890319, "reward_std": 0.25722263269126416, "rewards/accuracy_reward": 0.6383928876370192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23696987684816123, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 593.6537105560303, "epoch": 0.31802120141342755, "grad_norm": 0.06578419357538223, "kl": 0.224627685546875, "learning_rate": 1.7286298660705877e-05, "loss": 0.0928, "reward": 0.8682199046015739, "reward_std": 0.2509442804381251, "rewards/accuracy_reward": 0.6315848492085934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.236635054461658, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 563.4647586822509, "epoch": 0.33568904593639576, "grad_norm": 0.06993943452835083, "kl": 0.238482666015625, "learning_rate": 1.6849076713469914e-05, "loss": 0.1089, "reward": 0.8941406659781933, "reward_std": 0.2569141635671258, "rewards/accuracy_reward": 0.6537946693599224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24034599158912898, "step": 95 }, { "epoch": 0.35335689045936397, "grad_norm": 0.06800527125597, "learning_rate": 1.6385668960932143e-05, "loss": 0.0909, "step": 100 }, { "epoch": 0.35335689045936397, "eval_clip_ratio": 0.0, "eval_completion_length": 590.76025390625, "eval_kl": 0.19091796875, "eval_loss": 0.074989914894104, "eval_reward": 0.9082031697034836, "eval_reward_std": 0.2306336797773838, "eval_rewards/accuracy_reward": 0.667410746216774, "eval_rewards/format_reward": 0.0, "eval_rewards/tag_count_reward": 0.2407924234867096, "eval_runtime": 131.5321, "eval_samples_per_second": 0.753, "eval_steps_per_second": 0.008, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 586.0422695159912, "epoch": 0.3710247349823322, "grad_norm": 0.07303127646446228, "kl": 0.2162933349609375, "learning_rate": 1.5897847131705194e-05, "loss": 0.1027, "reward": 0.8979353092610836, "reward_std": 0.25196963488124313, "rewards/accuracy_reward": 0.6575335115194321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2404017962515354, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 577.3520366668702, "epoch": 0.38869257950530034, "grad_norm": 0.07516764849424362, "kl": 0.196807861328125, "learning_rate": 1.5387476295779737e-05, "loss": 0.0907, "reward": 0.9079520504921674, "reward_std": 0.252688442543149, "rewards/accuracy_reward": 0.6669643156230449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24098773431032897, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 578.9473468780518, "epoch": 0.40636042402826855, "grad_norm": 0.0895974189043045, "kl": 0.27685546875, "learning_rate": 1.4856507733875837e-05, "loss": 0.1298, "reward": 0.8826172292232514, "reward_std": 0.28199427481740713, "rewards/accuracy_reward": 0.6393973510712385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24321987628936767, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 564.4585056304932, "epoch": 0.42402826855123676, "grad_norm": 0.07208839058876038, "kl": 0.219805908203125, "learning_rate": 1.4306971477188223e-05, "loss": 0.0953, "reward": 0.900697585940361, "reward_std": 0.25064537590369584, "rewards/accuracy_reward": 0.6555803909897804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2451171986758709, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 619.9358505249023, "epoch": 0.4416961130742049, "grad_norm": 0.08019658923149109, "kl": 0.235772705078125, "learning_rate": 1.3740968546047935e-05, "loss": 0.105, "reward": 0.8835379853844643, "reward_std": 0.2642295270226896, "rewards/accuracy_reward": 0.6447544928640128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23878349252045156, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 602.9432163238525, "epoch": 0.45936395759717313, "grad_norm": 0.07495882362127304, "kl": 0.228643798828125, "learning_rate": 1.3160662917174045e-05, "loss": 0.1076, "reward": 0.9091239273548126, "reward_std": 0.2478904782794416, "rewards/accuracy_reward": 0.6672991372644901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24182478711009026, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 571.6796016693115, "epoch": 0.47703180212014135, "grad_norm": 0.08212456852197647, "kl": 0.197271728515625, "learning_rate": 1.2568273250226681e-05, "loss": 0.11, "reward": 0.9201730333268643, "reward_std": 0.24961013691499828, "rewards/accuracy_reward": 0.6765625298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24361050203442575, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 570.3125244140625, "epoch": 0.49469964664310956, "grad_norm": 0.07115107774734497, "kl": 0.2134033203125, "learning_rate": 1.1966064405292887e-05, "loss": 0.1111, "reward": 0.9121094167232513, "reward_std": 0.25566268572583795, "rewards/accuracy_reward": 0.6696428872644902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24246652834117413, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 599.4550518035888, "epoch": 0.5123674911660777, "grad_norm": 0.07841041684150696, "kl": 0.18359375, "learning_rate": 1.1356338783736256e-05, "loss": 0.0858, "reward": 0.9083984784781933, "reward_std": 0.22585448753088713, "rewards/accuracy_reward": 0.6655134223401546, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24288505520671605, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 589.7945564270019, "epoch": 0.5300353356890459, "grad_norm": 0.07215780019760132, "kl": 0.148455810546875, "learning_rate": 1.0741427525516463e-05, "loss": 0.0668, "reward": 0.9342076353728771, "reward_std": 0.22251177271828054, "rewards/accuracy_reward": 0.6893973518162966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2448102779686451, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 577.0310535430908, "epoch": 0.5477031802120141, "grad_norm": 0.0739278495311737, "kl": 0.190936279296875, "learning_rate": 1.012368159663363e-05, "loss": 0.098, "reward": 0.9136719211935997, "reward_std": 0.2538995613344014, "rewards/accuracy_reward": 0.6722098514437675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2414620641618967, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 565.5535991668701, "epoch": 0.5653710247349824, "grad_norm": 0.08016849309206009, "kl": 0.1893310546875, "learning_rate": 9.505462800772612e-06, "loss": 0.0997, "reward": 0.9155971385538578, "reward_std": 0.2556414810940623, "rewards/accuracy_reward": 0.6753348506987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24026228599250316, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 584.8990207672119, "epoch": 0.5830388692579506, "grad_norm": 0.07024497538805008, "kl": 0.178204345703125, "learning_rate": 8.889134749511956e-06, "loss": 0.0891, "reward": 0.9097935684025288, "reward_std": 0.2374617088586092, "rewards/accuracy_reward": 0.668080385774374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24171318039298056, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 617.6932182312012, "epoch": 0.6007067137809188, "grad_norm": 0.06575129926204681, "kl": 0.177227783203125, "learning_rate": 8.277053825620836e-06, "loss": 0.0803, "reward": 0.8907087415456771, "reward_std": 0.24254968659952283, "rewards/accuracy_reward": 0.6465402048081159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24416853692382573, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 592.3628578186035, "epoch": 0.6183745583038869, "grad_norm": 0.07182252407073975, "kl": 0.195758056640625, "learning_rate": 7.671560173993588e-06, "loss": 0.0894, "reward": 0.9130301713943482, "reward_std": 0.23459300631657243, "rewards/accuracy_reward": 0.665736635401845, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24729353692382575, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 559.4955589294434, "epoch": 0.6360424028268551, "grad_norm": 0.09263365715742111, "kl": 0.205853271484375, "learning_rate": 7.07496875466589e-06, "loss": 0.0981, "reward": 0.9218192398548126, "reward_std": 0.23992226729169489, "rewards/accuracy_reward": 0.6737723559141159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24804688580334186, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 577.3553829193115, "epoch": 0.6537102473498233, "grad_norm": 0.07366114854812622, "kl": 0.200531005859375, "learning_rate": 6.489560492119225e-06, "loss": 0.096, "reward": 0.9261161163449287, "reward_std": 0.2363728086464107, "rewards/accuracy_reward": 0.6770089607685804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2491071542724967, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 593.0000240325928, "epoch": 0.6713780918727915, "grad_norm": 0.06559525430202484, "kl": 0.21243896484375, "learning_rate": 5.9175735547120975e-06, "loss": 0.0995, "reward": 0.9242466956377029, "reward_std": 0.24610999850556253, "rewards/accuracy_reward": 0.6748884223401547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24935827050358056, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 587.670785522461, "epoch": 0.6890459363957597, "grad_norm": 0.09491094201803207, "kl": 0.19505615234375, "learning_rate": 5.361194797579108e-06, "loss": 0.0807, "reward": 0.9143694624304771, "reward_std": 0.2263300753198564, "rewards/accuracy_reward": 0.667410746961832, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24695871509611605, "step": 195 }, { "epoch": 0.7067137809187279, "grad_norm": 0.07209863513708115, "learning_rate": 4.8225514017138205e-06, "loss": 0.1019, "step": 200 }, { "epoch": 0.7067137809187279, "eval_clip_ratio": 0.0, "eval_completion_length": 597.6492767333984, "eval_kl": 0.2119140625, "eval_loss": 0.09467914700508118, "eval_reward": 0.9132254868745804, "eval_reward_std": 0.22664131596684456, "eval_rewards/accuracy_reward": 0.6685268133878708, "eval_rewards/format_reward": 0.0, "eval_rewards/tag_count_reward": 0.2446986697614193, "eval_runtime": 131.7381, "eval_samples_per_second": 0.751, "eval_steps_per_second": 0.008, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 593.5689434051513, "epoch": 0.7243816254416962, "grad_norm": 0.06886138767004013, "kl": 0.2204437255859375, "learning_rate": 4.303702741201431e-06, "loss": 0.0962, "reward": 0.9143136594444513, "reward_std": 0.2504740643315017, "rewards/accuracy_reward": 0.6694754760712385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24483818002045155, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 590.6283740997314, "epoch": 0.7420494699646644, "grad_norm": 0.07291878759860992, "kl": 0.195550537109375, "learning_rate": 3.8066325096949153e-06, "loss": 0.0884, "reward": 0.914955398440361, "reward_std": 0.23502108193933963, "rewards/accuracy_reward": 0.666517884656787, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2484375100582838, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 573.4894218444824, "epoch": 0.7597173144876325, "grad_norm": 0.07844824343919754, "kl": 0.20074462890625, "learning_rate": 3.3332411362372063e-06, "loss": 0.0823, "reward": 0.9080915614962578, "reward_std": 0.22488524662330747, "rewards/accuracy_reward": 0.658370564877987, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24972099326550962, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 572.3277076721191, "epoch": 0.7773851590106007, "grad_norm": 0.06696169078350067, "kl": 0.189166259765625, "learning_rate": 2.8853385194256677e-06, "loss": 0.0805, "reward": 0.9161830775439739, "reward_std": 0.22096735332161188, "rewards/accuracy_reward": 0.6659598544239997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25022322554141285, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 591.3379741668701, "epoch": 0.7950530035335689, "grad_norm": 0.1020839512348175, "kl": 0.213531494140625, "learning_rate": 2.464637107698046e-06, "loss": 0.0864, "reward": 0.9009207993745804, "reward_std": 0.23703113198280334, "rewards/accuracy_reward": 0.6501116339117289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25080916434526446, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 579.4515911102295, "epoch": 0.8127208480565371, "grad_norm": 0.08135361969470978, "kl": 0.2225341796875, "learning_rate": 2.072745352195794e-06, "loss": 0.0975, "reward": 0.9147600896656514, "reward_std": 0.23770583430305123, "rewards/accuracy_reward": 0.6649553872644901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24980470035225152, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 573.1327266693115, "epoch": 0.8303886925795053, "grad_norm": 0.07584529370069504, "kl": 0.2205078125, "learning_rate": 1.7111615572361628e-06, "loss": 0.1078, "reward": 0.9006138816475868, "reward_std": 0.24386549470946192, "rewards/accuracy_reward": 0.6527902077883482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24782367032021285, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 574.2280414581298, "epoch": 0.8480565371024735, "grad_norm": 0.07971746474504471, "kl": 0.22808837890625, "learning_rate": 1.381268151904298e-06, "loss": 0.1048, "reward": 0.889174147695303, "reward_std": 0.24307329086586832, "rewards/accuracy_reward": 0.641852705180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24732143860310316, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 580.1797130584716, "epoch": 0.8657243816254417, "grad_norm": 0.0819220319390297, "kl": 0.2184326171875, "learning_rate": 1.0843264046665558e-06, "loss": 0.1038, "reward": 0.9068638809025288, "reward_std": 0.24303168477490544, "rewards/accuracy_reward": 0.6598214615136385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24704242143779992, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 586.1197792053223, "epoch": 0.8833922261484098, "grad_norm": 0.07211937755346298, "kl": 0.21861572265625, "learning_rate": 8.214716012124491e-07, "loss": 0.0972, "reward": 0.8969587460160255, "reward_std": 0.24261261774227022, "rewards/accuracy_reward": 0.6494419921189547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24751675240695475, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 581.7469051361084, "epoch": 0.901060070671378, "grad_norm": 0.07039004564285278, "kl": 0.1945556640625, "learning_rate": 5.937087039615619e-07, "loss": 0.0798, "reward": 0.9000837452709675, "reward_std": 0.22726329311262816, "rewards/accuracy_reward": 0.6511161029338837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24896764568984509, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 568.4496898651123, "epoch": 0.9187279151943463, "grad_norm": 0.07688862830400467, "kl": 0.1930908203125, "learning_rate": 4.019085098303077e-07, "loss": 0.0886, "reward": 0.925837092846632, "reward_std": 0.229900080896914, "rewards/accuracy_reward": 0.6780134219676256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24782367069274186, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 585.3359642028809, "epoch": 0.9363957597173145, "grad_norm": 0.07333354651927948, "kl": 0.199957275390625, "learning_rate": 2.4680432094837394e-07, "loss": 0.0913, "reward": 0.9235212452709675, "reward_std": 0.2362320901826024, "rewards/accuracy_reward": 0.6742187783122062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2493024656549096, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 589.8091770172119, "epoch": 0.9540636042402827, "grad_norm": 0.07898814976215363, "kl": 0.199560546875, "learning_rate": 1.289891410535593e-07, "loss": 0.0874, "reward": 0.9116071857511997, "reward_std": 0.22943277563899755, "rewards/accuracy_reward": 0.6623884223401546, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.249218762293458, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 589.4594024658203, "epoch": 0.9717314487632509, "grad_norm": 0.07111751288175583, "kl": 0.20418701171875, "learning_rate": 4.8913408283934874e-08, "loss": 0.0867, "reward": 0.9023716941475868, "reward_std": 0.23179991263896227, "rewards/accuracy_reward": 0.6522321693599225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25013952106237414, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 583.0817214965821, "epoch": 0.9893992932862191, "grad_norm": 0.07049054652452469, "kl": 0.199383544921875, "learning_rate": 6.883273035447335e-09, "loss": 0.0902, "reward": 0.9261998198926449, "reward_std": 0.22382794730365277, "rewards/accuracy_reward": 0.6766741391271353, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24952568095177413, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 574.5289487838745, "epoch": 0.9964664310954063, "kl": 0.199249267578125, "reward": 0.9313616473227739, "reward_std": 0.22618725849315524, "rewards/accuracy_reward": 0.6813616417348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2500000102445483, "step": 282, "total_flos": 0.0, "train_loss": 0.08527955120918176, "train_runtime": 119365.0462, "train_samples_per_second": 0.607, "train_steps_per_second": 0.002 } ], "logging_steps": 5, "max_steps": 283, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }