{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5479962808649566, "eval_steps": 500, "global_step": 34000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3893316108733416, "epoch": 0.0003747239338518576, "grad_norm": 3.2205183506011963, "learning_rate": 0.0002, "loss": 2.481, "mean_token_accuracy": 0.6302581503987312, "num_tokens": 1455574.0, "step": 5 }, { "entropy": 2.3604774422943593, "epoch": 0.0007494478677037152, "grad_norm": 1.0569955110549927, "learning_rate": 0.0002, "loss": 1.594, "mean_token_accuracy": 0.6757766727358103, "num_tokens": 2853982.0, "step": 10 }, { "entropy": 2.596837738901377, "epoch": 0.0011241718015555728, "grad_norm": 0.45325878262519836, "learning_rate": 0.0002, "loss": 1.3825, "mean_token_accuracy": 0.6996057916432619, "num_tokens": 4359458.0, "step": 15 }, { "entropy": 2.603879726678133, "epoch": 0.0014988957354074304, "grad_norm": 0.38502994179725647, "learning_rate": 0.0002, "loss": 1.2134, "mean_token_accuracy": 0.723450992628932, "num_tokens": 5822270.0, "step": 20 }, { "entropy": 2.411510304361582, "epoch": 0.001873619669259288, "grad_norm": 0.3255424201488495, "learning_rate": 0.0002, "loss": 1.1449, "mean_token_accuracy": 0.7311726346611976, "num_tokens": 7270846.0, "step": 25 }, { "entropy": 1.7609096560627222, "epoch": 0.0022483436031111455, "grad_norm": 0.34361711144447327, "learning_rate": 0.0002, "loss": 1.0786, "mean_token_accuracy": 0.7423146508634091, "num_tokens": 8736337.0, "step": 30 }, { "entropy": 1.2919814597815276, "epoch": 0.002623067536963003, "grad_norm": 0.2850675880908966, "learning_rate": 0.0002, "loss": 1.0947, "mean_token_accuracy": 0.738439654558897, "num_tokens": 10208555.0, "step": 35 }, { "entropy": 1.080675508826971, "epoch": 0.0029977914708148607, "grad_norm": 0.36652302742004395, "learning_rate": 0.0002, "loss": 1.0391, "mean_token_accuracy": 0.7495698932558298, "num_tokens": 11678333.0, "step": 40 }, { "entropy": 1.0725676577538252, "epoch": 0.003372515404666718, "grad_norm": 0.31147828698158264, "learning_rate": 0.0002, "loss": 1.0367, "mean_token_accuracy": 0.7505599021911621, "num_tokens": 13086047.0, "step": 45 }, { "entropy": 1.0891018833965063, "epoch": 0.003747239338518576, "grad_norm": 0.3285379409790039, "learning_rate": 0.0002, "loss": 1.0466, "mean_token_accuracy": 0.7446779947727918, "num_tokens": 14601873.0, "step": 50 }, { "entropy": 1.1071042962372304, "epoch": 0.004121963272370434, "grad_norm": 0.3067648112773895, "learning_rate": 0.0002, "loss": 1.0267, "mean_token_accuracy": 0.7491683695465327, "num_tokens": 16031697.0, "step": 55 }, { "entropy": 1.1180668529123068, "epoch": 0.004496687206222291, "grad_norm": 0.2837771475315094, "learning_rate": 0.0002, "loss": 1.0209, "mean_token_accuracy": 0.7530738033354283, "num_tokens": 17524575.0, "step": 60 }, { "entropy": 1.1108239632099868, "epoch": 0.004871411140074148, "grad_norm": 0.2912568151950836, "learning_rate": 0.0002, "loss": 1.0145, "mean_token_accuracy": 0.7516566962003708, "num_tokens": 19027392.0, "step": 65 }, { "entropy": 1.085039308667183, "epoch": 0.005246135073926006, "grad_norm": 0.33241885900497437, "learning_rate": 0.0002, "loss": 1.0218, "mean_token_accuracy": 0.752640512958169, "num_tokens": 20520111.0, "step": 70 }, { "entropy": 1.0589611357077957, "epoch": 0.005620859007777864, "grad_norm": 0.3116655945777893, "learning_rate": 0.0002, "loss": 0.9944, "mean_token_accuracy": 0.75811633951962, "num_tokens": 22018800.0, "step": 75 }, { "entropy": 1.0268964029848575, "epoch": 0.005995582941629721, "grad_norm": 0.25950562953948975, "learning_rate": 0.0002, "loss": 0.9679, "mean_token_accuracy": 0.7602423522621393, "num_tokens": 23509569.0, "step": 80 }, { "entropy": 0.9822440635412931, "epoch": 0.006370306875481579, "grad_norm": 0.34652596712112427, "learning_rate": 0.0002, "loss": 0.9767, "mean_token_accuracy": 0.7599799651652575, "num_tokens": 25010706.0, "step": 85 }, { "entropy": 0.9954451229423285, "epoch": 0.006745030809333436, "grad_norm": 0.27451908588409424, "learning_rate": 0.0002, "loss": 1.0145, "mean_token_accuracy": 0.7515401769429445, "num_tokens": 26499792.0, "step": 90 }, { "entropy": 0.9626753572374582, "epoch": 0.0071197547431852935, "grad_norm": 0.4042084813117981, "learning_rate": 0.0002, "loss": 1.0017, "mean_token_accuracy": 0.7537775810807943, "num_tokens": 28030320.0, "step": 95 }, { "entropy": 0.9011292217299343, "epoch": 0.007494478677037152, "grad_norm": 0.27968496084213257, "learning_rate": 0.0002, "loss": 0.9438, "mean_token_accuracy": 0.7628859847784042, "num_tokens": 29474672.0, "step": 100 }, { "entropy": 0.8972558995708824, "epoch": 0.007869202610889008, "grad_norm": 0.259904682636261, "learning_rate": 0.0002, "loss": 0.961, "mean_token_accuracy": 0.75872584246099, "num_tokens": 30942973.0, "step": 105 }, { "entropy": 0.8857051214203239, "epoch": 0.008243926544740867, "grad_norm": 0.24059340357780457, "learning_rate": 0.0002, "loss": 0.9563, "mean_token_accuracy": 0.7612129136919975, "num_tokens": 32396363.0, "step": 110 }, { "entropy": 0.8968787910416722, "epoch": 0.008618650478592725, "grad_norm": 0.24934181571006775, "learning_rate": 0.0002, "loss": 0.9655, "mean_token_accuracy": 0.759117328748107, "num_tokens": 33845111.0, "step": 115 }, { "entropy": 0.9285211838781834, "epoch": 0.008993374412444582, "grad_norm": 0.2618580162525177, "learning_rate": 0.0002, "loss": 0.9779, "mean_token_accuracy": 0.7587591409683228, "num_tokens": 35291631.0, "step": 120 }, { "entropy": 0.940931336954236, "epoch": 0.00936809834629644, "grad_norm": 0.2406407743692398, "learning_rate": 0.0002, "loss": 0.9878, "mean_token_accuracy": 0.7533375348895788, "num_tokens": 36759743.0, "step": 125 }, { "entropy": 0.914332109503448, "epoch": 0.009742822280148297, "grad_norm": 0.2866421043872833, "learning_rate": 0.0002, "loss": 0.966, "mean_token_accuracy": 0.7578140415251255, "num_tokens": 38220066.0, "step": 130 }, { "entropy": 0.8984072390943766, "epoch": 0.010117546214000154, "grad_norm": 0.2523578405380249, "learning_rate": 0.0002, "loss": 0.9669, "mean_token_accuracy": 0.759901924803853, "num_tokens": 39684248.0, "step": 135 }, { "entropy": 0.8794408267363906, "epoch": 0.010492270147852012, "grad_norm": 0.24808499217033386, "learning_rate": 0.0002, "loss": 0.9598, "mean_token_accuracy": 0.7619973722845316, "num_tokens": 41132884.0, "step": 140 }, { "entropy": 0.8870571108534933, "epoch": 0.010866994081703869, "grad_norm": 0.27936556935310364, "learning_rate": 0.0002, "loss": 0.9576, "mean_token_accuracy": 0.7627144370228052, "num_tokens": 42542591.0, "step": 145 }, { "entropy": 0.8856921583414078, "epoch": 0.011241718015555728, "grad_norm": 0.2550513744354248, "learning_rate": 0.0002, "loss": 0.9518, "mean_token_accuracy": 0.7632503002882004, "num_tokens": 44000647.0, "step": 150 }, { "entropy": 0.909747464209795, "epoch": 0.011616441949407585, "grad_norm": 0.2570670247077942, "learning_rate": 0.0002, "loss": 0.9569, "mean_token_accuracy": 0.7590698093175888, "num_tokens": 45465394.0, "step": 155 }, { "entropy": 0.9125672746449709, "epoch": 0.011991165883259443, "grad_norm": 0.227640300989151, "learning_rate": 0.0002, "loss": 0.972, "mean_token_accuracy": 0.7582723300904035, "num_tokens": 46932975.0, "step": 160 }, { "entropy": 0.9326373111456633, "epoch": 0.0123658898171113, "grad_norm": 0.2532043159008026, "learning_rate": 0.0002, "loss": 0.9781, "mean_token_accuracy": 0.7567741934210062, "num_tokens": 48429614.0, "step": 165 }, { "entropy": 0.8952477537095547, "epoch": 0.012740613750963158, "grad_norm": 0.31653285026550293, "learning_rate": 0.0002, "loss": 0.9588, "mean_token_accuracy": 0.7601167216897011, "num_tokens": 49890270.0, "step": 170 }, { "entropy": 0.8711420681327582, "epoch": 0.013115337684815015, "grad_norm": 0.24041110277175903, "learning_rate": 0.0002, "loss": 0.9572, "mean_token_accuracy": 0.758920069038868, "num_tokens": 51381948.0, "step": 175 }, { "entropy": 0.8636878512799739, "epoch": 0.013490061618666872, "grad_norm": 0.22928392887115479, "learning_rate": 0.0002, "loss": 0.9814, "mean_token_accuracy": 0.7587152201682329, "num_tokens": 52855237.0, "step": 180 }, { "entropy": 0.8045509492978453, "epoch": 0.01386478555251873, "grad_norm": 0.21791502833366394, "learning_rate": 0.0002, "loss": 0.9239, "mean_token_accuracy": 0.7704681549221277, "num_tokens": 54310019.0, "step": 185 }, { "entropy": 0.8124296953901649, "epoch": 0.014239509486370587, "grad_norm": 0.2366018146276474, "learning_rate": 0.0002, "loss": 0.9339, "mean_token_accuracy": 0.7643769532442093, "num_tokens": 55777192.0, "step": 190 }, { "entropy": 0.8066996375098825, "epoch": 0.014614233420222446, "grad_norm": 0.24702267348766327, "learning_rate": 0.0002, "loss": 0.9412, "mean_token_accuracy": 0.7606352269649506, "num_tokens": 57230016.0, "step": 195 }, { "entropy": 0.8075136210769415, "epoch": 0.014988957354074304, "grad_norm": 0.2174111306667328, "learning_rate": 0.0002, "loss": 0.929, "mean_token_accuracy": 0.7674362812191248, "num_tokens": 58662424.0, "step": 200 }, { "entropy": 0.8647322719916701, "epoch": 0.015363681287926161, "grad_norm": 0.22126944363117218, "learning_rate": 0.0002, "loss": 0.9723, "mean_token_accuracy": 0.7580762289464473, "num_tokens": 60171871.0, "step": 205 }, { "entropy": 0.8225537849590182, "epoch": 0.015738405221778017, "grad_norm": 0.24800258874893188, "learning_rate": 0.0002, "loss": 0.9242, "mean_token_accuracy": 0.7680390790104866, "num_tokens": 61619877.0, "step": 210 }, { "entropy": 0.8443031895905733, "epoch": 0.016113129155629877, "grad_norm": 0.25594648718833923, "learning_rate": 0.0002, "loss": 0.9491, "mean_token_accuracy": 0.76283875182271, "num_tokens": 63088784.0, "step": 215 }, { "entropy": 0.8095832044258714, "epoch": 0.016487853089481735, "grad_norm": 0.2542487680912018, "learning_rate": 0.0002, "loss": 0.9187, "mean_token_accuracy": 0.7669479124248028, "num_tokens": 64534089.0, "step": 220 }, { "entropy": 0.817254213988781, "epoch": 0.016862577023333592, "grad_norm": 0.23247107863426208, "learning_rate": 0.0002, "loss": 0.9336, "mean_token_accuracy": 0.7631811764091253, "num_tokens": 65977483.0, "step": 225 }, { "entropy": 0.8123004641383886, "epoch": 0.01723730095718545, "grad_norm": 0.25194257497787476, "learning_rate": 0.0002, "loss": 0.9279, "mean_token_accuracy": 0.7662997681647539, "num_tokens": 67454337.0, "step": 230 }, { "entropy": 0.837943134456873, "epoch": 0.017612024891037307, "grad_norm": 0.2762316167354584, "learning_rate": 0.0002, "loss": 0.946, "mean_token_accuracy": 0.7626963317394256, "num_tokens": 68912499.0, "step": 235 }, { "entropy": 0.8126986065879465, "epoch": 0.017986748824889164, "grad_norm": 0.2535557746887207, "learning_rate": 0.0002, "loss": 0.9232, "mean_token_accuracy": 0.767506480962038, "num_tokens": 70378729.0, "step": 240 }, { "entropy": 0.7933407505974174, "epoch": 0.01836147275874102, "grad_norm": 0.2693343758583069, "learning_rate": 0.0002, "loss": 0.9239, "mean_token_accuracy": 0.7657601397484541, "num_tokens": 71800625.0, "step": 245 }, { "entropy": 0.8099238703027367, "epoch": 0.01873619669259288, "grad_norm": 0.22669032216072083, "learning_rate": 0.0002, "loss": 0.9355, "mean_token_accuracy": 0.7668153390288353, "num_tokens": 73258519.0, "step": 250 }, { "entropy": 0.7861788280308246, "epoch": 0.019110920626444736, "grad_norm": 0.22872091829776764, "learning_rate": 0.0002, "loss": 0.9219, "mean_token_accuracy": 0.7650986984372139, "num_tokens": 74712520.0, "step": 255 }, { "entropy": 0.7878014603629708, "epoch": 0.019485644560296594, "grad_norm": 0.24544259905815125, "learning_rate": 0.0002, "loss": 0.9223, "mean_token_accuracy": 0.7632117461413145, "num_tokens": 76172031.0, "step": 260 }, { "entropy": 0.8003201503306627, "epoch": 0.01986036849414845, "grad_norm": 0.26402008533477783, "learning_rate": 0.0002, "loss": 0.9252, "mean_token_accuracy": 0.7675170581787825, "num_tokens": 77675436.0, "step": 265 }, { "entropy": 0.7843868507072329, "epoch": 0.02023509242800031, "grad_norm": 0.24159960448741913, "learning_rate": 0.0002, "loss": 0.9028, "mean_token_accuracy": 0.7698289453983307, "num_tokens": 79145046.0, "step": 270 }, { "entropy": 0.8070292085409164, "epoch": 0.020609816361852166, "grad_norm": 0.2255016565322876, "learning_rate": 0.0002, "loss": 0.9389, "mean_token_accuracy": 0.7676650535315275, "num_tokens": 80580749.0, "step": 275 }, { "entropy": 0.8299927279353142, "epoch": 0.020984540295704023, "grad_norm": 0.23554107546806335, "learning_rate": 0.0002, "loss": 0.9434, "mean_token_accuracy": 0.7627578850835561, "num_tokens": 82065650.0, "step": 280 }, { "entropy": 0.8075728833675384, "epoch": 0.02135926422955588, "grad_norm": 0.25705021619796753, "learning_rate": 0.0002, "loss": 0.9419, "mean_token_accuracy": 0.7653288055211306, "num_tokens": 83531161.0, "step": 285 }, { "entropy": 0.8051820514723659, "epoch": 0.021733988163407738, "grad_norm": 0.25123974680900574, "learning_rate": 0.0002, "loss": 0.9251, "mean_token_accuracy": 0.7668902698904276, "num_tokens": 85010086.0, "step": 290 }, { "entropy": 0.8034495690837502, "epoch": 0.022108712097259595, "grad_norm": 0.2486058622598648, "learning_rate": 0.0002, "loss": 0.9356, "mean_token_accuracy": 0.7635506249964237, "num_tokens": 86445918.0, "step": 295 }, { "entropy": 0.7715813240036369, "epoch": 0.022483436031111456, "grad_norm": 0.24203544855117798, "learning_rate": 0.0002, "loss": 0.9137, "mean_token_accuracy": 0.7650374840945006, "num_tokens": 87889087.0, "step": 300 }, { "entropy": 0.7730180829763412, "epoch": 0.022858159964963314, "grad_norm": 0.24872373044490814, "learning_rate": 0.0002, "loss": 0.9113, "mean_token_accuracy": 0.76662175655365, "num_tokens": 89387021.0, "step": 305 }, { "entropy": 0.7637225350365042, "epoch": 0.02323288389881517, "grad_norm": 0.2170420140028, "learning_rate": 0.0002, "loss": 0.8968, "mean_token_accuracy": 0.7699498858302831, "num_tokens": 90853965.0, "step": 310 }, { "entropy": 0.7987107900902629, "epoch": 0.02360760783266703, "grad_norm": 0.22875702381134033, "learning_rate": 0.0002, "loss": 0.9342, "mean_token_accuracy": 0.7632717840373516, "num_tokens": 92347787.0, "step": 315 }, { "entropy": 0.7454060694202781, "epoch": 0.023982331766518886, "grad_norm": 0.20397421717643738, "learning_rate": 0.0002, "loss": 0.8935, "mean_token_accuracy": 0.7717408258467913, "num_tokens": 93786114.0, "step": 320 }, { "entropy": 0.7863677745684982, "epoch": 0.024357055700370743, "grad_norm": 0.20915848016738892, "learning_rate": 0.0002, "loss": 0.9351, "mean_token_accuracy": 0.7642316803336143, "num_tokens": 95292650.0, "step": 325 }, { "entropy": 0.7310505252331495, "epoch": 0.0247317796342226, "grad_norm": 0.2310132384300232, "learning_rate": 0.0002, "loss": 0.8723, "mean_token_accuracy": 0.7747792974114418, "num_tokens": 96746931.0, "step": 330 }, { "entropy": 0.7534694505855442, "epoch": 0.025106503568074458, "grad_norm": 0.23855732381343842, "learning_rate": 0.0002, "loss": 0.9093, "mean_token_accuracy": 0.7689717803150415, "num_tokens": 98215363.0, "step": 335 }, { "entropy": 0.7878096560016274, "epoch": 0.025481227501926315, "grad_norm": 0.23160603642463684, "learning_rate": 0.0002, "loss": 0.9361, "mean_token_accuracy": 0.7637045536190271, "num_tokens": 99710645.0, "step": 340 }, { "entropy": 0.7732148427516222, "epoch": 0.025855951435778173, "grad_norm": 0.21337415277957916, "learning_rate": 0.0002, "loss": 0.9183, "mean_token_accuracy": 0.7673635438084603, "num_tokens": 101155088.0, "step": 345 }, { "entropy": 0.7719780545681715, "epoch": 0.02623067536963003, "grad_norm": 0.21319876611232758, "learning_rate": 0.0002, "loss": 0.9188, "mean_token_accuracy": 0.7667896158993244, "num_tokens": 102653977.0, "step": 350 }, { "entropy": 0.7450987994670868, "epoch": 0.026605399303481887, "grad_norm": 0.2299962043762207, "learning_rate": 0.0002, "loss": 0.8936, "mean_token_accuracy": 0.7700511202216148, "num_tokens": 104074890.0, "step": 355 }, { "entropy": 0.7500674979761243, "epoch": 0.026980123237333745, "grad_norm": 0.22948752343654633, "learning_rate": 0.0002, "loss": 0.9003, "mean_token_accuracy": 0.7707898557186127, "num_tokens": 105527930.0, "step": 360 }, { "entropy": 0.7375289656221866, "epoch": 0.027354847171185602, "grad_norm": 0.242097407579422, "learning_rate": 0.0002, "loss": 0.8854, "mean_token_accuracy": 0.7708391886204481, "num_tokens": 106965728.0, "step": 365 }, { "entropy": 0.7519961211830377, "epoch": 0.02772957110503746, "grad_norm": 0.23257307708263397, "learning_rate": 0.0002, "loss": 0.8989, "mean_token_accuracy": 0.7705086842179298, "num_tokens": 108429970.0, "step": 370 }, { "entropy": 0.776968813687563, "epoch": 0.028104295038889317, "grad_norm": 0.21215219795703888, "learning_rate": 0.0002, "loss": 0.9272, "mean_token_accuracy": 0.7689751889556646, "num_tokens": 109893983.0, "step": 375 }, { "entropy": 0.7904937205836177, "epoch": 0.028479018972741174, "grad_norm": 0.21407072246074677, "learning_rate": 0.0002, "loss": 0.9394, "mean_token_accuracy": 0.7640385791659355, "num_tokens": 111347880.0, "step": 380 }, { "entropy": 0.7696556933224201, "epoch": 0.028853742906593035, "grad_norm": 0.24298420548439026, "learning_rate": 0.0002, "loss": 0.8852, "mean_token_accuracy": 0.7729216422885656, "num_tokens": 112824512.0, "step": 385 }, { "entropy": 0.7817372400313616, "epoch": 0.029228466840444892, "grad_norm": 0.22030627727508545, "learning_rate": 0.0002, "loss": 0.8976, "mean_token_accuracy": 0.7697529125958681, "num_tokens": 114283405.0, "step": 390 }, { "entropy": 0.8215504337102175, "epoch": 0.02960319077429675, "grad_norm": 0.2028440535068512, "learning_rate": 0.0002, "loss": 0.9281, "mean_token_accuracy": 0.765995754674077, "num_tokens": 115822164.0, "step": 395 }, { "entropy": 0.7863073702901602, "epoch": 0.029977914708148607, "grad_norm": 0.20542506873607635, "learning_rate": 0.0002, "loss": 0.9002, "mean_token_accuracy": 0.7686510033905506, "num_tokens": 117311256.0, "step": 400 }, { "entropy": 0.7819267375394702, "epoch": 0.030352638642000464, "grad_norm": 0.2163015604019165, "learning_rate": 0.0002, "loss": 0.8978, "mean_token_accuracy": 0.7730205796658993, "num_tokens": 118836558.0, "step": 405 }, { "entropy": 0.7991739869117737, "epoch": 0.030727362575852322, "grad_norm": 0.19979199767112732, "learning_rate": 0.0002, "loss": 0.9172, "mean_token_accuracy": 0.7657362539321184, "num_tokens": 120324779.0, "step": 410 }, { "entropy": 0.7927406456321477, "epoch": 0.03110208650970418, "grad_norm": 0.23146109282970428, "learning_rate": 0.0002, "loss": 0.9001, "mean_token_accuracy": 0.768084653839469, "num_tokens": 121779957.0, "step": 415 }, { "entropy": 0.7987086821347475, "epoch": 0.03147681044355603, "grad_norm": 0.2177598923444748, "learning_rate": 0.0002, "loss": 0.9212, "mean_token_accuracy": 0.7666705925017595, "num_tokens": 123242641.0, "step": 420 }, { "entropy": 0.7785284670069814, "epoch": 0.0318515343774079, "grad_norm": 0.2276596575975418, "learning_rate": 0.0002, "loss": 0.9111, "mean_token_accuracy": 0.7690066646784544, "num_tokens": 124745321.0, "step": 425 }, { "entropy": 0.761037464812398, "epoch": 0.032226258311259755, "grad_norm": 0.23254360258579254, "learning_rate": 0.0002, "loss": 0.8971, "mean_token_accuracy": 0.7715179987251759, "num_tokens": 126168828.0, "step": 430 }, { "entropy": 0.7799057886004448, "epoch": 0.03260098224511161, "grad_norm": 0.2148168832063675, "learning_rate": 0.0002, "loss": 0.9108, "mean_token_accuracy": 0.7676570221781731, "num_tokens": 127616908.0, "step": 435 }, { "entropy": 0.7559140540659428, "epoch": 0.03297570617896347, "grad_norm": 0.2181471437215805, "learning_rate": 0.0002, "loss": 0.8749, "mean_token_accuracy": 0.7740662027150392, "num_tokens": 129041591.0, "step": 440 }, { "entropy": 0.7671527091413737, "epoch": 0.03335043011281533, "grad_norm": 0.20608696341514587, "learning_rate": 0.0002, "loss": 0.8919, "mean_token_accuracy": 0.7713141970336437, "num_tokens": 130532895.0, "step": 445 }, { "entropy": 0.7486231025308371, "epoch": 0.033725154046667184, "grad_norm": 0.24095776677131653, "learning_rate": 0.0002, "loss": 0.8876, "mean_token_accuracy": 0.7713767770677805, "num_tokens": 132000597.0, "step": 450 }, { "entropy": 0.7443445902317762, "epoch": 0.03409987798051904, "grad_norm": 0.21533212065696716, "learning_rate": 0.0002, "loss": 0.8886, "mean_token_accuracy": 0.7731007803231478, "num_tokens": 133451969.0, "step": 455 }, { "entropy": 0.7302283631637693, "epoch": 0.0344746019143709, "grad_norm": 0.2585500180721283, "learning_rate": 0.0002, "loss": 0.883, "mean_token_accuracy": 0.7716005589812994, "num_tokens": 134926579.0, "step": 460 }, { "entropy": 0.7434947615489363, "epoch": 0.034849325848222756, "grad_norm": 0.24547934532165527, "learning_rate": 0.0002, "loss": 0.8971, "mean_token_accuracy": 0.7693880494683981, "num_tokens": 136412821.0, "step": 465 }, { "entropy": 0.7300087234005332, "epoch": 0.035224049782074614, "grad_norm": 0.2360820472240448, "learning_rate": 0.0002, "loss": 0.902, "mean_token_accuracy": 0.7703893728554249, "num_tokens": 137920738.0, "step": 470 }, { "entropy": 0.7132256627082825, "epoch": 0.03559877371592647, "grad_norm": 0.24533914029598236, "learning_rate": 0.0002, "loss": 0.8874, "mean_token_accuracy": 0.7742379616945982, "num_tokens": 139356763.0, "step": 475 }, { "entropy": 0.7218451015651226, "epoch": 0.03597349764977833, "grad_norm": 0.27322521805763245, "learning_rate": 0.0002, "loss": 0.8778, "mean_token_accuracy": 0.7715528409928083, "num_tokens": 140799652.0, "step": 480 }, { "entropy": 0.733222302235663, "epoch": 0.036348221583630186, "grad_norm": 0.2224287986755371, "learning_rate": 0.0002, "loss": 0.895, "mean_token_accuracy": 0.7712752800434828, "num_tokens": 142270173.0, "step": 485 }, { "entropy": 0.737508637085557, "epoch": 0.03672294551748204, "grad_norm": 0.23592694103717804, "learning_rate": 0.0002, "loss": 0.9076, "mean_token_accuracy": 0.7698614407330752, "num_tokens": 143770240.0, "step": 490 }, { "entropy": 0.7146318901330233, "epoch": 0.0370976694513339, "grad_norm": 0.2367502748966217, "learning_rate": 0.0002, "loss": 0.8806, "mean_token_accuracy": 0.7748886536806822, "num_tokens": 145211900.0, "step": 495 }, { "entropy": 0.7360311279073357, "epoch": 0.03747239338518576, "grad_norm": 0.23575659096240997, "learning_rate": 0.0002, "loss": 0.904, "mean_token_accuracy": 0.7713186226785183, "num_tokens": 146695588.0, "step": 500 }, { "entropy": 0.7119140999391675, "epoch": 0.037847117319037615, "grad_norm": 0.2242777794599533, "learning_rate": 0.0002, "loss": 0.8813, "mean_token_accuracy": 0.7713157676160336, "num_tokens": 148135803.0, "step": 505 }, { "entropy": 0.7251281870529056, "epoch": 0.03822184125288947, "grad_norm": 0.22642415761947632, "learning_rate": 0.0002, "loss": 0.8972, "mean_token_accuracy": 0.7698167450726032, "num_tokens": 149558867.0, "step": 510 }, { "entropy": 0.714796955883503, "epoch": 0.03859656518674133, "grad_norm": 0.19430266320705414, "learning_rate": 0.0002, "loss": 0.8813, "mean_token_accuracy": 0.7734818965196609, "num_tokens": 151019789.0, "step": 515 }, { "entropy": 0.7298636768944562, "epoch": 0.03897128912059319, "grad_norm": 0.19353872537612915, "learning_rate": 0.0002, "loss": 0.8876, "mean_token_accuracy": 0.7728677496314049, "num_tokens": 152507649.0, "step": 520 }, { "entropy": 0.7415406823158264, "epoch": 0.039346013054445045, "grad_norm": 0.20516586303710938, "learning_rate": 0.0002, "loss": 0.9082, "mean_token_accuracy": 0.7717824660241603, "num_tokens": 153982315.0, "step": 525 }, { "entropy": 0.7210252761840821, "epoch": 0.0397207369882969, "grad_norm": 0.20642779767513275, "learning_rate": 0.0002, "loss": 0.885, "mean_token_accuracy": 0.7722584448754788, "num_tokens": 155441084.0, "step": 530 }, { "entropy": 0.7390391340479254, "epoch": 0.04009546092214876, "grad_norm": 0.2166910320520401, "learning_rate": 0.0002, "loss": 0.9077, "mean_token_accuracy": 0.7675858929753303, "num_tokens": 156911952.0, "step": 535 }, { "entropy": 0.7110726023092866, "epoch": 0.04047018485600062, "grad_norm": 0.2234870046377182, "learning_rate": 0.0002, "loss": 0.8903, "mean_token_accuracy": 0.7708638031035662, "num_tokens": 158371547.0, "step": 540 }, { "entropy": 0.7103780636563897, "epoch": 0.040844908789852474, "grad_norm": 0.2209707647562027, "learning_rate": 0.0002, "loss": 0.8825, "mean_token_accuracy": 0.7690335776656866, "num_tokens": 159824320.0, "step": 545 }, { "entropy": 0.7025527600198984, "epoch": 0.04121963272370433, "grad_norm": 0.2132430225610733, "learning_rate": 0.0002, "loss": 0.8731, "mean_token_accuracy": 0.7729729507118464, "num_tokens": 161311619.0, "step": 550 }, { "entropy": 0.7067951329052449, "epoch": 0.04159435665755619, "grad_norm": 0.1869579702615738, "learning_rate": 0.0002, "loss": 0.8826, "mean_token_accuracy": 0.7738885249942541, "num_tokens": 162802561.0, "step": 555 }, { "entropy": 0.6977386493235826, "epoch": 0.041969080591408046, "grad_norm": 0.20643305778503418, "learning_rate": 0.0002, "loss": 0.8649, "mean_token_accuracy": 0.7745762251317501, "num_tokens": 164279431.0, "step": 560 }, { "entropy": 0.7232646439224482, "epoch": 0.042343804525259904, "grad_norm": 0.20992213487625122, "learning_rate": 0.0002, "loss": 0.8922, "mean_token_accuracy": 0.7713353052735329, "num_tokens": 165747085.0, "step": 565 }, { "entropy": 0.7139923570677638, "epoch": 0.04271852845911176, "grad_norm": 0.2291632741689682, "learning_rate": 0.0002, "loss": 0.8777, "mean_token_accuracy": 0.7735786702483892, "num_tokens": 167226310.0, "step": 570 }, { "entropy": 0.7205363191664219, "epoch": 0.04309325239296362, "grad_norm": 0.20945483446121216, "learning_rate": 0.0002, "loss": 0.8739, "mean_token_accuracy": 0.7701063767075539, "num_tokens": 168715430.0, "step": 575 }, { "entropy": 0.7168231669813394, "epoch": 0.043467976326815476, "grad_norm": 0.21111442148685455, "learning_rate": 0.0002, "loss": 0.8857, "mean_token_accuracy": 0.7723576217889786, "num_tokens": 170145914.0, "step": 580 }, { "entropy": 0.726085440069437, "epoch": 0.04384270026066733, "grad_norm": 0.20902438461780548, "learning_rate": 0.0002, "loss": 0.8856, "mean_token_accuracy": 0.7722388681024313, "num_tokens": 171666115.0, "step": 585 }, { "entropy": 0.7008619353175163, "epoch": 0.04421742419451919, "grad_norm": 0.19922180473804474, "learning_rate": 0.0002, "loss": 0.8757, "mean_token_accuracy": 0.7728334214538336, "num_tokens": 173120236.0, "step": 590 }, { "entropy": 0.7050014838576317, "epoch": 0.044592148128371055, "grad_norm": 0.22303976118564606, "learning_rate": 0.0002, "loss": 0.8974, "mean_token_accuracy": 0.7693502131849528, "num_tokens": 174620178.0, "step": 595 }, { "entropy": 0.6934247054159641, "epoch": 0.04496687206222291, "grad_norm": 0.2166004627943039, "learning_rate": 0.0002, "loss": 0.8851, "mean_token_accuracy": 0.7703736647963524, "num_tokens": 176083963.0, "step": 600 }, { "entropy": 0.7178489742800593, "epoch": 0.04534159599607477, "grad_norm": 0.2152416706085205, "learning_rate": 0.0002, "loss": 0.9134, "mean_token_accuracy": 0.7682455915957689, "num_tokens": 177579608.0, "step": 605 }, { "entropy": 0.6918890262022614, "epoch": 0.04571631992992663, "grad_norm": 0.24158701300621033, "learning_rate": 0.0002, "loss": 0.875, "mean_token_accuracy": 0.7723512142896652, "num_tokens": 179046225.0, "step": 610 }, { "entropy": 0.6885810962878167, "epoch": 0.046091043863778484, "grad_norm": 0.24410876631736755, "learning_rate": 0.0002, "loss": 0.8677, "mean_token_accuracy": 0.7768646977841854, "num_tokens": 180528018.0, "step": 615 }, { "entropy": 0.6850621856749057, "epoch": 0.04646576779763034, "grad_norm": 0.21322393417358398, "learning_rate": 0.0002, "loss": 0.874, "mean_token_accuracy": 0.7721446417272091, "num_tokens": 181964424.0, "step": 620 }, { "entropy": 0.6861705645918846, "epoch": 0.0468404917314822, "grad_norm": 0.2641318440437317, "learning_rate": 0.0002, "loss": 0.8682, "mean_token_accuracy": 0.7762427926063538, "num_tokens": 183399384.0, "step": 625 }, { "entropy": 0.7056332128122449, "epoch": 0.04721521566533406, "grad_norm": 0.20881137251853943, "learning_rate": 0.0002, "loss": 0.8859, "mean_token_accuracy": 0.7710943330079317, "num_tokens": 184886294.0, "step": 630 }, { "entropy": 0.7147496521472931, "epoch": 0.047589939599185914, "grad_norm": 0.2147122174501419, "learning_rate": 0.0002, "loss": 0.8956, "mean_token_accuracy": 0.7710015404969454, "num_tokens": 186340748.0, "step": 635 }, { "entropy": 0.6941389926709235, "epoch": 0.04796466353303777, "grad_norm": 0.22968168556690216, "learning_rate": 0.0002, "loss": 0.881, "mean_token_accuracy": 0.7758374307304621, "num_tokens": 187790619.0, "step": 640 }, { "entropy": 0.7193040534853935, "epoch": 0.04833938746688963, "grad_norm": 0.2201451063156128, "learning_rate": 0.0002, "loss": 0.9057, "mean_token_accuracy": 0.7697021324187517, "num_tokens": 189291601.0, "step": 645 }, { "entropy": 0.6908259502612054, "epoch": 0.048714111400741486, "grad_norm": 0.20757362246513367, "learning_rate": 0.0002, "loss": 0.8722, "mean_token_accuracy": 0.7732685264199972, "num_tokens": 190758367.0, "step": 650 }, { "entropy": 0.6848335884511471, "epoch": 0.04908883533459334, "grad_norm": 0.19443821907043457, "learning_rate": 0.0002, "loss": 0.8702, "mean_token_accuracy": 0.7799169398844242, "num_tokens": 192160893.0, "step": 655 }, { "entropy": 0.6851829366758466, "epoch": 0.0494635592684452, "grad_norm": 0.2394486516714096, "learning_rate": 0.0002, "loss": 0.8692, "mean_token_accuracy": 0.7741584632545709, "num_tokens": 193611081.0, "step": 660 }, { "entropy": 0.7278399795293808, "epoch": 0.04983828320229706, "grad_norm": 0.2010699212551117, "learning_rate": 0.0002, "loss": 0.9122, "mean_token_accuracy": 0.7679913002997637, "num_tokens": 195082366.0, "step": 665 }, { "entropy": 0.6963616453111172, "epoch": 0.050213007136148916, "grad_norm": 0.21445848047733307, "learning_rate": 0.0002, "loss": 0.8712, "mean_token_accuracy": 0.7762143149971962, "num_tokens": 196508789.0, "step": 670 }, { "entropy": 0.6803658617660403, "epoch": 0.05058773107000077, "grad_norm": 0.20769071578979492, "learning_rate": 0.0002, "loss": 0.8568, "mean_token_accuracy": 0.7759375236928463, "num_tokens": 197962547.0, "step": 675 }, { "entropy": 0.7079053482040762, "epoch": 0.05096245500385263, "grad_norm": 0.2032313197851181, "learning_rate": 0.0002, "loss": 0.8952, "mean_token_accuracy": 0.7733294732868672, "num_tokens": 199477361.0, "step": 680 }, { "entropy": 0.6784100523218513, "epoch": 0.05133717893770449, "grad_norm": 0.20772534608840942, "learning_rate": 0.0002, "loss": 0.8559, "mean_token_accuracy": 0.7757477320730686, "num_tokens": 200935734.0, "step": 685 }, { "entropy": 0.6895376440137625, "epoch": 0.051711902871556345, "grad_norm": 0.21266022324562073, "learning_rate": 0.0002, "loss": 0.8695, "mean_token_accuracy": 0.7771495819091797, "num_tokens": 202397484.0, "step": 690 }, { "entropy": 0.6961139403283596, "epoch": 0.0520866268054082, "grad_norm": 0.20519869029521942, "learning_rate": 0.0002, "loss": 0.8778, "mean_token_accuracy": 0.7741450317203998, "num_tokens": 203876117.0, "step": 695 }, { "entropy": 0.7274865910410881, "epoch": 0.05246135073926006, "grad_norm": 0.2186785191297531, "learning_rate": 0.0002, "loss": 0.9031, "mean_token_accuracy": 0.7700905483216047, "num_tokens": 205349904.0, "step": 700 }, { "entropy": 0.690010579302907, "epoch": 0.05283607467311192, "grad_norm": 0.19929195940494537, "learning_rate": 0.0002, "loss": 0.8606, "mean_token_accuracy": 0.7779366973787546, "num_tokens": 206856417.0, "step": 705 }, { "entropy": 0.6926077701151371, "epoch": 0.053210798606963775, "grad_norm": 0.19931712746620178, "learning_rate": 0.0002, "loss": 0.8687, "mean_token_accuracy": 0.773920338973403, "num_tokens": 208313473.0, "step": 710 }, { "entropy": 0.711066496744752, "epoch": 0.05358552254081563, "grad_norm": 0.2225644290447235, "learning_rate": 0.0002, "loss": 0.8815, "mean_token_accuracy": 0.7715433772653342, "num_tokens": 209819531.0, "step": 715 }, { "entropy": 0.7044996408745646, "epoch": 0.05396024647466749, "grad_norm": 0.20278345048427582, "learning_rate": 0.0002, "loss": 0.8904, "mean_token_accuracy": 0.7711338501423597, "num_tokens": 211267031.0, "step": 720 }, { "entropy": 0.7006971979513764, "epoch": 0.05433497040851935, "grad_norm": 0.2017652541399002, "learning_rate": 0.0002, "loss": 0.8745, "mean_token_accuracy": 0.7722296223044396, "num_tokens": 212708904.0, "step": 725 }, { "entropy": 0.7121895130723714, "epoch": 0.054709694342371204, "grad_norm": 0.19882673025131226, "learning_rate": 0.0002, "loss": 0.8871, "mean_token_accuracy": 0.7714706443250179, "num_tokens": 214189763.0, "step": 730 }, { "entropy": 0.6886648120358586, "epoch": 0.05508441827622306, "grad_norm": 0.18076416850090027, "learning_rate": 0.0002, "loss": 0.8738, "mean_token_accuracy": 0.7738342527300119, "num_tokens": 215632876.0, "step": 735 }, { "entropy": 0.683577973023057, "epoch": 0.05545914221007492, "grad_norm": 0.21768447756767273, "learning_rate": 0.0002, "loss": 0.8596, "mean_token_accuracy": 0.7771030701696873, "num_tokens": 217110095.0, "step": 740 }, { "entropy": 0.6838728362694383, "epoch": 0.055833866143926776, "grad_norm": 0.1977786421775818, "learning_rate": 0.0002, "loss": 0.86, "mean_token_accuracy": 0.7758116945624352, "num_tokens": 218582408.0, "step": 745 }, { "entropy": 0.6947007779031992, "epoch": 0.056208590077778633, "grad_norm": 0.18655768036842346, "learning_rate": 0.0002, "loss": 0.866, "mean_token_accuracy": 0.7756775271147489, "num_tokens": 220077303.0, "step": 750 }, { "entropy": 0.6739026002585888, "epoch": 0.05658331401163049, "grad_norm": 0.19813179969787598, "learning_rate": 0.0002, "loss": 0.8591, "mean_token_accuracy": 0.774714444950223, "num_tokens": 221503769.0, "step": 755 }, { "entropy": 0.6873208973556757, "epoch": 0.05695803794548235, "grad_norm": 0.19957342743873596, "learning_rate": 0.0002, "loss": 0.8703, "mean_token_accuracy": 0.7743039771914482, "num_tokens": 222971417.0, "step": 760 }, { "entropy": 0.7134957130998373, "epoch": 0.05733276187933421, "grad_norm": 0.19308705627918243, "learning_rate": 0.0002, "loss": 0.8979, "mean_token_accuracy": 0.7692199505865573, "num_tokens": 224477556.0, "step": 765 }, { "entropy": 0.6562654061242938, "epoch": 0.05770748581318607, "grad_norm": 0.20176191627979279, "learning_rate": 0.0002, "loss": 0.8418, "mean_token_accuracy": 0.7796271409839391, "num_tokens": 225902505.0, "step": 770 }, { "entropy": 0.6876827047206462, "epoch": 0.05808220974703793, "grad_norm": 0.19288098812103271, "learning_rate": 0.0002, "loss": 0.8756, "mean_token_accuracy": 0.7763391472399235, "num_tokens": 227383252.0, "step": 775 }, { "entropy": 0.6894372427836061, "epoch": 0.058456933680889785, "grad_norm": 0.1818469762802124, "learning_rate": 0.0002, "loss": 0.8801, "mean_token_accuracy": 0.772885761782527, "num_tokens": 228847392.0, "step": 780 }, { "entropy": 0.7062537401914597, "epoch": 0.05883165761474164, "grad_norm": 0.19710654020309448, "learning_rate": 0.0002, "loss": 0.8909, "mean_token_accuracy": 0.7688345372676849, "num_tokens": 230346789.0, "step": 785 }, { "entropy": 0.6769068429246545, "epoch": 0.0592063815485935, "grad_norm": 0.1942002922296524, "learning_rate": 0.0002, "loss": 0.8597, "mean_token_accuracy": 0.7767196271568537, "num_tokens": 231860410.0, "step": 790 }, { "entropy": 0.6870667021721601, "epoch": 0.05958110548244536, "grad_norm": 0.19050264358520508, "learning_rate": 0.0002, "loss": 0.87, "mean_token_accuracy": 0.7713828928768635, "num_tokens": 233328266.0, "step": 795 }, { "entropy": 0.7072715302929282, "epoch": 0.059955829416297214, "grad_norm": 0.19950442016124725, "learning_rate": 0.0002, "loss": 0.9044, "mean_token_accuracy": 0.7726562019437552, "num_tokens": 234785510.0, "step": 800 }, { "entropy": 0.6776481417939066, "epoch": 0.06033055335014907, "grad_norm": 0.201198011636734, "learning_rate": 0.0002, "loss": 0.8621, "mean_token_accuracy": 0.7758769664913416, "num_tokens": 236244640.0, "step": 805 }, { "entropy": 0.6861417319625616, "epoch": 0.06070527728400093, "grad_norm": 0.22792856395244598, "learning_rate": 0.0002, "loss": 0.8589, "mean_token_accuracy": 0.775053346157074, "num_tokens": 237773692.0, "step": 810 }, { "entropy": 0.68076346013695, "epoch": 0.061080001217852786, "grad_norm": 0.22050045430660248, "learning_rate": 0.0002, "loss": 0.8654, "mean_token_accuracy": 0.7771227553486824, "num_tokens": 239225688.0, "step": 815 }, { "entropy": 0.6755342699587346, "epoch": 0.061454725151704644, "grad_norm": 0.1917007714509964, "learning_rate": 0.0002, "loss": 0.8503, "mean_token_accuracy": 0.7776442654430866, "num_tokens": 240657394.0, "step": 820 }, { "entropy": 0.6699436271563173, "epoch": 0.0618294490855565, "grad_norm": 0.18601155281066895, "learning_rate": 0.0002, "loss": 0.8402, "mean_token_accuracy": 0.7791504394263029, "num_tokens": 242114543.0, "step": 825 }, { "entropy": 0.6880132792517543, "epoch": 0.06220417301940836, "grad_norm": 0.22835999727249146, "learning_rate": 0.0002, "loss": 0.8625, "mean_token_accuracy": 0.7802407775074244, "num_tokens": 243578575.0, "step": 830 }, { "entropy": 0.7017028031870722, "epoch": 0.06257889695326022, "grad_norm": 0.19939732551574707, "learning_rate": 0.0002, "loss": 0.8715, "mean_token_accuracy": 0.7743933714926243, "num_tokens": 245015760.0, "step": 835 }, { "entropy": 0.6857368623837828, "epoch": 0.06295362088711207, "grad_norm": 0.1992313116788864, "learning_rate": 0.0002, "loss": 0.8703, "mean_token_accuracy": 0.7779597554355859, "num_tokens": 246456915.0, "step": 840 }, { "entropy": 0.6946071574464441, "epoch": 0.06332834482096393, "grad_norm": 0.23346717655658722, "learning_rate": 0.0002, "loss": 0.8698, "mean_token_accuracy": 0.7753937870264054, "num_tokens": 247902241.0, "step": 845 }, { "entropy": 0.6857899194583297, "epoch": 0.0637030687548158, "grad_norm": 0.21772658824920654, "learning_rate": 0.0002, "loss": 0.8594, "mean_token_accuracy": 0.7752189792692661, "num_tokens": 249341298.0, "step": 850 }, { "entropy": 0.6836088748648763, "epoch": 0.06407779268866765, "grad_norm": 0.19519197940826416, "learning_rate": 0.0002, "loss": 0.8525, "mean_token_accuracy": 0.781422246992588, "num_tokens": 250784257.0, "step": 855 }, { "entropy": 0.6805694276466966, "epoch": 0.06445251662251951, "grad_norm": 0.2008732557296753, "learning_rate": 0.0002, "loss": 0.8585, "mean_token_accuracy": 0.7783581424504519, "num_tokens": 252246296.0, "step": 860 }, { "entropy": 0.6764410545118154, "epoch": 0.06482724055637136, "grad_norm": 0.20467473566532135, "learning_rate": 0.0002, "loss": 0.8503, "mean_token_accuracy": 0.7786156255751848, "num_tokens": 253684937.0, "step": 865 }, { "entropy": 0.6939568700268864, "epoch": 0.06520196449022322, "grad_norm": 0.18921157717704773, "learning_rate": 0.0002, "loss": 0.8697, "mean_token_accuracy": 0.7754145566374063, "num_tokens": 255164011.0, "step": 870 }, { "entropy": 0.6695347841829061, "epoch": 0.06557668842407507, "grad_norm": 0.20197342336177826, "learning_rate": 0.0002, "loss": 0.8405, "mean_token_accuracy": 0.7792784947901964, "num_tokens": 256649140.0, "step": 875 }, { "entropy": 0.682520210929215, "epoch": 0.06595141235792694, "grad_norm": 0.20108264684677124, "learning_rate": 0.0002, "loss": 0.8491, "mean_token_accuracy": 0.781222765147686, "num_tokens": 258102532.0, "step": 880 }, { "entropy": 0.6916093762964011, "epoch": 0.06632613629177879, "grad_norm": 0.17815600335597992, "learning_rate": 0.0002, "loss": 0.8604, "mean_token_accuracy": 0.7766068521887064, "num_tokens": 259607546.0, "step": 885 }, { "entropy": 0.6830752532929182, "epoch": 0.06670086022563065, "grad_norm": 0.2494674175977707, "learning_rate": 0.0002, "loss": 0.8491, "mean_token_accuracy": 0.7779087942093611, "num_tokens": 261108245.0, "step": 890 }, { "entropy": 0.7085453571751714, "epoch": 0.0670755841594825, "grad_norm": 0.20687998831272125, "learning_rate": 0.0002, "loss": 0.8859, "mean_token_accuracy": 0.7732801467180253, "num_tokens": 262580362.0, "step": 895 }, { "entropy": 0.6944962004199624, "epoch": 0.06745030809333437, "grad_norm": 0.19854652881622314, "learning_rate": 0.0002, "loss": 0.8713, "mean_token_accuracy": 0.7718857411295176, "num_tokens": 264041054.0, "step": 900 }, { "entropy": 0.6846056109294295, "epoch": 0.06782503202718622, "grad_norm": 0.19521760940551758, "learning_rate": 0.0002, "loss": 0.8593, "mean_token_accuracy": 0.7752500303089619, "num_tokens": 265539640.0, "step": 905 }, { "entropy": 0.6749046182259917, "epoch": 0.06819975596103808, "grad_norm": 0.20171040296554565, "learning_rate": 0.0002, "loss": 0.8556, "mean_token_accuracy": 0.7779517650604248, "num_tokens": 266989850.0, "step": 910 }, { "entropy": 0.6533212670125067, "epoch": 0.06857447989488993, "grad_norm": 0.2029609978199005, "learning_rate": 0.0002, "loss": 0.8449, "mean_token_accuracy": 0.779866735637188, "num_tokens": 268421875.0, "step": 915 }, { "entropy": 0.6943401716649532, "epoch": 0.0689492038287418, "grad_norm": 0.19357433915138245, "learning_rate": 0.0002, "loss": 0.8721, "mean_token_accuracy": 0.7736887849867344, "num_tokens": 269924166.0, "step": 920 }, { "entropy": 0.6802269952371717, "epoch": 0.06932392776259365, "grad_norm": 0.18982748687267303, "learning_rate": 0.0002, "loss": 0.8557, "mean_token_accuracy": 0.7754610732197762, "num_tokens": 271395931.0, "step": 925 }, { "entropy": 0.6847355164587497, "epoch": 0.06969865169644551, "grad_norm": 0.1949714720249176, "learning_rate": 0.0002, "loss": 0.8687, "mean_token_accuracy": 0.775904505699873, "num_tokens": 272885358.0, "step": 930 }, { "entropy": 0.6649012049660087, "epoch": 0.07007337563029736, "grad_norm": 0.1849607527256012, "learning_rate": 0.0002, "loss": 0.8349, "mean_token_accuracy": 0.7810276415199041, "num_tokens": 274373403.0, "step": 935 }, { "entropy": 0.6779296549037099, "epoch": 0.07044809956414923, "grad_norm": 0.19169479608535767, "learning_rate": 0.0002, "loss": 0.8586, "mean_token_accuracy": 0.7768583863973617, "num_tokens": 275829982.0, "step": 940 }, { "entropy": 0.6908345017582178, "epoch": 0.07082282349800108, "grad_norm": 0.2071664184331894, "learning_rate": 0.0002, "loss": 0.865, "mean_token_accuracy": 0.7778472147881985, "num_tokens": 277320037.0, "step": 945 }, { "entropy": 0.6606402156874538, "epoch": 0.07119754743185294, "grad_norm": 0.18321937322616577, "learning_rate": 0.0002, "loss": 0.8302, "mean_token_accuracy": 0.7801937073469162, "num_tokens": 278760138.0, "step": 950 }, { "entropy": 0.6868944946676493, "epoch": 0.07157227136570479, "grad_norm": 0.19641046226024628, "learning_rate": 0.0002, "loss": 0.8655, "mean_token_accuracy": 0.7724831733852625, "num_tokens": 280248410.0, "step": 955 }, { "entropy": 0.6831979069858789, "epoch": 0.07194699529955666, "grad_norm": 0.20043067634105682, "learning_rate": 0.0002, "loss": 0.8627, "mean_token_accuracy": 0.7735623929649591, "num_tokens": 281708935.0, "step": 960 }, { "entropy": 0.670716879889369, "epoch": 0.07232171923340851, "grad_norm": 0.18648770451545715, "learning_rate": 0.0002, "loss": 0.8535, "mean_token_accuracy": 0.7770292516797781, "num_tokens": 283180269.0, "step": 965 }, { "entropy": 0.681360381282866, "epoch": 0.07269644316726037, "grad_norm": 0.19211436808109283, "learning_rate": 0.0002, "loss": 0.8656, "mean_token_accuracy": 0.7732577074319125, "num_tokens": 284625155.0, "step": 970 }, { "entropy": 0.676706700399518, "epoch": 0.07307116710111222, "grad_norm": 0.18134737014770508, "learning_rate": 0.0002, "loss": 0.8584, "mean_token_accuracy": 0.7761218708008528, "num_tokens": 286069195.0, "step": 975 }, { "entropy": 0.6653478540480137, "epoch": 0.07344589103496409, "grad_norm": 0.22254501283168793, "learning_rate": 0.0002, "loss": 0.8554, "mean_token_accuracy": 0.7773523040115833, "num_tokens": 287558858.0, "step": 980 }, { "entropy": 0.688911640830338, "epoch": 0.07382061496881594, "grad_norm": 0.19165067374706268, "learning_rate": 0.0002, "loss": 0.8855, "mean_token_accuracy": 0.7753227040171623, "num_tokens": 289010300.0, "step": 985 }, { "entropy": 0.6670272344723344, "epoch": 0.0741953389026678, "grad_norm": 0.1862744241952896, "learning_rate": 0.0002, "loss": 0.8469, "mean_token_accuracy": 0.776986138522625, "num_tokens": 290469232.0, "step": 990 }, { "entropy": 0.6664619704708457, "epoch": 0.07457006283651965, "grad_norm": 0.24014201760292053, "learning_rate": 0.0002, "loss": 0.8559, "mean_token_accuracy": 0.7771924637258053, "num_tokens": 291926041.0, "step": 995 }, { "entropy": 0.686285094358027, "epoch": 0.07494478677037152, "grad_norm": 0.22935780882835388, "learning_rate": 0.0002, "loss": 0.8646, "mean_token_accuracy": 0.7731781758368015, "num_tokens": 293457871.0, "step": 1000 }, { "entropy": 0.684523194655776, "epoch": 0.07531951070422337, "grad_norm": 0.2078138291835785, "learning_rate": 0.0002, "loss": 0.8696, "mean_token_accuracy": 0.7765862341970206, "num_tokens": 294925881.0, "step": 1005 }, { "entropy": 0.6893799670040608, "epoch": 0.07569423463807523, "grad_norm": 0.1953161209821701, "learning_rate": 0.0002, "loss": 0.874, "mean_token_accuracy": 0.7743509285151958, "num_tokens": 296408524.0, "step": 1010 }, { "entropy": 0.6680609015747905, "epoch": 0.0760689585719271, "grad_norm": 0.1929452270269394, "learning_rate": 0.0002, "loss": 0.8496, "mean_token_accuracy": 0.7760592613369226, "num_tokens": 297850638.0, "step": 1015 }, { "entropy": 0.6814554180949927, "epoch": 0.07644368250577895, "grad_norm": 0.20585598051548004, "learning_rate": 0.0002, "loss": 0.8764, "mean_token_accuracy": 0.7769545618444681, "num_tokens": 299315100.0, "step": 1020 }, { "entropy": 0.6575755996629595, "epoch": 0.07681840643963081, "grad_norm": 0.19543606042861938, "learning_rate": 0.0002, "loss": 0.8592, "mean_token_accuracy": 0.7744812175631524, "num_tokens": 300743425.0, "step": 1025 }, { "entropy": 0.6526883619837462, "epoch": 0.07719313037348266, "grad_norm": 0.19866237044334412, "learning_rate": 0.0002, "loss": 0.8513, "mean_token_accuracy": 0.7773861423134804, "num_tokens": 302167356.0, "step": 1030 }, { "entropy": 0.6510671962052583, "epoch": 0.07756785430733452, "grad_norm": 0.18750518560409546, "learning_rate": 0.0002, "loss": 0.8409, "mean_token_accuracy": 0.7803856890648604, "num_tokens": 303647962.0, "step": 1035 }, { "entropy": 0.6546841079369188, "epoch": 0.07794257824118637, "grad_norm": 0.19254551827907562, "learning_rate": 0.0002, "loss": 0.8322, "mean_token_accuracy": 0.7780230231583118, "num_tokens": 305157720.0, "step": 1040 }, { "entropy": 0.6615381244570017, "epoch": 0.07831730217503824, "grad_norm": 0.18707098066806793, "learning_rate": 0.0002, "loss": 0.8524, "mean_token_accuracy": 0.7760605555027723, "num_tokens": 306658577.0, "step": 1045 }, { "entropy": 0.6438515406101942, "epoch": 0.07869202610889009, "grad_norm": 0.19375741481781006, "learning_rate": 0.0002, "loss": 0.84, "mean_token_accuracy": 0.779602475464344, "num_tokens": 308107274.0, "step": 1050 }, { "entropy": 0.6825897028669715, "epoch": 0.07906675004274195, "grad_norm": 0.18541692197322845, "learning_rate": 0.0002, "loss": 0.8782, "mean_token_accuracy": 0.7745042938739062, "num_tokens": 309601676.0, "step": 1055 }, { "entropy": 0.6551225475966931, "epoch": 0.0794414739765938, "grad_norm": 0.21006803214550018, "learning_rate": 0.0002, "loss": 0.8609, "mean_token_accuracy": 0.7764386601746083, "num_tokens": 311052142.0, "step": 1060 }, { "entropy": 0.6475762304849922, "epoch": 0.07981619791044567, "grad_norm": 0.22043506801128387, "learning_rate": 0.0002, "loss": 0.8473, "mean_token_accuracy": 0.7792323872447013, "num_tokens": 312475746.0, "step": 1065 }, { "entropy": 0.6674028528854251, "epoch": 0.08019092184429752, "grad_norm": 0.20016296207904816, "learning_rate": 0.0002, "loss": 0.8695, "mean_token_accuracy": 0.7754522394388914, "num_tokens": 313938116.0, "step": 1070 }, { "entropy": 0.6497701728716493, "epoch": 0.08056564577814938, "grad_norm": 0.20610012114048004, "learning_rate": 0.0002, "loss": 0.8462, "mean_token_accuracy": 0.779331823065877, "num_tokens": 315391993.0, "step": 1075 }, { "entropy": 0.6672687197104097, "epoch": 0.08094036971200123, "grad_norm": 0.17773714661598206, "learning_rate": 0.0002, "loss": 0.8639, "mean_token_accuracy": 0.7767653875052929, "num_tokens": 316835141.0, "step": 1080 }, { "entropy": 0.6412888536229729, "epoch": 0.0813150936458531, "grad_norm": 0.2003755271434784, "learning_rate": 0.0002, "loss": 0.8316, "mean_token_accuracy": 0.7822141777724028, "num_tokens": 318292866.0, "step": 1085 }, { "entropy": 0.6642941897735, "epoch": 0.08168981757970495, "grad_norm": 0.18399682641029358, "learning_rate": 0.0002, "loss": 0.857, "mean_token_accuracy": 0.7762846518307924, "num_tokens": 319755006.0, "step": 1090 }, { "entropy": 0.6477929381653666, "epoch": 0.08206454151355681, "grad_norm": 0.18477346003055573, "learning_rate": 0.0002, "loss": 0.8384, "mean_token_accuracy": 0.779123081639409, "num_tokens": 321210844.0, "step": 1095 }, { "entropy": 0.6658795705065131, "epoch": 0.08243926544740866, "grad_norm": 0.18577106297016144, "learning_rate": 0.0002, "loss": 0.8601, "mean_token_accuracy": 0.7774687699973584, "num_tokens": 322681917.0, "step": 1100 }, { "entropy": 0.6509121298789978, "epoch": 0.08281398938126053, "grad_norm": 0.1968340277671814, "learning_rate": 0.0002, "loss": 0.8431, "mean_token_accuracy": 0.7810686975717545, "num_tokens": 324159157.0, "step": 1105 }, { "entropy": 0.6406434698961675, "epoch": 0.08318871331511238, "grad_norm": 0.18768852949142456, "learning_rate": 0.0002, "loss": 0.8349, "mean_token_accuracy": 0.7803263753652573, "num_tokens": 325587299.0, "step": 1110 }, { "entropy": 0.647580723464489, "epoch": 0.08356343724896424, "grad_norm": 0.19148941338062286, "learning_rate": 0.0002, "loss": 0.8368, "mean_token_accuracy": 0.7814983889460564, "num_tokens": 327016449.0, "step": 1115 }, { "entropy": 0.6620581578463316, "epoch": 0.08393816118281609, "grad_norm": 0.1818246990442276, "learning_rate": 0.0002, "loss": 0.8466, "mean_token_accuracy": 0.7797580111771822, "num_tokens": 328505880.0, "step": 1120 }, { "entropy": 0.6553780306130648, "epoch": 0.08431288511666796, "grad_norm": 0.1920444369316101, "learning_rate": 0.0002, "loss": 0.8457, "mean_token_accuracy": 0.7790684841573239, "num_tokens": 329959482.0, "step": 1125 }, { "entropy": 0.6620829770341515, "epoch": 0.08468760905051981, "grad_norm": 0.20259416103363037, "learning_rate": 0.0002, "loss": 0.8417, "mean_token_accuracy": 0.7774847902357578, "num_tokens": 331430819.0, "step": 1130 }, { "entropy": 0.6590219909325242, "epoch": 0.08506233298437167, "grad_norm": 0.1860360950231552, "learning_rate": 0.0002, "loss": 0.8447, "mean_token_accuracy": 0.7786302298307419, "num_tokens": 332869609.0, "step": 1135 }, { "entropy": 0.6571066471748054, "epoch": 0.08543705691822352, "grad_norm": 0.17534145712852478, "learning_rate": 0.0002, "loss": 0.8376, "mean_token_accuracy": 0.7794949088245631, "num_tokens": 334353166.0, "step": 1140 }, { "entropy": 0.6643397537991405, "epoch": 0.08581178085207539, "grad_norm": 0.2781204581260681, "learning_rate": 0.0002, "loss": 0.8477, "mean_token_accuracy": 0.7799445364624262, "num_tokens": 335837505.0, "step": 1145 }, { "entropy": 0.6510635744780302, "epoch": 0.08618650478592724, "grad_norm": 0.1873469054698944, "learning_rate": 0.0002, "loss": 0.8425, "mean_token_accuracy": 0.7806868970394134, "num_tokens": 337290856.0, "step": 1150 }, { "entropy": 0.6517510069534183, "epoch": 0.0865612287197791, "grad_norm": 0.19847722351551056, "learning_rate": 0.0002, "loss": 0.852, "mean_token_accuracy": 0.7784461423754692, "num_tokens": 338778007.0, "step": 1155 }, { "entropy": 0.6403329519554972, "epoch": 0.08693595265363095, "grad_norm": 0.2034250944852829, "learning_rate": 0.0002, "loss": 0.846, "mean_token_accuracy": 0.7778051283210516, "num_tokens": 340240095.0, "step": 1160 }, { "entropy": 0.6589386174455285, "epoch": 0.08731067658748282, "grad_norm": 0.1931041181087494, "learning_rate": 0.0002, "loss": 0.8664, "mean_token_accuracy": 0.773771469667554, "num_tokens": 341698042.0, "step": 1165 }, { "entropy": 0.6565620510838925, "epoch": 0.08768540052133467, "grad_norm": 0.19156764447689056, "learning_rate": 0.0002, "loss": 0.8589, "mean_token_accuracy": 0.774412814900279, "num_tokens": 343203543.0, "step": 1170 }, { "entropy": 0.6460859108716249, "epoch": 0.08806012445518653, "grad_norm": 0.21564167737960815, "learning_rate": 0.0002, "loss": 0.8632, "mean_token_accuracy": 0.7768410194665194, "num_tokens": 344649644.0, "step": 1175 }, { "entropy": 0.6524973612278699, "epoch": 0.08843484838903838, "grad_norm": 0.1812422275543213, "learning_rate": 0.0002, "loss": 0.8427, "mean_token_accuracy": 0.7785371549427509, "num_tokens": 346133788.0, "step": 1180 }, { "entropy": 0.6539872374385596, "epoch": 0.08880957232289025, "grad_norm": 0.21075302362442017, "learning_rate": 0.0002, "loss": 0.8484, "mean_token_accuracy": 0.7807331141084433, "num_tokens": 347611108.0, "step": 1185 }, { "entropy": 0.6465396398678422, "epoch": 0.08918429625674211, "grad_norm": 0.18271389603614807, "learning_rate": 0.0002, "loss": 0.8446, "mean_token_accuracy": 0.779352480173111, "num_tokens": 349081717.0, "step": 1190 }, { "entropy": 0.6554296005517244, "epoch": 0.08955902019059396, "grad_norm": 0.21928569674491882, "learning_rate": 0.0002, "loss": 0.8404, "mean_token_accuracy": 0.7784810218960047, "num_tokens": 350553200.0, "step": 1195 }, { "entropy": 0.6527797542512417, "epoch": 0.08993374412444582, "grad_norm": 0.188278466463089, "learning_rate": 0.0002, "loss": 0.8383, "mean_token_accuracy": 0.7782312709838152, "num_tokens": 351993503.0, "step": 1200 }, { "entropy": 0.6742814019322395, "epoch": 0.09030846805829768, "grad_norm": 0.17520470917224884, "learning_rate": 0.0002, "loss": 0.8616, "mean_token_accuracy": 0.7779228270053864, "num_tokens": 353461035.0, "step": 1205 }, { "entropy": 0.660521568544209, "epoch": 0.09068319199214954, "grad_norm": 0.18751035630702972, "learning_rate": 0.0002, "loss": 0.8455, "mean_token_accuracy": 0.7765742085874081, "num_tokens": 354930947.0, "step": 1210 }, { "entropy": 0.6453920468688011, "epoch": 0.09105791592600139, "grad_norm": 0.19343531131744385, "learning_rate": 0.0002, "loss": 0.8409, "mean_token_accuracy": 0.7784354973584413, "num_tokens": 356333326.0, "step": 1215 }, { "entropy": 0.6522661423310637, "epoch": 0.09143263985985325, "grad_norm": 0.20133203268051147, "learning_rate": 0.0002, "loss": 0.8598, "mean_token_accuracy": 0.7777920637279749, "num_tokens": 357785719.0, "step": 1220 }, { "entropy": 0.6527669411152601, "epoch": 0.0918073637937051, "grad_norm": 0.2689473032951355, "learning_rate": 0.0002, "loss": 0.8541, "mean_token_accuracy": 0.779644264280796, "num_tokens": 359267944.0, "step": 1225 }, { "entropy": 0.6446255991235376, "epoch": 0.09218208772755697, "grad_norm": 0.19355711340904236, "learning_rate": 0.0002, "loss": 0.8428, "mean_token_accuracy": 0.7801071256399155, "num_tokens": 360762239.0, "step": 1230 }, { "entropy": 0.6497489780187606, "epoch": 0.09255681166140882, "grad_norm": 0.2204262912273407, "learning_rate": 0.0002, "loss": 0.8502, "mean_token_accuracy": 0.7791404161602259, "num_tokens": 362238393.0, "step": 1235 }, { "entropy": 0.6518970590084792, "epoch": 0.09293153559526068, "grad_norm": 0.19767022132873535, "learning_rate": 0.0002, "loss": 0.8495, "mean_token_accuracy": 0.7780205421149731, "num_tokens": 363739789.0, "step": 1240 }, { "entropy": 0.6470884993672371, "epoch": 0.09330625952911253, "grad_norm": 0.19597597420215607, "learning_rate": 0.0002, "loss": 0.8388, "mean_token_accuracy": 0.7788534894585609, "num_tokens": 365196380.0, "step": 1245 }, { "entropy": 0.6465545725077391, "epoch": 0.0936809834629644, "grad_norm": 0.19215765595436096, "learning_rate": 0.0002, "loss": 0.8408, "mean_token_accuracy": 0.7797310184687376, "num_tokens": 366658397.0, "step": 1250 }, { "entropy": 0.6303374163806439, "epoch": 0.09405570739681625, "grad_norm": 0.1778433918952942, "learning_rate": 0.0002, "loss": 0.8248, "mean_token_accuracy": 0.7821567326784133, "num_tokens": 368102798.0, "step": 1255 }, { "entropy": 0.6509063921868801, "epoch": 0.09443043133066811, "grad_norm": 0.19593441486358643, "learning_rate": 0.0002, "loss": 0.8403, "mean_token_accuracy": 0.7811719138175249, "num_tokens": 369551657.0, "step": 1260 }, { "entropy": 0.6529487075284124, "epoch": 0.09480515526451996, "grad_norm": 0.19191808998584747, "learning_rate": 0.0002, "loss": 0.8356, "mean_token_accuracy": 0.7802269969135522, "num_tokens": 371022264.0, "step": 1265 }, { "entropy": 0.6491042528301477, "epoch": 0.09517987919837183, "grad_norm": 0.23719540238380432, "learning_rate": 0.0002, "loss": 0.835, "mean_token_accuracy": 0.7801361583173275, "num_tokens": 372518638.0, "step": 1270 }, { "entropy": 0.6609306506812572, "epoch": 0.09555460313222368, "grad_norm": 0.19188262522220612, "learning_rate": 0.0002, "loss": 0.8595, "mean_token_accuracy": 0.7766454931348562, "num_tokens": 373994675.0, "step": 1275 }, { "entropy": 0.6228219280019403, "epoch": 0.09592932706607554, "grad_norm": 0.20073696970939636, "learning_rate": 0.0002, "loss": 0.8201, "mean_token_accuracy": 0.7829756170511246, "num_tokens": 375432679.0, "step": 1280 }, { "entropy": 0.6678185943514109, "epoch": 0.09630405099992739, "grad_norm": 0.18283146619796753, "learning_rate": 0.0002, "loss": 0.8612, "mean_token_accuracy": 0.7778335515409708, "num_tokens": 376895842.0, "step": 1285 }, { "entropy": 0.6380815952084958, "epoch": 0.09667877493377926, "grad_norm": 0.20238468050956726, "learning_rate": 0.0002, "loss": 0.8254, "mean_token_accuracy": 0.7808939162641764, "num_tokens": 378363230.0, "step": 1290 }, { "entropy": 0.6504175407812, "epoch": 0.09705349886763111, "grad_norm": 0.18905462324619293, "learning_rate": 0.0002, "loss": 0.8439, "mean_token_accuracy": 0.7779222674667835, "num_tokens": 379851777.0, "step": 1295 }, { "entropy": 0.6324932791292668, "epoch": 0.09742822280148297, "grad_norm": 0.5517951250076294, "learning_rate": 0.0002, "loss": 0.8268, "mean_token_accuracy": 0.7825745131820441, "num_tokens": 381289997.0, "step": 1300 }, { "entropy": 0.6239147560670972, "epoch": 0.09780294673533482, "grad_norm": 0.20370782911777496, "learning_rate": 0.0002, "loss": 0.8232, "mean_token_accuracy": 0.7831059787422419, "num_tokens": 382702284.0, "step": 1305 }, { "entropy": 0.6557770747691393, "epoch": 0.09817767066918669, "grad_norm": 0.1927860975265503, "learning_rate": 0.0002, "loss": 0.852, "mean_token_accuracy": 0.7782856073230505, "num_tokens": 384162822.0, "step": 1310 }, { "entropy": 0.6365593209862709, "epoch": 0.09855239460303854, "grad_norm": 0.20641757547855377, "learning_rate": 0.0002, "loss": 0.8312, "mean_token_accuracy": 0.781517020240426, "num_tokens": 385639410.0, "step": 1315 }, { "entropy": 0.6627272021025419, "epoch": 0.0989271185368904, "grad_norm": 0.2135564386844635, "learning_rate": 0.0002, "loss": 0.8557, "mean_token_accuracy": 0.7751013241708279, "num_tokens": 387163460.0, "step": 1320 }, { "entropy": 0.6501585507765413, "epoch": 0.09930184247074225, "grad_norm": 0.18609334528446198, "learning_rate": 0.0002, "loss": 0.845, "mean_token_accuracy": 0.7773637939244509, "num_tokens": 388670431.0, "step": 1325 }, { "entropy": 0.6478286979719996, "epoch": 0.09967656640459412, "grad_norm": 0.19592301547527313, "learning_rate": 0.0002, "loss": 0.8426, "mean_token_accuracy": 0.7783489838242531, "num_tokens": 390165385.0, "step": 1330 }, { "entropy": 0.6222256946377456, "epoch": 0.10005129033844597, "grad_norm": 0.175072580575943, "learning_rate": 0.0002, "loss": 0.8221, "mean_token_accuracy": 0.7823193661868573, "num_tokens": 391590972.0, "step": 1335 }, { "entropy": 0.6330616317689419, "epoch": 0.10042601427229783, "grad_norm": 0.4179931879043579, "learning_rate": 0.0002, "loss": 0.8299, "mean_token_accuracy": 0.7815587311983109, "num_tokens": 393018342.0, "step": 1340 }, { "entropy": 0.6405655488371849, "epoch": 0.10080073820614968, "grad_norm": 0.200890451669693, "learning_rate": 0.0002, "loss": 0.8331, "mean_token_accuracy": 0.781229143589735, "num_tokens": 394503876.0, "step": 1345 }, { "entropy": 0.6378223046660423, "epoch": 0.10117546214000155, "grad_norm": 0.19800686836242676, "learning_rate": 0.0002, "loss": 0.8157, "mean_token_accuracy": 0.7855598047375679, "num_tokens": 395941756.0, "step": 1350 }, { "entropy": 0.657622453942895, "epoch": 0.10155018607385341, "grad_norm": 0.20294563472270966, "learning_rate": 0.0002, "loss": 0.8552, "mean_token_accuracy": 0.7777816239744425, "num_tokens": 397389015.0, "step": 1355 }, { "entropy": 0.6344848966225982, "epoch": 0.10192491000770526, "grad_norm": 0.19377164542675018, "learning_rate": 0.0002, "loss": 0.8133, "mean_token_accuracy": 0.7840403310954571, "num_tokens": 398848928.0, "step": 1360 }, { "entropy": 0.6749243509024382, "epoch": 0.10229963394155712, "grad_norm": 0.19257383048534393, "learning_rate": 0.0002, "loss": 0.8637, "mean_token_accuracy": 0.7744420301169157, "num_tokens": 400355763.0, "step": 1365 }, { "entropy": 0.6387437300756573, "epoch": 0.10267435787540898, "grad_norm": 0.20750398933887482, "learning_rate": 0.0002, "loss": 0.8268, "mean_token_accuracy": 0.7815543256700039, "num_tokens": 401799998.0, "step": 1370 }, { "entropy": 0.6270101465284824, "epoch": 0.10304908180926084, "grad_norm": 0.1943477988243103, "learning_rate": 0.0002, "loss": 0.8181, "mean_token_accuracy": 0.7833234902471304, "num_tokens": 403234341.0, "step": 1375 }, { "entropy": 0.6438026694580913, "epoch": 0.10342380574311269, "grad_norm": 0.2138630747795105, "learning_rate": 0.0002, "loss": 0.8401, "mean_token_accuracy": 0.78072714433074, "num_tokens": 404713215.0, "step": 1380 }, { "entropy": 0.6316709099337459, "epoch": 0.10379852967696455, "grad_norm": 0.18970222771167755, "learning_rate": 0.0002, "loss": 0.8247, "mean_token_accuracy": 0.7822585377842188, "num_tokens": 406152281.0, "step": 1385 }, { "entropy": 0.6595128836110234, "epoch": 0.1041732536108164, "grad_norm": 0.18954254686832428, "learning_rate": 0.0002, "loss": 0.8519, "mean_token_accuracy": 0.7773042719811201, "num_tokens": 407651945.0, "step": 1390 }, { "entropy": 0.6625305041670799, "epoch": 0.10454797754466827, "grad_norm": 0.17895330488681793, "learning_rate": 0.0002, "loss": 0.8508, "mean_token_accuracy": 0.7795351572334767, "num_tokens": 409139391.0, "step": 1395 }, { "entropy": 0.6442565290257335, "epoch": 0.10492270147852012, "grad_norm": 0.19764970242977142, "learning_rate": 0.0002, "loss": 0.8328, "mean_token_accuracy": 0.7807065177708864, "num_tokens": 410596324.0, "step": 1400 }, { "entropy": 0.6312514532357454, "epoch": 0.10529742541237198, "grad_norm": 0.19283929467201233, "learning_rate": 0.0002, "loss": 0.8295, "mean_token_accuracy": 0.7822819974273443, "num_tokens": 412036822.0, "step": 1405 }, { "entropy": 0.6429750138893724, "epoch": 0.10567214934622383, "grad_norm": 0.18040485680103302, "learning_rate": 0.0002, "loss": 0.8394, "mean_token_accuracy": 0.7794837426394224, "num_tokens": 413519932.0, "step": 1410 }, { "entropy": 0.657593216188252, "epoch": 0.1060468732800757, "grad_norm": 0.19345133006572723, "learning_rate": 0.0002, "loss": 0.8557, "mean_token_accuracy": 0.7759938091039658, "num_tokens": 415041156.0, "step": 1415 }, { "entropy": 0.6334192112088204, "epoch": 0.10642159721392755, "grad_norm": 0.18228760361671448, "learning_rate": 0.0002, "loss": 0.8381, "mean_token_accuracy": 0.7775046180933713, "num_tokens": 416495860.0, "step": 1420 }, { "entropy": 0.6224536437541246, "epoch": 0.10679632114777941, "grad_norm": 0.1789640337228775, "learning_rate": 0.0002, "loss": 0.8159, "mean_token_accuracy": 0.782342467457056, "num_tokens": 417960840.0, "step": 1425 }, { "entropy": 0.6241625620983541, "epoch": 0.10717104508163126, "grad_norm": 0.17715175449848175, "learning_rate": 0.0002, "loss": 0.8205, "mean_token_accuracy": 0.7823353253304959, "num_tokens": 419461378.0, "step": 1430 }, { "entropy": 0.6176043318584561, "epoch": 0.10754576901548313, "grad_norm": 0.177711620926857, "learning_rate": 0.0002, "loss": 0.8149, "mean_token_accuracy": 0.7814407598227262, "num_tokens": 420893239.0, "step": 1435 }, { "entropy": 0.6305481757968664, "epoch": 0.10792049294933498, "grad_norm": 0.18190404772758484, "learning_rate": 0.0002, "loss": 0.8285, "mean_token_accuracy": 0.7822105191648007, "num_tokens": 422371779.0, "step": 1440 }, { "entropy": 0.6308273015543818, "epoch": 0.10829521688318684, "grad_norm": 0.18108534812927246, "learning_rate": 0.0002, "loss": 0.8276, "mean_token_accuracy": 0.782382570952177, "num_tokens": 423837294.0, "step": 1445 }, { "entropy": 0.6448040628805757, "epoch": 0.1086699408170387, "grad_norm": 0.19534040987491608, "learning_rate": 0.0002, "loss": 0.8406, "mean_token_accuracy": 0.7788636200129986, "num_tokens": 425343443.0, "step": 1450 }, { "entropy": 0.6314654000103473, "epoch": 0.10904466475089056, "grad_norm": 0.18594981729984283, "learning_rate": 0.0002, "loss": 0.8271, "mean_token_accuracy": 0.7800248835235835, "num_tokens": 426787838.0, "step": 1455 }, { "entropy": 0.629124347679317, "epoch": 0.10941938868474241, "grad_norm": 0.1930442750453949, "learning_rate": 0.0002, "loss": 0.8191, "mean_token_accuracy": 0.78207939080894, "num_tokens": 428289538.0, "step": 1460 }, { "entropy": 0.6513354748487472, "epoch": 0.10979411261859427, "grad_norm": 0.22483757138252258, "learning_rate": 0.0002, "loss": 0.8413, "mean_token_accuracy": 0.7771540846675634, "num_tokens": 429764459.0, "step": 1465 }, { "entropy": 0.6444356065243483, "epoch": 0.11016883655244612, "grad_norm": 0.1978091597557068, "learning_rate": 0.0002, "loss": 0.8419, "mean_token_accuracy": 0.7809195566922427, "num_tokens": 431212256.0, "step": 1470 }, { "entropy": 0.642676770593971, "epoch": 0.11054356048629799, "grad_norm": 0.20882736146450043, "learning_rate": 0.0002, "loss": 0.8404, "mean_token_accuracy": 0.7812424622476101, "num_tokens": 432689287.0, "step": 1475 }, { "entropy": 0.6376683812588453, "epoch": 0.11091828442014984, "grad_norm": 0.19313231110572815, "learning_rate": 0.0002, "loss": 0.8222, "mean_token_accuracy": 0.779304251074791, "num_tokens": 434194747.0, "step": 1480 }, { "entropy": 0.6269765416160226, "epoch": 0.1112930083540017, "grad_norm": 0.20342648029327393, "learning_rate": 0.0002, "loss": 0.8107, "mean_token_accuracy": 0.7852847930043936, "num_tokens": 435675956.0, "step": 1485 }, { "entropy": 0.6489633288234472, "epoch": 0.11166773228785355, "grad_norm": 0.19542594254016876, "learning_rate": 0.0002, "loss": 0.854, "mean_token_accuracy": 0.7791711963713169, "num_tokens": 437153920.0, "step": 1490 }, { "entropy": 0.6326759969815612, "epoch": 0.11204245622170542, "grad_norm": 0.21418365836143494, "learning_rate": 0.0002, "loss": 0.8192, "mean_token_accuracy": 0.7829687397927045, "num_tokens": 438637131.0, "step": 1495 }, { "entropy": 0.6352060239762067, "epoch": 0.11241718015555727, "grad_norm": 0.2036706656217575, "learning_rate": 0.0002, "loss": 0.8207, "mean_token_accuracy": 0.7816400893032551, "num_tokens": 440107703.0, "step": 1500 }, { "entropy": 0.623235106933862, "epoch": 0.11279190408940913, "grad_norm": 0.18213726580142975, "learning_rate": 0.0002, "loss": 0.8045, "mean_token_accuracy": 0.7863257631659508, "num_tokens": 441605457.0, "step": 1505 }, { "entropy": 0.6457588752731681, "epoch": 0.11316662802326098, "grad_norm": 0.18502438068389893, "learning_rate": 0.0002, "loss": 0.829, "mean_token_accuracy": 0.7797689534723758, "num_tokens": 443107775.0, "step": 1510 }, { "entropy": 0.6309646831825375, "epoch": 0.11354135195711285, "grad_norm": 0.1928846538066864, "learning_rate": 0.0002, "loss": 0.8176, "mean_token_accuracy": 0.7827642522752285, "num_tokens": 444554435.0, "step": 1515 }, { "entropy": 0.6508087219670415, "epoch": 0.1139160758909647, "grad_norm": 0.1925126314163208, "learning_rate": 0.0002, "loss": 0.8362, "mean_token_accuracy": 0.7793129488825798, "num_tokens": 445982296.0, "step": 1520 }, { "entropy": 0.6367783956229687, "epoch": 0.11429079982481656, "grad_norm": 0.17420123517513275, "learning_rate": 0.0002, "loss": 0.8151, "mean_token_accuracy": 0.7844093803316354, "num_tokens": 447461511.0, "step": 1525 }, { "entropy": 0.6366791883483529, "epoch": 0.11466552375866843, "grad_norm": 0.18183572590351105, "learning_rate": 0.0002, "loss": 0.8221, "mean_token_accuracy": 0.7864723682403565, "num_tokens": 448940345.0, "step": 1530 }, { "entropy": 0.6401759672909975, "epoch": 0.11504024769252028, "grad_norm": 0.17584463953971863, "learning_rate": 0.0002, "loss": 0.8038, "mean_token_accuracy": 0.785202507674694, "num_tokens": 450413101.0, "step": 1535 }, { "entropy": 0.6310706317424775, "epoch": 0.11541497162637214, "grad_norm": 0.17037333548069, "learning_rate": 0.0002, "loss": 0.8052, "mean_token_accuracy": 0.7894041709601879, "num_tokens": 451912951.0, "step": 1540 }, { "entropy": 0.6401767961680889, "epoch": 0.11578969556022399, "grad_norm": 0.20026451349258423, "learning_rate": 0.0002, "loss": 0.8197, "mean_token_accuracy": 0.7830154221504927, "num_tokens": 453384211.0, "step": 1545 }, { "entropy": 0.657573894970119, "epoch": 0.11616441949407585, "grad_norm": 0.1850256323814392, "learning_rate": 0.0002, "loss": 0.8307, "mean_token_accuracy": 0.7795117247849703, "num_tokens": 454841768.0, "step": 1550 }, { "entropy": 0.6502273692749441, "epoch": 0.1165391434279277, "grad_norm": 0.16529785096645355, "learning_rate": 0.0002, "loss": 0.8342, "mean_token_accuracy": 0.7823585733771324, "num_tokens": 456304518.0, "step": 1555 }, { "entropy": 0.6567935801111162, "epoch": 0.11691386736177957, "grad_norm": 0.19096967577934265, "learning_rate": 0.0002, "loss": 0.8228, "mean_token_accuracy": 0.7814198397099972, "num_tokens": 457814388.0, "step": 1560 }, { "entropy": 0.6574173988774419, "epoch": 0.11728859129563142, "grad_norm": 0.1925215721130371, "learning_rate": 0.0002, "loss": 0.8345, "mean_token_accuracy": 0.7814335562288761, "num_tokens": 459292949.0, "step": 1565 }, { "entropy": 0.6570137158036232, "epoch": 0.11766331522948328, "grad_norm": 0.19550134241580963, "learning_rate": 0.0002, "loss": 0.8298, "mean_token_accuracy": 0.7848746094852685, "num_tokens": 460804713.0, "step": 1570 }, { "entropy": 0.6398474534973502, "epoch": 0.11803803916333513, "grad_norm": 0.1769096553325653, "learning_rate": 0.0002, "loss": 0.8159, "mean_token_accuracy": 0.7845467649400234, "num_tokens": 462231197.0, "step": 1575 }, { "entropy": 0.6492908170446754, "epoch": 0.118412763097187, "grad_norm": 0.16787941753864288, "learning_rate": 0.0002, "loss": 0.8204, "mean_token_accuracy": 0.7832032844424248, "num_tokens": 463677706.0, "step": 1580 }, { "entropy": 0.6500168214552104, "epoch": 0.11878748703103885, "grad_norm": 0.19225063920021057, "learning_rate": 0.0002, "loss": 0.8249, "mean_token_accuracy": 0.7828476637601852, "num_tokens": 465158580.0, "step": 1585 }, { "entropy": 0.6341028127819299, "epoch": 0.11916221096489071, "grad_norm": 0.1919771134853363, "learning_rate": 0.0002, "loss": 0.8082, "mean_token_accuracy": 0.7863995518535376, "num_tokens": 466614186.0, "step": 1590 }, { "entropy": 0.6619253367185592, "epoch": 0.11953693489874256, "grad_norm": 0.19398124516010284, "learning_rate": 0.0002, "loss": 0.8426, "mean_token_accuracy": 0.7782408274710179, "num_tokens": 468065439.0, "step": 1595 }, { "entropy": 0.6597399214282632, "epoch": 0.11991165883259443, "grad_norm": 0.20868563652038574, "learning_rate": 0.0002, "loss": 0.8338, "mean_token_accuracy": 0.7823300905525684, "num_tokens": 469531115.0, "step": 1600 }, { "entropy": 0.640760999545455, "epoch": 0.12028638276644628, "grad_norm": 0.19020946323871613, "learning_rate": 0.0002, "loss": 0.8056, "mean_token_accuracy": 0.7851228531450033, "num_tokens": 471024368.0, "step": 1605 }, { "entropy": 0.6557803027331829, "epoch": 0.12066110670029814, "grad_norm": 0.18471430242061615, "learning_rate": 0.0002, "loss": 0.8321, "mean_token_accuracy": 0.7798160724341869, "num_tokens": 472497449.0, "step": 1610 }, { "entropy": 0.6805735655128956, "epoch": 0.12103583063415, "grad_norm": 0.17671038210391998, "learning_rate": 0.0002, "loss": 0.8551, "mean_token_accuracy": 0.7755267873406411, "num_tokens": 473992384.0, "step": 1615 }, { "entropy": 0.6513863643631339, "epoch": 0.12141055456800186, "grad_norm": 0.1878633350133896, "learning_rate": 0.0002, "loss": 0.8195, "mean_token_accuracy": 0.7815994102507829, "num_tokens": 475483709.0, "step": 1620 }, { "entropy": 0.6443220686167479, "epoch": 0.12178527850185371, "grad_norm": 0.1731729805469513, "learning_rate": 0.0002, "loss": 0.8224, "mean_token_accuracy": 0.7852519396692514, "num_tokens": 476929607.0, "step": 1625 }, { "entropy": 0.6492387168109417, "epoch": 0.12216000243570557, "grad_norm": 0.1867334246635437, "learning_rate": 0.0002, "loss": 0.8139, "mean_token_accuracy": 0.7844989258795977, "num_tokens": 478390508.0, "step": 1630 }, { "entropy": 0.6368599327281117, "epoch": 0.12253472636955742, "grad_norm": 0.18804235756397247, "learning_rate": 0.0002, "loss": 0.8038, "mean_token_accuracy": 0.7857808455824852, "num_tokens": 479845085.0, "step": 1635 }, { "entropy": 0.6702014181762934, "epoch": 0.12290945030340929, "grad_norm": 0.1854729950428009, "learning_rate": 0.0002, "loss": 0.8431, "mean_token_accuracy": 0.7824995428323746, "num_tokens": 481304970.0, "step": 1640 }, { "entropy": 0.6459416594356299, "epoch": 0.12328417423726114, "grad_norm": 0.20026013255119324, "learning_rate": 0.0002, "loss": 0.8137, "mean_token_accuracy": 0.7834785848855972, "num_tokens": 482781203.0, "step": 1645 }, { "entropy": 0.617957841604948, "epoch": 0.123658898171113, "grad_norm": 0.18037894368171692, "learning_rate": 0.0002, "loss": 0.7878, "mean_token_accuracy": 0.7902416285127402, "num_tokens": 484254525.0, "step": 1650 }, { "entropy": 0.6346460292115808, "epoch": 0.12403362210496485, "grad_norm": 0.17374536395072937, "learning_rate": 0.0002, "loss": 0.831, "mean_token_accuracy": 0.7803115911781788, "num_tokens": 485692845.0, "step": 1655 }, { "entropy": 0.6257511867210269, "epoch": 0.12440834603881672, "grad_norm": 0.1810159534215927, "learning_rate": 0.0002, "loss": 0.8107, "mean_token_accuracy": 0.7860979612916708, "num_tokens": 487154658.0, "step": 1660 }, { "entropy": 0.6431899634189904, "epoch": 0.12478306997266857, "grad_norm": 0.18413160741329193, "learning_rate": 0.0002, "loss": 0.83, "mean_token_accuracy": 0.7827605728060008, "num_tokens": 488672361.0, "step": 1665 }, { "entropy": 0.633536656666547, "epoch": 0.12515779390652043, "grad_norm": 0.16822056472301483, "learning_rate": 0.0002, "loss": 0.8278, "mean_token_accuracy": 0.7824280451983213, "num_tokens": 490165483.0, "step": 1670 }, { "entropy": 0.6435291208326817, "epoch": 0.12553251784037228, "grad_norm": 0.19557581841945648, "learning_rate": 0.0002, "loss": 0.8342, "mean_token_accuracy": 0.7841637082397938, "num_tokens": 491638847.0, "step": 1675 }, { "entropy": 0.6308700069785118, "epoch": 0.12590724177422413, "grad_norm": 0.18068793416023254, "learning_rate": 0.0002, "loss": 0.8251, "mean_token_accuracy": 0.7793952107429505, "num_tokens": 493083602.0, "step": 1680 }, { "entropy": 0.6292005917057395, "epoch": 0.126281965708076, "grad_norm": 0.20607395470142365, "learning_rate": 0.0002, "loss": 0.8263, "mean_token_accuracy": 0.782640753686428, "num_tokens": 494557496.0, "step": 1685 }, { "entropy": 0.6162070151418447, "epoch": 0.12665668964192786, "grad_norm": 0.19063003361225128, "learning_rate": 0.0002, "loss": 0.8067, "mean_token_accuracy": 0.7889676634222269, "num_tokens": 496049609.0, "step": 1690 }, { "entropy": 0.6296820843592286, "epoch": 0.1270314135757797, "grad_norm": 0.16259321570396423, "learning_rate": 0.0002, "loss": 0.8267, "mean_token_accuracy": 0.7825254421681166, "num_tokens": 497522191.0, "step": 1695 }, { "entropy": 0.6523891519755125, "epoch": 0.1274061375096316, "grad_norm": 0.20356962084770203, "learning_rate": 0.0002, "loss": 0.8642, "mean_token_accuracy": 0.7745334561914206, "num_tokens": 499000160.0, "step": 1700 }, { "entropy": 0.6293125778436661, "epoch": 0.12778086144348344, "grad_norm": 0.1783589869737625, "learning_rate": 0.0002, "loss": 0.8289, "mean_token_accuracy": 0.7837120857089758, "num_tokens": 500455967.0, "step": 1705 }, { "entropy": 0.623191099241376, "epoch": 0.1281555853773353, "grad_norm": 0.1985974758863449, "learning_rate": 0.0002, "loss": 0.8142, "mean_token_accuracy": 0.7839359745383263, "num_tokens": 501929454.0, "step": 1710 }, { "entropy": 0.6361714601516724, "epoch": 0.12853030931118714, "grad_norm": 0.18165786564350128, "learning_rate": 0.0002, "loss": 0.8288, "mean_token_accuracy": 0.7800959911197424, "num_tokens": 503420073.0, "step": 1715 }, { "entropy": 0.6393444337882102, "epoch": 0.12890503324503902, "grad_norm": 0.20377711951732635, "learning_rate": 0.0002, "loss": 0.8252, "mean_token_accuracy": 0.7796891387552023, "num_tokens": 504885287.0, "step": 1720 }, { "entropy": 0.6338514726608991, "epoch": 0.12927975717889087, "grad_norm": 0.17600296437740326, "learning_rate": 0.0002, "loss": 0.8236, "mean_token_accuracy": 0.7823041111230851, "num_tokens": 506376516.0, "step": 1725 }, { "entropy": 0.6417007051408291, "epoch": 0.12965448111274272, "grad_norm": 0.1791848987340927, "learning_rate": 0.0002, "loss": 0.8356, "mean_token_accuracy": 0.7835383791476488, "num_tokens": 507800684.0, "step": 1730 }, { "entropy": 0.5996557333506644, "epoch": 0.13002920504659457, "grad_norm": 0.20563314855098724, "learning_rate": 0.0002, "loss": 0.7878, "mean_token_accuracy": 0.7889736741781235, "num_tokens": 509247318.0, "step": 1735 }, { "entropy": 0.6178161969408393, "epoch": 0.13040392898044645, "grad_norm": 0.21251806616783142, "learning_rate": 0.0002, "loss": 0.8126, "mean_token_accuracy": 0.786270621418953, "num_tokens": 510716969.0, "step": 1740 }, { "entropy": 0.6091602237895131, "epoch": 0.1307786529142983, "grad_norm": 0.1793888509273529, "learning_rate": 0.0002, "loss": 0.8076, "mean_token_accuracy": 0.7851733796298503, "num_tokens": 512130684.0, "step": 1745 }, { "entropy": 0.6206506266258657, "epoch": 0.13115337684815015, "grad_norm": 0.20197632908821106, "learning_rate": 0.0002, "loss": 0.829, "mean_token_accuracy": 0.7814985428005456, "num_tokens": 513585748.0, "step": 1750 }, { "entropy": 0.609588623791933, "epoch": 0.131528100782002, "grad_norm": 0.18139299750328064, "learning_rate": 0.0002, "loss": 0.7947, "mean_token_accuracy": 0.7880592904984951, "num_tokens": 515045987.0, "step": 1755 }, { "entropy": 0.618846413679421, "epoch": 0.13190282471585388, "grad_norm": 0.18567700684070587, "learning_rate": 0.0002, "loss": 0.809, "mean_token_accuracy": 0.783166266977787, "num_tokens": 516519468.0, "step": 1760 }, { "entropy": 0.6185420343652368, "epoch": 0.13227754864970573, "grad_norm": 0.1868051141500473, "learning_rate": 0.0002, "loss": 0.814, "mean_token_accuracy": 0.7835904143750667, "num_tokens": 517954719.0, "step": 1765 }, { "entropy": 0.6081602826714516, "epoch": 0.13265227258355758, "grad_norm": 0.171245276927948, "learning_rate": 0.0002, "loss": 0.7991, "mean_token_accuracy": 0.7857162784785032, "num_tokens": 519383528.0, "step": 1770 }, { "entropy": 0.6207474311813712, "epoch": 0.13302699651740943, "grad_norm": 0.19233457744121552, "learning_rate": 0.0002, "loss": 0.8193, "mean_token_accuracy": 0.7845215741544962, "num_tokens": 520840977.0, "step": 1775 }, { "entropy": 0.6208784136921167, "epoch": 0.1334017204512613, "grad_norm": 0.1783570498228073, "learning_rate": 0.0002, "loss": 0.8122, "mean_token_accuracy": 0.7845445185899734, "num_tokens": 522279602.0, "step": 1780 }, { "entropy": 0.618262165505439, "epoch": 0.13377644438511316, "grad_norm": 0.17906376719474792, "learning_rate": 0.0002, "loss": 0.8039, "mean_token_accuracy": 0.7868785079568624, "num_tokens": 523752456.0, "step": 1785 }, { "entropy": 0.6474976431578398, "epoch": 0.134151168318965, "grad_norm": 0.24100901186466217, "learning_rate": 0.0002, "loss": 0.8333, "mean_token_accuracy": 0.7796257983893156, "num_tokens": 525240593.0, "step": 1790 }, { "entropy": 0.6233639700338245, "epoch": 0.13452589225281686, "grad_norm": 0.17775972187519073, "learning_rate": 0.0002, "loss": 0.8118, "mean_token_accuracy": 0.7847069930285215, "num_tokens": 526711609.0, "step": 1795 }, { "entropy": 0.6202631808817387, "epoch": 0.13490061618666874, "grad_norm": 0.17530876398086548, "learning_rate": 0.0002, "loss": 0.8146, "mean_token_accuracy": 0.7836083110421896, "num_tokens": 528192279.0, "step": 1800 }, { "entropy": 0.6147802710533142, "epoch": 0.1352753401205206, "grad_norm": 0.17938204109668732, "learning_rate": 0.0002, "loss": 0.8112, "mean_token_accuracy": 0.78585069142282, "num_tokens": 529634025.0, "step": 1805 }, { "entropy": 0.620032390486449, "epoch": 0.13565006405437244, "grad_norm": 0.20578625798225403, "learning_rate": 0.0002, "loss": 0.8076, "mean_token_accuracy": 0.7851048417389392, "num_tokens": 531103242.0, "step": 1810 }, { "entropy": 0.6212565668858587, "epoch": 0.1360247879882243, "grad_norm": 0.20457911491394043, "learning_rate": 0.0002, "loss": 0.8087, "mean_token_accuracy": 0.7848767276853323, "num_tokens": 532576668.0, "step": 1815 }, { "entropy": 0.6173491543158889, "epoch": 0.13639951192207617, "grad_norm": 0.18724916875362396, "learning_rate": 0.0002, "loss": 0.7881, "mean_token_accuracy": 0.7871451061218977, "num_tokens": 534064567.0, "step": 1820 }, { "entropy": 0.6328080480918288, "epoch": 0.13677423585592802, "grad_norm": 0.20665758848190308, "learning_rate": 0.0002, "loss": 0.8138, "mean_token_accuracy": 0.7836977083235979, "num_tokens": 535541050.0, "step": 1825 }, { "entropy": 0.6441036683507264, "epoch": 0.13714895978977987, "grad_norm": 0.1852053850889206, "learning_rate": 0.0002, "loss": 0.8305, "mean_token_accuracy": 0.7817787528038025, "num_tokens": 536997240.0, "step": 1830 }, { "entropy": 0.6311615947633982, "epoch": 0.13752368372363172, "grad_norm": 0.1927780956029892, "learning_rate": 0.0002, "loss": 0.8136, "mean_token_accuracy": 0.78698292337358, "num_tokens": 538491025.0, "step": 1835 }, { "entropy": 0.6299569799564779, "epoch": 0.1378984076574836, "grad_norm": 0.18220816552639008, "learning_rate": 0.0002, "loss": 0.8117, "mean_token_accuracy": 0.7854545559734106, "num_tokens": 539958330.0, "step": 1840 }, { "entropy": 0.6312386387959122, "epoch": 0.13827313159133545, "grad_norm": 0.18083232641220093, "learning_rate": 0.0002, "loss": 0.8238, "mean_token_accuracy": 0.7832896322011947, "num_tokens": 541411954.0, "step": 1845 }, { "entropy": 0.6404116407968103, "epoch": 0.1386478555251873, "grad_norm": 0.17270754277706146, "learning_rate": 0.0002, "loss": 0.8194, "mean_token_accuracy": 0.7851656049489975, "num_tokens": 542899050.0, "step": 1850 }, { "entropy": 0.6163149713538587, "epoch": 0.13902257945903915, "grad_norm": 0.17206385731697083, "learning_rate": 0.0002, "loss": 0.7986, "mean_token_accuracy": 0.7877317287027836, "num_tokens": 544364195.0, "step": 1855 }, { "entropy": 0.6124714547768235, "epoch": 0.13939730339289103, "grad_norm": 0.17256370186805725, "learning_rate": 0.0002, "loss": 0.8088, "mean_token_accuracy": 0.7854775629937649, "num_tokens": 545816639.0, "step": 1860 }, { "entropy": 0.6236966420896352, "epoch": 0.13977202732674288, "grad_norm": 0.17555341124534607, "learning_rate": 0.0002, "loss": 0.8152, "mean_token_accuracy": 0.7861062832176685, "num_tokens": 547288499.0, "step": 1865 }, { "entropy": 0.6439672035165132, "epoch": 0.14014675126059473, "grad_norm": 0.17522072792053223, "learning_rate": 0.0002, "loss": 0.8306, "mean_token_accuracy": 0.7799076527357102, "num_tokens": 548797160.0, "step": 1870 }, { "entropy": 0.6152849234640598, "epoch": 0.1405214751944466, "grad_norm": 0.17938008904457092, "learning_rate": 0.0002, "loss": 0.8029, "mean_token_accuracy": 0.7845146499574185, "num_tokens": 550269147.0, "step": 1875 }, { "entropy": 0.6235533682629466, "epoch": 0.14089619912829845, "grad_norm": 0.24197359383106232, "learning_rate": 0.0002, "loss": 0.8047, "mean_token_accuracy": 0.7858578421175479, "num_tokens": 551704620.0, "step": 1880 }, { "entropy": 0.6029068449512124, "epoch": 0.1412709230621503, "grad_norm": 0.26450932025909424, "learning_rate": 0.0002, "loss": 0.7939, "mean_token_accuracy": 0.7931190576404333, "num_tokens": 553170056.0, "step": 1885 }, { "entropy": 0.6203575391322375, "epoch": 0.14164564699600216, "grad_norm": 0.1725843846797943, "learning_rate": 0.0002, "loss": 0.8054, "mean_token_accuracy": 0.7856256704777479, "num_tokens": 554634440.0, "step": 1890 }, { "entropy": 0.6258201254531741, "epoch": 0.14202037092985403, "grad_norm": 0.203072190284729, "learning_rate": 0.0002, "loss": 0.8212, "mean_token_accuracy": 0.7818147998303175, "num_tokens": 556088975.0, "step": 1895 }, { "entropy": 0.6337720619514584, "epoch": 0.14239509486370588, "grad_norm": 0.18463082611560822, "learning_rate": 0.0002, "loss": 0.821, "mean_token_accuracy": 0.7827408816665411, "num_tokens": 557580527.0, "step": 1900 }, { "entropy": 0.63241111272946, "epoch": 0.14276981879755773, "grad_norm": 0.18178461492061615, "learning_rate": 0.0002, "loss": 0.8337, "mean_token_accuracy": 0.7806367076933384, "num_tokens": 559023642.0, "step": 1905 }, { "entropy": 0.6344677823595702, "epoch": 0.14314454273140959, "grad_norm": 0.2196112424135208, "learning_rate": 0.0002, "loss": 0.827, "mean_token_accuracy": 0.7831140752881766, "num_tokens": 560508217.0, "step": 1910 }, { "entropy": 0.6410200625658036, "epoch": 0.14351926666526146, "grad_norm": 0.19154831767082214, "learning_rate": 0.0002, "loss": 0.8227, "mean_token_accuracy": 0.7859092570841313, "num_tokens": 562004448.0, "step": 1915 }, { "entropy": 0.6219524689018726, "epoch": 0.14389399059911331, "grad_norm": 0.17339716851711273, "learning_rate": 0.0002, "loss": 0.8021, "mean_token_accuracy": 0.7883383993059396, "num_tokens": 563450862.0, "step": 1920 }, { "entropy": 0.6143129765987396, "epoch": 0.14426871453296516, "grad_norm": 0.18110442161560059, "learning_rate": 0.0002, "loss": 0.8081, "mean_token_accuracy": 0.7860922928899526, "num_tokens": 564883581.0, "step": 1925 }, { "entropy": 0.6139122614637017, "epoch": 0.14464343846681701, "grad_norm": 0.17997603118419647, "learning_rate": 0.0002, "loss": 0.7915, "mean_token_accuracy": 0.7846145492047072, "num_tokens": 566351792.0, "step": 1930 }, { "entropy": 0.6343079641461372, "epoch": 0.1450181624006689, "grad_norm": 0.17053461074829102, "learning_rate": 0.0002, "loss": 0.8195, "mean_token_accuracy": 0.784117216616869, "num_tokens": 567789933.0, "step": 1935 }, { "entropy": 0.6474445162340998, "epoch": 0.14539288633452074, "grad_norm": 0.20981234312057495, "learning_rate": 0.0002, "loss": 0.8441, "mean_token_accuracy": 0.7785210259258747, "num_tokens": 569244127.0, "step": 1940 }, { "entropy": 0.6277735093608499, "epoch": 0.1457676102683726, "grad_norm": 0.18128719925880432, "learning_rate": 0.0002, "loss": 0.8094, "mean_token_accuracy": 0.7871556654572487, "num_tokens": 570689335.0, "step": 1945 }, { "entropy": 0.6197189100086689, "epoch": 0.14614233420222444, "grad_norm": 0.18222935497760773, "learning_rate": 0.0002, "loss": 0.8089, "mean_token_accuracy": 0.784514256939292, "num_tokens": 572135823.0, "step": 1950 }, { "entropy": 0.6246771916747094, "epoch": 0.14651705813607632, "grad_norm": 0.17531156539916992, "learning_rate": 0.0002, "loss": 0.8082, "mean_token_accuracy": 0.7866884879767895, "num_tokens": 573614579.0, "step": 1955 }, { "entropy": 0.5874809358268976, "epoch": 0.14689178206992817, "grad_norm": 0.18838605284690857, "learning_rate": 0.0002, "loss": 0.7715, "mean_token_accuracy": 0.7929830502718687, "num_tokens": 575032862.0, "step": 1960 }, { "entropy": 0.6314543502405285, "epoch": 0.14726650600378002, "grad_norm": 0.1977023184299469, "learning_rate": 0.0002, "loss": 0.8252, "mean_token_accuracy": 0.7840785045176745, "num_tokens": 576500077.0, "step": 1965 }, { "entropy": 0.606737223919481, "epoch": 0.14764122993763187, "grad_norm": 0.1908135712146759, "learning_rate": 0.0002, "loss": 0.789, "mean_token_accuracy": 0.7873634554445743, "num_tokens": 577957213.0, "step": 1970 }, { "entropy": 0.6252687303349376, "epoch": 0.14801595387148375, "grad_norm": 0.19899038970470428, "learning_rate": 0.0002, "loss": 0.8193, "mean_token_accuracy": 0.7822138667106628, "num_tokens": 579396857.0, "step": 1975 }, { "entropy": 0.6354222187772394, "epoch": 0.1483906778053356, "grad_norm": 0.21318168938159943, "learning_rate": 0.0002, "loss": 0.8171, "mean_token_accuracy": 0.7835757710039616, "num_tokens": 580918357.0, "step": 1980 }, { "entropy": 0.5938525709323585, "epoch": 0.14876540173918745, "grad_norm": 0.1769288331270218, "learning_rate": 0.0002, "loss": 0.7703, "mean_token_accuracy": 0.7911068603396416, "num_tokens": 582370906.0, "step": 1985 }, { "entropy": 0.6183813020586968, "epoch": 0.1491401256730393, "grad_norm": 0.22243306040763855, "learning_rate": 0.0002, "loss": 0.8022, "mean_token_accuracy": 0.7885041285306216, "num_tokens": 583811234.0, "step": 1990 }, { "entropy": 0.6287448238581419, "epoch": 0.14951484960689118, "grad_norm": 0.17485681176185608, "learning_rate": 0.0002, "loss": 0.8054, "mean_token_accuracy": 0.7850345086306334, "num_tokens": 585333369.0, "step": 1995 }, { "entropy": 0.6265132948756218, "epoch": 0.14988957354074303, "grad_norm": 0.17577014863491058, "learning_rate": 0.0002, "loss": 0.8151, "mean_token_accuracy": 0.7863041218370199, "num_tokens": 586780183.0, "step": 2000 }, { "entropy": 0.6398693408817053, "epoch": 0.15026429747459488, "grad_norm": 0.18524107336997986, "learning_rate": 0.0002, "loss": 0.8398, "mean_token_accuracy": 0.7815801482647657, "num_tokens": 588225197.0, "step": 2005 }, { "entropy": 0.6157224349677562, "epoch": 0.15063902140844673, "grad_norm": 0.17321732640266418, "learning_rate": 0.0002, "loss": 0.7964, "mean_token_accuracy": 0.7878490202128887, "num_tokens": 589676578.0, "step": 2010 }, { "entropy": 0.6192301977425814, "epoch": 0.1510137453422986, "grad_norm": 0.18356382846832275, "learning_rate": 0.0002, "loss": 0.8129, "mean_token_accuracy": 0.7867310479283333, "num_tokens": 591117172.0, "step": 2015 }, { "entropy": 0.606128491088748, "epoch": 0.15138846927615046, "grad_norm": 0.1761026680469513, "learning_rate": 0.0002, "loss": 0.792, "mean_token_accuracy": 0.7854276549071073, "num_tokens": 592554634.0, "step": 2020 }, { "entropy": 0.6361926771700382, "epoch": 0.1517631932100023, "grad_norm": 0.1737743318080902, "learning_rate": 0.0002, "loss": 0.8241, "mean_token_accuracy": 0.7836410161107779, "num_tokens": 594050203.0, "step": 2025 }, { "entropy": 0.5961550158448518, "epoch": 0.1521379171438542, "grad_norm": 0.17735545337200165, "learning_rate": 0.0002, "loss": 0.7947, "mean_token_accuracy": 0.787772286310792, "num_tokens": 595506930.0, "step": 2030 }, { "entropy": 0.6283499024808407, "epoch": 0.15251264107770604, "grad_norm": 0.20012183487415314, "learning_rate": 0.0002, "loss": 0.8227, "mean_token_accuracy": 0.7816881105303765, "num_tokens": 596948966.0, "step": 2035 }, { "entropy": 0.64318259768188, "epoch": 0.1528873650115579, "grad_norm": 0.21286864578723907, "learning_rate": 0.0002, "loss": 0.8353, "mean_token_accuracy": 0.7808368105441332, "num_tokens": 598411854.0, "step": 2040 }, { "entropy": 0.6183641526848078, "epoch": 0.15326208894540974, "grad_norm": 0.21077996492385864, "learning_rate": 0.0002, "loss": 0.8215, "mean_token_accuracy": 0.7862260054796935, "num_tokens": 599873257.0, "step": 2045 }, { "entropy": 0.5941176300868392, "epoch": 0.15363681287926162, "grad_norm": 0.18024812638759613, "learning_rate": 0.0002, "loss": 0.7851, "mean_token_accuracy": 0.7884294196963311, "num_tokens": 601323521.0, "step": 2050 }, { "entropy": 0.6211720215156674, "epoch": 0.15401153681311347, "grad_norm": 0.1926286220550537, "learning_rate": 0.0002, "loss": 0.8231, "mean_token_accuracy": 0.7840686626732349, "num_tokens": 602850266.0, "step": 2055 }, { "entropy": 0.5908981263637543, "epoch": 0.15438626074696532, "grad_norm": 0.17295804619789124, "learning_rate": 0.0002, "loss": 0.793, "mean_token_accuracy": 0.787461456656456, "num_tokens": 604309576.0, "step": 2060 }, { "entropy": 0.5990890568122268, "epoch": 0.15476098468081717, "grad_norm": 0.21123728156089783, "learning_rate": 0.0002, "loss": 0.8025, "mean_token_accuracy": 0.7846048094332219, "num_tokens": 605804453.0, "step": 2065 }, { "entropy": 0.6244153896346688, "epoch": 0.15513570861466905, "grad_norm": 0.24762798845767975, "learning_rate": 0.0002, "loss": 0.8328, "mean_token_accuracy": 0.782168199121952, "num_tokens": 607286997.0, "step": 2070 }, { "entropy": 0.5949526596814394, "epoch": 0.1555104325485209, "grad_norm": 0.1878950446844101, "learning_rate": 0.0002, "loss": 0.7866, "mean_token_accuracy": 0.7875169686973095, "num_tokens": 608743395.0, "step": 2075 }, { "entropy": 0.5908074013888835, "epoch": 0.15588515648237275, "grad_norm": 0.45085155963897705, "learning_rate": 0.0002, "loss": 0.7926, "mean_token_accuracy": 0.7864012680947781, "num_tokens": 610197149.0, "step": 2080 }, { "entropy": 0.5946511570364237, "epoch": 0.1562598804162246, "grad_norm": 0.18598204851150513, "learning_rate": 0.0002, "loss": 0.7956, "mean_token_accuracy": 0.7892676431685686, "num_tokens": 611651754.0, "step": 2085 }, { "entropy": 0.609775306750089, "epoch": 0.15663460435007648, "grad_norm": 0.16992828249931335, "learning_rate": 0.0002, "loss": 0.8066, "mean_token_accuracy": 0.7863094978034496, "num_tokens": 613145110.0, "step": 2090 }, { "entropy": 0.5782581530511379, "epoch": 0.15700932828392833, "grad_norm": 0.18417759239673615, "learning_rate": 0.0002, "loss": 0.7783, "mean_token_accuracy": 0.7919834949076175, "num_tokens": 614607238.0, "step": 2095 }, { "entropy": 0.6106474783271552, "epoch": 0.15738405221778018, "grad_norm": 0.20821793377399445, "learning_rate": 0.0002, "loss": 0.8012, "mean_token_accuracy": 0.787000660225749, "num_tokens": 616089223.0, "step": 2100 }, { "entropy": 0.6090683812275529, "epoch": 0.15775877615163203, "grad_norm": 0.16859573125839233, "learning_rate": 0.0002, "loss": 0.8059, "mean_token_accuracy": 0.7867076024413109, "num_tokens": 617587883.0, "step": 2105 }, { "entropy": 0.611951632425189, "epoch": 0.1581335000854839, "grad_norm": 0.17813149094581604, "learning_rate": 0.0002, "loss": 0.8108, "mean_token_accuracy": 0.785032381862402, "num_tokens": 619077377.0, "step": 2110 }, { "entropy": 0.6028899470344186, "epoch": 0.15850822401933576, "grad_norm": 0.18737171590328217, "learning_rate": 0.0002, "loss": 0.7963, "mean_token_accuracy": 0.7891909435391427, "num_tokens": 620566156.0, "step": 2115 }, { "entropy": 0.5902818626724183, "epoch": 0.1588829479531876, "grad_norm": 0.1986258625984192, "learning_rate": 0.0002, "loss": 0.7825, "mean_token_accuracy": 0.7912440408021212, "num_tokens": 621999245.0, "step": 2120 }, { "entropy": 0.5925171723589301, "epoch": 0.15925767188703946, "grad_norm": 0.18814513087272644, "learning_rate": 0.0002, "loss": 0.7877, "mean_token_accuracy": 0.7890162356197834, "num_tokens": 623433642.0, "step": 2125 }, { "entropy": 0.6284636536613106, "epoch": 0.15963239582089134, "grad_norm": 0.1962500959634781, "learning_rate": 0.0002, "loss": 0.8173, "mean_token_accuracy": 0.7818551268428564, "num_tokens": 624935061.0, "step": 2130 }, { "entropy": 0.605744460131973, "epoch": 0.1600071197547432, "grad_norm": 0.182819202542305, "learning_rate": 0.0002, "loss": 0.8035, "mean_token_accuracy": 0.7862702585756779, "num_tokens": 626385671.0, "step": 2135 }, { "entropy": 0.6282671516761184, "epoch": 0.16038184368859504, "grad_norm": 0.17543447017669678, "learning_rate": 0.0002, "loss": 0.8131, "mean_token_accuracy": 0.7853460356593132, "num_tokens": 627850748.0, "step": 2140 }, { "entropy": 0.6257581887766719, "epoch": 0.1607565676224469, "grad_norm": 0.18405720591545105, "learning_rate": 0.0002, "loss": 0.8203, "mean_token_accuracy": 0.7807066898792983, "num_tokens": 629309801.0, "step": 2145 }, { "entropy": 0.6203157160431146, "epoch": 0.16113129155629877, "grad_norm": 0.1751137524843216, "learning_rate": 0.0002, "loss": 0.8031, "mean_token_accuracy": 0.7842825409024954, "num_tokens": 630768459.0, "step": 2150 }, { "entropy": 0.6249939261004329, "epoch": 0.16150601549015062, "grad_norm": 0.18319366872310638, "learning_rate": 0.0002, "loss": 0.823, "mean_token_accuracy": 0.7817492049187422, "num_tokens": 632259553.0, "step": 2155 }, { "entropy": 0.6077609388157725, "epoch": 0.16188073942400247, "grad_norm": 0.18884330987930298, "learning_rate": 0.0002, "loss": 0.8033, "mean_token_accuracy": 0.7851300992071628, "num_tokens": 633694042.0, "step": 2160 }, { "entropy": 0.6114727141335606, "epoch": 0.16225546335785432, "grad_norm": 0.18598489463329315, "learning_rate": 0.0002, "loss": 0.8103, "mean_token_accuracy": 0.7834366019815207, "num_tokens": 635189459.0, "step": 2165 }, { "entropy": 0.5980860583484173, "epoch": 0.1626301872917062, "grad_norm": 0.18226277828216553, "learning_rate": 0.0002, "loss": 0.7887, "mean_token_accuracy": 0.7884750418365002, "num_tokens": 636646523.0, "step": 2170 }, { "entropy": 0.6224906457588076, "epoch": 0.16300491122555805, "grad_norm": 0.18794497847557068, "learning_rate": 0.0002, "loss": 0.8207, "mean_token_accuracy": 0.7814871646463871, "num_tokens": 638144724.0, "step": 2175 }, { "entropy": 0.6033993881195784, "epoch": 0.1633796351594099, "grad_norm": 0.16932767629623413, "learning_rate": 0.0002, "loss": 0.7928, "mean_token_accuracy": 0.7875210471451283, "num_tokens": 639627880.0, "step": 2180 }, { "entropy": 0.6017533193342388, "epoch": 0.16375435909326175, "grad_norm": 0.17281045019626617, "learning_rate": 0.0002, "loss": 0.7912, "mean_token_accuracy": 0.792485586181283, "num_tokens": 641125328.0, "step": 2185 }, { "entropy": 0.606758998055011, "epoch": 0.16412908302711363, "grad_norm": 0.17200632393360138, "learning_rate": 0.0002, "loss": 0.8051, "mean_token_accuracy": 0.7853546734899283, "num_tokens": 642593482.0, "step": 2190 }, { "entropy": 0.6203546144068242, "epoch": 0.16450380696096548, "grad_norm": 0.16839708387851715, "learning_rate": 0.0002, "loss": 0.8161, "mean_token_accuracy": 0.7872797790914774, "num_tokens": 644042837.0, "step": 2195 }, { "entropy": 0.6211318824440241, "epoch": 0.16487853089481733, "grad_norm": 0.18486961722373962, "learning_rate": 0.0002, "loss": 0.8206, "mean_token_accuracy": 0.7823463566601276, "num_tokens": 645479555.0, "step": 2200 }, { "entropy": 0.6269730249419808, "epoch": 0.1652532548286692, "grad_norm": 0.18047453463077545, "learning_rate": 0.0002, "loss": 0.8216, "mean_token_accuracy": 0.7829699575901031, "num_tokens": 646967196.0, "step": 2205 }, { "entropy": 0.5922915680333972, "epoch": 0.16562797876252106, "grad_norm": 0.18557429313659668, "learning_rate": 0.0002, "loss": 0.7856, "mean_token_accuracy": 0.7903221972286701, "num_tokens": 648413084.0, "step": 2210 }, { "entropy": 0.6091880628839135, "epoch": 0.1660027026963729, "grad_norm": 0.18479838967323303, "learning_rate": 0.0002, "loss": 0.8043, "mean_token_accuracy": 0.785714577883482, "num_tokens": 649888289.0, "step": 2215 }, { "entropy": 0.6108442628756166, "epoch": 0.16637742663022476, "grad_norm": 0.20475894212722778, "learning_rate": 0.0002, "loss": 0.7968, "mean_token_accuracy": 0.78904725946486, "num_tokens": 651383847.0, "step": 2220 }, { "entropy": 0.6168661942705512, "epoch": 0.16675215056407663, "grad_norm": 0.20748576521873474, "learning_rate": 0.0002, "loss": 0.8033, "mean_token_accuracy": 0.7854213282465935, "num_tokens": 652876736.0, "step": 2225 }, { "entropy": 0.6190294930711389, "epoch": 0.16712687449792848, "grad_norm": 0.2014569342136383, "learning_rate": 0.0002, "loss": 0.8057, "mean_token_accuracy": 0.7863823093473912, "num_tokens": 654344067.0, "step": 2230 }, { "entropy": 0.600414308346808, "epoch": 0.16750159843178034, "grad_norm": 0.18003125488758087, "learning_rate": 0.0002, "loss": 0.7783, "mean_token_accuracy": 0.7869856584817171, "num_tokens": 655768344.0, "step": 2235 }, { "entropy": 0.6285524751991034, "epoch": 0.16787632236563219, "grad_norm": 0.1823740005493164, "learning_rate": 0.0002, "loss": 0.8186, "mean_token_accuracy": 0.7869383323937654, "num_tokens": 657242256.0, "step": 2240 }, { "entropy": 0.5882493598386646, "epoch": 0.16825104629948406, "grad_norm": 0.1734876036643982, "learning_rate": 0.0002, "loss": 0.7788, "mean_token_accuracy": 0.7889539286494255, "num_tokens": 658700839.0, "step": 2245 }, { "entropy": 0.6086031991988421, "epoch": 0.16862577023333591, "grad_norm": 0.17379982769489288, "learning_rate": 0.0002, "loss": 0.7931, "mean_token_accuracy": 0.7862867891788483, "num_tokens": 660191855.0, "step": 2250 }, { "entropy": 0.605179401114583, "epoch": 0.16900049416718776, "grad_norm": 0.17497168481349945, "learning_rate": 0.0002, "loss": 0.7988, "mean_token_accuracy": 0.789503776282072, "num_tokens": 661682791.0, "step": 2255 }, { "entropy": 0.6139368636533618, "epoch": 0.16937521810103962, "grad_norm": 0.18127556145191193, "learning_rate": 0.0002, "loss": 0.812, "mean_token_accuracy": 0.78596535846591, "num_tokens": 663127823.0, "step": 2260 }, { "entropy": 0.6257609152235091, "epoch": 0.1697499420348915, "grad_norm": 0.1940447986125946, "learning_rate": 0.0002, "loss": 0.8072, "mean_token_accuracy": 0.7853500317782164, "num_tokens": 664627526.0, "step": 2265 }, { "entropy": 0.6266929244622588, "epoch": 0.17012466596874334, "grad_norm": 0.17783193290233612, "learning_rate": 0.0002, "loss": 0.8134, "mean_token_accuracy": 0.7847793128341436, "num_tokens": 666139980.0, "step": 2270 }, { "entropy": 0.5934259125962853, "epoch": 0.1704993899025952, "grad_norm": 0.2076859474182129, "learning_rate": 0.0002, "loss": 0.7866, "mean_token_accuracy": 0.7906119737774133, "num_tokens": 667647624.0, "step": 2275 }, { "entropy": 0.5981077512726187, "epoch": 0.17087411383644704, "grad_norm": 0.18115019798278809, "learning_rate": 0.0002, "loss": 0.7886, "mean_token_accuracy": 0.7880510043352842, "num_tokens": 669120873.0, "step": 2280 }, { "entropy": 0.6020470436662435, "epoch": 0.17124883777029892, "grad_norm": 0.6403230428695679, "learning_rate": 0.0002, "loss": 0.7996, "mean_token_accuracy": 0.7863847937434911, "num_tokens": 670573331.0, "step": 2285 }, { "entropy": 0.5928613636642694, "epoch": 0.17162356170415077, "grad_norm": 0.18821920454502106, "learning_rate": 0.0002, "loss": 0.7726, "mean_token_accuracy": 0.7934564486145973, "num_tokens": 672040154.0, "step": 2290 }, { "entropy": 0.5989745660685003, "epoch": 0.17199828563800262, "grad_norm": 0.17454999685287476, "learning_rate": 0.0002, "loss": 0.794, "mean_token_accuracy": 0.788464804366231, "num_tokens": 673513429.0, "step": 2295 }, { "entropy": 0.5985276572406292, "epoch": 0.17237300957185447, "grad_norm": 0.18502481281757355, "learning_rate": 0.0002, "loss": 0.7976, "mean_token_accuracy": 0.7872604690492153, "num_tokens": 674933238.0, "step": 2300 }, { "entropy": 0.6175102325156331, "epoch": 0.17274773350570635, "grad_norm": 0.2009778469800949, "learning_rate": 0.0002, "loss": 0.8146, "mean_token_accuracy": 0.7836092263460159, "num_tokens": 676407955.0, "step": 2305 }, { "entropy": 0.6291124112904072, "epoch": 0.1731224574395582, "grad_norm": 0.18718038499355316, "learning_rate": 0.0002, "loss": 0.828, "mean_token_accuracy": 0.7833633087575436, "num_tokens": 677914722.0, "step": 2310 }, { "entropy": 0.5855541061609983, "epoch": 0.17349718137341005, "grad_norm": 0.1839580088853836, "learning_rate": 0.0002, "loss": 0.7695, "mean_token_accuracy": 0.7906408738344908, "num_tokens": 679382240.0, "step": 2315 }, { "entropy": 0.5981345439329744, "epoch": 0.1738719053072619, "grad_norm": 0.1702301800251007, "learning_rate": 0.0002, "loss": 0.7752, "mean_token_accuracy": 0.7884683355689048, "num_tokens": 680842238.0, "step": 2320 }, { "entropy": 0.6227511813864112, "epoch": 0.17424662924111378, "grad_norm": 0.1766570806503296, "learning_rate": 0.0002, "loss": 0.819, "mean_token_accuracy": 0.7862730104476213, "num_tokens": 682330133.0, "step": 2325 }, { "entropy": 0.617927101161331, "epoch": 0.17462135317496563, "grad_norm": 0.2794263958930969, "learning_rate": 0.0002, "loss": 0.8091, "mean_token_accuracy": 0.7881725862622261, "num_tokens": 683818212.0, "step": 2330 }, { "entropy": 0.6100722098723054, "epoch": 0.17499607710881748, "grad_norm": 0.17343035340309143, "learning_rate": 0.0002, "loss": 0.8043, "mean_token_accuracy": 0.7878972236067057, "num_tokens": 685308304.0, "step": 2335 }, { "entropy": 0.5906741645187139, "epoch": 0.17537080104266933, "grad_norm": 0.17975738644599915, "learning_rate": 0.0002, "loss": 0.7707, "mean_token_accuracy": 0.7930621638894081, "num_tokens": 686789776.0, "step": 2340 }, { "entropy": 0.6115793246775866, "epoch": 0.1757455249765212, "grad_norm": 0.18127599358558655, "learning_rate": 0.0002, "loss": 0.7905, "mean_token_accuracy": 0.790056373551488, "num_tokens": 688268196.0, "step": 2345 }, { "entropy": 0.6175913737155497, "epoch": 0.17612024891037306, "grad_norm": 0.2504035234451294, "learning_rate": 0.0002, "loss": 0.8002, "mean_token_accuracy": 0.788684468716383, "num_tokens": 689756983.0, "step": 2350 }, { "entropy": 0.6101310702972114, "epoch": 0.1764949728442249, "grad_norm": 0.18765224516391754, "learning_rate": 0.0002, "loss": 0.7856, "mean_token_accuracy": 0.7884681593626738, "num_tokens": 691225668.0, "step": 2355 }, { "entropy": 0.6147379427216947, "epoch": 0.17686969677807676, "grad_norm": 0.18152476847171783, "learning_rate": 0.0002, "loss": 0.7898, "mean_token_accuracy": 0.787495118752122, "num_tokens": 692714791.0, "step": 2360 }, { "entropy": 0.6265628801658749, "epoch": 0.17724442071192864, "grad_norm": 0.1858488917350769, "learning_rate": 0.0002, "loss": 0.8001, "mean_token_accuracy": 0.7849145531654358, "num_tokens": 694201039.0, "step": 2365 }, { "entropy": 0.6241427404806018, "epoch": 0.1776191446457805, "grad_norm": 0.18484807014465332, "learning_rate": 0.0002, "loss": 0.8123, "mean_token_accuracy": 0.7868451442569494, "num_tokens": 695664005.0, "step": 2370 }, { "entropy": 0.6152596173807978, "epoch": 0.17799386857963234, "grad_norm": 0.19196289777755737, "learning_rate": 0.0002, "loss": 0.7909, "mean_token_accuracy": 0.7886902574449778, "num_tokens": 697148427.0, "step": 2375 }, { "entropy": 0.5861590525135398, "epoch": 0.17836859251348422, "grad_norm": 0.20011544227600098, "learning_rate": 0.0002, "loss": 0.7748, "mean_token_accuracy": 0.789220017567277, "num_tokens": 698580787.0, "step": 2380 }, { "entropy": 0.6097015801817178, "epoch": 0.17874331644733607, "grad_norm": 0.1861138790845871, "learning_rate": 0.0002, "loss": 0.7936, "mean_token_accuracy": 0.7883909195661545, "num_tokens": 700056194.0, "step": 2385 }, { "entropy": 0.6083336467854679, "epoch": 0.17911804038118792, "grad_norm": 0.20533917844295502, "learning_rate": 0.0002, "loss": 0.8008, "mean_token_accuracy": 0.7879138384014368, "num_tokens": 701521185.0, "step": 2390 }, { "entropy": 0.6223215434700251, "epoch": 0.17949276431503977, "grad_norm": 0.19430561363697052, "learning_rate": 0.0002, "loss": 0.8073, "mean_token_accuracy": 0.7858055762946605, "num_tokens": 702992683.0, "step": 2395 }, { "entropy": 0.6210349515080452, "epoch": 0.17986748824889165, "grad_norm": 0.1901136338710785, "learning_rate": 0.0002, "loss": 0.794, "mean_token_accuracy": 0.7879847846925259, "num_tokens": 704504921.0, "step": 2400 }, { "entropy": 0.6013050388544798, "epoch": 0.1802422121827435, "grad_norm": 0.2094990313053131, "learning_rate": 0.0002, "loss": 0.7967, "mean_token_accuracy": 0.7898013614118099, "num_tokens": 705971736.0, "step": 2405 }, { "entropy": 0.6158100626431405, "epoch": 0.18061693611659535, "grad_norm": 0.2128753960132599, "learning_rate": 0.0002, "loss": 0.8045, "mean_token_accuracy": 0.7903809923678636, "num_tokens": 707458080.0, "step": 2410 }, { "entropy": 0.5991538882255554, "epoch": 0.1809916600504472, "grad_norm": 0.18988759815692902, "learning_rate": 0.0002, "loss": 0.7799, "mean_token_accuracy": 0.788377096131444, "num_tokens": 708926177.0, "step": 2415 }, { "entropy": 0.6101385859772563, "epoch": 0.18136638398429908, "grad_norm": 0.17940951883792877, "learning_rate": 0.0002, "loss": 0.7965, "mean_token_accuracy": 0.7876051302999258, "num_tokens": 710371354.0, "step": 2420 }, { "entropy": 0.6021702472120524, "epoch": 0.18174110791815093, "grad_norm": 0.17562425136566162, "learning_rate": 0.0002, "loss": 0.7842, "mean_token_accuracy": 0.7912558287382125, "num_tokens": 711834628.0, "step": 2425 }, { "entropy": 0.6204814890399575, "epoch": 0.18211583185200278, "grad_norm": 0.17122507095336914, "learning_rate": 0.0002, "loss": 0.8023, "mean_token_accuracy": 0.7853690691292285, "num_tokens": 713376093.0, "step": 2430 }, { "entropy": 0.6015372175723315, "epoch": 0.18249055578585463, "grad_norm": 0.19471311569213867, "learning_rate": 0.0002, "loss": 0.7905, "mean_token_accuracy": 0.7898991134017705, "num_tokens": 714779920.0, "step": 2435 }, { "entropy": 0.5914392463862896, "epoch": 0.1828652797197065, "grad_norm": 0.19396290183067322, "learning_rate": 0.0002, "loss": 0.7851, "mean_token_accuracy": 0.7904362447559834, "num_tokens": 716194424.0, "step": 2440 }, { "entropy": 0.6123619806952775, "epoch": 0.18324000365355836, "grad_norm": 0.20699858665466309, "learning_rate": 0.0002, "loss": 0.7957, "mean_token_accuracy": 0.7895799655467272, "num_tokens": 717656871.0, "step": 2445 }, { "entropy": 0.6121074752882123, "epoch": 0.1836147275874102, "grad_norm": 0.17560189962387085, "learning_rate": 0.0002, "loss": 0.7882, "mean_token_accuracy": 0.7869485691189766, "num_tokens": 719150293.0, "step": 2450 }, { "entropy": 0.6110768154263496, "epoch": 0.18398945152126206, "grad_norm": 0.1762045919895172, "learning_rate": 0.0002, "loss": 0.7937, "mean_token_accuracy": 0.790903753042221, "num_tokens": 720603198.0, "step": 2455 }, { "entropy": 0.595826662518084, "epoch": 0.18436417545511394, "grad_norm": 0.18920017778873444, "learning_rate": 0.0002, "loss": 0.7836, "mean_token_accuracy": 0.7894156023859977, "num_tokens": 722053385.0, "step": 2460 }, { "entropy": 0.5998823028057814, "epoch": 0.1847388993889658, "grad_norm": 0.21495449542999268, "learning_rate": 0.0002, "loss": 0.7818, "mean_token_accuracy": 0.7919099375605583, "num_tokens": 723479007.0, "step": 2465 }, { "entropy": 0.60638448279351, "epoch": 0.18511362332281764, "grad_norm": 0.17206978797912598, "learning_rate": 0.0002, "loss": 0.7828, "mean_token_accuracy": 0.7873579211533069, "num_tokens": 724953762.0, "step": 2470 }, { "entropy": 0.6347057750448585, "epoch": 0.1854883472566695, "grad_norm": 0.18416330218315125, "learning_rate": 0.0002, "loss": 0.8149, "mean_token_accuracy": 0.7848142243921756, "num_tokens": 726441528.0, "step": 2475 }, { "entropy": 0.6096488101407885, "epoch": 0.18586307119052137, "grad_norm": 0.20825284719467163, "learning_rate": 0.0002, "loss": 0.7947, "mean_token_accuracy": 0.7862904842942953, "num_tokens": 727894229.0, "step": 2480 }, { "entropy": 0.6225566241890192, "epoch": 0.18623779512437322, "grad_norm": 0.20046336948871613, "learning_rate": 0.0002, "loss": 0.8172, "mean_token_accuracy": 0.7859689518809319, "num_tokens": 729345034.0, "step": 2485 }, { "entropy": 0.6276885188184679, "epoch": 0.18661251905822507, "grad_norm": 0.20170053839683533, "learning_rate": 0.0002, "loss": 0.8074, "mean_token_accuracy": 0.7844698514789343, "num_tokens": 730845195.0, "step": 2490 }, { "entropy": 0.6244707813486456, "epoch": 0.18698724299207692, "grad_norm": 0.17910750210285187, "learning_rate": 0.0002, "loss": 0.8061, "mean_token_accuracy": 0.7864898584783078, "num_tokens": 732342433.0, "step": 2495 }, { "entropy": 0.6151678692549467, "epoch": 0.1873619669259288, "grad_norm": 0.17974688112735748, "learning_rate": 0.0002, "loss": 0.7974, "mean_token_accuracy": 0.7875491309911012, "num_tokens": 733806942.0, "step": 2500 }, { "entropy": 0.6054765827953815, "epoch": 0.18773669085978065, "grad_norm": 0.20793481171131134, "learning_rate": 0.0002, "loss": 0.7968, "mean_token_accuracy": 0.7894412249326705, "num_tokens": 735276377.0, "step": 2505 }, { "entropy": 0.5989274078980088, "epoch": 0.1881114147936325, "grad_norm": 0.17363592982292175, "learning_rate": 0.0002, "loss": 0.7896, "mean_token_accuracy": 0.7884077817201615, "num_tokens": 736729518.0, "step": 2510 }, { "entropy": 0.5882355907931924, "epoch": 0.18848613872748435, "grad_norm": 0.16924186050891876, "learning_rate": 0.0002, "loss": 0.7708, "mean_token_accuracy": 0.7886406034231186, "num_tokens": 738186183.0, "step": 2515 }, { "entropy": 0.5891388170421124, "epoch": 0.18886086266133623, "grad_norm": 0.1870952695608139, "learning_rate": 0.0002, "loss": 0.7721, "mean_token_accuracy": 0.7929255183786154, "num_tokens": 739651394.0, "step": 2520 }, { "entropy": 0.6105662947520614, "epoch": 0.18923558659518808, "grad_norm": 0.16879162192344666, "learning_rate": 0.0002, "loss": 0.7939, "mean_token_accuracy": 0.784249759465456, "num_tokens": 741127991.0, "step": 2525 }, { "entropy": 0.5926085026934743, "epoch": 0.18961031052903993, "grad_norm": 0.1694159209728241, "learning_rate": 0.0002, "loss": 0.791, "mean_token_accuracy": 0.7897600445896387, "num_tokens": 742534052.0, "step": 2530 }, { "entropy": 0.578111047577113, "epoch": 0.1899850344628918, "grad_norm": 0.1716638058423996, "learning_rate": 0.0002, "loss": 0.76, "mean_token_accuracy": 0.7942444946616888, "num_tokens": 743979453.0, "step": 2535 }, { "entropy": 0.6048824125900865, "epoch": 0.19035975839674366, "grad_norm": 0.1762598156929016, "learning_rate": 0.0002, "loss": 0.7892, "mean_token_accuracy": 0.7897901546210051, "num_tokens": 745435230.0, "step": 2540 }, { "entropy": 0.598692586645484, "epoch": 0.1907344823305955, "grad_norm": 0.20164677500724792, "learning_rate": 0.0002, "loss": 0.7811, "mean_token_accuracy": 0.7877854872494936, "num_tokens": 746905965.0, "step": 2545 }, { "entropy": 0.6279929176904261, "epoch": 0.19110920626444736, "grad_norm": 0.21723723411560059, "learning_rate": 0.0002, "loss": 0.8237, "mean_token_accuracy": 0.7870237711817026, "num_tokens": 748370182.0, "step": 2550 }, { "entropy": 0.6177911364473403, "epoch": 0.19148393019829923, "grad_norm": 0.179407000541687, "learning_rate": 0.0002, "loss": 0.7966, "mean_token_accuracy": 0.7894405163824558, "num_tokens": 749873808.0, "step": 2555 }, { "entropy": 0.5824682550504804, "epoch": 0.19185865413215109, "grad_norm": 0.1857885718345642, "learning_rate": 0.0002, "loss": 0.7597, "mean_token_accuracy": 0.7930919826030731, "num_tokens": 751367575.0, "step": 2560 }, { "entropy": 0.6034259998239577, "epoch": 0.19223337806600294, "grad_norm": 0.24231888353824615, "learning_rate": 0.0002, "loss": 0.7799, "mean_token_accuracy": 0.7898766431957484, "num_tokens": 752830630.0, "step": 2565 }, { "entropy": 0.6047795241698622, "epoch": 0.19260810199985479, "grad_norm": 0.18353267014026642, "learning_rate": 0.0002, "loss": 0.7835, "mean_token_accuracy": 0.7887903783470392, "num_tokens": 754288647.0, "step": 2570 }, { "entropy": 0.6144316362217068, "epoch": 0.19298282593370666, "grad_norm": 0.17732740938663483, "learning_rate": 0.0002, "loss": 0.7934, "mean_token_accuracy": 0.7882490396499634, "num_tokens": 755750374.0, "step": 2575 }, { "entropy": 0.6049374390393496, "epoch": 0.19335754986755851, "grad_norm": 0.17246204614639282, "learning_rate": 0.0002, "loss": 0.8, "mean_token_accuracy": 0.7885756149888039, "num_tokens": 757186106.0, "step": 2580 }, { "entropy": 0.607648341730237, "epoch": 0.19373227380141037, "grad_norm": 0.1821683794260025, "learning_rate": 0.0002, "loss": 0.7943, "mean_token_accuracy": 0.7904865764081478, "num_tokens": 758645202.0, "step": 2585 }, { "entropy": 0.6115696442313492, "epoch": 0.19410699773526222, "grad_norm": 0.1625790297985077, "learning_rate": 0.0002, "loss": 0.7932, "mean_token_accuracy": 0.7904490385204553, "num_tokens": 760096443.0, "step": 2590 }, { "entropy": 0.6060734571889043, "epoch": 0.1944817216691141, "grad_norm": 0.17711307108402252, "learning_rate": 0.0002, "loss": 0.7923, "mean_token_accuracy": 0.787765871733427, "num_tokens": 761585391.0, "step": 2595 }, { "entropy": 0.6033030249178409, "epoch": 0.19485644560296594, "grad_norm": 0.1835392415523529, "learning_rate": 0.0002, "loss": 0.7771, "mean_token_accuracy": 0.7920665927231312, "num_tokens": 763066520.0, "step": 2600 }, { "entropy": 0.610666786134243, "epoch": 0.1952311695368178, "grad_norm": 0.1827099472284317, "learning_rate": 0.0002, "loss": 0.7786, "mean_token_accuracy": 0.7932992525398731, "num_tokens": 764520996.0, "step": 2605 }, { "entropy": 0.6224947217851877, "epoch": 0.19560589347066965, "grad_norm": 0.19031643867492676, "learning_rate": 0.0002, "loss": 0.8085, "mean_token_accuracy": 0.7856072951108217, "num_tokens": 765984256.0, "step": 2610 }, { "entropy": 0.604100895486772, "epoch": 0.19598061740452152, "grad_norm": 0.18280814588069916, "learning_rate": 0.0002, "loss": 0.7814, "mean_token_accuracy": 0.7916664961725474, "num_tokens": 767458163.0, "step": 2615 }, { "entropy": 0.5948310643434525, "epoch": 0.19635534133837337, "grad_norm": 0.7753672003746033, "learning_rate": 0.0002, "loss": 0.778, "mean_token_accuracy": 0.7913050178438426, "num_tokens": 768924697.0, "step": 2620 }, { "entropy": 0.629897852614522, "epoch": 0.19673006527222522, "grad_norm": 0.18097127974033356, "learning_rate": 0.0002, "loss": 0.8165, "mean_token_accuracy": 0.7836997304111719, "num_tokens": 770408282.0, "step": 2625 }, { "entropy": 0.5963857406750321, "epoch": 0.19710478920607707, "grad_norm": 0.2120867818593979, "learning_rate": 0.0002, "loss": 0.7758, "mean_token_accuracy": 0.7892747286707162, "num_tokens": 771884614.0, "step": 2630 }, { "entropy": 0.5956146495416761, "epoch": 0.19747951313992895, "grad_norm": 0.17755529284477234, "learning_rate": 0.0002, "loss": 0.7875, "mean_token_accuracy": 0.7905724324285984, "num_tokens": 773332653.0, "step": 2635 }, { "entropy": 0.6113113263621927, "epoch": 0.1978542370737808, "grad_norm": 0.17270447313785553, "learning_rate": 0.0002, "loss": 0.7984, "mean_token_accuracy": 0.789347442984581, "num_tokens": 774794479.0, "step": 2640 }, { "entropy": 0.6187123715877533, "epoch": 0.19822896100763265, "grad_norm": 0.18184180557727814, "learning_rate": 0.0002, "loss": 0.8045, "mean_token_accuracy": 0.7860436458140612, "num_tokens": 776249421.0, "step": 2645 }, { "entropy": 0.5960615056566894, "epoch": 0.1986036849414845, "grad_norm": 0.18621350824832916, "learning_rate": 0.0002, "loss": 0.7785, "mean_token_accuracy": 0.7918926022946835, "num_tokens": 777692261.0, "step": 2650 }, { "entropy": 0.6043345905840397, "epoch": 0.19897840887533638, "grad_norm": 0.19183208048343658, "learning_rate": 0.0002, "loss": 0.7892, "mean_token_accuracy": 0.7882481809705496, "num_tokens": 779156530.0, "step": 2655 }, { "entropy": 0.6334025768563152, "epoch": 0.19935313280918823, "grad_norm": 0.18271343410015106, "learning_rate": 0.0002, "loss": 0.8231, "mean_token_accuracy": 0.7824711982160807, "num_tokens": 780650061.0, "step": 2660 }, { "entropy": 0.6211708852089941, "epoch": 0.19972785674304008, "grad_norm": 0.21401765942573547, "learning_rate": 0.0002, "loss": 0.8057, "mean_token_accuracy": 0.7879332605749368, "num_tokens": 782135290.0, "step": 2665 }, { "entropy": 0.6083402017131447, "epoch": 0.20010258067689193, "grad_norm": 0.20461492240428925, "learning_rate": 0.0002, "loss": 0.7953, "mean_token_accuracy": 0.7874959386885166, "num_tokens": 783584191.0, "step": 2670 }, { "entropy": 0.6071524232625961, "epoch": 0.2004773046107438, "grad_norm": 0.2050897479057312, "learning_rate": 0.0002, "loss": 0.7748, "mean_token_accuracy": 0.792029419541359, "num_tokens": 785073936.0, "step": 2675 }, { "entropy": 0.5922674821689725, "epoch": 0.20085202854459566, "grad_norm": 0.1674569845199585, "learning_rate": 0.0002, "loss": 0.7563, "mean_token_accuracy": 0.7951609812676906, "num_tokens": 786612471.0, "step": 2680 }, { "entropy": 0.5905024342238903, "epoch": 0.2012267524784475, "grad_norm": 0.20633292198181152, "learning_rate": 0.0002, "loss": 0.7616, "mean_token_accuracy": 0.7942201644182205, "num_tokens": 788064006.0, "step": 2685 }, { "entropy": 0.6256845213472844, "epoch": 0.20160147641229936, "grad_norm": 0.18148267269134521, "learning_rate": 0.0002, "loss": 0.8052, "mean_token_accuracy": 0.7880542896687984, "num_tokens": 789531680.0, "step": 2690 }, { "entropy": 0.62670062687248, "epoch": 0.20197620034615124, "grad_norm": 0.2254752367734909, "learning_rate": 0.0002, "loss": 0.8043, "mean_token_accuracy": 0.7850899051874876, "num_tokens": 791002171.0, "step": 2695 }, { "entropy": 0.6193706039339304, "epoch": 0.2023509242800031, "grad_norm": 0.1853032112121582, "learning_rate": 0.0002, "loss": 0.7939, "mean_token_accuracy": 0.788734445720911, "num_tokens": 792463397.0, "step": 2700 }, { "entropy": 0.6197291631251574, "epoch": 0.20272564821385494, "grad_norm": 0.1850012242794037, "learning_rate": 0.0002, "loss": 0.7741, "mean_token_accuracy": 0.7917637724429369, "num_tokens": 793986610.0, "step": 2705 }, { "entropy": 0.6321620935574174, "epoch": 0.20310037214770682, "grad_norm": 0.17727600038051605, "learning_rate": 0.0002, "loss": 0.8055, "mean_token_accuracy": 0.7880735639482737, "num_tokens": 795465689.0, "step": 2710 }, { "entropy": 0.6273479524999857, "epoch": 0.20347509608155867, "grad_norm": 0.1902616173028946, "learning_rate": 0.0002, "loss": 0.7958, "mean_token_accuracy": 0.7898019336163997, "num_tokens": 796921970.0, "step": 2715 }, { "entropy": 0.6130238527432084, "epoch": 0.20384982001541052, "grad_norm": 0.18217138946056366, "learning_rate": 0.0002, "loss": 0.7743, "mean_token_accuracy": 0.7912269692867995, "num_tokens": 798421853.0, "step": 2720 }, { "entropy": 0.6210615945979953, "epoch": 0.20422454394926237, "grad_norm": 0.1950245201587677, "learning_rate": 0.0002, "loss": 0.7991, "mean_token_accuracy": 0.7856516223400831, "num_tokens": 799928214.0, "step": 2725 }, { "entropy": 0.607552258297801, "epoch": 0.20459926788311425, "grad_norm": 0.18298666179180145, "learning_rate": 0.0002, "loss": 0.7826, "mean_token_accuracy": 0.7907232098281384, "num_tokens": 801417653.0, "step": 2730 }, { "entropy": 0.6104102081619203, "epoch": 0.2049739918169661, "grad_norm": 0.1920616179704666, "learning_rate": 0.0002, "loss": 0.774, "mean_token_accuracy": 0.7907057214528322, "num_tokens": 802971614.0, "step": 2735 }, { "entropy": 0.606289454549551, "epoch": 0.20534871575081795, "grad_norm": 0.1806163489818573, "learning_rate": 0.0002, "loss": 0.7973, "mean_token_accuracy": 0.788843871280551, "num_tokens": 804442267.0, "step": 2740 }, { "entropy": 0.6019480282440781, "epoch": 0.2057234396846698, "grad_norm": 0.1957215815782547, "learning_rate": 0.0002, "loss": 0.7744, "mean_token_accuracy": 0.7921865537762642, "num_tokens": 805903325.0, "step": 2745 }, { "entropy": 0.6009451208636165, "epoch": 0.20609816361852168, "grad_norm": 0.17835405468940735, "learning_rate": 0.0002, "loss": 0.7797, "mean_token_accuracy": 0.7914812348783016, "num_tokens": 807376519.0, "step": 2750 }, { "entropy": 0.6194185825064779, "epoch": 0.20647288755237353, "grad_norm": 0.1961940973997116, "learning_rate": 0.0002, "loss": 0.7942, "mean_token_accuracy": 0.7863513570278883, "num_tokens": 808866907.0, "step": 2755 }, { "entropy": 0.5981897107325495, "epoch": 0.20684761148622538, "grad_norm": 0.20404107868671417, "learning_rate": 0.0002, "loss": 0.7789, "mean_token_accuracy": 0.7912492860108614, "num_tokens": 810342406.0, "step": 2760 }, { "entropy": 0.6057866841554642, "epoch": 0.20722233542007723, "grad_norm": 0.24581818282604218, "learning_rate": 0.0002, "loss": 0.7906, "mean_token_accuracy": 0.7883641928434372, "num_tokens": 811767525.0, "step": 2765 }, { "entropy": 0.6017561301589012, "epoch": 0.2075970593539291, "grad_norm": 0.2070930451154709, "learning_rate": 0.0002, "loss": 0.7682, "mean_token_accuracy": 0.7925706565380096, "num_tokens": 813226027.0, "step": 2770 }, { "entropy": 0.5970782518386841, "epoch": 0.20797178328778096, "grad_norm": 0.1914699375629425, "learning_rate": 0.0002, "loss": 0.7707, "mean_token_accuracy": 0.7921871233731508, "num_tokens": 814699657.0, "step": 2775 }, { "entropy": 0.6036464035511017, "epoch": 0.2083465072216328, "grad_norm": 0.18876062333583832, "learning_rate": 0.0002, "loss": 0.7801, "mean_token_accuracy": 0.7885274790227413, "num_tokens": 816141062.0, "step": 2780 }, { "entropy": 0.6224133441224694, "epoch": 0.20872123115548466, "grad_norm": 0.19061732292175293, "learning_rate": 0.0002, "loss": 0.7971, "mean_token_accuracy": 0.788865114748478, "num_tokens": 817634756.0, "step": 2785 }, { "entropy": 0.6126434590667487, "epoch": 0.20909595508933654, "grad_norm": 0.18213514983654022, "learning_rate": 0.0002, "loss": 0.7844, "mean_token_accuracy": 0.7892961777746678, "num_tokens": 819126154.0, "step": 2790 }, { "entropy": 0.5940558838658034, "epoch": 0.2094706790231884, "grad_norm": 0.19679783284664154, "learning_rate": 0.0002, "loss": 0.7725, "mean_token_accuracy": 0.7935563955456019, "num_tokens": 820578818.0, "step": 2795 }, { "entropy": 0.6088366130366921, "epoch": 0.20984540295704024, "grad_norm": 0.17023132741451263, "learning_rate": 0.0002, "loss": 0.7882, "mean_token_accuracy": 0.7916604653000832, "num_tokens": 822081442.0, "step": 2800 }, { "entropy": 0.6142249125987291, "epoch": 0.2102201268908921, "grad_norm": 0.19342753291130066, "learning_rate": 0.0002, "loss": 0.7923, "mean_token_accuracy": 0.7899951037019491, "num_tokens": 823540909.0, "step": 2805 }, { "entropy": 0.6092501781880856, "epoch": 0.21059485082474397, "grad_norm": 0.18400532007217407, "learning_rate": 0.0002, "loss": 0.7788, "mean_token_accuracy": 0.7922325726598501, "num_tokens": 825020260.0, "step": 2810 }, { "entropy": 0.6388037256896496, "epoch": 0.21096957475859582, "grad_norm": 0.19629128277301788, "learning_rate": 0.0002, "loss": 0.7985, "mean_token_accuracy": 0.7868939451873302, "num_tokens": 826549032.0, "step": 2815 }, { "entropy": 0.6036311758682131, "epoch": 0.21134429869244767, "grad_norm": 0.18593575060367584, "learning_rate": 0.0002, "loss": 0.7736, "mean_token_accuracy": 0.792702828720212, "num_tokens": 828018427.0, "step": 2820 }, { "entropy": 0.6028555830009281, "epoch": 0.21171902262629952, "grad_norm": 0.1813669353723526, "learning_rate": 0.0002, "loss": 0.7799, "mean_token_accuracy": 0.7907445352524519, "num_tokens": 829449768.0, "step": 2825 }, { "entropy": 0.6104486616328358, "epoch": 0.2120937465601514, "grad_norm": 0.18718929588794708, "learning_rate": 0.0002, "loss": 0.7816, "mean_token_accuracy": 0.7911163344979286, "num_tokens": 830909288.0, "step": 2830 }, { "entropy": 0.5999830112792551, "epoch": 0.21246847049400325, "grad_norm": 0.18361009657382965, "learning_rate": 0.0002, "loss": 0.7679, "mean_token_accuracy": 0.7908868905156851, "num_tokens": 832402947.0, "step": 2835 }, { "entropy": 0.5939900986850262, "epoch": 0.2128431944278551, "grad_norm": 0.1723877191543579, "learning_rate": 0.0002, "loss": 0.7705, "mean_token_accuracy": 0.7918451078236103, "num_tokens": 833865249.0, "step": 2840 }, { "entropy": 0.5859808555804193, "epoch": 0.21321791836170695, "grad_norm": 0.19567057490348816, "learning_rate": 0.0002, "loss": 0.7606, "mean_token_accuracy": 0.7938916247338057, "num_tokens": 835307896.0, "step": 2845 }, { "entropy": 0.5901813838630915, "epoch": 0.21359264229555883, "grad_norm": 0.1776261329650879, "learning_rate": 0.0002, "loss": 0.7669, "mean_token_accuracy": 0.7954722870141268, "num_tokens": 836768971.0, "step": 2850 }, { "entropy": 0.6044940795749426, "epoch": 0.21396736622941068, "grad_norm": 0.19765175879001617, "learning_rate": 0.0002, "loss": 0.7752, "mean_token_accuracy": 0.7924645636230707, "num_tokens": 838261383.0, "step": 2855 }, { "entropy": 0.5979951788671315, "epoch": 0.21434209016326253, "grad_norm": 0.16093911230564117, "learning_rate": 0.0002, "loss": 0.7639, "mean_token_accuracy": 0.7957977890968323, "num_tokens": 839761092.0, "step": 2860 }, { "entropy": 0.5872519429773092, "epoch": 0.21471681409711438, "grad_norm": 0.18506231904029846, "learning_rate": 0.0002, "loss": 0.7635, "mean_token_accuracy": 0.7951505053788424, "num_tokens": 841192108.0, "step": 2865 }, { "entropy": 0.5992196457460522, "epoch": 0.21509153803096626, "grad_norm": 0.21644768118858337, "learning_rate": 0.0002, "loss": 0.7692, "mean_token_accuracy": 0.7912162743508816, "num_tokens": 842697959.0, "step": 2870 }, { "entropy": 0.6085485180839896, "epoch": 0.2154662619648181, "grad_norm": 0.18148860335350037, "learning_rate": 0.0002, "loss": 0.7845, "mean_token_accuracy": 0.7884397853165865, "num_tokens": 844216305.0, "step": 2875 }, { "entropy": 0.5849363293498755, "epoch": 0.21584098589866996, "grad_norm": 0.17322057485580444, "learning_rate": 0.0002, "loss": 0.7666, "mean_token_accuracy": 0.7895379915833474, "num_tokens": 845686724.0, "step": 2880 }, { "entropy": 0.5952237248420715, "epoch": 0.21621570983252184, "grad_norm": 0.20046848058700562, "learning_rate": 0.0002, "loss": 0.781, "mean_token_accuracy": 0.7907855413854122, "num_tokens": 847178168.0, "step": 2885 }, { "entropy": 0.5924846460111439, "epoch": 0.21659043376637369, "grad_norm": 0.1998337060213089, "learning_rate": 0.0002, "loss": 0.7724, "mean_token_accuracy": 0.7920611575245857, "num_tokens": 848633341.0, "step": 2890 }, { "entropy": 0.6005576085299253, "epoch": 0.21696515770022554, "grad_norm": 0.1894308626651764, "learning_rate": 0.0002, "loss": 0.7711, "mean_token_accuracy": 0.7905118342489004, "num_tokens": 850108984.0, "step": 2895 }, { "entropy": 0.6163029311224818, "epoch": 0.2173398816340774, "grad_norm": 0.19529661536216736, "learning_rate": 0.0002, "loss": 0.7996, "mean_token_accuracy": 0.7893692750483752, "num_tokens": 851602361.0, "step": 2900 }, { "entropy": 0.6016871736384928, "epoch": 0.21771460556792926, "grad_norm": 0.17662760615348816, "learning_rate": 0.0002, "loss": 0.7738, "mean_token_accuracy": 0.7908546458929777, "num_tokens": 853021666.0, "step": 2905 }, { "entropy": 0.620117592997849, "epoch": 0.21808932950178112, "grad_norm": 0.18486721813678741, "learning_rate": 0.0002, "loss": 0.7875, "mean_token_accuracy": 0.7891411367803812, "num_tokens": 854490651.0, "step": 2910 }, { "entropy": 0.6034224388189614, "epoch": 0.21846405343563297, "grad_norm": 0.16472120583057404, "learning_rate": 0.0002, "loss": 0.7712, "mean_token_accuracy": 0.79190241843462, "num_tokens": 855947082.0, "step": 2915 }, { "entropy": 0.6326739527285099, "epoch": 0.21883877736948482, "grad_norm": 0.1842227727174759, "learning_rate": 0.0002, "loss": 0.7934, "mean_token_accuracy": 0.7872454207390547, "num_tokens": 857448888.0, "step": 2920 }, { "entropy": 0.6265815269201994, "epoch": 0.2192135013033367, "grad_norm": 0.17184653878211975, "learning_rate": 0.0002, "loss": 0.7877, "mean_token_accuracy": 0.7888232197612524, "num_tokens": 858933022.0, "step": 2925 }, { "entropy": 0.6147498056292534, "epoch": 0.21958822523718854, "grad_norm": 0.17626118659973145, "learning_rate": 0.0002, "loss": 0.7814, "mean_token_accuracy": 0.7900721199810505, "num_tokens": 860375764.0, "step": 2930 }, { "entropy": 0.6013707297854125, "epoch": 0.2199629491710404, "grad_norm": 0.1772213578224182, "learning_rate": 0.0002, "loss": 0.7685, "mean_token_accuracy": 0.7928800012916326, "num_tokens": 861806009.0, "step": 2935 }, { "entropy": 0.6132688777521252, "epoch": 0.22033767310489225, "grad_norm": 0.18683673441410065, "learning_rate": 0.0002, "loss": 0.7747, "mean_token_accuracy": 0.790662682428956, "num_tokens": 863274710.0, "step": 2940 }, { "entropy": 0.6143709937110543, "epoch": 0.22071239703874412, "grad_norm": 0.19702856242656708, "learning_rate": 0.0002, "loss": 0.7696, "mean_token_accuracy": 0.7951595935970545, "num_tokens": 864759676.0, "step": 2945 }, { "entropy": 0.6078088474459946, "epoch": 0.22108712097259597, "grad_norm": 0.18063944578170776, "learning_rate": 0.0002, "loss": 0.7843, "mean_token_accuracy": 0.7927134070545435, "num_tokens": 866212548.0, "step": 2950 }, { "entropy": 0.5848479934036732, "epoch": 0.22146184490644782, "grad_norm": 0.1970263123512268, "learning_rate": 0.0002, "loss": 0.7528, "mean_token_accuracy": 0.7953515410423279, "num_tokens": 867684267.0, "step": 2955 }, { "entropy": 0.5742570541799068, "epoch": 0.22183656884029967, "grad_norm": 0.22783194482326508, "learning_rate": 0.0002, "loss": 0.7499, "mean_token_accuracy": 0.7980666745454073, "num_tokens": 869107397.0, "step": 2960 }, { "entropy": 0.6192237246781588, "epoch": 0.22221129277415155, "grad_norm": 0.2176225185394287, "learning_rate": 0.0002, "loss": 0.7912, "mean_token_accuracy": 0.7896652769297361, "num_tokens": 870548432.0, "step": 2965 }, { "entropy": 0.6122456667944789, "epoch": 0.2225860167080034, "grad_norm": 0.2068229615688324, "learning_rate": 0.0002, "loss": 0.7857, "mean_token_accuracy": 0.7895904559642076, "num_tokens": 872018077.0, "step": 2970 }, { "entropy": 0.6097775716334581, "epoch": 0.22296074064185525, "grad_norm": 0.17791098356246948, "learning_rate": 0.0002, "loss": 0.7763, "mean_token_accuracy": 0.7923383433371782, "num_tokens": 873488308.0, "step": 2975 }, { "entropy": 0.60122777428478, "epoch": 0.2233354645757071, "grad_norm": 0.1928730607032776, "learning_rate": 0.0002, "loss": 0.767, "mean_token_accuracy": 0.7900618508458137, "num_tokens": 874939893.0, "step": 2980 }, { "entropy": 0.6211571775376796, "epoch": 0.22371018850955898, "grad_norm": 0.20651677250862122, "learning_rate": 0.0002, "loss": 0.7764, "mean_token_accuracy": 0.7917639080435037, "num_tokens": 876390445.0, "step": 2985 }, { "entropy": 0.6165442254394293, "epoch": 0.22408491244341083, "grad_norm": 0.19095340371131897, "learning_rate": 0.0002, "loss": 0.7631, "mean_token_accuracy": 0.7913955107331276, "num_tokens": 877850389.0, "step": 2990 }, { "entropy": 0.6280237125232816, "epoch": 0.22445963637726268, "grad_norm": 0.18613387644290924, "learning_rate": 0.0002, "loss": 0.7924, "mean_token_accuracy": 0.7889679301530123, "num_tokens": 879338339.0, "step": 2995 }, { "entropy": 0.6176994573324919, "epoch": 0.22483436031111453, "grad_norm": 0.2034207284450531, "learning_rate": 0.0002, "loss": 0.7762, "mean_token_accuracy": 0.79308289475739, "num_tokens": 880827646.0, "step": 3000 }, { "entropy": 0.6120400831103325, "epoch": 0.2252090842449664, "grad_norm": 0.18888041377067566, "learning_rate": 0.0002, "loss": 0.7699, "mean_token_accuracy": 0.7920909240841866, "num_tokens": 882317248.0, "step": 3005 }, { "entropy": 0.6014716439880431, "epoch": 0.22558380817881826, "grad_norm": 0.18220680952072144, "learning_rate": 0.0002, "loss": 0.756, "mean_token_accuracy": 0.7962518826127052, "num_tokens": 883812879.0, "step": 3010 }, { "entropy": 0.5928925178945065, "epoch": 0.2259585321126701, "grad_norm": 0.1714429259300232, "learning_rate": 0.0002, "loss": 0.7572, "mean_token_accuracy": 0.7955896612256765, "num_tokens": 885255152.0, "step": 3015 }, { "entropy": 0.6081295993179083, "epoch": 0.22633325604652196, "grad_norm": 0.1863308548927307, "learning_rate": 0.0002, "loss": 0.7734, "mean_token_accuracy": 0.7903869476169347, "num_tokens": 886715187.0, "step": 3020 }, { "entropy": 0.6125404221937061, "epoch": 0.22670797998037384, "grad_norm": 0.20721282064914703, "learning_rate": 0.0002, "loss": 0.7801, "mean_token_accuracy": 0.7900226388126612, "num_tokens": 888166761.0, "step": 3025 }, { "entropy": 0.6066348949447274, "epoch": 0.2270827039142257, "grad_norm": 0.18798832595348358, "learning_rate": 0.0002, "loss": 0.7707, "mean_token_accuracy": 0.7933323182165622, "num_tokens": 889631575.0, "step": 3030 }, { "entropy": 0.594732791185379, "epoch": 0.22745742784807754, "grad_norm": 0.1800381988286972, "learning_rate": 0.0002, "loss": 0.7646, "mean_token_accuracy": 0.7944474682211876, "num_tokens": 891072954.0, "step": 3035 }, { "entropy": 0.6030937067233026, "epoch": 0.2278321517819294, "grad_norm": 0.16873566806316376, "learning_rate": 0.0002, "loss": 0.7645, "mean_token_accuracy": 0.7935793794691562, "num_tokens": 892565702.0, "step": 3040 }, { "entropy": 0.604484181664884, "epoch": 0.22820687571578127, "grad_norm": 0.17453648149967194, "learning_rate": 0.0002, "loss": 0.7823, "mean_token_accuracy": 0.7931980516761541, "num_tokens": 894033348.0, "step": 3045 }, { "entropy": 0.5850357495248317, "epoch": 0.22858159964963312, "grad_norm": 0.1975393146276474, "learning_rate": 0.0002, "loss": 0.7589, "mean_token_accuracy": 0.79242212921381, "num_tokens": 895462750.0, "step": 3050 }, { "entropy": 0.6130742050707341, "epoch": 0.22895632358348497, "grad_norm": 0.2044544517993927, "learning_rate": 0.0002, "loss": 0.7825, "mean_token_accuracy": 0.7886365078389644, "num_tokens": 896942268.0, "step": 3055 }, { "entropy": 0.607714077271521, "epoch": 0.22933104751733685, "grad_norm": 0.34942471981048584, "learning_rate": 0.0002, "loss": 0.7805, "mean_token_accuracy": 0.7912286546081304, "num_tokens": 898441673.0, "step": 3060 }, { "entropy": 0.5814354110509157, "epoch": 0.2297057714511887, "grad_norm": 0.1859629899263382, "learning_rate": 0.0002, "loss": 0.7661, "mean_token_accuracy": 0.7946258999407292, "num_tokens": 899905740.0, "step": 3065 }, { "entropy": 0.6177827170118689, "epoch": 0.23008049538504055, "grad_norm": 0.3119899034500122, "learning_rate": 0.0002, "loss": 0.7899, "mean_token_accuracy": 0.7903837848454713, "num_tokens": 901400563.0, "step": 3070 }, { "entropy": 0.5968695192597806, "epoch": 0.2304552193188924, "grad_norm": 0.16633917391300201, "learning_rate": 0.0002, "loss": 0.7807, "mean_token_accuracy": 0.7917581487447023, "num_tokens": 902855182.0, "step": 3075 }, { "entropy": 0.5961834775283933, "epoch": 0.23082994325274428, "grad_norm": 0.1790451556444168, "learning_rate": 0.0002, "loss": 0.7767, "mean_token_accuracy": 0.7921489655971528, "num_tokens": 904348895.0, "step": 3080 }, { "entropy": 0.6021447038277984, "epoch": 0.23120466718659613, "grad_norm": 0.17224012315273285, "learning_rate": 0.0002, "loss": 0.7828, "mean_token_accuracy": 0.7918225079774857, "num_tokens": 905820786.0, "step": 3085 }, { "entropy": 0.5922387264668941, "epoch": 0.23157939112044798, "grad_norm": 0.17574802041053772, "learning_rate": 0.0002, "loss": 0.7663, "mean_token_accuracy": 0.79227932728827, "num_tokens": 907276550.0, "step": 3090 }, { "entropy": 0.6020781299099326, "epoch": 0.23195411505429983, "grad_norm": 0.20596148073673248, "learning_rate": 0.0002, "loss": 0.783, "mean_token_accuracy": 0.7921137340366841, "num_tokens": 908738876.0, "step": 3095 }, { "entropy": 0.6005534796044231, "epoch": 0.2323288389881517, "grad_norm": 0.32544365525245667, "learning_rate": 0.0002, "loss": 0.7893, "mean_token_accuracy": 0.7917782008647919, "num_tokens": 910219932.0, "step": 3100 }, { "entropy": 0.575971554312855, "epoch": 0.23270356292200356, "grad_norm": 0.20600625872612, "learning_rate": 0.0002, "loss": 0.7599, "mean_token_accuracy": 0.7939216367900371, "num_tokens": 911708480.0, "step": 3105 }, { "entropy": 0.5956843079999089, "epoch": 0.2330782868558554, "grad_norm": 0.384930819272995, "learning_rate": 0.0002, "loss": 0.791, "mean_token_accuracy": 0.7894523601979018, "num_tokens": 913208396.0, "step": 3110 }, { "entropy": 0.5905843143351376, "epoch": 0.23345301078970726, "grad_norm": 0.1932404786348343, "learning_rate": 0.0002, "loss": 0.7802, "mean_token_accuracy": 0.7921688210219144, "num_tokens": 914676849.0, "step": 3115 }, { "entropy": 0.5921662215143442, "epoch": 0.23382773472355914, "grad_norm": 0.18472546339035034, "learning_rate": 0.0002, "loss": 0.7804, "mean_token_accuracy": 0.7913658030331134, "num_tokens": 916157425.0, "step": 3120 }, { "entropy": 0.5886248469352722, "epoch": 0.234202458657411, "grad_norm": 0.18258540332317352, "learning_rate": 0.0002, "loss": 0.7781, "mean_token_accuracy": 0.7916328597813844, "num_tokens": 917634601.0, "step": 3125 }, { "entropy": 0.602498197555542, "epoch": 0.23457718259126284, "grad_norm": 0.18758630752563477, "learning_rate": 0.0002, "loss": 0.7867, "mean_token_accuracy": 0.7863902099430561, "num_tokens": 919131266.0, "step": 3130 }, { "entropy": 0.6006685465574264, "epoch": 0.2349519065251147, "grad_norm": 0.18530689179897308, "learning_rate": 0.0002, "loss": 0.7791, "mean_token_accuracy": 0.7883331336081028, "num_tokens": 920663473.0, "step": 3135 }, { "entropy": 0.5690535187721253, "epoch": 0.23532663045896657, "grad_norm": 0.173971489071846, "learning_rate": 0.0002, "loss": 0.7532, "mean_token_accuracy": 0.7953459672629833, "num_tokens": 922106596.0, "step": 3140 }, { "entropy": 0.5911414446309209, "epoch": 0.23570135439281842, "grad_norm": 0.1885146200656891, "learning_rate": 0.0002, "loss": 0.7756, "mean_token_accuracy": 0.7909908458590508, "num_tokens": 923595453.0, "step": 3145 }, { "entropy": 0.594905898347497, "epoch": 0.23607607832667027, "grad_norm": 0.18323823809623718, "learning_rate": 0.0002, "loss": 0.7819, "mean_token_accuracy": 0.7880465116351842, "num_tokens": 925053213.0, "step": 3150 }, { "entropy": 0.5760542242787778, "epoch": 0.23645080226052212, "grad_norm": 0.18570762872695923, "learning_rate": 0.0002, "loss": 0.7487, "mean_token_accuracy": 0.7977918013930321, "num_tokens": 926548609.0, "step": 3155 }, { "entropy": 0.5974040046334267, "epoch": 0.236825526194374, "grad_norm": 0.17445509135723114, "learning_rate": 0.0002, "loss": 0.7784, "mean_token_accuracy": 0.7913479849696159, "num_tokens": 928051323.0, "step": 3160 }, { "entropy": 0.602661244943738, "epoch": 0.23720025012822585, "grad_norm": 0.16877879202365875, "learning_rate": 0.0002, "loss": 0.7781, "mean_token_accuracy": 0.7875406693667173, "num_tokens": 929525452.0, "step": 3165 }, { "entropy": 0.5801331653259695, "epoch": 0.2375749740620777, "grad_norm": 0.17203545570373535, "learning_rate": 0.0002, "loss": 0.7512, "mean_token_accuracy": 0.7966703176498413, "num_tokens": 930974301.0, "step": 3170 }, { "entropy": 0.5909808944910765, "epoch": 0.23794969799592955, "grad_norm": 0.18197855353355408, "learning_rate": 0.0002, "loss": 0.7793, "mean_token_accuracy": 0.7925734013319016, "num_tokens": 932432044.0, "step": 3175 }, { "entropy": 0.5937771830707789, "epoch": 0.23832442192978143, "grad_norm": 0.1830630749464035, "learning_rate": 0.0002, "loss": 0.7656, "mean_token_accuracy": 0.7921238649636507, "num_tokens": 933918399.0, "step": 3180 }, { "entropy": 0.5923420252278447, "epoch": 0.23869914586363328, "grad_norm": 0.18631024658679962, "learning_rate": 0.0002, "loss": 0.7686, "mean_token_accuracy": 0.7900075186043978, "num_tokens": 935399250.0, "step": 3185 }, { "entropy": 0.5687323113903403, "epoch": 0.23907386979748513, "grad_norm": 0.17834340035915375, "learning_rate": 0.0002, "loss": 0.7415, "mean_token_accuracy": 0.7972087629139424, "num_tokens": 936852502.0, "step": 3190 }, { "entropy": 0.5894895501434803, "epoch": 0.23944859373133698, "grad_norm": 0.19092167913913727, "learning_rate": 0.0002, "loss": 0.7663, "mean_token_accuracy": 0.7931097961962223, "num_tokens": 938306942.0, "step": 3195 }, { "entropy": 0.6095428977161645, "epoch": 0.23982331766518886, "grad_norm": 0.17559018731117249, "learning_rate": 0.0002, "loss": 0.7984, "mean_token_accuracy": 0.7878601267933846, "num_tokens": 939754791.0, "step": 3200 }, { "entropy": 0.5900081584230066, "epoch": 0.2401980415990407, "grad_norm": 0.1968281865119934, "learning_rate": 0.0002, "loss": 0.783, "mean_token_accuracy": 0.791095332801342, "num_tokens": 941194618.0, "step": 3205 }, { "entropy": 0.5985764656215906, "epoch": 0.24057276553289256, "grad_norm": 0.184096559882164, "learning_rate": 0.0002, "loss": 0.7766, "mean_token_accuracy": 0.7883451756089925, "num_tokens": 942650984.0, "step": 3210 }, { "entropy": 0.5926110571250319, "epoch": 0.24094748946674444, "grad_norm": 0.17031511664390564, "learning_rate": 0.0002, "loss": 0.7639, "mean_token_accuracy": 0.7952513925731182, "num_tokens": 944156488.0, "step": 3215 }, { "entropy": 0.6054661465808749, "epoch": 0.24132221340059629, "grad_norm": 0.17488786578178406, "learning_rate": 0.0002, "loss": 0.777, "mean_token_accuracy": 0.7885928194969892, "num_tokens": 945653712.0, "step": 3220 }, { "entropy": 0.5872668992727995, "epoch": 0.24169693733444814, "grad_norm": 0.1892264187335968, "learning_rate": 0.0002, "loss": 0.7567, "mean_token_accuracy": 0.7933645397424698, "num_tokens": 947101185.0, "step": 3225 }, { "entropy": 0.5807681653648615, "epoch": 0.2420716612683, "grad_norm": 0.19674065709114075, "learning_rate": 0.0002, "loss": 0.7599, "mean_token_accuracy": 0.7953812491148711, "num_tokens": 948548531.0, "step": 3230 }, { "entropy": 0.5847845610231162, "epoch": 0.24244638520215187, "grad_norm": 0.19050508737564087, "learning_rate": 0.0002, "loss": 0.7693, "mean_token_accuracy": 0.7926752306520939, "num_tokens": 950001455.0, "step": 3235 }, { "entropy": 0.5933756682090461, "epoch": 0.24282110913600372, "grad_norm": 0.17491255700588226, "learning_rate": 0.0002, "loss": 0.7781, "mean_token_accuracy": 0.7924311645328999, "num_tokens": 951456977.0, "step": 3240 }, { "entropy": 0.5958244953304529, "epoch": 0.24319583306985557, "grad_norm": 0.1876218020915985, "learning_rate": 0.0002, "loss": 0.7815, "mean_token_accuracy": 0.7904223330318928, "num_tokens": 952916927.0, "step": 3245 }, { "entropy": 0.6157199969515205, "epoch": 0.24357055700370742, "grad_norm": 0.20346054434776306, "learning_rate": 0.0002, "loss": 0.8006, "mean_token_accuracy": 0.7859828535467386, "num_tokens": 954411664.0, "step": 3250 }, { "entropy": 0.6034699751064181, "epoch": 0.2439452809375593, "grad_norm": 0.1771382987499237, "learning_rate": 0.0002, "loss": 0.7723, "mean_token_accuracy": 0.7910667154937983, "num_tokens": 955903013.0, "step": 3255 }, { "entropy": 0.61547646895051, "epoch": 0.24432000487141114, "grad_norm": 0.22670641541481018, "learning_rate": 0.0002, "loss": 0.7775, "mean_token_accuracy": 0.7891286257654428, "num_tokens": 957441370.0, "step": 3260 }, { "entropy": 0.6002426866441966, "epoch": 0.244694728805263, "grad_norm": 0.17327943444252014, "learning_rate": 0.0002, "loss": 0.7689, "mean_token_accuracy": 0.7911980908364058, "num_tokens": 958901892.0, "step": 3265 }, { "entropy": 0.5945069866254926, "epoch": 0.24506945273911485, "grad_norm": 0.19520705938339233, "learning_rate": 0.0002, "loss": 0.751, "mean_token_accuracy": 0.7959384519606829, "num_tokens": 960346214.0, "step": 3270 }, { "entropy": 0.5842633651569485, "epoch": 0.24544417667296672, "grad_norm": 0.2030860185623169, "learning_rate": 0.0002, "loss": 0.747, "mean_token_accuracy": 0.7987332347780466, "num_tokens": 961781571.0, "step": 3275 }, { "entropy": 0.6039242866449058, "epoch": 0.24581890060681857, "grad_norm": 0.186064213514328, "learning_rate": 0.0002, "loss": 0.7589, "mean_token_accuracy": 0.7958866361528635, "num_tokens": 963235214.0, "step": 3280 }, { "entropy": 0.6040273838676512, "epoch": 0.24619362454067042, "grad_norm": 0.2068074345588684, "learning_rate": 0.0002, "loss": 0.7676, "mean_token_accuracy": 0.7937548849731684, "num_tokens": 964716284.0, "step": 3285 }, { "entropy": 0.6022475901991129, "epoch": 0.24656834847452228, "grad_norm": 0.17622481286525726, "learning_rate": 0.0002, "loss": 0.7672, "mean_token_accuracy": 0.7921460211277008, "num_tokens": 966206039.0, "step": 3290 }, { "entropy": 0.6006730562075973, "epoch": 0.24694307240837415, "grad_norm": 0.22721464931964874, "learning_rate": 0.0002, "loss": 0.7734, "mean_token_accuracy": 0.7923626616597176, "num_tokens": 967700905.0, "step": 3295 }, { "entropy": 0.6179472109302878, "epoch": 0.247317796342226, "grad_norm": 0.19082921743392944, "learning_rate": 0.0002, "loss": 0.7914, "mean_token_accuracy": 0.7910350564867258, "num_tokens": 969183633.0, "step": 3300 }, { "entropy": 0.6048224554397166, "epoch": 0.24769252027607785, "grad_norm": 0.19868041574954987, "learning_rate": 0.0002, "loss": 0.7726, "mean_token_accuracy": 0.7928085949271917, "num_tokens": 970622043.0, "step": 3305 }, { "entropy": 0.5998501993715764, "epoch": 0.2480672442099297, "grad_norm": 0.1894310861825943, "learning_rate": 0.0002, "loss": 0.769, "mean_token_accuracy": 0.7920093674212694, "num_tokens": 972124270.0, "step": 3310 }, { "entropy": 0.5988027549348771, "epoch": 0.24844196814378158, "grad_norm": 0.19373099505901337, "learning_rate": 0.0002, "loss": 0.7653, "mean_token_accuracy": 0.7923041392117739, "num_tokens": 973602223.0, "step": 3315 }, { "entropy": 0.5807457817718387, "epoch": 0.24881669207763343, "grad_norm": 0.1850639134645462, "learning_rate": 0.0002, "loss": 0.748, "mean_token_accuracy": 0.7981423027813435, "num_tokens": 975079110.0, "step": 3320 }, { "entropy": 0.5863528411835432, "epoch": 0.24919141601148528, "grad_norm": 0.1962193101644516, "learning_rate": 0.0002, "loss": 0.7523, "mean_token_accuracy": 0.7981250055134297, "num_tokens": 976544095.0, "step": 3325 }, { "entropy": 0.601879983022809, "epoch": 0.24956613994533713, "grad_norm": 0.16936078667640686, "learning_rate": 0.0002, "loss": 0.7648, "mean_token_accuracy": 0.7904386047273875, "num_tokens": 977998960.0, "step": 3330 }, { "entropy": 0.5978780467063188, "epoch": 0.249940863879189, "grad_norm": 0.22908467054367065, "learning_rate": 0.0002, "loss": 0.7715, "mean_token_accuracy": 0.7908818397670985, "num_tokens": 979443973.0, "step": 3335 }, { "entropy": 0.5912923371419311, "epoch": 0.25031558781304086, "grad_norm": 0.22144706547260284, "learning_rate": 0.0002, "loss": 0.7719, "mean_token_accuracy": 0.7947148490697146, "num_tokens": 980942744.0, "step": 3340 }, { "entropy": 0.5914248387329281, "epoch": 0.2506903117468927, "grad_norm": 0.2078184187412262, "learning_rate": 0.0002, "loss": 0.7659, "mean_token_accuracy": 0.794261672347784, "num_tokens": 982371529.0, "step": 3345 }, { "entropy": 0.6095857445150614, "epoch": 0.25106503568074456, "grad_norm": 0.30034127831459045, "learning_rate": 0.0002, "loss": 0.7823, "mean_token_accuracy": 0.7900100003927946, "num_tokens": 983867179.0, "step": 3350 }, { "entropy": 0.6012157524004579, "epoch": 0.2514397596145964, "grad_norm": 0.19297821819782257, "learning_rate": 0.0002, "loss": 0.7658, "mean_token_accuracy": 0.793502501770854, "num_tokens": 985362758.0, "step": 3355 }, { "entropy": 0.6000203797593713, "epoch": 0.25181448354844826, "grad_norm": 0.18240496516227722, "learning_rate": 0.0002, "loss": 0.7831, "mean_token_accuracy": 0.7914968751370907, "num_tokens": 986817585.0, "step": 3360 }, { "entropy": 0.5964129779487848, "epoch": 0.25218920748230017, "grad_norm": 0.20648911595344543, "learning_rate": 0.0002, "loss": 0.7545, "mean_token_accuracy": 0.7944485753774643, "num_tokens": 988286254.0, "step": 3365 }, { "entropy": 0.5880178642459214, "epoch": 0.252563931416152, "grad_norm": 0.2029644399881363, "learning_rate": 0.0002, "loss": 0.7498, "mean_token_accuracy": 0.7978643778711557, "num_tokens": 989752787.0, "step": 3370 }, { "entropy": 0.5838120109401643, "epoch": 0.25293865535000387, "grad_norm": 0.17866076529026031, "learning_rate": 0.0002, "loss": 0.7515, "mean_token_accuracy": 0.7978673372417688, "num_tokens": 991195643.0, "step": 3375 }, { "entropy": 0.5964143015444279, "epoch": 0.2533133792838557, "grad_norm": 0.19206590950489044, "learning_rate": 0.0002, "loss": 0.7595, "mean_token_accuracy": 0.7940529938787222, "num_tokens": 992647741.0, "step": 3380 }, { "entropy": 0.6125034753233194, "epoch": 0.2536881032177076, "grad_norm": 0.1790781021118164, "learning_rate": 0.0002, "loss": 0.7728, "mean_token_accuracy": 0.7936070002615452, "num_tokens": 994089327.0, "step": 3385 }, { "entropy": 0.5904562810435892, "epoch": 0.2540628271515594, "grad_norm": 0.18886712193489075, "learning_rate": 0.0002, "loss": 0.7631, "mean_token_accuracy": 0.7936054103076458, "num_tokens": 995535893.0, "step": 3390 }, { "entropy": 0.5960104243829847, "epoch": 0.2544375510854113, "grad_norm": 0.18336865305900574, "learning_rate": 0.0002, "loss": 0.7704, "mean_token_accuracy": 0.7963327609002591, "num_tokens": 997034609.0, "step": 3395 }, { "entropy": 0.5788840701803565, "epoch": 0.2548122750192632, "grad_norm": 0.19076141715049744, "learning_rate": 0.0002, "loss": 0.757, "mean_token_accuracy": 0.7948989797383547, "num_tokens": 998507850.0, "step": 3400 }, { "entropy": 0.6032233649864793, "epoch": 0.25518699895311503, "grad_norm": 0.1811448037624359, "learning_rate": 0.0002, "loss": 0.784, "mean_token_accuracy": 0.7891838554292917, "num_tokens": 1000002285.0, "step": 3405 }, { "entropy": 0.583606531098485, "epoch": 0.2555617228869669, "grad_norm": 0.1821887195110321, "learning_rate": 0.0002, "loss": 0.7654, "mean_token_accuracy": 0.7946232974529266, "num_tokens": 1001485977.0, "step": 3410 }, { "entropy": 0.5918758489191532, "epoch": 0.25593644682081873, "grad_norm": 0.17130309343338013, "learning_rate": 0.0002, "loss": 0.773, "mean_token_accuracy": 0.7904511842876673, "num_tokens": 1002962459.0, "step": 3415 }, { "entropy": 0.5935383439064026, "epoch": 0.2563111707546706, "grad_norm": 0.1863093376159668, "learning_rate": 0.0002, "loss": 0.7815, "mean_token_accuracy": 0.794016232341528, "num_tokens": 1004426260.0, "step": 3420 }, { "entropy": 0.5807607773225755, "epoch": 0.25668589468852243, "grad_norm": 0.17075879871845245, "learning_rate": 0.0002, "loss": 0.7439, "mean_token_accuracy": 0.7954550661146641, "num_tokens": 1005932088.0, "step": 3425 }, { "entropy": 0.5953171156346798, "epoch": 0.2570606186223743, "grad_norm": 0.16647036373615265, "learning_rate": 0.0002, "loss": 0.7581, "mean_token_accuracy": 0.7962095115333796, "num_tokens": 1007479067.0, "step": 3430 }, { "entropy": 0.5897403264418244, "epoch": 0.25743534255622613, "grad_norm": 0.18616706132888794, "learning_rate": 0.0002, "loss": 0.7694, "mean_token_accuracy": 0.792909674346447, "num_tokens": 1008918291.0, "step": 3435 }, { "entropy": 0.6042361265048385, "epoch": 0.25781006649007804, "grad_norm": 0.20190022885799408, "learning_rate": 0.0002, "loss": 0.7828, "mean_token_accuracy": 0.7914794113487005, "num_tokens": 1010413400.0, "step": 3440 }, { "entropy": 0.5968231849372387, "epoch": 0.2581847904239299, "grad_norm": 0.19030892848968506, "learning_rate": 0.0002, "loss": 0.7714, "mean_token_accuracy": 0.7942539513111114, "num_tokens": 1011857110.0, "step": 3445 }, { "entropy": 0.5822040738537908, "epoch": 0.25855951435778174, "grad_norm": 0.17487646639347076, "learning_rate": 0.0002, "loss": 0.7563, "mean_token_accuracy": 0.7968291647732257, "num_tokens": 1013277713.0, "step": 3450 }, { "entropy": 0.5898692475631833, "epoch": 0.2589342382916336, "grad_norm": 0.2083653062582016, "learning_rate": 0.0002, "loss": 0.7736, "mean_token_accuracy": 0.7947997521609068, "num_tokens": 1014750721.0, "step": 3455 }, { "entropy": 0.6078956128098071, "epoch": 0.25930896222548544, "grad_norm": 0.19440293312072754, "learning_rate": 0.0002, "loss": 0.7717, "mean_token_accuracy": 0.7924240209162235, "num_tokens": 1016266696.0, "step": 3460 }, { "entropy": 0.6032758740708232, "epoch": 0.2596836861593373, "grad_norm": 0.19161345064640045, "learning_rate": 0.0002, "loss": 0.7783, "mean_token_accuracy": 0.7927794314920902, "num_tokens": 1017762763.0, "step": 3465 }, { "entropy": 0.6035493643954396, "epoch": 0.26005841009318914, "grad_norm": 0.1881159543991089, "learning_rate": 0.0002, "loss": 0.7736, "mean_token_accuracy": 0.7924304351210594, "num_tokens": 1019215941.0, "step": 3470 }, { "entropy": 0.5992323034442961, "epoch": 0.260433134027041, "grad_norm": 0.17855720221996307, "learning_rate": 0.0002, "loss": 0.7623, "mean_token_accuracy": 0.7944755211472512, "num_tokens": 1020687909.0, "step": 3475 }, { "entropy": 0.6088975040242076, "epoch": 0.2608078579608929, "grad_norm": 0.31441813707351685, "learning_rate": 0.0002, "loss": 0.7623, "mean_token_accuracy": 0.7900080241262912, "num_tokens": 1022176357.0, "step": 3480 }, { "entropy": 0.6178255111910402, "epoch": 0.26118258189474475, "grad_norm": 0.18566195666790009, "learning_rate": 0.0002, "loss": 0.7866, "mean_token_accuracy": 0.789628129824996, "num_tokens": 1023675197.0, "step": 3485 }, { "entropy": 0.5849117916077375, "epoch": 0.2615573058285966, "grad_norm": 0.17559179663658142, "learning_rate": 0.0002, "loss": 0.7412, "mean_token_accuracy": 0.7972925134003163, "num_tokens": 1025146429.0, "step": 3490 }, { "entropy": 0.5920987529680133, "epoch": 0.26193202976244845, "grad_norm": 0.17406582832336426, "learning_rate": 0.0002, "loss": 0.7533, "mean_token_accuracy": 0.7939569678157568, "num_tokens": 1026606748.0, "step": 3495 }, { "entropy": 0.6204957904294133, "epoch": 0.2623067536963003, "grad_norm": 0.17927072942256927, "learning_rate": 0.0002, "loss": 0.7964, "mean_token_accuracy": 0.7886785835027694, "num_tokens": 1028097177.0, "step": 3500 }, { "entropy": 0.5766547257080674, "epoch": 0.26268147763015215, "grad_norm": 0.18533779680728912, "learning_rate": 0.0002, "loss": 0.7423, "mean_token_accuracy": 0.8006468687206507, "num_tokens": 1029549123.0, "step": 3505 }, { "entropy": 0.604522779583931, "epoch": 0.263056201564004, "grad_norm": 0.16642893850803375, "learning_rate": 0.0002, "loss": 0.7737, "mean_token_accuracy": 0.7922281485050917, "num_tokens": 1031025288.0, "step": 3510 }, { "entropy": 0.5985106758773326, "epoch": 0.26343092549785585, "grad_norm": 0.1880245804786682, "learning_rate": 0.0002, "loss": 0.7612, "mean_token_accuracy": 0.7961158849298954, "num_tokens": 1032443929.0, "step": 3515 }, { "entropy": 0.6097992507740855, "epoch": 0.26380564943170776, "grad_norm": 0.2003479301929474, "learning_rate": 0.0002, "loss": 0.7747, "mean_token_accuracy": 0.792513120546937, "num_tokens": 1033911375.0, "step": 3520 }, { "entropy": 0.5940940045751631, "epoch": 0.2641803733655596, "grad_norm": 0.1820470094680786, "learning_rate": 0.0002, "loss": 0.7539, "mean_token_accuracy": 0.7981526773422957, "num_tokens": 1035319799.0, "step": 3525 }, { "entropy": 0.6172434492968023, "epoch": 0.26455509729941146, "grad_norm": 0.17090807855129242, "learning_rate": 0.0002, "loss": 0.7617, "mean_token_accuracy": 0.7949294984340668, "num_tokens": 1036826407.0, "step": 3530 }, { "entropy": 0.5913195163011551, "epoch": 0.2649298212332633, "grad_norm": 0.19506271183490753, "learning_rate": 0.0002, "loss": 0.747, "mean_token_accuracy": 0.7985684588551522, "num_tokens": 1038291988.0, "step": 3535 }, { "entropy": 0.6099884321913123, "epoch": 0.26530454516711516, "grad_norm": 0.1877293437719345, "learning_rate": 0.0002, "loss": 0.7715, "mean_token_accuracy": 0.7934809625148773, "num_tokens": 1039772497.0, "step": 3540 }, { "entropy": 0.6069047521799803, "epoch": 0.265679269100967, "grad_norm": 0.19753628969192505, "learning_rate": 0.0002, "loss": 0.7661, "mean_token_accuracy": 0.7941189754754305, "num_tokens": 1041266064.0, "step": 3545 }, { "entropy": 0.5958967113867402, "epoch": 0.26605399303481886, "grad_norm": 0.17828026413917542, "learning_rate": 0.0002, "loss": 0.755, "mean_token_accuracy": 0.7977355565875769, "num_tokens": 1042750699.0, "step": 3550 }, { "entropy": 0.6093648741021752, "epoch": 0.26642871696867076, "grad_norm": 0.19836865365505219, "learning_rate": 0.0002, "loss": 0.7752, "mean_token_accuracy": 0.7942829698324203, "num_tokens": 1044202174.0, "step": 3555 }, { "entropy": 0.60506637301296, "epoch": 0.2668034409025226, "grad_norm": 0.17814111709594727, "learning_rate": 0.0002, "loss": 0.7632, "mean_token_accuracy": 0.7939696252346039, "num_tokens": 1045696151.0, "step": 3560 }, { "entropy": 0.5861180092208087, "epoch": 0.26717816483637447, "grad_norm": 0.18165303766727448, "learning_rate": 0.0002, "loss": 0.7544, "mean_token_accuracy": 0.792906592413783, "num_tokens": 1047148069.0, "step": 3565 }, { "entropy": 0.594329347088933, "epoch": 0.2675528887702263, "grad_norm": 0.18132542073726654, "learning_rate": 0.0002, "loss": 0.768, "mean_token_accuracy": 0.7924674712121487, "num_tokens": 1048608405.0, "step": 3570 }, { "entropy": 0.5913009720854461, "epoch": 0.26792761270407817, "grad_norm": 0.18218807876110077, "learning_rate": 0.0002, "loss": 0.7595, "mean_token_accuracy": 0.7958759281784296, "num_tokens": 1050102066.0, "step": 3575 }, { "entropy": 0.5914055973291397, "epoch": 0.26830233663793, "grad_norm": 0.16635237634181976, "learning_rate": 0.0002, "loss": 0.7696, "mean_token_accuracy": 0.7930228639394045, "num_tokens": 1051525853.0, "step": 3580 }, { "entropy": 0.5917995788156987, "epoch": 0.26867706057178187, "grad_norm": 0.19655656814575195, "learning_rate": 0.0002, "loss": 0.7779, "mean_token_accuracy": 0.7912351436913013, "num_tokens": 1052939342.0, "step": 3585 }, { "entropy": 0.580206785351038, "epoch": 0.2690517845056337, "grad_norm": 0.17760209739208221, "learning_rate": 0.0002, "loss": 0.772, "mean_token_accuracy": 0.7964803721755743, "num_tokens": 1054385334.0, "step": 3590 }, { "entropy": 0.5739079128019512, "epoch": 0.2694265084394856, "grad_norm": 0.1852676123380661, "learning_rate": 0.0002, "loss": 0.7521, "mean_token_accuracy": 0.7985167879611254, "num_tokens": 1055814171.0, "step": 3595 }, { "entropy": 0.600291644781828, "epoch": 0.2698012323733375, "grad_norm": 0.18089650571346283, "learning_rate": 0.0002, "loss": 0.7698, "mean_token_accuracy": 0.7941801484674216, "num_tokens": 1057287485.0, "step": 3600 }, { "entropy": 0.5822049813345075, "epoch": 0.2701759563071893, "grad_norm": 0.18153142929077148, "learning_rate": 0.0002, "loss": 0.7515, "mean_token_accuracy": 0.7956582549959421, "num_tokens": 1058756528.0, "step": 3605 }, { "entropy": 0.6202313524670899, "epoch": 0.2705506802410412, "grad_norm": 0.17523814737796783, "learning_rate": 0.0002, "loss": 0.7894, "mean_token_accuracy": 0.7893832955509424, "num_tokens": 1060264233.0, "step": 3610 }, { "entropy": 0.6071001017466188, "epoch": 0.270925404174893, "grad_norm": 0.17604252696037292, "learning_rate": 0.0002, "loss": 0.7668, "mean_token_accuracy": 0.7949936624616385, "num_tokens": 1061801945.0, "step": 3615 }, { "entropy": 0.5811024282127619, "epoch": 0.2713001281087449, "grad_norm": 0.1843315213918686, "learning_rate": 0.0002, "loss": 0.7564, "mean_token_accuracy": 0.7938980408012867, "num_tokens": 1063283020.0, "step": 3620 }, { "entropy": 0.5805819742381573, "epoch": 0.2716748520425967, "grad_norm": 0.16596189141273499, "learning_rate": 0.0002, "loss": 0.744, "mean_token_accuracy": 0.7989357717335224, "num_tokens": 1064758596.0, "step": 3625 }, { "entropy": 0.5838448764756322, "epoch": 0.2720495759764486, "grad_norm": 0.18708626925945282, "learning_rate": 0.0002, "loss": 0.7578, "mean_token_accuracy": 0.7971524957567453, "num_tokens": 1066176404.0, "step": 3630 }, { "entropy": 0.5918695518746973, "epoch": 0.2724242999103005, "grad_norm": 0.18651260435581207, "learning_rate": 0.0002, "loss": 0.7603, "mean_token_accuracy": 0.7945366263389587, "num_tokens": 1067676447.0, "step": 3635 }, { "entropy": 0.5909916624426842, "epoch": 0.27279902384415233, "grad_norm": 0.16948096454143524, "learning_rate": 0.0002, "loss": 0.7685, "mean_token_accuracy": 0.7925908096134663, "num_tokens": 1069134471.0, "step": 3640 }, { "entropy": 0.594531019218266, "epoch": 0.2731737477780042, "grad_norm": 0.1923794150352478, "learning_rate": 0.0002, "loss": 0.7647, "mean_token_accuracy": 0.7925879754126072, "num_tokens": 1070638784.0, "step": 3645 }, { "entropy": 0.5914392530918121, "epoch": 0.27354847171185603, "grad_norm": 0.263683557510376, "learning_rate": 0.0002, "loss": 0.7726, "mean_token_accuracy": 0.792741046473384, "num_tokens": 1072143647.0, "step": 3650 }, { "entropy": 0.5770319237373769, "epoch": 0.2739231956457079, "grad_norm": 0.1736728847026825, "learning_rate": 0.0002, "loss": 0.7475, "mean_token_accuracy": 0.7948320243507624, "num_tokens": 1073645266.0, "step": 3655 }, { "entropy": 0.5854573218151927, "epoch": 0.27429791957955973, "grad_norm": 0.18913981318473816, "learning_rate": 0.0002, "loss": 0.7668, "mean_token_accuracy": 0.7943837370723486, "num_tokens": 1075130498.0, "step": 3660 }, { "entropy": 0.5785916738212109, "epoch": 0.2746726435134116, "grad_norm": 0.9787591695785522, "learning_rate": 0.0002, "loss": 0.76, "mean_token_accuracy": 0.7942673292011022, "num_tokens": 1076572393.0, "step": 3665 }, { "entropy": 0.5793195364065469, "epoch": 0.27504736744726344, "grad_norm": 0.28047582507133484, "learning_rate": 0.0002, "loss": 0.7452, "mean_token_accuracy": 0.7968756891787052, "num_tokens": 1078073386.0, "step": 3670 }, { "entropy": 0.5748851810581982, "epoch": 0.27542209138111534, "grad_norm": 0.17054568231105804, "learning_rate": 0.0002, "loss": 0.7496, "mean_token_accuracy": 0.7946161702275276, "num_tokens": 1079554721.0, "step": 3675 }, { "entropy": 0.5906270237639546, "epoch": 0.2757968153149672, "grad_norm": 0.18592874705791473, "learning_rate": 0.0002, "loss": 0.7729, "mean_token_accuracy": 0.7935394957661629, "num_tokens": 1081050292.0, "step": 3680 }, { "entropy": 0.5906142725609242, "epoch": 0.27617153924881904, "grad_norm": 0.1914171427488327, "learning_rate": 0.0002, "loss": 0.7616, "mean_token_accuracy": 0.7942790210247039, "num_tokens": 1082538389.0, "step": 3685 }, { "entropy": 0.571046962402761, "epoch": 0.2765462631826709, "grad_norm": 0.18487516045570374, "learning_rate": 0.0002, "loss": 0.7502, "mean_token_accuracy": 0.7982507199048996, "num_tokens": 1083979704.0, "step": 3690 }, { "entropy": 0.5877140026539565, "epoch": 0.27692098711652274, "grad_norm": 0.1898277997970581, "learning_rate": 0.0002, "loss": 0.7647, "mean_token_accuracy": 0.7941856760531664, "num_tokens": 1085462193.0, "step": 3695 }, { "entropy": 0.6036896239966154, "epoch": 0.2772957110503746, "grad_norm": 0.19926698505878448, "learning_rate": 0.0002, "loss": 0.7769, "mean_token_accuracy": 0.7938246611505747, "num_tokens": 1086949541.0, "step": 3700 }, { "entropy": 0.6019755465909838, "epoch": 0.27767043498422644, "grad_norm": 0.1830356866121292, "learning_rate": 0.0002, "loss": 0.7715, "mean_token_accuracy": 0.7897591572254896, "num_tokens": 1088466435.0, "step": 3705 }, { "entropy": 0.5821739053353667, "epoch": 0.2780451589180783, "grad_norm": 0.17422476410865784, "learning_rate": 0.0002, "loss": 0.7536, "mean_token_accuracy": 0.7958270203322172, "num_tokens": 1089938717.0, "step": 3710 }, { "entropy": 0.5797704672440886, "epoch": 0.2784198828519302, "grad_norm": 0.19488529860973358, "learning_rate": 0.0002, "loss": 0.7363, "mean_token_accuracy": 0.7971135914325714, "num_tokens": 1091398664.0, "step": 3715 }, { "entropy": 0.5953437337651849, "epoch": 0.27879460678578205, "grad_norm": 0.199569970369339, "learning_rate": 0.0002, "loss": 0.7711, "mean_token_accuracy": 0.7940188456326723, "num_tokens": 1092870468.0, "step": 3720 }, { "entropy": 0.6020960809662939, "epoch": 0.2791693307196339, "grad_norm": 0.17371869087219238, "learning_rate": 0.0002, "loss": 0.7641, "mean_token_accuracy": 0.7953622210770845, "num_tokens": 1094370265.0, "step": 3725 }, { "entropy": 0.5957865772768856, "epoch": 0.27954405465348575, "grad_norm": 0.18947140872478485, "learning_rate": 0.0002, "loss": 0.7639, "mean_token_accuracy": 0.7926796864718199, "num_tokens": 1095830733.0, "step": 3730 }, { "entropy": 0.6030566282570362, "epoch": 0.2799187785873376, "grad_norm": 0.18481799960136414, "learning_rate": 0.0002, "loss": 0.7764, "mean_token_accuracy": 0.7901679381728173, "num_tokens": 1097320340.0, "step": 3735 }, { "entropy": 0.5791268991306424, "epoch": 0.28029350252118945, "grad_norm": 0.21069826185703278, "learning_rate": 0.0002, "loss": 0.7511, "mean_token_accuracy": 0.7958936434239149, "num_tokens": 1098816249.0, "step": 3740 }, { "entropy": 0.5966117840260268, "epoch": 0.2806682264550413, "grad_norm": 0.21962536871433258, "learning_rate": 0.0002, "loss": 0.7678, "mean_token_accuracy": 0.7935097210109234, "num_tokens": 1100305032.0, "step": 3745 }, { "entropy": 0.5966830026358366, "epoch": 0.2810429503888932, "grad_norm": 0.19365186989307404, "learning_rate": 0.0002, "loss": 0.7722, "mean_token_accuracy": 0.7925878763198853, "num_tokens": 1101781409.0, "step": 3750 }, { "entropy": 0.5958879696205258, "epoch": 0.28141767432274506, "grad_norm": 0.18368341028690338, "learning_rate": 0.0002, "loss": 0.7581, "mean_token_accuracy": 0.794626897573471, "num_tokens": 1103283566.0, "step": 3755 }, { "entropy": 0.5912776984274387, "epoch": 0.2817923982565969, "grad_norm": 0.17516572773456573, "learning_rate": 0.0002, "loss": 0.7541, "mean_token_accuracy": 0.7951778016984463, "num_tokens": 1104775467.0, "step": 3760 }, { "entropy": 0.5909770568832755, "epoch": 0.28216712219044876, "grad_norm": 0.18388386070728302, "learning_rate": 0.0002, "loss": 0.7584, "mean_token_accuracy": 0.7954656060785055, "num_tokens": 1106270595.0, "step": 3765 }, { "entropy": 0.5764641601592302, "epoch": 0.2825418461243006, "grad_norm": 0.176491379737854, "learning_rate": 0.0002, "loss": 0.7474, "mean_token_accuracy": 0.7952606290578842, "num_tokens": 1107703267.0, "step": 3770 }, { "entropy": 0.580771891027689, "epoch": 0.28291657005815246, "grad_norm": 0.1818656325340271, "learning_rate": 0.0002, "loss": 0.7351, "mean_token_accuracy": 0.7979366589337588, "num_tokens": 1109178734.0, "step": 3775 }, { "entropy": 0.5976667669601738, "epoch": 0.2832912939920043, "grad_norm": 0.18498311936855316, "learning_rate": 0.0002, "loss": 0.7623, "mean_token_accuracy": 0.7949820652604103, "num_tokens": 1110625280.0, "step": 3780 }, { "entropy": 0.5940342396497726, "epoch": 0.28366601792585616, "grad_norm": 0.17447055876255035, "learning_rate": 0.0002, "loss": 0.7445, "mean_token_accuracy": 0.7963079616427422, "num_tokens": 1112054511.0, "step": 3785 }, { "entropy": 0.589411587268114, "epoch": 0.28404074185970807, "grad_norm": 0.18859738111495972, "learning_rate": 0.0002, "loss": 0.751, "mean_token_accuracy": 0.7977277111262083, "num_tokens": 1113509477.0, "step": 3790 }, { "entropy": 0.5864951342344285, "epoch": 0.2844154657935599, "grad_norm": 0.19083347916603088, "learning_rate": 0.0002, "loss": 0.7342, "mean_token_accuracy": 0.8006857763975859, "num_tokens": 1114988438.0, "step": 3795 }, { "entropy": 0.6080025298520922, "epoch": 0.28479018972741177, "grad_norm": 0.19683346152305603, "learning_rate": 0.0002, "loss": 0.7732, "mean_token_accuracy": 0.7934406898915768, "num_tokens": 1116460352.0, "step": 3800 }, { "entropy": 0.5848989766091108, "epoch": 0.2851649136612636, "grad_norm": 0.2073942869901657, "learning_rate": 0.0002, "loss": 0.7488, "mean_token_accuracy": 0.7959999274462461, "num_tokens": 1117942508.0, "step": 3805 }, { "entropy": 0.5915478216484189, "epoch": 0.28553963759511547, "grad_norm": 0.21190282702445984, "learning_rate": 0.0002, "loss": 0.7569, "mean_token_accuracy": 0.7963630467653274, "num_tokens": 1119422110.0, "step": 3810 }, { "entropy": 0.5814269296824932, "epoch": 0.2859143615289673, "grad_norm": 0.1875295788049698, "learning_rate": 0.0002, "loss": 0.734, "mean_token_accuracy": 0.7991679269820452, "num_tokens": 1120911192.0, "step": 3815 }, { "entropy": 0.5867473101243377, "epoch": 0.28628908546281917, "grad_norm": 0.1948828399181366, "learning_rate": 0.0002, "loss": 0.7463, "mean_token_accuracy": 0.7953331612050534, "num_tokens": 1122353960.0, "step": 3820 }, { "entropy": 0.5968233676627278, "epoch": 0.286663809396671, "grad_norm": 0.2042892873287201, "learning_rate": 0.0002, "loss": 0.7593, "mean_token_accuracy": 0.7971887048333883, "num_tokens": 1123810294.0, "step": 3825 }, { "entropy": 0.6047568095847964, "epoch": 0.2870385333305229, "grad_norm": 0.17764034867286682, "learning_rate": 0.0002, "loss": 0.7634, "mean_token_accuracy": 0.7954793583601714, "num_tokens": 1125314223.0, "step": 3830 }, { "entropy": 0.5886770750395953, "epoch": 0.2874132572643748, "grad_norm": 0.1892738938331604, "learning_rate": 0.0002, "loss": 0.7461, "mean_token_accuracy": 0.7974199261516333, "num_tokens": 1126730020.0, "step": 3835 }, { "entropy": 0.5780029246583581, "epoch": 0.28778798119822663, "grad_norm": 0.19249573349952698, "learning_rate": 0.0002, "loss": 0.7341, "mean_token_accuracy": 0.7995512302964926, "num_tokens": 1128204118.0, "step": 3840 }, { "entropy": 0.5872319115325808, "epoch": 0.2881627051320785, "grad_norm": 0.1837387979030609, "learning_rate": 0.0002, "loss": 0.7564, "mean_token_accuracy": 0.7939633153378963, "num_tokens": 1129671449.0, "step": 3845 }, { "entropy": 0.5985147535800934, "epoch": 0.28853742906593033, "grad_norm": 0.17734719812870026, "learning_rate": 0.0002, "loss": 0.7663, "mean_token_accuracy": 0.7925904791802167, "num_tokens": 1131147289.0, "step": 3850 }, { "entropy": 0.5661965215578675, "epoch": 0.2889121529997822, "grad_norm": 0.19153210520744324, "learning_rate": 0.0002, "loss": 0.7325, "mean_token_accuracy": 0.7997805934399367, "num_tokens": 1132587948.0, "step": 3855 }, { "entropy": 0.5937475304119288, "epoch": 0.28928687693363403, "grad_norm": 0.17040984332561493, "learning_rate": 0.0002, "loss": 0.7632, "mean_token_accuracy": 0.7971526607871056, "num_tokens": 1134058839.0, "step": 3860 }, { "entropy": 0.5953242268413306, "epoch": 0.2896616008674859, "grad_norm": 0.18811440467834473, "learning_rate": 0.0002, "loss": 0.7682, "mean_token_accuracy": 0.7946061491966248, "num_tokens": 1135518689.0, "step": 3865 }, { "entropy": 0.5963866168633103, "epoch": 0.2900363248013378, "grad_norm": 0.19411739706993103, "learning_rate": 0.0002, "loss": 0.7622, "mean_token_accuracy": 0.7973169162869453, "num_tokens": 1137011899.0, "step": 3870 }, { "entropy": 0.5995880637317896, "epoch": 0.29041104873518964, "grad_norm": 0.17939260601997375, "learning_rate": 0.0002, "loss": 0.7687, "mean_token_accuracy": 0.7960042741149664, "num_tokens": 1138447283.0, "step": 3875 }, { "entropy": 0.6145158773288131, "epoch": 0.2907857726690415, "grad_norm": 0.17616793513298035, "learning_rate": 0.0002, "loss": 0.7686, "mean_token_accuracy": 0.7922325756400823, "num_tokens": 1139948417.0, "step": 3880 }, { "entropy": 0.5858472583815455, "epoch": 0.29116049660289334, "grad_norm": 0.18484659492969513, "learning_rate": 0.0002, "loss": 0.7406, "mean_token_accuracy": 0.796822813898325, "num_tokens": 1141443959.0, "step": 3885 }, { "entropy": 0.583527111262083, "epoch": 0.2915352205367452, "grad_norm": 0.1952163577079773, "learning_rate": 0.0002, "loss": 0.7406, "mean_token_accuracy": 0.7993714611977338, "num_tokens": 1142893070.0, "step": 3890 }, { "entropy": 0.5853881964460015, "epoch": 0.29190994447059704, "grad_norm": 0.21433506906032562, "learning_rate": 0.0002, "loss": 0.7517, "mean_token_accuracy": 0.7960381157696247, "num_tokens": 1144353396.0, "step": 3895 }, { "entropy": 0.5987289113923907, "epoch": 0.2922846684044489, "grad_norm": 0.18793492019176483, "learning_rate": 0.0002, "loss": 0.7629, "mean_token_accuracy": 0.7938758168369532, "num_tokens": 1145815244.0, "step": 3900 }, { "entropy": 0.6095782617107034, "epoch": 0.2926593923383008, "grad_norm": 0.18685689568519592, "learning_rate": 0.0002, "loss": 0.7689, "mean_token_accuracy": 0.7904509924352169, "num_tokens": 1147301656.0, "step": 3905 }, { "entropy": 0.5904654378071428, "epoch": 0.29303411627215264, "grad_norm": 0.19341328740119934, "learning_rate": 0.0002, "loss": 0.7505, "mean_token_accuracy": 0.7938127972185611, "num_tokens": 1148747281.0, "step": 3910 }, { "entropy": 0.6056897608563304, "epoch": 0.2934088402060045, "grad_norm": 0.1819075047969818, "learning_rate": 0.0002, "loss": 0.768, "mean_token_accuracy": 0.7931995052844286, "num_tokens": 1150221921.0, "step": 3915 }, { "entropy": 0.5723868653178215, "epoch": 0.29378356413985635, "grad_norm": 0.17827950417995453, "learning_rate": 0.0002, "loss": 0.7452, "mean_token_accuracy": 0.797577828541398, "num_tokens": 1151650855.0, "step": 3920 }, { "entropy": 0.6031199229881168, "epoch": 0.2941582880737082, "grad_norm": 0.20608966052532196, "learning_rate": 0.0002, "loss": 0.7653, "mean_token_accuracy": 0.7940841425210238, "num_tokens": 1153085454.0, "step": 3925 }, { "entropy": 0.5830601828172803, "epoch": 0.29453301200756005, "grad_norm": 0.1823141723871231, "learning_rate": 0.0002, "loss": 0.7478, "mean_token_accuracy": 0.7972083024680614, "num_tokens": 1154540106.0, "step": 3930 }, { "entropy": 0.5862758728675544, "epoch": 0.2949077359414119, "grad_norm": 0.17002210021018982, "learning_rate": 0.0002, "loss": 0.7513, "mean_token_accuracy": 0.7986780986189842, "num_tokens": 1156009577.0, "step": 3935 }, { "entropy": 0.6043861117213964, "epoch": 0.29528245987526375, "grad_norm": 0.2091769427061081, "learning_rate": 0.0002, "loss": 0.763, "mean_token_accuracy": 0.7946979913860559, "num_tokens": 1157465179.0, "step": 3940 }, { "entropy": 0.6133271703496576, "epoch": 0.29565718380911565, "grad_norm": 0.18437084555625916, "learning_rate": 0.0002, "loss": 0.778, "mean_token_accuracy": 0.7904251229017973, "num_tokens": 1158929919.0, "step": 3945 }, { "entropy": 0.6102759868837893, "epoch": 0.2960319077429675, "grad_norm": 0.17994475364685059, "learning_rate": 0.0002, "loss": 0.7758, "mean_token_accuracy": 0.7926817804574966, "num_tokens": 1160392504.0, "step": 3950 }, { "entropy": 0.5821545688435436, "epoch": 0.29640663167681935, "grad_norm": 0.18236200511455536, "learning_rate": 0.0002, "loss": 0.7458, "mean_token_accuracy": 0.7970305930823087, "num_tokens": 1161856048.0, "step": 3955 }, { "entropy": 0.6048814540728926, "epoch": 0.2967813556106712, "grad_norm": 0.18876715004444122, "learning_rate": 0.0002, "loss": 0.7603, "mean_token_accuracy": 0.7941070392727851, "num_tokens": 1163333967.0, "step": 3960 }, { "entropy": 0.595556765049696, "epoch": 0.29715607954452306, "grad_norm": 0.2181604951620102, "learning_rate": 0.0002, "loss": 0.7673, "mean_token_accuracy": 0.7953674469143153, "num_tokens": 1164787339.0, "step": 3965 }, { "entropy": 0.5730951242148876, "epoch": 0.2975308034783749, "grad_norm": 0.19329917430877686, "learning_rate": 0.0002, "loss": 0.7372, "mean_token_accuracy": 0.8003424942493439, "num_tokens": 1166215210.0, "step": 3970 }, { "entropy": 0.5736376494169235, "epoch": 0.29790552741222676, "grad_norm": 0.19843412935733795, "learning_rate": 0.0002, "loss": 0.7384, "mean_token_accuracy": 0.797173286229372, "num_tokens": 1167685645.0, "step": 3975 }, { "entropy": 0.5705607783049345, "epoch": 0.2982802513460786, "grad_norm": 0.19190992414951324, "learning_rate": 0.0002, "loss": 0.7323, "mean_token_accuracy": 0.8016104061156512, "num_tokens": 1169117379.0, "step": 3980 }, { "entropy": 0.6090691594406963, "epoch": 0.2986549752799305, "grad_norm": 0.1885799914598465, "learning_rate": 0.0002, "loss": 0.7765, "mean_token_accuracy": 0.7929979585111141, "num_tokens": 1170597175.0, "step": 3985 }, { "entropy": 0.5964752737432718, "epoch": 0.29902969921378236, "grad_norm": 0.17719292640686035, "learning_rate": 0.0002, "loss": 0.7605, "mean_token_accuracy": 0.7966483928263187, "num_tokens": 1172062327.0, "step": 3990 }, { "entropy": 0.6002643492072821, "epoch": 0.2994044231476342, "grad_norm": 0.20591703057289124, "learning_rate": 0.0002, "loss": 0.7605, "mean_token_accuracy": 0.7947906412184238, "num_tokens": 1173524767.0, "step": 3995 }, { "entropy": 0.5913285125046969, "epoch": 0.29977914708148606, "grad_norm": 0.19465233385562897, "learning_rate": 0.0002, "loss": 0.744, "mean_token_accuracy": 0.8000953551381826, "num_tokens": 1174970519.0, "step": 4000 }, { "entropy": 0.588915232475847, "epoch": 0.3001538710153379, "grad_norm": 0.1725004017353058, "learning_rate": 0.0002, "loss": 0.7383, "mean_token_accuracy": 0.7977715011686086, "num_tokens": 1176433055.0, "step": 4005 }, { "entropy": 0.594618970155716, "epoch": 0.30052859494918976, "grad_norm": 0.17716097831726074, "learning_rate": 0.0002, "loss": 0.7464, "mean_token_accuracy": 0.7973858743906022, "num_tokens": 1177905750.0, "step": 4010 }, { "entropy": 0.5939691379666329, "epoch": 0.3009033188830416, "grad_norm": 0.19119758903980255, "learning_rate": 0.0002, "loss": 0.744, "mean_token_accuracy": 0.7979783650487662, "num_tokens": 1179393579.0, "step": 4015 }, { "entropy": 0.6092382561415434, "epoch": 0.30127804281689347, "grad_norm": 0.2015710175037384, "learning_rate": 0.0002, "loss": 0.7634, "mean_token_accuracy": 0.7938186284154654, "num_tokens": 1180901901.0, "step": 4020 }, { "entropy": 0.5988793212920427, "epoch": 0.30165276675074537, "grad_norm": 0.2795675992965698, "learning_rate": 0.0002, "loss": 0.7567, "mean_token_accuracy": 0.7949708957225085, "num_tokens": 1182400716.0, "step": 4025 }, { "entropy": 0.5890937637537718, "epoch": 0.3020274906845972, "grad_norm": 0.1768081784248352, "learning_rate": 0.0002, "loss": 0.7406, "mean_token_accuracy": 0.7980746611952781, "num_tokens": 1183846173.0, "step": 4030 }, { "entropy": 0.6070587676949799, "epoch": 0.3024022146184491, "grad_norm": 0.20132850110530853, "learning_rate": 0.0002, "loss": 0.7638, "mean_token_accuracy": 0.7953146010637283, "num_tokens": 1185350249.0, "step": 4035 }, { "entropy": 0.6005327467806637, "epoch": 0.3027769385523009, "grad_norm": 0.19193784892559052, "learning_rate": 0.0002, "loss": 0.767, "mean_token_accuracy": 0.7968817740678787, "num_tokens": 1186827639.0, "step": 4040 }, { "entropy": 0.5834723424166441, "epoch": 0.3031516624861528, "grad_norm": 0.17789281904697418, "learning_rate": 0.0002, "loss": 0.737, "mean_token_accuracy": 0.7988991409540176, "num_tokens": 1188296182.0, "step": 4045 }, { "entropy": 0.5940482480451464, "epoch": 0.3035263864200046, "grad_norm": 0.18467698991298676, "learning_rate": 0.0002, "loss": 0.7526, "mean_token_accuracy": 0.7938948422670364, "num_tokens": 1189739463.0, "step": 4050 }, { "entropy": 0.5962785778567194, "epoch": 0.3039011103538565, "grad_norm": 0.1783953160047531, "learning_rate": 0.0002, "loss": 0.7618, "mean_token_accuracy": 0.7935312263667583, "num_tokens": 1191216836.0, "step": 4055 }, { "entropy": 0.60069102011621, "epoch": 0.3042758342877084, "grad_norm": 0.1929769665002823, "learning_rate": 0.0002, "loss": 0.7653, "mean_token_accuracy": 0.7931593935936689, "num_tokens": 1192685161.0, "step": 4060 }, { "entropy": 0.5800295911729336, "epoch": 0.30465055822156023, "grad_norm": 0.17601469159126282, "learning_rate": 0.0002, "loss": 0.7323, "mean_token_accuracy": 0.7984515409916639, "num_tokens": 1194116477.0, "step": 4065 }, { "entropy": 0.6028885073028505, "epoch": 0.3050252821554121, "grad_norm": 0.18306754529476166, "learning_rate": 0.0002, "loss": 0.7592, "mean_token_accuracy": 0.7949360940605402, "num_tokens": 1195594422.0, "step": 4070 }, { "entropy": 0.5980476615950465, "epoch": 0.30540000608926393, "grad_norm": 0.18501035869121552, "learning_rate": 0.0002, "loss": 0.7559, "mean_token_accuracy": 0.7961597748100757, "num_tokens": 1197072750.0, "step": 4075 }, { "entropy": 0.5764782713726163, "epoch": 0.3057747300231158, "grad_norm": 0.19371964037418365, "learning_rate": 0.0002, "loss": 0.7343, "mean_token_accuracy": 0.798349529877305, "num_tokens": 1198508377.0, "step": 4080 }, { "entropy": 0.5951108586043119, "epoch": 0.30614945395696763, "grad_norm": 0.2047787308692932, "learning_rate": 0.0002, "loss": 0.7527, "mean_token_accuracy": 0.7971757922321558, "num_tokens": 1199937055.0, "step": 4085 }, { "entropy": 0.5893726255744696, "epoch": 0.3065241778908195, "grad_norm": 0.20032227039337158, "learning_rate": 0.0002, "loss": 0.7435, "mean_token_accuracy": 0.796142416447401, "num_tokens": 1201397055.0, "step": 4090 }, { "entropy": 0.5796432020142674, "epoch": 0.30689890182467133, "grad_norm": 0.20025895535945892, "learning_rate": 0.0002, "loss": 0.7447, "mean_token_accuracy": 0.796749296784401, "num_tokens": 1202820662.0, "step": 4095 }, { "entropy": 0.5938818152993918, "epoch": 0.30727362575852324, "grad_norm": 0.20131397247314453, "learning_rate": 0.0002, "loss": 0.7492, "mean_token_accuracy": 0.7967523504048586, "num_tokens": 1204298402.0, "step": 4100 }, { "entropy": 0.5819771561771632, "epoch": 0.3076483496923751, "grad_norm": 0.18495695292949677, "learning_rate": 0.0002, "loss": 0.7337, "mean_token_accuracy": 0.798812759667635, "num_tokens": 1205740239.0, "step": 4105 }, { "entropy": 0.587762220390141, "epoch": 0.30802307362622694, "grad_norm": 0.1989593505859375, "learning_rate": 0.0002, "loss": 0.7431, "mean_token_accuracy": 0.7988625418394804, "num_tokens": 1207186616.0, "step": 4110 }, { "entropy": 0.5868752352893353, "epoch": 0.3083977975600788, "grad_norm": 0.17300957441329956, "learning_rate": 0.0002, "loss": 0.7415, "mean_token_accuracy": 0.7992626380175352, "num_tokens": 1208651795.0, "step": 4115 }, { "entropy": 0.5727521490305663, "epoch": 0.30877252149393064, "grad_norm": 0.18440550565719604, "learning_rate": 0.0002, "loss": 0.7285, "mean_token_accuracy": 0.7993430748581887, "num_tokens": 1210060401.0, "step": 4120 }, { "entropy": 0.6071222243830562, "epoch": 0.3091472454277825, "grad_norm": 0.1703031063079834, "learning_rate": 0.0002, "loss": 0.7689, "mean_token_accuracy": 0.7938266888260841, "num_tokens": 1211569736.0, "step": 4125 }, { "entropy": 0.5854591882787645, "epoch": 0.30952196936163434, "grad_norm": 0.22250808775424957, "learning_rate": 0.0002, "loss": 0.747, "mean_token_accuracy": 0.7988794773817063, "num_tokens": 1213051915.0, "step": 4130 }, { "entropy": 0.6133045453578234, "epoch": 0.3098966932954862, "grad_norm": 0.2189883589744568, "learning_rate": 0.0002, "loss": 0.7716, "mean_token_accuracy": 0.7917583353817463, "num_tokens": 1214551063.0, "step": 4135 }, { "entropy": 0.5985005253925919, "epoch": 0.3102714172293381, "grad_norm": 0.1836496740579605, "learning_rate": 0.0002, "loss": 0.7555, "mean_token_accuracy": 0.7958225157111883, "num_tokens": 1215983772.0, "step": 4140 }, { "entropy": 0.5960024408996105, "epoch": 0.31064614116318995, "grad_norm": 0.18800455331802368, "learning_rate": 0.0002, "loss": 0.7599, "mean_token_accuracy": 0.7959621272981167, "num_tokens": 1217463562.0, "step": 4145 }, { "entropy": 0.5919467760249972, "epoch": 0.3110208650970418, "grad_norm": 0.18175968527793884, "learning_rate": 0.0002, "loss": 0.7531, "mean_token_accuracy": 0.7959878135472536, "num_tokens": 1218934660.0, "step": 4150 }, { "entropy": 0.5921079269610345, "epoch": 0.31139558903089365, "grad_norm": 0.18033485114574432, "learning_rate": 0.0002, "loss": 0.73, "mean_token_accuracy": 0.801633107662201, "num_tokens": 1220443831.0, "step": 4155 }, { "entropy": 0.5869416167959571, "epoch": 0.3117703129647455, "grad_norm": 0.19445431232452393, "learning_rate": 0.0002, "loss": 0.7342, "mean_token_accuracy": 0.7986761324107647, "num_tokens": 1221898416.0, "step": 4160 }, { "entropy": 0.6169377397745848, "epoch": 0.31214503689859735, "grad_norm": 0.1834959238767624, "learning_rate": 0.0002, "loss": 0.7647, "mean_token_accuracy": 0.7950588595122099, "num_tokens": 1223359204.0, "step": 4165 }, { "entropy": 0.6151689033955335, "epoch": 0.3125197608324492, "grad_norm": 0.21327655017375946, "learning_rate": 0.0002, "loss": 0.7603, "mean_token_accuracy": 0.7953384470194578, "num_tokens": 1224816206.0, "step": 4170 }, { "entropy": 0.6085467610508204, "epoch": 0.31289448476630105, "grad_norm": 0.18956615030765533, "learning_rate": 0.0002, "loss": 0.7438, "mean_token_accuracy": 0.7959521487355232, "num_tokens": 1226316241.0, "step": 4175 }, { "entropy": 0.618184894695878, "epoch": 0.31326920870015296, "grad_norm": 0.1989566832780838, "learning_rate": 0.0002, "loss": 0.7649, "mean_token_accuracy": 0.7974353585392236, "num_tokens": 1227767012.0, "step": 4180 }, { "entropy": 0.6026652362197638, "epoch": 0.3136439326340048, "grad_norm": 0.18283776938915253, "learning_rate": 0.0002, "loss": 0.7497, "mean_token_accuracy": 0.7993021577596664, "num_tokens": 1229228634.0, "step": 4185 }, { "entropy": 0.5892559413798153, "epoch": 0.31401865656785666, "grad_norm": 0.1886904239654541, "learning_rate": 0.0002, "loss": 0.7295, "mean_token_accuracy": 0.800459448248148, "num_tokens": 1230714739.0, "step": 4190 }, { "entropy": 0.6019833458587527, "epoch": 0.3143933805017085, "grad_norm": 0.18766747415065765, "learning_rate": 0.0002, "loss": 0.7489, "mean_token_accuracy": 0.7965607516467571, "num_tokens": 1232212642.0, "step": 4195 }, { "entropy": 0.5969116274267435, "epoch": 0.31476810443556036, "grad_norm": 0.19490975141525269, "learning_rate": 0.0002, "loss": 0.7475, "mean_token_accuracy": 0.7996454201638699, "num_tokens": 1233694571.0, "step": 4200 }, { "entropy": 0.5926325399428606, "epoch": 0.3151428283694122, "grad_norm": 0.1981535255908966, "learning_rate": 0.0002, "loss": 0.7346, "mean_token_accuracy": 0.796305050700903, "num_tokens": 1235143923.0, "step": 4205 }, { "entropy": 0.6219364490360022, "epoch": 0.31551755230326406, "grad_norm": 0.9369872808456421, "learning_rate": 0.0002, "loss": 0.7736, "mean_token_accuracy": 0.792543912306428, "num_tokens": 1236648634.0, "step": 4210 }, { "entropy": 0.6123530445620418, "epoch": 0.3158922762371159, "grad_norm": 0.20134173333644867, "learning_rate": 0.0002, "loss": 0.7521, "mean_token_accuracy": 0.7959941949695348, "num_tokens": 1238130031.0, "step": 4215 }, { "entropy": 0.5982205793261528, "epoch": 0.3162670001709678, "grad_norm": 0.19891785085201263, "learning_rate": 0.0002, "loss": 0.75, "mean_token_accuracy": 0.7976205609738827, "num_tokens": 1239563810.0, "step": 4220 }, { "entropy": 0.6012900587171316, "epoch": 0.31664172410481967, "grad_norm": 0.17052653431892395, "learning_rate": 0.0002, "loss": 0.7554, "mean_token_accuracy": 0.7980640891939401, "num_tokens": 1241011185.0, "step": 4225 }, { "entropy": 0.5863015509210527, "epoch": 0.3170164480386715, "grad_norm": 0.1804395467042923, "learning_rate": 0.0002, "loss": 0.7476, "mean_token_accuracy": 0.7963964063674212, "num_tokens": 1242440207.0, "step": 4230 }, { "entropy": 0.5800167789682746, "epoch": 0.31739117197252337, "grad_norm": 0.18476103246212006, "learning_rate": 0.0002, "loss": 0.7389, "mean_token_accuracy": 0.7976656053215265, "num_tokens": 1243871642.0, "step": 4235 }, { "entropy": 0.5630110170692205, "epoch": 0.3177658959063752, "grad_norm": 0.17315682768821716, "learning_rate": 0.0002, "loss": 0.7252, "mean_token_accuracy": 0.8002696976065635, "num_tokens": 1245341824.0, "step": 4240 }, { "entropy": 0.5903698574751616, "epoch": 0.31814061984022707, "grad_norm": 0.21537631750106812, "learning_rate": 0.0002, "loss": 0.752, "mean_token_accuracy": 0.7946964461356402, "num_tokens": 1246806237.0, "step": 4245 }, { "entropy": 0.5900678852573037, "epoch": 0.3185153437740789, "grad_norm": 0.18168358504772186, "learning_rate": 0.0002, "loss": 0.7462, "mean_token_accuracy": 0.7994452841579914, "num_tokens": 1248298121.0, "step": 4250 }, { "entropy": 0.5845153380185366, "epoch": 0.3188900677079308, "grad_norm": 0.1984136551618576, "learning_rate": 0.0002, "loss": 0.7516, "mean_token_accuracy": 0.7996589880436659, "num_tokens": 1249729706.0, "step": 4255 }, { "entropy": 0.5871046589687466, "epoch": 0.3192647916417827, "grad_norm": 0.17925018072128296, "learning_rate": 0.0002, "loss": 0.7457, "mean_token_accuracy": 0.7965475898236036, "num_tokens": 1251207468.0, "step": 4260 }, { "entropy": 0.6105845969170332, "epoch": 0.3196395155756345, "grad_norm": 0.19739824533462524, "learning_rate": 0.0002, "loss": 0.776, "mean_token_accuracy": 0.7930166501551866, "num_tokens": 1252695308.0, "step": 4265 }, { "entropy": 0.5652345422655344, "epoch": 0.3200142395094864, "grad_norm": 0.1779516637325287, "learning_rate": 0.0002, "loss": 0.7281, "mean_token_accuracy": 0.8002659309655428, "num_tokens": 1254150435.0, "step": 4270 }, { "entropy": 0.5903418650850654, "epoch": 0.3203889634433382, "grad_norm": 0.18189236521720886, "learning_rate": 0.0002, "loss": 0.7721, "mean_token_accuracy": 0.7940781429409981, "num_tokens": 1255627278.0, "step": 4275 }, { "entropy": 0.5772392850369215, "epoch": 0.3207636873771901, "grad_norm": 0.22363565862178802, "learning_rate": 0.0002, "loss": 0.7454, "mean_token_accuracy": 0.795168299973011, "num_tokens": 1257096482.0, "step": 4280 }, { "entropy": 0.5551470740698278, "epoch": 0.3211384113110419, "grad_norm": 0.178113654255867, "learning_rate": 0.0002, "loss": 0.7305, "mean_token_accuracy": 0.8029332306236029, "num_tokens": 1258549930.0, "step": 4285 }, { "entropy": 0.5800701219588518, "epoch": 0.3215131352448938, "grad_norm": 0.19037465751171112, "learning_rate": 0.0002, "loss": 0.7503, "mean_token_accuracy": 0.7976047646254301, "num_tokens": 1260007155.0, "step": 4290 }, { "entropy": 0.5784346081316472, "epoch": 0.3218878591787457, "grad_norm": 0.27007052302360535, "learning_rate": 0.0002, "loss": 0.7552, "mean_token_accuracy": 0.796234080940485, "num_tokens": 1261448045.0, "step": 4295 }, { "entropy": 0.6008585726842284, "epoch": 0.32226258311259753, "grad_norm": 0.16696324944496155, "learning_rate": 0.0002, "loss": 0.7744, "mean_token_accuracy": 0.7926724925637245, "num_tokens": 1262946230.0, "step": 4300 }, { "entropy": 0.5715092736296356, "epoch": 0.3226373070464494, "grad_norm": 0.1994435340166092, "learning_rate": 0.0002, "loss": 0.7469, "mean_token_accuracy": 0.7975834865123034, "num_tokens": 1264397694.0, "step": 4305 }, { "entropy": 0.586613642424345, "epoch": 0.32301203098030123, "grad_norm": 0.19389083981513977, "learning_rate": 0.0002, "loss": 0.7592, "mean_token_accuracy": 0.7956932727247477, "num_tokens": 1265855927.0, "step": 4310 }, { "entropy": 0.5735210683196783, "epoch": 0.3233867549141531, "grad_norm": 0.20650425553321838, "learning_rate": 0.0002, "loss": 0.7476, "mean_token_accuracy": 0.7983983896672726, "num_tokens": 1267342772.0, "step": 4315 }, { "entropy": 0.5492088231258094, "epoch": 0.32376147884800494, "grad_norm": 0.18016353249549866, "learning_rate": 0.0002, "loss": 0.7175, "mean_token_accuracy": 0.8041205894201994, "num_tokens": 1268757125.0, "step": 4320 }, { "entropy": 0.5672954301349818, "epoch": 0.3241362027818568, "grad_norm": 0.19729115068912506, "learning_rate": 0.0002, "loss": 0.754, "mean_token_accuracy": 0.7976382441818715, "num_tokens": 1270185730.0, "step": 4325 }, { "entropy": 0.573708425834775, "epoch": 0.32451092671570864, "grad_norm": 0.22318118810653687, "learning_rate": 0.0002, "loss": 0.7433, "mean_token_accuracy": 0.7970391292124986, "num_tokens": 1271632160.0, "step": 4330 }, { "entropy": 0.5825185719877481, "epoch": 0.32488565064956054, "grad_norm": 0.2009771317243576, "learning_rate": 0.0002, "loss": 0.7411, "mean_token_accuracy": 0.7975780569016934, "num_tokens": 1273118607.0, "step": 4335 }, { "entropy": 0.5732336455024779, "epoch": 0.3252603745834124, "grad_norm": 0.1992644965648651, "learning_rate": 0.0002, "loss": 0.7395, "mean_token_accuracy": 0.797964533790946, "num_tokens": 1274587572.0, "step": 4340 }, { "entropy": 0.5924339111894369, "epoch": 0.32563509851726424, "grad_norm": 0.1811087429523468, "learning_rate": 0.0002, "loss": 0.7638, "mean_token_accuracy": 0.7928844198584557, "num_tokens": 1276086404.0, "step": 4345 }, { "entropy": 0.5717601019889116, "epoch": 0.3260098224511161, "grad_norm": 0.19335830211639404, "learning_rate": 0.0002, "loss": 0.7424, "mean_token_accuracy": 0.7978363208472729, "num_tokens": 1277553659.0, "step": 4350 }, { "entropy": 0.568881281837821, "epoch": 0.32638454638496794, "grad_norm": 0.17897053062915802, "learning_rate": 0.0002, "loss": 0.7344, "mean_token_accuracy": 0.7972408786416054, "num_tokens": 1278990864.0, "step": 4355 }, { "entropy": 0.5756158145144582, "epoch": 0.3267592703188198, "grad_norm": 0.17390257120132446, "learning_rate": 0.0002, "loss": 0.7543, "mean_token_accuracy": 0.7969175621867179, "num_tokens": 1280441380.0, "step": 4360 }, { "entropy": 0.5818153481930495, "epoch": 0.32713399425267164, "grad_norm": 0.18495650589466095, "learning_rate": 0.0002, "loss": 0.737, "mean_token_accuracy": 0.7981816817075014, "num_tokens": 1281946540.0, "step": 4365 }, { "entropy": 0.5704060893505811, "epoch": 0.3275087181865235, "grad_norm": 0.18139240145683289, "learning_rate": 0.0002, "loss": 0.7358, "mean_token_accuracy": 0.7983148362487554, "num_tokens": 1283431766.0, "step": 4370 }, { "entropy": 0.5752978790551424, "epoch": 0.3278834421203754, "grad_norm": 0.18995633721351624, "learning_rate": 0.0002, "loss": 0.7411, "mean_token_accuracy": 0.7994767487049103, "num_tokens": 1284920065.0, "step": 4375 }, { "entropy": 0.6073280008509755, "epoch": 0.32825816605422725, "grad_norm": 0.19034498929977417, "learning_rate": 0.0002, "loss": 0.7745, "mean_token_accuracy": 0.7925092339515686, "num_tokens": 1286415528.0, "step": 4380 }, { "entropy": 0.6063831532374024, "epoch": 0.3286328899880791, "grad_norm": 0.17085595428943634, "learning_rate": 0.0002, "loss": 0.7573, "mean_token_accuracy": 0.7929708659648895, "num_tokens": 1287903629.0, "step": 4385 }, { "entropy": 0.5874953912571073, "epoch": 0.32900761392193095, "grad_norm": 0.1907806396484375, "learning_rate": 0.0002, "loss": 0.7382, "mean_token_accuracy": 0.7958885114639997, "num_tokens": 1289400843.0, "step": 4390 }, { "entropy": 0.5893725760281086, "epoch": 0.3293823378557828, "grad_norm": 0.18211959302425385, "learning_rate": 0.0002, "loss": 0.743, "mean_token_accuracy": 0.7991102144122124, "num_tokens": 1290838358.0, "step": 4395 }, { "entropy": 0.5889736717566848, "epoch": 0.32975706178963465, "grad_norm": 0.18135878443717957, "learning_rate": 0.0002, "loss": 0.7393, "mean_token_accuracy": 0.8001553863286972, "num_tokens": 1292349505.0, "step": 4400 }, { "entropy": 0.6065046388655901, "epoch": 0.3301317857234865, "grad_norm": 0.1917053461074829, "learning_rate": 0.0002, "loss": 0.7683, "mean_token_accuracy": 0.7934649474918842, "num_tokens": 1293828666.0, "step": 4405 }, { "entropy": 0.592289250716567, "epoch": 0.3305065096573384, "grad_norm": 0.20869827270507812, "learning_rate": 0.0002, "loss": 0.76, "mean_token_accuracy": 0.7977940123528242, "num_tokens": 1295260048.0, "step": 4410 }, { "entropy": 0.5999905502423644, "epoch": 0.33088123359119026, "grad_norm": 0.1997469961643219, "learning_rate": 0.0002, "loss": 0.7493, "mean_token_accuracy": 0.7987882353365421, "num_tokens": 1296718875.0, "step": 4415 }, { "entropy": 0.610616478137672, "epoch": 0.3312559575250421, "grad_norm": 0.1916056126356125, "learning_rate": 0.0002, "loss": 0.7524, "mean_token_accuracy": 0.7929950505495071, "num_tokens": 1298189821.0, "step": 4420 }, { "entropy": 0.5928906991146505, "epoch": 0.33163068145889396, "grad_norm": 0.18314820528030396, "learning_rate": 0.0002, "loss": 0.7456, "mean_token_accuracy": 0.79955795109272, "num_tokens": 1299676371.0, "step": 4425 }, { "entropy": 0.6012357980012893, "epoch": 0.3320054053927458, "grad_norm": 0.18670450150966644, "learning_rate": 0.0002, "loss": 0.7447, "mean_token_accuracy": 0.799390771985054, "num_tokens": 1301122589.0, "step": 4430 }, { "entropy": 0.6136921806260943, "epoch": 0.33238012932659766, "grad_norm": 0.19798871874809265, "learning_rate": 0.0002, "loss": 0.7578, "mean_token_accuracy": 0.7954088162630797, "num_tokens": 1302607013.0, "step": 4435 }, { "entropy": 0.6068981342017651, "epoch": 0.3327548532604495, "grad_norm": 0.2052772045135498, "learning_rate": 0.0002, "loss": 0.75, "mean_token_accuracy": 0.7942487854510546, "num_tokens": 1304060297.0, "step": 4440 }, { "entropy": 0.6085036912932992, "epoch": 0.33312957719430136, "grad_norm": 0.1889418512582779, "learning_rate": 0.0002, "loss": 0.7521, "mean_token_accuracy": 0.7994385924190283, "num_tokens": 1305501690.0, "step": 4445 }, { "entropy": 0.6197216922417284, "epoch": 0.33350430112815327, "grad_norm": 0.19308330118656158, "learning_rate": 0.0002, "loss": 0.7534, "mean_token_accuracy": 0.796280013397336, "num_tokens": 1306948440.0, "step": 4450 }, { "entropy": 0.5947308054193854, "epoch": 0.3338790250620051, "grad_norm": 0.16824638843536377, "learning_rate": 0.0002, "loss": 0.7168, "mean_token_accuracy": 0.8025096178054809, "num_tokens": 1308435276.0, "step": 4455 }, { "entropy": 0.6026596296578646, "epoch": 0.33425374899585697, "grad_norm": 0.19485075771808624, "learning_rate": 0.0002, "loss": 0.7314, "mean_token_accuracy": 0.7983763795346022, "num_tokens": 1309884435.0, "step": 4460 }, { "entropy": 0.6080049378797412, "epoch": 0.3346284729297088, "grad_norm": 0.19074778258800507, "learning_rate": 0.0002, "loss": 0.7437, "mean_token_accuracy": 0.8004644546657801, "num_tokens": 1311351485.0, "step": 4465 }, { "entropy": 0.5967827849090099, "epoch": 0.33500319686356067, "grad_norm": 0.18806493282318115, "learning_rate": 0.0002, "loss": 0.7267, "mean_token_accuracy": 0.7973689246922732, "num_tokens": 1312839186.0, "step": 4470 }, { "entropy": 0.5884976119734346, "epoch": 0.3353779207974125, "grad_norm": 0.18532702326774597, "learning_rate": 0.0002, "loss": 0.7208, "mean_token_accuracy": 0.8008176937699318, "num_tokens": 1314304324.0, "step": 4475 }, { "entropy": 0.5932407798245549, "epoch": 0.33575264473126437, "grad_norm": 0.1888483166694641, "learning_rate": 0.0002, "loss": 0.7417, "mean_token_accuracy": 0.7965114839375019, "num_tokens": 1315780368.0, "step": 4480 }, { "entropy": 0.5672040889039636, "epoch": 0.3361273686651162, "grad_norm": 0.18735750019550323, "learning_rate": 0.0002, "loss": 0.7231, "mean_token_accuracy": 0.8038594949990511, "num_tokens": 1317171532.0, "step": 4485 }, { "entropy": 0.5971724774688483, "epoch": 0.3365020925989681, "grad_norm": 0.2093728482723236, "learning_rate": 0.0002, "loss": 0.7558, "mean_token_accuracy": 0.7953540168702602, "num_tokens": 1318650642.0, "step": 4490 }, { "entropy": 0.6028981760144234, "epoch": 0.33687681653282, "grad_norm": 0.2117159366607666, "learning_rate": 0.0002, "loss": 0.76, "mean_token_accuracy": 0.7939942512661219, "num_tokens": 1320097092.0, "step": 4495 }, { "entropy": 0.6071425011381507, "epoch": 0.33725154046667183, "grad_norm": 0.1754227876663208, "learning_rate": 0.0002, "loss": 0.7558, "mean_token_accuracy": 0.7958189148455859, "num_tokens": 1321567397.0, "step": 4500 }, { "entropy": 0.5860256193205714, "epoch": 0.3376262644005237, "grad_norm": 0.17914941906929016, "learning_rate": 0.0002, "loss": 0.7443, "mean_token_accuracy": 0.7963110972195864, "num_tokens": 1323017034.0, "step": 4505 }, { "entropy": 0.6029907310381531, "epoch": 0.33800098833437553, "grad_norm": 0.18212351202964783, "learning_rate": 0.0002, "loss": 0.7574, "mean_token_accuracy": 0.7951176811009646, "num_tokens": 1324492821.0, "step": 4510 }, { "entropy": 0.6123477714136243, "epoch": 0.3383757122682274, "grad_norm": 0.18680709600448608, "learning_rate": 0.0002, "loss": 0.7744, "mean_token_accuracy": 0.7941988348960877, "num_tokens": 1326028368.0, "step": 4515 }, { "entropy": 0.5793088492937386, "epoch": 0.33875043620207923, "grad_norm": 0.20924927294254303, "learning_rate": 0.0002, "loss": 0.7402, "mean_token_accuracy": 0.8037777245044708, "num_tokens": 1327482478.0, "step": 4520 }, { "entropy": 0.5749708184972405, "epoch": 0.3391251601359311, "grad_norm": 0.18482470512390137, "learning_rate": 0.0002, "loss": 0.7396, "mean_token_accuracy": 0.7996509231626987, "num_tokens": 1328944314.0, "step": 4525 }, { "entropy": 0.5797097146511078, "epoch": 0.339499884069783, "grad_norm": 0.20702248811721802, "learning_rate": 0.0002, "loss": 0.7436, "mean_token_accuracy": 0.7984757360070944, "num_tokens": 1330369600.0, "step": 4530 }, { "entropy": 0.6064786098897457, "epoch": 0.33987460800363484, "grad_norm": 0.201396182179451, "learning_rate": 0.0002, "loss": 0.7518, "mean_token_accuracy": 0.7961844820529222, "num_tokens": 1331849378.0, "step": 4535 }, { "entropy": 0.6184732314199209, "epoch": 0.3402493319374867, "grad_norm": 0.1805904060602188, "learning_rate": 0.0002, "loss": 0.7495, "mean_token_accuracy": 0.7956168461591006, "num_tokens": 1333367384.0, "step": 4540 }, { "entropy": 0.6098783003166318, "epoch": 0.34062405587133854, "grad_norm": 0.17677423357963562, "learning_rate": 0.0002, "loss": 0.7256, "mean_token_accuracy": 0.7994213379919529, "num_tokens": 1334883900.0, "step": 4545 }, { "entropy": 0.6054682653397322, "epoch": 0.3409987798051904, "grad_norm": 0.21986840665340424, "learning_rate": 0.0002, "loss": 0.7211, "mean_token_accuracy": 0.803442444652319, "num_tokens": 1336338902.0, "step": 4550 }, { "entropy": 0.5981726782396436, "epoch": 0.34137350373904224, "grad_norm": 0.19187182188034058, "learning_rate": 0.0002, "loss": 0.7176, "mean_token_accuracy": 0.803050572425127, "num_tokens": 1337758548.0, "step": 4555 }, { "entropy": 0.6134155025705695, "epoch": 0.3417482276728941, "grad_norm": 0.1837368905544281, "learning_rate": 0.0002, "loss": 0.7451, "mean_token_accuracy": 0.7961516816169023, "num_tokens": 1339256623.0, "step": 4560 }, { "entropy": 0.589125045388937, "epoch": 0.342122951606746, "grad_norm": 0.2867242991924286, "learning_rate": 0.0002, "loss": 0.7284, "mean_token_accuracy": 0.7990252546966076, "num_tokens": 1340726526.0, "step": 4565 }, { "entropy": 0.5737242180854082, "epoch": 0.34249767554059785, "grad_norm": 0.17912383377552032, "learning_rate": 0.0002, "loss": 0.7285, "mean_token_accuracy": 0.8009428594261407, "num_tokens": 1342180288.0, "step": 4570 }, { "entropy": 0.565463831089437, "epoch": 0.3428723994744497, "grad_norm": 0.18032892048358917, "learning_rate": 0.0002, "loss": 0.7127, "mean_token_accuracy": 0.8045461744070053, "num_tokens": 1343606062.0, "step": 4575 }, { "entropy": 0.6029457021504641, "epoch": 0.34324712340830155, "grad_norm": 0.2165166735649109, "learning_rate": 0.0002, "loss": 0.758, "mean_token_accuracy": 0.7929500985890627, "num_tokens": 1345091910.0, "step": 4580 }, { "entropy": 0.6003938623704016, "epoch": 0.3436218473421534, "grad_norm": 0.18798141181468964, "learning_rate": 0.0002, "loss": 0.7428, "mean_token_accuracy": 0.7980141773819923, "num_tokens": 1346592111.0, "step": 4585 }, { "entropy": 0.5784593317657709, "epoch": 0.34399657127600525, "grad_norm": 0.22192838788032532, "learning_rate": 0.0002, "loss": 0.7162, "mean_token_accuracy": 0.8033805008977651, "num_tokens": 1348109783.0, "step": 4590 }, { "entropy": 0.5952477874234319, "epoch": 0.3443712952098571, "grad_norm": 0.1816307157278061, "learning_rate": 0.0002, "loss": 0.7437, "mean_token_accuracy": 0.7983831901103258, "num_tokens": 1349594454.0, "step": 4595 }, { "entropy": 0.6002910321578383, "epoch": 0.34474601914370895, "grad_norm": 0.27875080704689026, "learning_rate": 0.0002, "loss": 0.7475, "mean_token_accuracy": 0.7970802791416645, "num_tokens": 1351047320.0, "step": 4600 }, { "entropy": 0.5997304426506161, "epoch": 0.34512074307756085, "grad_norm": 0.17645806074142456, "learning_rate": 0.0002, "loss": 0.7557, "mean_token_accuracy": 0.7974685095250607, "num_tokens": 1352532162.0, "step": 4605 }, { "entropy": 0.5949242856353522, "epoch": 0.3454954670114127, "grad_norm": 0.18899789452552795, "learning_rate": 0.0002, "loss": 0.7595, "mean_token_accuracy": 0.7963003344833851, "num_tokens": 1353978872.0, "step": 4610 }, { "entropy": 0.61357164029032, "epoch": 0.34587019094526456, "grad_norm": 0.20339226722717285, "learning_rate": 0.0002, "loss": 0.7664, "mean_token_accuracy": 0.7962271049618721, "num_tokens": 1355457010.0, "step": 4615 }, { "entropy": 0.6023844674229621, "epoch": 0.3462449148791164, "grad_norm": 0.22493068873882294, "learning_rate": 0.0002, "loss": 0.7616, "mean_token_accuracy": 0.7968821126967669, "num_tokens": 1356931811.0, "step": 4620 }, { "entropy": 0.5848802100867033, "epoch": 0.34661963881296826, "grad_norm": 0.20864161849021912, "learning_rate": 0.0002, "loss": 0.7416, "mean_token_accuracy": 0.7972682666033506, "num_tokens": 1358426624.0, "step": 4625 }, { "entropy": 0.5914139445871115, "epoch": 0.3469943627468201, "grad_norm": 0.19173218309879303, "learning_rate": 0.0002, "loss": 0.7494, "mean_token_accuracy": 0.7965084105730057, "num_tokens": 1359914145.0, "step": 4630 }, { "entropy": 0.5891448893584311, "epoch": 0.34736908668067196, "grad_norm": 0.17477940022945404, "learning_rate": 0.0002, "loss": 0.7361, "mean_token_accuracy": 0.7983337722718715, "num_tokens": 1361421696.0, "step": 4635 }, { "entropy": 0.5725218346342444, "epoch": 0.3477438106145238, "grad_norm": 0.20570595562458038, "learning_rate": 0.0002, "loss": 0.7331, "mean_token_accuracy": 0.796998705342412, "num_tokens": 1362889748.0, "step": 4640 }, { "entropy": 0.580723943002522, "epoch": 0.3481185345483757, "grad_norm": 0.20207048952579498, "learning_rate": 0.0002, "loss": 0.7392, "mean_token_accuracy": 0.8000086706131697, "num_tokens": 1364317219.0, "step": 4645 }, { "entropy": 0.5839482078328728, "epoch": 0.34849325848222756, "grad_norm": 0.1872325837612152, "learning_rate": 0.0002, "loss": 0.7317, "mean_token_accuracy": 0.7976406637579203, "num_tokens": 1365793417.0, "step": 4650 }, { "entropy": 0.5817147634923459, "epoch": 0.3488679824160794, "grad_norm": 0.1976315975189209, "learning_rate": 0.0002, "loss": 0.7521, "mean_token_accuracy": 0.7974647268652916, "num_tokens": 1367265404.0, "step": 4655 }, { "entropy": 0.5771442328579723, "epoch": 0.34924270634993126, "grad_norm": 0.18965168297290802, "learning_rate": 0.0002, "loss": 0.7346, "mean_token_accuracy": 0.7986477218568325, "num_tokens": 1368718697.0, "step": 4660 }, { "entropy": 0.612562607601285, "epoch": 0.3496174302837831, "grad_norm": 0.1789223700761795, "learning_rate": 0.0002, "loss": 0.7776, "mean_token_accuracy": 0.7944931771606207, "num_tokens": 1370230627.0, "step": 4665 }, { "entropy": 0.5841511222533882, "epoch": 0.34999215421763497, "grad_norm": 0.1855054497718811, "learning_rate": 0.0002, "loss": 0.744, "mean_token_accuracy": 0.7980222284793854, "num_tokens": 1371663391.0, "step": 4670 }, { "entropy": 0.5726575301028788, "epoch": 0.3503668781514868, "grad_norm": 0.18287332355976105, "learning_rate": 0.0002, "loss": 0.7349, "mean_token_accuracy": 0.7989390462636947, "num_tokens": 1373136091.0, "step": 4675 }, { "entropy": 0.5842721712775528, "epoch": 0.35074160208533867, "grad_norm": 0.19508624076843262, "learning_rate": 0.0002, "loss": 0.7399, "mean_token_accuracy": 0.7991010017693043, "num_tokens": 1374606996.0, "step": 4680 }, { "entropy": 0.5865584466606378, "epoch": 0.35111632601919057, "grad_norm": 0.1839231252670288, "learning_rate": 0.0002, "loss": 0.748, "mean_token_accuracy": 0.7984711226075888, "num_tokens": 1376102094.0, "step": 4685 }, { "entropy": 0.5804344142787159, "epoch": 0.3514910499530424, "grad_norm": 0.19885554909706116, "learning_rate": 0.0002, "loss": 0.7344, "mean_token_accuracy": 0.798745895922184, "num_tokens": 1377586137.0, "step": 4690 }, { "entropy": 0.5839382311329245, "epoch": 0.3518657738868943, "grad_norm": 0.1926424503326416, "learning_rate": 0.0002, "loss": 0.749, "mean_token_accuracy": 0.796558178588748, "num_tokens": 1379016701.0, "step": 4695 }, { "entropy": 0.558655359596014, "epoch": 0.3522404978207461, "grad_norm": 0.17590387165546417, "learning_rate": 0.0002, "loss": 0.716, "mean_token_accuracy": 0.8008059382438659, "num_tokens": 1380472003.0, "step": 4700 }, { "entropy": 0.5992295948788524, "epoch": 0.352615221754598, "grad_norm": 0.18730570375919342, "learning_rate": 0.0002, "loss": 0.764, "mean_token_accuracy": 0.792586499080062, "num_tokens": 1381958999.0, "step": 4705 }, { "entropy": 0.5916466902941465, "epoch": 0.3529899456884498, "grad_norm": 0.19733095169067383, "learning_rate": 0.0002, "loss": 0.7497, "mean_token_accuracy": 0.7953646160662174, "num_tokens": 1383444431.0, "step": 4710 }, { "entropy": 0.5825668888166546, "epoch": 0.3533646696223017, "grad_norm": 0.17837853729724884, "learning_rate": 0.0002, "loss": 0.7354, "mean_token_accuracy": 0.8022403687238693, "num_tokens": 1384898539.0, "step": 4715 }, { "entropy": 0.5751688392832875, "epoch": 0.3537393935561535, "grad_norm": 0.17588138580322266, "learning_rate": 0.0002, "loss": 0.7388, "mean_token_accuracy": 0.7963860865682364, "num_tokens": 1386334157.0, "step": 4720 }, { "entropy": 0.5703924555331469, "epoch": 0.35411411749000543, "grad_norm": 0.17731118202209473, "learning_rate": 0.0002, "loss": 0.7346, "mean_token_accuracy": 0.7987803544849157, "num_tokens": 1387769720.0, "step": 4725 }, { "entropy": 0.5928223986178637, "epoch": 0.3544888414238573, "grad_norm": 0.1906772404909134, "learning_rate": 0.0002, "loss": 0.7624, "mean_token_accuracy": 0.7969958391040564, "num_tokens": 1389228801.0, "step": 4730 }, { "entropy": 0.573495902121067, "epoch": 0.35486356535770913, "grad_norm": 0.18404625356197357, "learning_rate": 0.0002, "loss": 0.7332, "mean_token_accuracy": 0.8017934776842595, "num_tokens": 1390760195.0, "step": 4735 }, { "entropy": 0.5674853023141623, "epoch": 0.355238289291561, "grad_norm": 0.20313434302806854, "learning_rate": 0.0002, "loss": 0.7256, "mean_token_accuracy": 0.8005526948720216, "num_tokens": 1392229737.0, "step": 4740 }, { "entropy": 0.5782244767993688, "epoch": 0.35561301322541283, "grad_norm": 0.201667919754982, "learning_rate": 0.0002, "loss": 0.7477, "mean_token_accuracy": 0.7997404854744673, "num_tokens": 1393684907.0, "step": 4745 }, { "entropy": 0.5829410107806325, "epoch": 0.3559877371592647, "grad_norm": 0.1786106824874878, "learning_rate": 0.0002, "loss": 0.7412, "mean_token_accuracy": 0.7970421176403761, "num_tokens": 1395192935.0, "step": 4750 }, { "entropy": 0.5818075874820352, "epoch": 0.35636246109311653, "grad_norm": 0.18894323706626892, "learning_rate": 0.0002, "loss": 0.7383, "mean_token_accuracy": 0.7995944909751416, "num_tokens": 1396691899.0, "step": 4755 }, { "entropy": 0.570878697745502, "epoch": 0.35673718502696844, "grad_norm": 0.18714845180511475, "learning_rate": 0.0002, "loss": 0.72, "mean_token_accuracy": 0.8008377820253372, "num_tokens": 1398172482.0, "step": 4760 }, { "entropy": 0.5680818172171712, "epoch": 0.3571119089608203, "grad_norm": 0.1884465217590332, "learning_rate": 0.0002, "loss": 0.7162, "mean_token_accuracy": 0.8019432451575994, "num_tokens": 1399630120.0, "step": 4765 }, { "entropy": 0.59914560476318, "epoch": 0.35748663289467214, "grad_norm": 0.18132026493549347, "learning_rate": 0.0002, "loss": 0.7727, "mean_token_accuracy": 0.7968050118535757, "num_tokens": 1401112972.0, "step": 4770 }, { "entropy": 0.5737534126266837, "epoch": 0.357861356828524, "grad_norm": 0.1956413835287094, "learning_rate": 0.0002, "loss": 0.7299, "mean_token_accuracy": 0.8009427558630705, "num_tokens": 1402596653.0, "step": 4775 }, { "entropy": 0.5739362601190805, "epoch": 0.35823608076237584, "grad_norm": 0.17732462286949158, "learning_rate": 0.0002, "loss": 0.7338, "mean_token_accuracy": 0.7965256422758102, "num_tokens": 1404077702.0, "step": 4780 }, { "entropy": 0.572279292345047, "epoch": 0.3586108046962277, "grad_norm": 0.18786025047302246, "learning_rate": 0.0002, "loss": 0.7313, "mean_token_accuracy": 0.8000641610473395, "num_tokens": 1405543562.0, "step": 4785 }, { "entropy": 0.5803645297884941, "epoch": 0.35898552863007954, "grad_norm": 0.18222348392009735, "learning_rate": 0.0002, "loss": 0.7409, "mean_token_accuracy": 0.8010318212211132, "num_tokens": 1406948752.0, "step": 4790 }, { "entropy": 0.575404942035675, "epoch": 0.3593602525639314, "grad_norm": 0.20444773137569427, "learning_rate": 0.0002, "loss": 0.7337, "mean_token_accuracy": 0.7999761961400509, "num_tokens": 1408441071.0, "step": 4795 }, { "entropy": 0.5685036795213818, "epoch": 0.3597349764977833, "grad_norm": 0.19874393939971924, "learning_rate": 0.0002, "loss": 0.7229, "mean_token_accuracy": 0.8006416901946067, "num_tokens": 1409899876.0, "step": 4800 }, { "entropy": 0.5921065416187048, "epoch": 0.36010970043163515, "grad_norm": 0.18367169797420502, "learning_rate": 0.0002, "loss": 0.7517, "mean_token_accuracy": 0.7948074229061604, "num_tokens": 1411332338.0, "step": 4805 }, { "entropy": 0.5593248918652535, "epoch": 0.360484424365487, "grad_norm": 0.20880846679210663, "learning_rate": 0.0002, "loss": 0.7049, "mean_token_accuracy": 0.8035412963479758, "num_tokens": 1412782619.0, "step": 4810 }, { "entropy": 0.5972999509423971, "epoch": 0.36085914829933885, "grad_norm": 0.20289364457130432, "learning_rate": 0.0002, "loss": 0.7535, "mean_token_accuracy": 0.7945328272879124, "num_tokens": 1414257429.0, "step": 4815 }, { "entropy": 0.5892018659040332, "epoch": 0.3612338722331907, "grad_norm": 0.18444085121154785, "learning_rate": 0.0002, "loss": 0.737, "mean_token_accuracy": 0.7992063321173191, "num_tokens": 1415740315.0, "step": 4820 }, { "entropy": 0.5872085569426417, "epoch": 0.36160859616704255, "grad_norm": 0.1720702350139618, "learning_rate": 0.0002, "loss": 0.7297, "mean_token_accuracy": 0.796893660724163, "num_tokens": 1417233450.0, "step": 4825 }, { "entropy": 0.597268870100379, "epoch": 0.3619833201008944, "grad_norm": 0.2075827568769455, "learning_rate": 0.0002, "loss": 0.7421, "mean_token_accuracy": 0.7982594925910235, "num_tokens": 1418756972.0, "step": 4830 }, { "entropy": 0.5882808804512024, "epoch": 0.36235804403474625, "grad_norm": 0.18972039222717285, "learning_rate": 0.0002, "loss": 0.7373, "mean_token_accuracy": 0.798435515165329, "num_tokens": 1420230069.0, "step": 4835 }, { "entropy": 0.5692582068964839, "epoch": 0.36273276796859816, "grad_norm": 0.20226328074932098, "learning_rate": 0.0002, "loss": 0.7155, "mean_token_accuracy": 0.8017075173556805, "num_tokens": 1421694126.0, "step": 4840 }, { "entropy": 0.5915076319128275, "epoch": 0.36310749190245, "grad_norm": 0.20646041631698608, "learning_rate": 0.0002, "loss": 0.7507, "mean_token_accuracy": 0.7994481381028891, "num_tokens": 1423193852.0, "step": 4845 }, { "entropy": 0.6007507666945457, "epoch": 0.36348221583630186, "grad_norm": 0.1875486522912979, "learning_rate": 0.0002, "loss": 0.7522, "mean_token_accuracy": 0.7966476358473301, "num_tokens": 1424712581.0, "step": 4850 }, { "entropy": 0.5853548977524042, "epoch": 0.3638569397701537, "grad_norm": 0.25774314999580383, "learning_rate": 0.0002, "loss": 0.7439, "mean_token_accuracy": 0.8021819230169058, "num_tokens": 1426221560.0, "step": 4855 }, { "entropy": 0.5785012442618609, "epoch": 0.36423166370400556, "grad_norm": 0.19124172627925873, "learning_rate": 0.0002, "loss": 0.7344, "mean_token_accuracy": 0.800578735768795, "num_tokens": 1427654589.0, "step": 4860 }, { "entropy": 0.5687680108472705, "epoch": 0.3646063876378574, "grad_norm": 0.1915317326784134, "learning_rate": 0.0002, "loss": 0.7146, "mean_token_accuracy": 0.8024926792830229, "num_tokens": 1429100892.0, "step": 4865 }, { "entropy": 0.5734780834987759, "epoch": 0.36498111157170926, "grad_norm": 0.18938209116458893, "learning_rate": 0.0002, "loss": 0.7345, "mean_token_accuracy": 0.801308885961771, "num_tokens": 1430553712.0, "step": 4870 }, { "entropy": 0.5794782632961869, "epoch": 0.3653558355055611, "grad_norm": 0.2205447107553482, "learning_rate": 0.0002, "loss": 0.7367, "mean_token_accuracy": 0.798992995172739, "num_tokens": 1432005007.0, "step": 4875 }, { "entropy": 0.5975294193252921, "epoch": 0.365730559439413, "grad_norm": 0.20155152678489685, "learning_rate": 0.0002, "loss": 0.7542, "mean_token_accuracy": 0.7957376640290021, "num_tokens": 1433490471.0, "step": 4880 }, { "entropy": 0.57965356297791, "epoch": 0.36610528337326487, "grad_norm": 0.18504579365253448, "learning_rate": 0.0002, "loss": 0.7378, "mean_token_accuracy": 0.8013306871056557, "num_tokens": 1434905688.0, "step": 4885 }, { "entropy": 0.5739920726045966, "epoch": 0.3664800073071167, "grad_norm": 0.1943066418170929, "learning_rate": 0.0002, "loss": 0.7313, "mean_token_accuracy": 0.799153184890747, "num_tokens": 1436350514.0, "step": 4890 }, { "entropy": 0.5926010633818806, "epoch": 0.36685473124096857, "grad_norm": 0.18582886457443237, "learning_rate": 0.0002, "loss": 0.7487, "mean_token_accuracy": 0.7969273757189512, "num_tokens": 1437836879.0, "step": 4895 }, { "entropy": 0.5646464450284838, "epoch": 0.3672294551748204, "grad_norm": 0.1848573386669159, "learning_rate": 0.0002, "loss": 0.7314, "mean_token_accuracy": 0.8000107020139694, "num_tokens": 1439290807.0, "step": 4900 }, { "entropy": 0.572533049993217, "epoch": 0.36760417910867227, "grad_norm": 0.19309037923812866, "learning_rate": 0.0002, "loss": 0.7304, "mean_token_accuracy": 0.7976596385240555, "num_tokens": 1440736907.0, "step": 4905 }, { "entropy": 0.5962202951312066, "epoch": 0.3679789030425241, "grad_norm": 0.20670510828495026, "learning_rate": 0.0002, "loss": 0.7446, "mean_token_accuracy": 0.7965790387243032, "num_tokens": 1442195041.0, "step": 4910 }, { "entropy": 0.5854447474703193, "epoch": 0.368353626976376, "grad_norm": 0.18230955302715302, "learning_rate": 0.0002, "loss": 0.7377, "mean_token_accuracy": 0.7971730679273605, "num_tokens": 1443720617.0, "step": 4915 }, { "entropy": 0.5684984810650349, "epoch": 0.3687283509102279, "grad_norm": 0.18805012106895447, "learning_rate": 0.0002, "loss": 0.7347, "mean_token_accuracy": 0.7987113554030657, "num_tokens": 1445206714.0, "step": 4920 }, { "entropy": 0.5613784501329064, "epoch": 0.3691030748440797, "grad_norm": 0.19778957962989807, "learning_rate": 0.0002, "loss": 0.7329, "mean_token_accuracy": 0.7998289946466685, "num_tokens": 1446624190.0, "step": 4925 }, { "entropy": 0.5539934336207807, "epoch": 0.3694777987779316, "grad_norm": 0.17687921226024628, "learning_rate": 0.0002, "loss": 0.7206, "mean_token_accuracy": 0.8034255649894476, "num_tokens": 1448107761.0, "step": 4930 }, { "entropy": 0.5546913259662688, "epoch": 0.3698525227117834, "grad_norm": 0.18196271359920502, "learning_rate": 0.0002, "loss": 0.7393, "mean_token_accuracy": 0.801861084252596, "num_tokens": 1449609928.0, "step": 4935 }, { "entropy": 0.5449927221983671, "epoch": 0.3702272466456353, "grad_norm": 0.19996878504753113, "learning_rate": 0.0002, "loss": 0.7153, "mean_token_accuracy": 0.8033001575618982, "num_tokens": 1451072627.0, "step": 4940 }, { "entropy": 0.5653955684043467, "epoch": 0.3706019705794871, "grad_norm": 0.24704083800315857, "learning_rate": 0.0002, "loss": 0.7353, "mean_token_accuracy": 0.8010582830756903, "num_tokens": 1452560841.0, "step": 4945 }, { "entropy": 0.5563784051686526, "epoch": 0.370976694513339, "grad_norm": 0.21859131753444672, "learning_rate": 0.0002, "loss": 0.7229, "mean_token_accuracy": 0.8001338131725788, "num_tokens": 1454048895.0, "step": 4950 }, { "entropy": 0.5709991950541735, "epoch": 0.3713514184471909, "grad_norm": 0.23880638182163239, "learning_rate": 0.0002, "loss": 0.7467, "mean_token_accuracy": 0.8003082532435656, "num_tokens": 1455500583.0, "step": 4955 }, { "entropy": 0.5789124708622694, "epoch": 0.37172614238104273, "grad_norm": 0.20555686950683594, "learning_rate": 0.0002, "loss": 0.7476, "mean_token_accuracy": 0.7952955711632967, "num_tokens": 1456932989.0, "step": 4960 }, { "entropy": 0.5677011242136359, "epoch": 0.3721008663148946, "grad_norm": 0.19110745191574097, "learning_rate": 0.0002, "loss": 0.7313, "mean_token_accuracy": 0.7995030850172042, "num_tokens": 1458414498.0, "step": 4965 }, { "entropy": 0.5636548589915037, "epoch": 0.37247559024874644, "grad_norm": 0.18925726413726807, "learning_rate": 0.0002, "loss": 0.7364, "mean_token_accuracy": 0.7989683624356985, "num_tokens": 1459874918.0, "step": 4970 }, { "entropy": 0.5596172729507088, "epoch": 0.3728503141825983, "grad_norm": 0.1993098109960556, "learning_rate": 0.0002, "loss": 0.7321, "mean_token_accuracy": 0.7985488656908274, "num_tokens": 1461354136.0, "step": 4975 }, { "entropy": 0.5765096904709935, "epoch": 0.37322503811645014, "grad_norm": 0.1803503930568695, "learning_rate": 0.0002, "loss": 0.7504, "mean_token_accuracy": 0.7947907693684101, "num_tokens": 1462828958.0, "step": 4980 }, { "entropy": 0.5739358423277736, "epoch": 0.373599762050302, "grad_norm": 0.1679273247718811, "learning_rate": 0.0002, "loss": 0.7464, "mean_token_accuracy": 0.8001529254019261, "num_tokens": 1464320492.0, "step": 4985 }, { "entropy": 0.5723758673295378, "epoch": 0.37397448598415384, "grad_norm": 0.1820175051689148, "learning_rate": 0.0002, "loss": 0.7385, "mean_token_accuracy": 0.8004694510251283, "num_tokens": 1465771901.0, "step": 4990 }, { "entropy": 0.5435754101723432, "epoch": 0.37434920991800574, "grad_norm": 0.19182303547859192, "learning_rate": 0.0002, "loss": 0.7078, "mean_token_accuracy": 0.8040282633155584, "num_tokens": 1467233455.0, "step": 4995 }, { "entropy": 0.5684096917510033, "epoch": 0.3747239338518576, "grad_norm": 0.19479069113731384, "learning_rate": 0.0002, "loss": 0.7345, "mean_token_accuracy": 0.801713178306818, "num_tokens": 1468693513.0, "step": 5000 }, { "entropy": 0.5703719332814217, "epoch": 0.37509865778570944, "grad_norm": 0.17551757395267487, "learning_rate": 0.0002, "loss": 0.7239, "mean_token_accuracy": 0.8007403638213872, "num_tokens": 1470181483.0, "step": 5005 }, { "entropy": 0.5656170880421996, "epoch": 0.3754733817195613, "grad_norm": 0.17958620190620422, "learning_rate": 0.0002, "loss": 0.7278, "mean_token_accuracy": 0.8019810147583485, "num_tokens": 1471648582.0, "step": 5010 }, { "entropy": 0.5731626104563474, "epoch": 0.37584810565341314, "grad_norm": 0.17565488815307617, "learning_rate": 0.0002, "loss": 0.7257, "mean_token_accuracy": 0.8004314102232456, "num_tokens": 1473163948.0, "step": 5015 }, { "entropy": 0.5648169742897153, "epoch": 0.376222829587265, "grad_norm": 0.17908954620361328, "learning_rate": 0.0002, "loss": 0.7277, "mean_token_accuracy": 0.801800524070859, "num_tokens": 1474642529.0, "step": 5020 }, { "entropy": 0.5855788966175168, "epoch": 0.37659755352111685, "grad_norm": 0.180165097117424, "learning_rate": 0.0002, "loss": 0.7563, "mean_token_accuracy": 0.7978784408420324, "num_tokens": 1476104462.0, "step": 5025 }, { "entropy": 0.5695291867479682, "epoch": 0.3769722774549687, "grad_norm": 0.16956767439842224, "learning_rate": 0.0002, "loss": 0.7281, "mean_token_accuracy": 0.8034883543848992, "num_tokens": 1477600235.0, "step": 5030 }, { "entropy": 0.5736737043596805, "epoch": 0.3773470013888206, "grad_norm": 0.2070835530757904, "learning_rate": 0.0002, "loss": 0.7369, "mean_token_accuracy": 0.8018581014126539, "num_tokens": 1479066733.0, "step": 5035 }, { "entropy": 0.5800370465964079, "epoch": 0.37772172532267245, "grad_norm": 0.19637635350227356, "learning_rate": 0.0002, "loss": 0.7404, "mean_token_accuracy": 0.7979629546403885, "num_tokens": 1480552419.0, "step": 5040 }, { "entropy": 0.5867718117311597, "epoch": 0.3780964492565243, "grad_norm": 0.1925646960735321, "learning_rate": 0.0002, "loss": 0.7584, "mean_token_accuracy": 0.7968352410942316, "num_tokens": 1482026886.0, "step": 5045 }, { "entropy": 0.568880901671946, "epoch": 0.37847117319037615, "grad_norm": 0.17983466386795044, "learning_rate": 0.0002, "loss": 0.7258, "mean_token_accuracy": 0.7997580699622631, "num_tokens": 1483525575.0, "step": 5050 }, { "entropy": 0.5638772539794445, "epoch": 0.378845897124228, "grad_norm": 0.18375860154628754, "learning_rate": 0.0002, "loss": 0.7205, "mean_token_accuracy": 0.7993284437805415, "num_tokens": 1485003764.0, "step": 5055 }, { "entropy": 0.5721071918495, "epoch": 0.37922062105807985, "grad_norm": 0.19429004192352295, "learning_rate": 0.0002, "loss": 0.7433, "mean_token_accuracy": 0.7977370385080575, "num_tokens": 1486451676.0, "step": 5060 }, { "entropy": 0.5781319473870099, "epoch": 0.3795953449919317, "grad_norm": 0.19047339260578156, "learning_rate": 0.0002, "loss": 0.7381, "mean_token_accuracy": 0.799719012528658, "num_tokens": 1487953880.0, "step": 5065 }, { "entropy": 0.587481596507132, "epoch": 0.3799700689257836, "grad_norm": 0.18584567308425903, "learning_rate": 0.0002, "loss": 0.747, "mean_token_accuracy": 0.7960651759058237, "num_tokens": 1489469821.0, "step": 5070 }, { "entropy": 0.5702677667140961, "epoch": 0.38034479285963546, "grad_norm": 0.20146271586418152, "learning_rate": 0.0002, "loss": 0.7309, "mean_token_accuracy": 0.8020607594400644, "num_tokens": 1490952282.0, "step": 5075 }, { "entropy": 0.5766153612174094, "epoch": 0.3807195167934873, "grad_norm": 0.18442951142787933, "learning_rate": 0.0002, "loss": 0.7264, "mean_token_accuracy": 0.8015585362911224, "num_tokens": 1492435977.0, "step": 5080 }, { "entropy": 0.582772177271545, "epoch": 0.38109424072733916, "grad_norm": 0.18297362327575684, "learning_rate": 0.0002, "loss": 0.7436, "mean_token_accuracy": 0.7973436448723078, "num_tokens": 1493883549.0, "step": 5085 }, { "entropy": 0.5742241537198425, "epoch": 0.381468964661191, "grad_norm": 0.19170425832271576, "learning_rate": 0.0002, "loss": 0.7304, "mean_token_accuracy": 0.7964871529489755, "num_tokens": 1495354726.0, "step": 5090 }, { "entropy": 0.5808211350813508, "epoch": 0.38184368859504286, "grad_norm": 0.18031062185764313, "learning_rate": 0.0002, "loss": 0.7421, "mean_token_accuracy": 0.8006569024175405, "num_tokens": 1496827572.0, "step": 5095 }, { "entropy": 0.5716703718528151, "epoch": 0.3822184125288947, "grad_norm": 0.1863376498222351, "learning_rate": 0.0002, "loss": 0.738, "mean_token_accuracy": 0.7998899929225445, "num_tokens": 1498315573.0, "step": 5100 }, { "entropy": 0.5592017529532314, "epoch": 0.38259313646274656, "grad_norm": 0.1865544319152832, "learning_rate": 0.0002, "loss": 0.7114, "mean_token_accuracy": 0.8031068157404662, "num_tokens": 1499802300.0, "step": 5105 }, { "entropy": 0.5641374764963984, "epoch": 0.38296786039659847, "grad_norm": 0.1842261701822281, "learning_rate": 0.0002, "loss": 0.7335, "mean_token_accuracy": 0.8013762585818768, "num_tokens": 1501253374.0, "step": 5110 }, { "entropy": 0.5671856408938766, "epoch": 0.3833425843304503, "grad_norm": 0.1927785575389862, "learning_rate": 0.0002, "loss": 0.7339, "mean_token_accuracy": 0.8003052443265914, "num_tokens": 1502723293.0, "step": 5115 }, { "entropy": 0.5868684457615018, "epoch": 0.38371730826430217, "grad_norm": 0.2126309871673584, "learning_rate": 0.0002, "loss": 0.7522, "mean_token_accuracy": 0.7961276993155479, "num_tokens": 1504166714.0, "step": 5120 }, { "entropy": 0.5766763465479017, "epoch": 0.384092032198154, "grad_norm": 0.20629918575286865, "learning_rate": 0.0002, "loss": 0.7295, "mean_token_accuracy": 0.8019895635545253, "num_tokens": 1505648164.0, "step": 5125 }, { "entropy": 0.5921165078878403, "epoch": 0.38446675613200587, "grad_norm": 0.19410806894302368, "learning_rate": 0.0002, "loss": 0.7451, "mean_token_accuracy": 0.799370127171278, "num_tokens": 1507136518.0, "step": 5130 }, { "entropy": 0.5781697290018201, "epoch": 0.3848414800658577, "grad_norm": 0.1666482537984848, "learning_rate": 0.0002, "loss": 0.7273, "mean_token_accuracy": 0.800388566032052, "num_tokens": 1508597984.0, "step": 5135 }, { "entropy": 0.5830591260455549, "epoch": 0.38521620399970957, "grad_norm": 0.1799800992012024, "learning_rate": 0.0002, "loss": 0.7286, "mean_token_accuracy": 0.8017721801996232, "num_tokens": 1510075772.0, "step": 5140 }, { "entropy": 0.5973584350198508, "epoch": 0.3855909279335614, "grad_norm": 0.20169442892074585, "learning_rate": 0.0002, "loss": 0.7373, "mean_token_accuracy": 0.798420288041234, "num_tokens": 1511590832.0, "step": 5145 }, { "entropy": 0.5908402375876903, "epoch": 0.38596565186741333, "grad_norm": 0.18746985495090485, "learning_rate": 0.0002, "loss": 0.7436, "mean_token_accuracy": 0.7972298804670572, "num_tokens": 1513046727.0, "step": 5150 }, { "entropy": 0.604563714005053, "epoch": 0.3863403758012652, "grad_norm": 0.21399599313735962, "learning_rate": 0.0002, "loss": 0.7516, "mean_token_accuracy": 0.8003412757068873, "num_tokens": 1514568190.0, "step": 5155 }, { "entropy": 0.5811024773865938, "epoch": 0.38671509973511703, "grad_norm": 0.18944421410560608, "learning_rate": 0.0002, "loss": 0.7248, "mean_token_accuracy": 0.8020495038479567, "num_tokens": 1516013825.0, "step": 5160 }, { "entropy": 0.5846484939567744, "epoch": 0.3870898236689689, "grad_norm": 0.18137767910957336, "learning_rate": 0.0002, "loss": 0.7452, "mean_token_accuracy": 0.796928359568119, "num_tokens": 1517494674.0, "step": 5165 }, { "entropy": 0.5590893810614943, "epoch": 0.38746454760282073, "grad_norm": 0.19370593130588531, "learning_rate": 0.0002, "loss": 0.7311, "mean_token_accuracy": 0.8019484881311655, "num_tokens": 1518941594.0, "step": 5170 }, { "entropy": 0.550470735039562, "epoch": 0.3878392715366726, "grad_norm": 0.18742471933364868, "learning_rate": 0.0002, "loss": 0.727, "mean_token_accuracy": 0.8022238194942475, "num_tokens": 1520382272.0, "step": 5175 }, { "entropy": 0.5570182142779231, "epoch": 0.38821399547052443, "grad_norm": 0.18567098677158356, "learning_rate": 0.0002, "loss": 0.7419, "mean_token_accuracy": 0.7993196979165077, "num_tokens": 1521840433.0, "step": 5180 }, { "entropy": 0.5595636347308754, "epoch": 0.3885887194043763, "grad_norm": 0.19346363842487335, "learning_rate": 0.0002, "loss": 0.731, "mean_token_accuracy": 0.8021578747779131, "num_tokens": 1523273961.0, "step": 5185 }, { "entropy": 0.5528078280389309, "epoch": 0.3889634433382282, "grad_norm": 0.20431330800056458, "learning_rate": 0.0002, "loss": 0.7184, "mean_token_accuracy": 0.8014503370970487, "num_tokens": 1524709396.0, "step": 5190 }, { "entropy": 0.556840124167502, "epoch": 0.38933816727208004, "grad_norm": 0.19623368978500366, "learning_rate": 0.0002, "loss": 0.723, "mean_token_accuracy": 0.8016678776592017, "num_tokens": 1526178236.0, "step": 5195 }, { "entropy": 0.5648952011018992, "epoch": 0.3897128912059319, "grad_norm": 0.1750521957874298, "learning_rate": 0.0002, "loss": 0.7169, "mean_token_accuracy": 0.8004106797277928, "num_tokens": 1527704177.0, "step": 5200 }, { "entropy": 0.5548686914145946, "epoch": 0.39008761513978374, "grad_norm": 0.1804570108652115, "learning_rate": 0.0002, "loss": 0.7088, "mean_token_accuracy": 0.8059614017605782, "num_tokens": 1529215365.0, "step": 5205 }, { "entropy": 0.5732555920258164, "epoch": 0.3904623390736356, "grad_norm": 0.24703942239284515, "learning_rate": 0.0002, "loss": 0.7322, "mean_token_accuracy": 0.7985583394765854, "num_tokens": 1530694058.0, "step": 5210 }, { "entropy": 0.5751229061745107, "epoch": 0.39083706300748744, "grad_norm": 0.19191259145736694, "learning_rate": 0.0002, "loss": 0.7389, "mean_token_accuracy": 0.799874185398221, "num_tokens": 1532188807.0, "step": 5215 }, { "entropy": 0.5829500991851091, "epoch": 0.3912117869413393, "grad_norm": 0.22531245648860931, "learning_rate": 0.0002, "loss": 0.7379, "mean_token_accuracy": 0.7981281634420156, "num_tokens": 1533719798.0, "step": 5220 }, { "entropy": 0.5744785076007247, "epoch": 0.39158651087519114, "grad_norm": 0.19566231966018677, "learning_rate": 0.0002, "loss": 0.7425, "mean_token_accuracy": 0.7993212029337883, "num_tokens": 1535196627.0, "step": 5225 }, { "entropy": 0.5623600628226996, "epoch": 0.39196123480904305, "grad_norm": 0.2309974581003189, "learning_rate": 0.0002, "loss": 0.7282, "mean_token_accuracy": 0.8002359177917242, "num_tokens": 1536674786.0, "step": 5230 }, { "entropy": 0.5725797495804728, "epoch": 0.3923359587428949, "grad_norm": 0.20628027617931366, "learning_rate": 0.0002, "loss": 0.7349, "mean_token_accuracy": 0.7990479718893766, "num_tokens": 1538171076.0, "step": 5235 }, { "entropy": 0.5809566579759121, "epoch": 0.39271068267674675, "grad_norm": 0.1985527127981186, "learning_rate": 0.0002, "loss": 0.7487, "mean_token_accuracy": 0.799552871659398, "num_tokens": 1539630503.0, "step": 5240 }, { "entropy": 0.5619875064119697, "epoch": 0.3930854066105986, "grad_norm": 0.19066496193408966, "learning_rate": 0.0002, "loss": 0.7287, "mean_token_accuracy": 0.8021123342216014, "num_tokens": 1541078153.0, "step": 5245 }, { "entropy": 0.5561446577310563, "epoch": 0.39346013054445045, "grad_norm": 0.18123772740364075, "learning_rate": 0.0002, "loss": 0.7124, "mean_token_accuracy": 0.8023211102932691, "num_tokens": 1542569612.0, "step": 5250 }, { "entropy": 0.5584403882734478, "epoch": 0.3938348544783023, "grad_norm": 0.18054476380348206, "learning_rate": 0.0002, "loss": 0.7339, "mean_token_accuracy": 0.8006356105208396, "num_tokens": 1544037749.0, "step": 5255 }, { "entropy": 0.5655527668073773, "epoch": 0.39420957841215415, "grad_norm": 0.1694682240486145, "learning_rate": 0.0002, "loss": 0.7297, "mean_token_accuracy": 0.8004370152950286, "num_tokens": 1545517809.0, "step": 5260 }, { "entropy": 0.552287241909653, "epoch": 0.39458430234600606, "grad_norm": 0.17121483385562897, "learning_rate": 0.0002, "loss": 0.7314, "mean_token_accuracy": 0.8022218272089958, "num_tokens": 1546993713.0, "step": 5265 }, { "entropy": 0.5669811934232711, "epoch": 0.3949590262798579, "grad_norm": 0.17548337578773499, "learning_rate": 0.0002, "loss": 0.7493, "mean_token_accuracy": 0.7984397910535336, "num_tokens": 1548503550.0, "step": 5270 }, { "entropy": 0.544682895205915, "epoch": 0.39533375021370976, "grad_norm": 0.19057342410087585, "learning_rate": 0.0002, "loss": 0.7174, "mean_token_accuracy": 0.8027185074985027, "num_tokens": 1549956765.0, "step": 5275 }, { "entropy": 0.5529410971794277, "epoch": 0.3957084741475616, "grad_norm": 0.1866282969713211, "learning_rate": 0.0002, "loss": 0.7199, "mean_token_accuracy": 0.8014912437647581, "num_tokens": 1551451386.0, "step": 5280 }, { "entropy": 0.552817021124065, "epoch": 0.39608319808141346, "grad_norm": 0.200348898768425, "learning_rate": 0.0002, "loss": 0.7309, "mean_token_accuracy": 0.8007747646421194, "num_tokens": 1552951939.0, "step": 5285 }, { "entropy": 0.5694477636367082, "epoch": 0.3964579220152653, "grad_norm": 0.1990327388048172, "learning_rate": 0.0002, "loss": 0.7475, "mean_token_accuracy": 0.7979642644524574, "num_tokens": 1554474286.0, "step": 5290 }, { "entropy": 0.5625232242047786, "epoch": 0.39683264594911716, "grad_norm": 0.18059058487415314, "learning_rate": 0.0002, "loss": 0.7321, "mean_token_accuracy": 0.8009769644588232, "num_tokens": 1555941232.0, "step": 5295 }, { "entropy": 0.551236139331013, "epoch": 0.397207369882969, "grad_norm": 0.20172515511512756, "learning_rate": 0.0002, "loss": 0.7199, "mean_token_accuracy": 0.8044455971568822, "num_tokens": 1557418572.0, "step": 5300 }, { "entropy": 0.5418757854029537, "epoch": 0.3975820938168209, "grad_norm": 0.1896996945142746, "learning_rate": 0.0002, "loss": 0.7192, "mean_token_accuracy": 0.8027500584721565, "num_tokens": 1558848129.0, "step": 5305 }, { "entropy": 0.5633070291019976, "epoch": 0.39795681775067276, "grad_norm": 0.21724018454551697, "learning_rate": 0.0002, "loss": 0.7381, "mean_token_accuracy": 0.8005500745028258, "num_tokens": 1560288444.0, "step": 5310 }, { "entropy": 0.5529559953138232, "epoch": 0.3983315416845246, "grad_norm": 0.19990840554237366, "learning_rate": 0.0002, "loss": 0.7291, "mean_token_accuracy": 0.7971979960799217, "num_tokens": 1561722974.0, "step": 5315 }, { "entropy": 0.5512363022193313, "epoch": 0.39870626561837647, "grad_norm": 0.18606989085674286, "learning_rate": 0.0002, "loss": 0.7265, "mean_token_accuracy": 0.80326322093606, "num_tokens": 1563165571.0, "step": 5320 }, { "entropy": 0.564984686113894, "epoch": 0.3990809895522283, "grad_norm": 0.19955651462078094, "learning_rate": 0.0002, "loss": 0.7414, "mean_token_accuracy": 0.7993991728872061, "num_tokens": 1564650841.0, "step": 5325 }, { "entropy": 0.5504837343469262, "epoch": 0.39945571348608017, "grad_norm": 0.20974119007587433, "learning_rate": 0.0002, "loss": 0.7256, "mean_token_accuracy": 0.8026264671236276, "num_tokens": 1566090591.0, "step": 5330 }, { "entropy": 0.5375357761979103, "epoch": 0.399830437419932, "grad_norm": 0.17603978514671326, "learning_rate": 0.0002, "loss": 0.6971, "mean_token_accuracy": 0.8079009849578143, "num_tokens": 1567556196.0, "step": 5335 }, { "entropy": 0.5653341617435217, "epoch": 0.40020516135378387, "grad_norm": 0.18092913925647736, "learning_rate": 0.0002, "loss": 0.7423, "mean_token_accuracy": 0.8005578335374594, "num_tokens": 1569015038.0, "step": 5340 }, { "entropy": 0.5713722685351967, "epoch": 0.4005798852876358, "grad_norm": 0.19714383780956268, "learning_rate": 0.0002, "loss": 0.7455, "mean_token_accuracy": 0.7985258772969246, "num_tokens": 1570500345.0, "step": 5345 }, { "entropy": 0.5530624569393694, "epoch": 0.4009546092214876, "grad_norm": 0.1920454055070877, "learning_rate": 0.0002, "loss": 0.7242, "mean_token_accuracy": 0.8051728744059801, "num_tokens": 1571967492.0, "step": 5350 }, { "entropy": 0.5641338730230927, "epoch": 0.4013293331553395, "grad_norm": 0.18036098778247833, "learning_rate": 0.0002, "loss": 0.7186, "mean_token_accuracy": 0.8022800900042057, "num_tokens": 1573457651.0, "step": 5355 }, { "entropy": 0.5633590844459831, "epoch": 0.4017040570891913, "grad_norm": 1.9658324718475342, "learning_rate": 0.0002, "loss": 0.743, "mean_token_accuracy": 0.8011297576129437, "num_tokens": 1574922771.0, "step": 5360 }, { "entropy": 0.5625708708539605, "epoch": 0.4020787810230432, "grad_norm": 0.18425364792346954, "learning_rate": 0.0002, "loss": 0.7367, "mean_token_accuracy": 0.8000194165855646, "num_tokens": 1576351095.0, "step": 5365 }, { "entropy": 0.5447711745277047, "epoch": 0.402453504956895, "grad_norm": 0.20710985362529755, "learning_rate": 0.0002, "loss": 0.7129, "mean_token_accuracy": 0.8034960441291332, "num_tokens": 1577775325.0, "step": 5370 }, { "entropy": 0.5560759533196687, "epoch": 0.4028282288907469, "grad_norm": 0.17480456829071045, "learning_rate": 0.0002, "loss": 0.7343, "mean_token_accuracy": 0.79835186265409, "num_tokens": 1579241276.0, "step": 5375 }, { "entropy": 0.5617519721388817, "epoch": 0.4032029528245987, "grad_norm": 0.19292627274990082, "learning_rate": 0.0002, "loss": 0.7301, "mean_token_accuracy": 0.7975770995020867, "num_tokens": 1580761990.0, "step": 5380 }, { "entropy": 0.5771205224096775, "epoch": 0.40357767675845063, "grad_norm": 0.17023208737373352, "learning_rate": 0.0002, "loss": 0.7634, "mean_token_accuracy": 0.7963186386972666, "num_tokens": 1582238709.0, "step": 5385 }, { "entropy": 0.5571873301640153, "epoch": 0.4039524006923025, "grad_norm": 0.18187867105007172, "learning_rate": 0.0002, "loss": 0.7316, "mean_token_accuracy": 0.8011353299021721, "num_tokens": 1583691311.0, "step": 5390 }, { "entropy": 0.5652807282283903, "epoch": 0.40432712462615433, "grad_norm": 0.17883527278900146, "learning_rate": 0.0002, "loss": 0.7407, "mean_token_accuracy": 0.7979554157704115, "num_tokens": 1585160851.0, "step": 5395 }, { "entropy": 0.5665291244164109, "epoch": 0.4047018485600062, "grad_norm": 0.182265505194664, "learning_rate": 0.0002, "loss": 0.7309, "mean_token_accuracy": 0.7990654859691858, "num_tokens": 1586659002.0, "step": 5400 }, { "entropy": 0.5546058454550803, "epoch": 0.40507657249385803, "grad_norm": 0.20298247039318085, "learning_rate": 0.0002, "loss": 0.7215, "mean_token_accuracy": 0.8028571955859661, "num_tokens": 1485885.0, "step": 5405 }, { "entropy": 0.5479241024702788, "epoch": 0.4054512964277099, "grad_norm": 0.191498801112175, "learning_rate": 0.0002, "loss": 0.7182, "mean_token_accuracy": 0.8026126887649297, "num_tokens": 2911041.0, "step": 5410 }, { "entropy": 0.5604784058406949, "epoch": 0.40582602036156173, "grad_norm": 0.17960728704929352, "learning_rate": 0.0002, "loss": 0.7396, "mean_token_accuracy": 0.7993506096303463, "num_tokens": 4376425.0, "step": 5415 }, { "entropy": 0.5606479931622743, "epoch": 0.40620074429541364, "grad_norm": 0.181997150182724, "learning_rate": 0.0002, "loss": 0.7247, "mean_token_accuracy": 0.8015803396701813, "num_tokens": 5872508.0, "step": 5420 }, { "entropy": 0.5674757799133658, "epoch": 0.4065754682292655, "grad_norm": 0.1781710684299469, "learning_rate": 0.0002, "loss": 0.7315, "mean_token_accuracy": 0.8001731101423502, "num_tokens": 7399270.0, "step": 5425 }, { "entropy": 0.5656424999237061, "epoch": 0.40695019216311734, "grad_norm": 0.21331530809402466, "learning_rate": 0.0002, "loss": 0.733, "mean_token_accuracy": 0.799388699606061, "num_tokens": 8893772.0, "step": 5430 }, { "entropy": 0.5708539139479398, "epoch": 0.4073249160969692, "grad_norm": 0.181723952293396, "learning_rate": 0.0002, "loss": 0.7376, "mean_token_accuracy": 0.799806671962142, "num_tokens": 10351159.0, "step": 5435 }, { "entropy": 0.5538613451644778, "epoch": 0.40769964003082104, "grad_norm": 0.1783646196126938, "learning_rate": 0.0002, "loss": 0.7186, "mean_token_accuracy": 0.8013026718050241, "num_tokens": 11775085.0, "step": 5440 }, { "entropy": 0.5648490890860558, "epoch": 0.4080743639646729, "grad_norm": 0.19231411814689636, "learning_rate": 0.0002, "loss": 0.7297, "mean_token_accuracy": 0.8013088323175908, "num_tokens": 13250213.0, "step": 5445 }, { "entropy": 0.5524394562467932, "epoch": 0.40844908789852474, "grad_norm": 0.18910102546215057, "learning_rate": 0.0002, "loss": 0.7079, "mean_token_accuracy": 0.8067852955311536, "num_tokens": 14692295.0, "step": 5450 }, { "entropy": 0.5587844462133944, "epoch": 0.4088238118323766, "grad_norm": 0.20647138357162476, "learning_rate": 0.0002, "loss": 0.7246, "mean_token_accuracy": 0.8024815458804369, "num_tokens": 16105905.0, "step": 5455 }, { "entropy": 0.5631779169663786, "epoch": 0.4091985357662285, "grad_norm": 0.17861567437648773, "learning_rate": 0.0002, "loss": 0.7263, "mean_token_accuracy": 0.8016240060329437, "num_tokens": 17562541.0, "step": 5460 }, { "entropy": 0.5696421848610044, "epoch": 0.40957325970008035, "grad_norm": 0.18672317266464233, "learning_rate": 0.0002, "loss": 0.7353, "mean_token_accuracy": 0.8009946506470442, "num_tokens": 19007936.0, "step": 5465 }, { "entropy": 0.5597955936565995, "epoch": 0.4099479836339322, "grad_norm": 0.18015538156032562, "learning_rate": 0.0002, "loss": 0.7194, "mean_token_accuracy": 0.8015758685767651, "num_tokens": 20490786.0, "step": 5470 }, { "entropy": 0.5581633051857352, "epoch": 0.41032270756778405, "grad_norm": 0.19323688745498657, "learning_rate": 0.0002, "loss": 0.7113, "mean_token_accuracy": 0.8044087544083596, "num_tokens": 21955488.0, "step": 5475 }, { "entropy": 0.5613039527088404, "epoch": 0.4106974315016359, "grad_norm": 0.18994468450546265, "learning_rate": 0.0002, "loss": 0.7171, "mean_token_accuracy": 0.8011538531631232, "num_tokens": 23426536.0, "step": 5480 }, { "entropy": 0.5835170423611998, "epoch": 0.41107215543548775, "grad_norm": 0.2210090309381485, "learning_rate": 0.0002, "loss": 0.7367, "mean_token_accuracy": 0.7991932719945908, "num_tokens": 24949014.0, "step": 5485 }, { "entropy": 0.5913526672869921, "epoch": 0.4114468793693396, "grad_norm": 0.20742791891098022, "learning_rate": 0.0002, "loss": 0.7465, "mean_token_accuracy": 0.7964831449091434, "num_tokens": 26393411.0, "step": 5490 }, { "entropy": 0.5620619747787714, "epoch": 0.41182160330319145, "grad_norm": 0.19550298154354095, "learning_rate": 0.0002, "loss": 0.7143, "mean_token_accuracy": 0.8009897612035275, "num_tokens": 27873430.0, "step": 5495 }, { "entropy": 0.5670578807592392, "epoch": 0.41219632723704336, "grad_norm": 0.22952567040920258, "learning_rate": 0.0002, "loss": 0.7269, "mean_token_accuracy": 0.8011167246848345, "num_tokens": 29322782.0, "step": 5500 }, { "entropy": 0.5536579045467078, "epoch": 0.4125710511708952, "grad_norm": 0.19990751147270203, "learning_rate": 0.0002, "loss": 0.7094, "mean_token_accuracy": 0.8023496650159359, "num_tokens": 30754889.0, "step": 5505 }, { "entropy": 0.5608529481105506, "epoch": 0.41294577510474706, "grad_norm": 0.23558743298053741, "learning_rate": 0.0002, "loss": 0.7191, "mean_token_accuracy": 0.8011992558836937, "num_tokens": 32241551.0, "step": 5510 }, { "entropy": 0.5652048028074205, "epoch": 0.4133204990385989, "grad_norm": 0.20166943967342377, "learning_rate": 0.0002, "loss": 0.7294, "mean_token_accuracy": 0.8017805069684982, "num_tokens": 33695354.0, "step": 5515 }, { "entropy": 0.5642685236409306, "epoch": 0.41369522297245076, "grad_norm": 0.18113182485103607, "learning_rate": 0.0002, "loss": 0.7266, "mean_token_accuracy": 0.8024164408445358, "num_tokens": 35192339.0, "step": 5520 }, { "entropy": 0.5662362427450717, "epoch": 0.4140699469063026, "grad_norm": 0.1931617110967636, "learning_rate": 0.0002, "loss": 0.7299, "mean_token_accuracy": 0.8021017257124186, "num_tokens": 36685264.0, "step": 5525 }, { "entropy": 0.5526924582198263, "epoch": 0.41444467084015446, "grad_norm": 0.20442648231983185, "learning_rate": 0.0002, "loss": 0.7164, "mean_token_accuracy": 0.802295234426856, "num_tokens": 38158025.0, "step": 5530 }, { "entropy": 0.550849131308496, "epoch": 0.4148193947740063, "grad_norm": 0.173475444316864, "learning_rate": 0.0002, "loss": 0.7031, "mean_token_accuracy": 0.8016922928392887, "num_tokens": 39627439.0, "step": 5535 }, { "entropy": 0.5593643047846854, "epoch": 0.4151941187078582, "grad_norm": 0.18207287788391113, "learning_rate": 0.0002, "loss": 0.7136, "mean_token_accuracy": 0.8045295890420675, "num_tokens": 41067485.0, "step": 5540 }, { "entropy": 0.5632121905684471, "epoch": 0.41556884264171007, "grad_norm": 0.18016685545444489, "learning_rate": 0.0002, "loss": 0.7193, "mean_token_accuracy": 0.8003503069281578, "num_tokens": 42542374.0, "step": 5545 }, { "entropy": 0.5836598549038172, "epoch": 0.4159435665755619, "grad_norm": 0.1859140396118164, "learning_rate": 0.0002, "loss": 0.7465, "mean_token_accuracy": 0.7991529665887356, "num_tokens": 44034296.0, "step": 5550 }, { "entropy": 0.5736728387884795, "epoch": 0.41631829050941377, "grad_norm": 0.1966867297887802, "learning_rate": 0.0002, "loss": 0.7344, "mean_token_accuracy": 0.7990712653845549, "num_tokens": 45504040.0, "step": 5555 }, { "entropy": 0.564608798827976, "epoch": 0.4166930144432656, "grad_norm": 0.1900351643562317, "learning_rate": 0.0002, "loss": 0.7162, "mean_token_accuracy": 0.8042087413370609, "num_tokens": 47009170.0, "step": 5560 }, { "entropy": 0.5715851521119475, "epoch": 0.41706773837711747, "grad_norm": 0.19300131499767303, "learning_rate": 0.0002, "loss": 0.7284, "mean_token_accuracy": 0.7997116442769766, "num_tokens": 48464148.0, "step": 5565 }, { "entropy": 0.5620608939789236, "epoch": 0.4174424623109693, "grad_norm": 0.1978474110364914, "learning_rate": 0.0002, "loss": 0.7188, "mean_token_accuracy": 0.8024834588170051, "num_tokens": 49924931.0, "step": 5570 }, { "entropy": 0.5744460733607412, "epoch": 0.41781718624482117, "grad_norm": 0.18698257207870483, "learning_rate": 0.0002, "loss": 0.7301, "mean_token_accuracy": 0.8018319372087717, "num_tokens": 51399357.0, "step": 5575 }, { "entropy": 0.5692311311140656, "epoch": 0.4181919101786731, "grad_norm": 0.17476381361484528, "learning_rate": 0.0002, "loss": 0.7182, "mean_token_accuracy": 0.8012085117399692, "num_tokens": 52881237.0, "step": 5580 }, { "entropy": 0.5673853455111384, "epoch": 0.4185666341125249, "grad_norm": 0.17534051835536957, "learning_rate": 0.0002, "loss": 0.7211, "mean_token_accuracy": 0.8007724523544312, "num_tokens": 54396924.0, "step": 5585 }, { "entropy": 0.561608224734664, "epoch": 0.4189413580463768, "grad_norm": 0.18155121803283691, "learning_rate": 0.0002, "loss": 0.7173, "mean_token_accuracy": 0.805029933527112, "num_tokens": 55866547.0, "step": 5590 }, { "entropy": 0.5792715574614704, "epoch": 0.4193160819802286, "grad_norm": 0.2032286524772644, "learning_rate": 0.0002, "loss": 0.7455, "mean_token_accuracy": 0.796748973429203, "num_tokens": 57311155.0, "step": 5595 }, { "entropy": 0.5637252507731318, "epoch": 0.4196908059140805, "grad_norm": 0.1949235498905182, "learning_rate": 0.0002, "loss": 0.7089, "mean_token_accuracy": 0.8017688106745482, "num_tokens": 58794960.0, "step": 5600 }, { "entropy": 0.5677880309522152, "epoch": 0.42006552984793233, "grad_norm": 0.18833322823047638, "learning_rate": 0.0002, "loss": 0.7169, "mean_token_accuracy": 0.8037907391786575, "num_tokens": 60281234.0, "step": 5605 }, { "entropy": 0.5560304651968181, "epoch": 0.4204402537817842, "grad_norm": 0.1904069483280182, "learning_rate": 0.0002, "loss": 0.7094, "mean_token_accuracy": 0.806665014103055, "num_tokens": 61710910.0, "step": 5610 }, { "entropy": 0.5785152425058186, "epoch": 0.4208149777156361, "grad_norm": 0.19262322783470154, "learning_rate": 0.0002, "loss": 0.7289, "mean_token_accuracy": 0.8000565398484468, "num_tokens": 63193563.0, "step": 5615 }, { "entropy": 0.5763456331565976, "epoch": 0.42118970164948794, "grad_norm": 0.19316042959690094, "learning_rate": 0.0002, "loss": 0.729, "mean_token_accuracy": 0.8004499308764934, "num_tokens": 64647551.0, "step": 5620 }, { "entropy": 0.5510069914162159, "epoch": 0.4215644255833398, "grad_norm": 0.20269285142421722, "learning_rate": 0.0002, "loss": 0.7037, "mean_token_accuracy": 0.803431560844183, "num_tokens": 66050924.0, "step": 5625 }, { "entropy": 0.5801758896559477, "epoch": 0.42193914951719164, "grad_norm": 0.20293265581130981, "learning_rate": 0.0002, "loss": 0.7249, "mean_token_accuracy": 0.8033314563333989, "num_tokens": 67556272.0, "step": 5630 }, { "entropy": 0.5688318070024252, "epoch": 0.4223138734510435, "grad_norm": 0.19119632244110107, "learning_rate": 0.0002, "loss": 0.7274, "mean_token_accuracy": 0.8008541870862246, "num_tokens": 68988772.0, "step": 5635 }, { "entropy": 0.5609268076717854, "epoch": 0.42268859738489534, "grad_norm": 0.18967914581298828, "learning_rate": 0.0002, "loss": 0.7126, "mean_token_accuracy": 0.8038977798074484, "num_tokens": 70479199.0, "step": 5640 }, { "entropy": 0.55955094024539, "epoch": 0.4230633213187472, "grad_norm": 0.18605652451515198, "learning_rate": 0.0002, "loss": 0.714, "mean_token_accuracy": 0.8032560460269451, "num_tokens": 71938716.0, "step": 5645 }, { "entropy": 0.5705979825928807, "epoch": 0.42343804525259904, "grad_norm": 0.19506984949111938, "learning_rate": 0.0002, "loss": 0.7236, "mean_token_accuracy": 0.8005380127578974, "num_tokens": 73389694.0, "step": 5650 }, { "entropy": 0.5686640682630241, "epoch": 0.42381276918645094, "grad_norm": 0.1894492357969284, "learning_rate": 0.0002, "loss": 0.7151, "mean_token_accuracy": 0.8021986860781908, "num_tokens": 74867780.0, "step": 5655 }, { "entropy": 0.5681798869743944, "epoch": 0.4241874931203028, "grad_norm": 0.20855113863945007, "learning_rate": 0.0002, "loss": 0.7307, "mean_token_accuracy": 0.8023861963301897, "num_tokens": 76317063.0, "step": 5660 }, { "entropy": 0.5653234113939106, "epoch": 0.42456221705415464, "grad_norm": 0.18883666396141052, "learning_rate": 0.0002, "loss": 0.7218, "mean_token_accuracy": 0.8027939945459366, "num_tokens": 77791209.0, "step": 5665 }, { "entropy": 0.5811999838799238, "epoch": 0.4249369409880065, "grad_norm": 0.17625825107097626, "learning_rate": 0.0002, "loss": 0.7332, "mean_token_accuracy": 0.8002711772918701, "num_tokens": 79298281.0, "step": 5670 }, { "entropy": 0.5763323941268027, "epoch": 0.42531166492185835, "grad_norm": 0.18136157095432281, "learning_rate": 0.0002, "loss": 0.7358, "mean_token_accuracy": 0.799370326474309, "num_tokens": 80805889.0, "step": 5675 }, { "entropy": 0.5685816790908576, "epoch": 0.4256863888557102, "grad_norm": 0.18181763589382172, "learning_rate": 0.0002, "loss": 0.7374, "mean_token_accuracy": 0.7969714485108852, "num_tokens": 82282949.0, "step": 5680 }, { "entropy": 0.5652149170637131, "epoch": 0.42606111278956205, "grad_norm": 0.1855533868074417, "learning_rate": 0.0002, "loss": 0.7195, "mean_token_accuracy": 0.8049558233469725, "num_tokens": 83782169.0, "step": 5685 }, { "entropy": 0.5722405733540654, "epoch": 0.4264358367234139, "grad_norm": 0.18248598277568817, "learning_rate": 0.0002, "loss": 0.7361, "mean_token_accuracy": 0.7989892639219761, "num_tokens": 85265503.0, "step": 5690 }, { "entropy": 0.5577335271984338, "epoch": 0.4268105606572658, "grad_norm": 0.18826673924922943, "learning_rate": 0.0002, "loss": 0.7104, "mean_token_accuracy": 0.8009180743247271, "num_tokens": 86747291.0, "step": 5695 }, { "entropy": 0.5536971117369831, "epoch": 0.42718528459111765, "grad_norm": 0.1938558965921402, "learning_rate": 0.0002, "loss": 0.7012, "mean_token_accuracy": 0.806043554469943, "num_tokens": 88220325.0, "step": 5700 }, { "entropy": 0.5846503974869848, "epoch": 0.4275600085249695, "grad_norm": 0.20286184549331665, "learning_rate": 0.0002, "loss": 0.7322, "mean_token_accuracy": 0.8015277646481991, "num_tokens": 89705591.0, "step": 5705 }, { "entropy": 0.5884869936853647, "epoch": 0.42793473245882135, "grad_norm": 0.1868835836648941, "learning_rate": 0.0002, "loss": 0.7374, "mean_token_accuracy": 0.7982123397290707, "num_tokens": 91198641.0, "step": 5710 }, { "entropy": 0.5782938873395324, "epoch": 0.4283094563926732, "grad_norm": 0.1867235153913498, "learning_rate": 0.0002, "loss": 0.7283, "mean_token_accuracy": 0.8007389511913061, "num_tokens": 92687648.0, "step": 5715 }, { "entropy": 0.5930353774689138, "epoch": 0.42868418032652506, "grad_norm": 0.18952061235904694, "learning_rate": 0.0002, "loss": 0.746, "mean_token_accuracy": 0.7988660782575607, "num_tokens": 94204928.0, "step": 5720 }, { "entropy": 0.5666006520390511, "epoch": 0.4290589042603769, "grad_norm": 0.22215032577514648, "learning_rate": 0.0002, "loss": 0.73, "mean_token_accuracy": 0.8040490988641977, "num_tokens": 95661483.0, "step": 5725 }, { "entropy": 0.5613255186006427, "epoch": 0.42943362819422876, "grad_norm": 0.17205989360809326, "learning_rate": 0.0002, "loss": 0.7264, "mean_token_accuracy": 0.80254628919065, "num_tokens": 97081886.0, "step": 5730 }, { "entropy": 0.5638255435973406, "epoch": 0.42980835212808066, "grad_norm": 0.19742368161678314, "learning_rate": 0.0002, "loss": 0.7427, "mean_token_accuracy": 0.7965445578098297, "num_tokens": 98566157.0, "step": 5735 }, { "entropy": 0.5547383457422257, "epoch": 0.4301830760619325, "grad_norm": 0.2410697042942047, "learning_rate": 0.0002, "loss": 0.7182, "mean_token_accuracy": 0.8043371316045522, "num_tokens": 100052105.0, "step": 5740 }, { "entropy": 0.5532501108944416, "epoch": 0.43055779999578436, "grad_norm": 0.205755352973938, "learning_rate": 0.0002, "loss": 0.7214, "mean_token_accuracy": 0.8032098859548569, "num_tokens": 101540512.0, "step": 5745 }, { "entropy": 0.5708063103258609, "epoch": 0.4309325239296362, "grad_norm": 0.17969755828380585, "learning_rate": 0.0002, "loss": 0.747, "mean_token_accuracy": 0.7963840901851654, "num_tokens": 103077454.0, "step": 5750 }, { "entropy": 0.5737206907942891, "epoch": 0.43130724786348806, "grad_norm": 0.18538227677345276, "learning_rate": 0.0002, "loss": 0.7407, "mean_token_accuracy": 0.7987205788493157, "num_tokens": 104569502.0, "step": 5755 }, { "entropy": 0.5563986409455538, "epoch": 0.4316819717973399, "grad_norm": 0.1687060445547104, "learning_rate": 0.0002, "loss": 0.7208, "mean_token_accuracy": 0.802626795321703, "num_tokens": 106032634.0, "step": 5760 }, { "entropy": 0.5480329301208258, "epoch": 0.43205669573119176, "grad_norm": 0.23853352665901184, "learning_rate": 0.0002, "loss": 0.7164, "mean_token_accuracy": 0.8067463137209415, "num_tokens": 107489204.0, "step": 5765 }, { "entropy": 0.5735791875049472, "epoch": 0.43243141966504367, "grad_norm": 0.18799157440662384, "learning_rate": 0.0002, "loss": 0.743, "mean_token_accuracy": 0.796773624420166, "num_tokens": 108983762.0, "step": 5770 }, { "entropy": 0.5455772588029504, "epoch": 0.4328061435988955, "grad_norm": 0.1855669468641281, "learning_rate": 0.0002, "loss": 0.7071, "mean_token_accuracy": 0.8033250242471695, "num_tokens": 110481819.0, "step": 5775 }, { "entropy": 0.5397538780234754, "epoch": 0.43318086753274737, "grad_norm": 0.18549607694149017, "learning_rate": 0.0002, "loss": 0.7013, "mean_token_accuracy": 0.8078886363655329, "num_tokens": 111967368.0, "step": 5780 }, { "entropy": 0.5411180576309562, "epoch": 0.4335555914665992, "grad_norm": 0.19139784574508667, "learning_rate": 0.0002, "loss": 0.7004, "mean_token_accuracy": 0.8051552273333072, "num_tokens": 113455717.0, "step": 5785 }, { "entropy": 0.5622915763407945, "epoch": 0.43393031540045107, "grad_norm": 0.17958752810955048, "learning_rate": 0.0002, "loss": 0.7296, "mean_token_accuracy": 0.8002548299729824, "num_tokens": 114923081.0, "step": 5790 }, { "entropy": 0.5523214412853121, "epoch": 0.4343050393343029, "grad_norm": 0.17241309583187103, "learning_rate": 0.0002, "loss": 0.7142, "mean_token_accuracy": 0.8042429514229298, "num_tokens": 116372281.0, "step": 5795 }, { "entropy": 0.5500665668398141, "epoch": 0.4346797632681548, "grad_norm": 0.2040933072566986, "learning_rate": 0.0002, "loss": 0.7133, "mean_token_accuracy": 0.8066270086914301, "num_tokens": 117821864.0, "step": 5800 }, { "entropy": 0.5653836423531174, "epoch": 0.4350544872020066, "grad_norm": 0.1934960037469864, "learning_rate": 0.0002, "loss": 0.7139, "mean_token_accuracy": 0.8020522605627776, "num_tokens": 119325331.0, "step": 5805 }, { "entropy": 0.5617340339347721, "epoch": 0.43542921113585853, "grad_norm": 0.1810407042503357, "learning_rate": 0.0002, "loss": 0.7128, "mean_token_accuracy": 0.8028455022722483, "num_tokens": 120777554.0, "step": 5810 }, { "entropy": 0.5795811934396624, "epoch": 0.4358039350697104, "grad_norm": 0.2016572654247284, "learning_rate": 0.0002, "loss": 0.7373, "mean_token_accuracy": 0.8008449412882328, "num_tokens": 122248647.0, "step": 5815 }, { "entropy": 0.5623559813946486, "epoch": 0.43617865900356223, "grad_norm": 0.17747119069099426, "learning_rate": 0.0002, "loss": 0.7057, "mean_token_accuracy": 0.8044580463320017, "num_tokens": 123733187.0, "step": 5820 }, { "entropy": 0.5580526772886515, "epoch": 0.4365533829374141, "grad_norm": 0.1941380649805069, "learning_rate": 0.0002, "loss": 0.7219, "mean_token_accuracy": 0.8048528552055358, "num_tokens": 125177502.0, "step": 5825 }, { "entropy": 0.5844762273132801, "epoch": 0.43692810687126593, "grad_norm": 0.18650652468204498, "learning_rate": 0.0002, "loss": 0.7352, "mean_token_accuracy": 0.7996958117932081, "num_tokens": 126708764.0, "step": 5830 }, { "entropy": 0.5676785707473755, "epoch": 0.4373028308051178, "grad_norm": 0.18264684081077576, "learning_rate": 0.0002, "loss": 0.7221, "mean_token_accuracy": 0.8013656694442034, "num_tokens": 128219735.0, "step": 5835 }, { "entropy": 0.5659491794183851, "epoch": 0.43767755473896963, "grad_norm": 0.1854492723941803, "learning_rate": 0.0002, "loss": 0.7218, "mean_token_accuracy": 0.8036191441118717, "num_tokens": 129661304.0, "step": 5840 }, { "entropy": 0.5791681262664496, "epoch": 0.4380522786728215, "grad_norm": 0.18832896649837494, "learning_rate": 0.0002, "loss": 0.7375, "mean_token_accuracy": 0.8012065663933754, "num_tokens": 131156122.0, "step": 5845 }, { "entropy": 0.5678353996947407, "epoch": 0.4384270026066734, "grad_norm": 0.18168477714061737, "learning_rate": 0.0002, "loss": 0.7157, "mean_token_accuracy": 0.8023030050098896, "num_tokens": 132635064.0, "step": 5850 }, { "entropy": 0.5736117651686072, "epoch": 0.43880172654052524, "grad_norm": 0.19398310780525208, "learning_rate": 0.0002, "loss": 0.7363, "mean_token_accuracy": 0.800086997821927, "num_tokens": 134083445.0, "step": 5855 }, { "entropy": 0.5855325811542571, "epoch": 0.4391764504743771, "grad_norm": 0.1899084895849228, "learning_rate": 0.0002, "loss": 0.744, "mean_token_accuracy": 0.8006644807755947, "num_tokens": 135597586.0, "step": 5860 }, { "entropy": 0.57105702329427, "epoch": 0.43955117440822894, "grad_norm": 0.2082265019416809, "learning_rate": 0.0002, "loss": 0.7331, "mean_token_accuracy": 0.7985539633780718, "num_tokens": 137075301.0, "step": 5865 }, { "entropy": 0.5752566371113061, "epoch": 0.4399258983420808, "grad_norm": 0.2200389951467514, "learning_rate": 0.0002, "loss": 0.7342, "mean_token_accuracy": 0.8019175838679076, "num_tokens": 138549347.0, "step": 5870 }, { "entropy": 0.5722615486010909, "epoch": 0.44030062227593264, "grad_norm": 0.18111298978328705, "learning_rate": 0.0002, "loss": 0.7153, "mean_token_accuracy": 0.8016845677047968, "num_tokens": 140056145.0, "step": 5875 }, { "entropy": 0.5645474571734667, "epoch": 0.4406753462097845, "grad_norm": 0.19472083449363708, "learning_rate": 0.0002, "loss": 0.7261, "mean_token_accuracy": 0.8032300688326359, "num_tokens": 141544516.0, "step": 5880 }, { "entropy": 0.5537991154007613, "epoch": 0.44105007014363634, "grad_norm": 0.19374021887779236, "learning_rate": 0.0002, "loss": 0.714, "mean_token_accuracy": 0.8039215590804816, "num_tokens": 142977652.0, "step": 5885 }, { "entropy": 0.5732108395546675, "epoch": 0.44142479407748825, "grad_norm": 0.19562549889087677, "learning_rate": 0.0002, "loss": 0.7221, "mean_token_accuracy": 0.8045172590762377, "num_tokens": 144477596.0, "step": 5890 }, { "entropy": 0.5601601902395487, "epoch": 0.4417995180113401, "grad_norm": 0.18729355931282043, "learning_rate": 0.0002, "loss": 0.7126, "mean_token_accuracy": 0.8053222883492708, "num_tokens": 145970426.0, "step": 5895 }, { "entropy": 0.5658760151825846, "epoch": 0.44217424194519195, "grad_norm": 0.18932875990867615, "learning_rate": 0.0002, "loss": 0.719, "mean_token_accuracy": 0.8012721970677376, "num_tokens": 147491763.0, "step": 5900 }, { "entropy": 0.5809668026398868, "epoch": 0.4425489658790438, "grad_norm": 0.17942695319652557, "learning_rate": 0.0002, "loss": 0.7534, "mean_token_accuracy": 0.797233922034502, "num_tokens": 148976963.0, "step": 5905 }, { "entropy": 0.5884192737750709, "epoch": 0.44292368981289565, "grad_norm": 0.18539753556251526, "learning_rate": 0.0002, "loss": 0.7461, "mean_token_accuracy": 0.7969976037740707, "num_tokens": 150477547.0, "step": 5910 }, { "entropy": 0.550876026879996, "epoch": 0.4432984137467475, "grad_norm": 0.18414448201656342, "learning_rate": 0.0002, "loss": 0.7055, "mean_token_accuracy": 0.8076436072587967, "num_tokens": 151986785.0, "step": 5915 }, { "entropy": 0.5609020356088876, "epoch": 0.44367313768059935, "grad_norm": 0.18041513860225677, "learning_rate": 0.0002, "loss": 0.7253, "mean_token_accuracy": 0.8006397452205419, "num_tokens": 153462873.0, "step": 5920 }, { "entropy": 0.5753432454541325, "epoch": 0.44404786161445126, "grad_norm": 0.2032923400402069, "learning_rate": 0.0002, "loss": 0.7283, "mean_token_accuracy": 0.8035691332072019, "num_tokens": 154982858.0, "step": 5925 }, { "entropy": 0.5709438351914287, "epoch": 0.4444225855483031, "grad_norm": 0.18740752339363098, "learning_rate": 0.0002, "loss": 0.729, "mean_token_accuracy": 0.8026599541306496, "num_tokens": 156496632.0, "step": 5930 }, { "entropy": 0.5842517327517271, "epoch": 0.44479730948215496, "grad_norm": 0.17598238587379456, "learning_rate": 0.0002, "loss": 0.7479, "mean_token_accuracy": 0.7976978823542595, "num_tokens": 158019470.0, "step": 5935 }, { "entropy": 0.5684919559396804, "epoch": 0.4451720334160068, "grad_norm": 0.2071516364812851, "learning_rate": 0.0002, "loss": 0.7341, "mean_token_accuracy": 0.8020080976188183, "num_tokens": 159472999.0, "step": 5940 }, { "entropy": 0.5550756022334099, "epoch": 0.44554675734985866, "grad_norm": 0.19059042632579803, "learning_rate": 0.0002, "loss": 0.7193, "mean_token_accuracy": 0.800651091337204, "num_tokens": 160879316.0, "step": 5945 }, { "entropy": 0.5715855317190289, "epoch": 0.4459214812837105, "grad_norm": 0.17316652834415436, "learning_rate": 0.0002, "loss": 0.7282, "mean_token_accuracy": 0.8029984898865223, "num_tokens": 162385651.0, "step": 5950 }, { "entropy": 0.5348846428096294, "epoch": 0.44629620521756236, "grad_norm": 0.1842014044523239, "learning_rate": 0.0002, "loss": 0.6897, "mean_token_accuracy": 0.8111164405941963, "num_tokens": 163784629.0, "step": 5955 }, { "entropy": 0.5599678181111812, "epoch": 0.4466709291514142, "grad_norm": 0.21545769274234772, "learning_rate": 0.0002, "loss": 0.7124, "mean_token_accuracy": 0.8030474074184895, "num_tokens": 165243800.0, "step": 5960 }, { "entropy": 0.5604405296035111, "epoch": 0.4470456530852661, "grad_norm": 0.20092467963695526, "learning_rate": 0.0002, "loss": 0.7142, "mean_token_accuracy": 0.8028687331825495, "num_tokens": 166692644.0, "step": 5965 }, { "entropy": 0.5585650744847953, "epoch": 0.44742037701911797, "grad_norm": 0.18647122383117676, "learning_rate": 0.0002, "loss": 0.703, "mean_token_accuracy": 0.8057953923940658, "num_tokens": 168168242.0, "step": 5970 }, { "entropy": 0.5732357574626803, "epoch": 0.4477951009529698, "grad_norm": 0.18345783650875092, "learning_rate": 0.0002, "loss": 0.7288, "mean_token_accuracy": 0.7997241385281086, "num_tokens": 169656654.0, "step": 5975 }, { "entropy": 0.5703855266794562, "epoch": 0.44816982488682167, "grad_norm": 0.1971818208694458, "learning_rate": 0.0002, "loss": 0.7199, "mean_token_accuracy": 0.8031940866261721, "num_tokens": 171154517.0, "step": 5980 }, { "entropy": 0.5849383687600493, "epoch": 0.4485445488206735, "grad_norm": 0.19015131890773773, "learning_rate": 0.0002, "loss": 0.747, "mean_token_accuracy": 0.8004543866962195, "num_tokens": 172640524.0, "step": 5985 }, { "entropy": 0.5613339229486882, "epoch": 0.44891927275452537, "grad_norm": 0.19768789410591125, "learning_rate": 0.0002, "loss": 0.7079, "mean_token_accuracy": 0.8065158676356077, "num_tokens": 174138511.0, "step": 5990 }, { "entropy": 0.5664570886641741, "epoch": 0.4492939966883772, "grad_norm": 0.25654202699661255, "learning_rate": 0.0002, "loss": 0.7096, "mean_token_accuracy": 0.8045822899788618, "num_tokens": 175605898.0, "step": 5995 }, { "entropy": 0.5684980014339089, "epoch": 0.44966872062222907, "grad_norm": 0.19295141100883484, "learning_rate": 0.0002, "loss": 0.7148, "mean_token_accuracy": 0.8039294566959143, "num_tokens": 177039596.0, "step": 6000 }, { "entropy": 0.5841399099677801, "epoch": 0.450043444556081, "grad_norm": 0.18804915249347687, "learning_rate": 0.0002, "loss": 0.7152, "mean_token_accuracy": 0.8032599590718746, "num_tokens": 178510494.0, "step": 6005 }, { "entropy": 0.5919969333335757, "epoch": 0.4504181684899328, "grad_norm": 0.18216978013515472, "learning_rate": 0.0002, "loss": 0.725, "mean_token_accuracy": 0.803064713999629, "num_tokens": 180005947.0, "step": 6010 }, { "entropy": 0.5747440787032246, "epoch": 0.4507928924237847, "grad_norm": 0.2061820775270462, "learning_rate": 0.0002, "loss": 0.7294, "mean_token_accuracy": 0.8023578573018313, "num_tokens": 181430023.0, "step": 6015 }, { "entropy": 0.5572536661289632, "epoch": 0.4511676163576365, "grad_norm": 0.1847982108592987, "learning_rate": 0.0002, "loss": 0.7063, "mean_token_accuracy": 0.8051946878433227, "num_tokens": 182903416.0, "step": 6020 }, { "entropy": 0.5608206098899245, "epoch": 0.4515423402914884, "grad_norm": 0.18562442064285278, "learning_rate": 0.0002, "loss": 0.7004, "mean_token_accuracy": 0.8067097168415784, "num_tokens": 184376906.0, "step": 6025 }, { "entropy": 0.5824912481009961, "epoch": 0.4519170642253402, "grad_norm": 0.19587139785289764, "learning_rate": 0.0002, "loss": 0.7356, "mean_token_accuracy": 0.7998513143509627, "num_tokens": 185854266.0, "step": 6030 }, { "entropy": 0.5675329182296991, "epoch": 0.4522917881591921, "grad_norm": 0.2122100293636322, "learning_rate": 0.0002, "loss": 0.7203, "mean_token_accuracy": 0.8028729651123285, "num_tokens": 187315636.0, "step": 6035 }, { "entropy": 0.5700642900541425, "epoch": 0.4526665120930439, "grad_norm": 0.24884024262428284, "learning_rate": 0.0002, "loss": 0.7246, "mean_token_accuracy": 0.8012557737529278, "num_tokens": 188753277.0, "step": 6040 }, { "entropy": 0.5593139139935375, "epoch": 0.45304123602689583, "grad_norm": 0.2109822928905487, "learning_rate": 0.0002, "loss": 0.7169, "mean_token_accuracy": 0.8044453486800194, "num_tokens": 190212272.0, "step": 6045 }, { "entropy": 0.5662208877503871, "epoch": 0.4534159599607477, "grad_norm": 0.21057990193367004, "learning_rate": 0.0002, "loss": 0.7056, "mean_token_accuracy": 0.8050284713506699, "num_tokens": 191661465.0, "step": 6050 }, { "entropy": 0.5747633513063193, "epoch": 0.45379068389459953, "grad_norm": 0.23561722040176392, "learning_rate": 0.0002, "loss": 0.7242, "mean_token_accuracy": 0.8020137503743172, "num_tokens": 193124623.0, "step": 6055 }, { "entropy": 0.5762643218040466, "epoch": 0.4541654078284514, "grad_norm": 0.18290957808494568, "learning_rate": 0.0002, "loss": 0.7197, "mean_token_accuracy": 0.8027803663164377, "num_tokens": 194611723.0, "step": 6060 }, { "entropy": 0.5814003359526396, "epoch": 0.45454013176230323, "grad_norm": 0.2089502066373825, "learning_rate": 0.0002, "loss": 0.7292, "mean_token_accuracy": 0.7998986665159464, "num_tokens": 196079820.0, "step": 6065 }, { "entropy": 0.6014971725642682, "epoch": 0.4549148556961551, "grad_norm": 0.20267291367053986, "learning_rate": 0.0002, "loss": 0.7406, "mean_token_accuracy": 0.7997817408293486, "num_tokens": 197554961.0, "step": 6070 }, { "entropy": 0.5840981792658567, "epoch": 0.45528957963000694, "grad_norm": 0.19395296275615692, "learning_rate": 0.0002, "loss": 0.7223, "mean_token_accuracy": 0.8035356406122446, "num_tokens": 199028149.0, "step": 6075 }, { "entropy": 0.564953170530498, "epoch": 0.4556643035638588, "grad_norm": 0.1901029497385025, "learning_rate": 0.0002, "loss": 0.7061, "mean_token_accuracy": 0.803486455604434, "num_tokens": 200523837.0, "step": 6080 }, { "entropy": 0.556080668978393, "epoch": 0.4560390274977107, "grad_norm": 0.20861336588859558, "learning_rate": 0.0002, "loss": 0.7108, "mean_token_accuracy": 0.8045687153935432, "num_tokens": 201980761.0, "step": 6085 }, { "entropy": 0.5642703076824546, "epoch": 0.45641375143156254, "grad_norm": 0.1816645860671997, "learning_rate": 0.0002, "loss": 0.7156, "mean_token_accuracy": 0.80333844833076, "num_tokens": 203465764.0, "step": 6090 }, { "entropy": 0.5491903698071837, "epoch": 0.4567884753654144, "grad_norm": 0.18830153346061707, "learning_rate": 0.0002, "loss": 0.7105, "mean_token_accuracy": 0.8062271982431412, "num_tokens": 204926469.0, "step": 6095 }, { "entropy": 0.5602709606289864, "epoch": 0.45716319929926624, "grad_norm": 0.1823458969593048, "learning_rate": 0.0002, "loss": 0.725, "mean_token_accuracy": 0.8001467656344176, "num_tokens": 206403136.0, "step": 6100 }, { "entropy": 0.5610052439384162, "epoch": 0.4575379232331181, "grad_norm": 0.1988830417394638, "learning_rate": 0.0002, "loss": 0.73, "mean_token_accuracy": 0.8024370279163122, "num_tokens": 207878735.0, "step": 6105 }, { "entropy": 0.5585511427372694, "epoch": 0.45791264716696994, "grad_norm": 0.19251585006713867, "learning_rate": 0.0002, "loss": 0.7238, "mean_token_accuracy": 0.8021120067685843, "num_tokens": 209336462.0, "step": 6110 }, { "entropy": 0.5520011832937598, "epoch": 0.4582873711008218, "grad_norm": 0.18565645813941956, "learning_rate": 0.0002, "loss": 0.7237, "mean_token_accuracy": 0.805689762160182, "num_tokens": 210755660.0, "step": 6115 }, { "entropy": 0.5692046537995339, "epoch": 0.4586620950346737, "grad_norm": 0.19174672663211823, "learning_rate": 0.0002, "loss": 0.7318, "mean_token_accuracy": 0.7998870264738798, "num_tokens": 212240641.0, "step": 6120 }, { "entropy": 0.5848788321018219, "epoch": 0.45903681896852555, "grad_norm": 0.18580195307731628, "learning_rate": 0.0002, "loss": 0.7467, "mean_token_accuracy": 0.7977655492722988, "num_tokens": 213759854.0, "step": 6125 }, { "entropy": 0.5716417377814651, "epoch": 0.4594115429023774, "grad_norm": 0.18674010038375854, "learning_rate": 0.0002, "loss": 0.7349, "mean_token_accuracy": 0.7980889990925789, "num_tokens": 215205381.0, "step": 6130 }, { "entropy": 0.5398157356306911, "epoch": 0.45978626683622925, "grad_norm": 0.18681515753269196, "learning_rate": 0.0002, "loss": 0.6984, "mean_token_accuracy": 0.804551200568676, "num_tokens": 216624696.0, "step": 6135 }, { "entropy": 0.5508135348558426, "epoch": 0.4601609907700811, "grad_norm": 0.18660904467105865, "learning_rate": 0.0002, "loss": 0.7061, "mean_token_accuracy": 0.8049354955554009, "num_tokens": 218083715.0, "step": 6140 }, { "entropy": 0.5621537227183581, "epoch": 0.46053571470393295, "grad_norm": 0.20019425451755524, "learning_rate": 0.0002, "loss": 0.719, "mean_token_accuracy": 0.8033699478954077, "num_tokens": 219551761.0, "step": 6145 }, { "entropy": 0.56479875985533, "epoch": 0.4609104386377848, "grad_norm": 1.4008941650390625, "learning_rate": 0.0002, "loss": 0.7224, "mean_token_accuracy": 0.8009526196867227, "num_tokens": 221064195.0, "step": 6150 }, { "entropy": 0.556078418623656, "epoch": 0.46128516257163665, "grad_norm": 0.21408437192440033, "learning_rate": 0.0002, "loss": 0.7127, "mean_token_accuracy": 0.8054736495018006, "num_tokens": 222514710.0, "step": 6155 }, { "entropy": 0.567907502874732, "epoch": 0.46165988650548856, "grad_norm": 0.20723502337932587, "learning_rate": 0.0002, "loss": 0.7241, "mean_token_accuracy": 0.8024130396544933, "num_tokens": 223990155.0, "step": 6160 }, { "entropy": 0.5677668899297714, "epoch": 0.4620346104393404, "grad_norm": 0.19655221700668335, "learning_rate": 0.0002, "loss": 0.7386, "mean_token_accuracy": 0.7982829868793487, "num_tokens": 225432184.0, "step": 6165 }, { "entropy": 0.5471007916145026, "epoch": 0.46240933437319226, "grad_norm": 0.1973390132188797, "learning_rate": 0.0002, "loss": 0.7038, "mean_token_accuracy": 0.806104677543044, "num_tokens": 226870483.0, "step": 6170 }, { "entropy": 0.5529609955847263, "epoch": 0.4627840583070441, "grad_norm": 0.19587701559066772, "learning_rate": 0.0002, "loss": 0.7077, "mean_token_accuracy": 0.8079354651272297, "num_tokens": 228371150.0, "step": 6175 }, { "entropy": 0.575756819639355, "epoch": 0.46315878224089596, "grad_norm": 0.2766522467136383, "learning_rate": 0.0002, "loss": 0.7315, "mean_token_accuracy": 0.8011742018163204, "num_tokens": 229839769.0, "step": 6180 }, { "entropy": 0.5647493152879178, "epoch": 0.4635335061747478, "grad_norm": 0.1808801293373108, "learning_rate": 0.0002, "loss": 0.7212, "mean_token_accuracy": 0.8007138434797525, "num_tokens": 231336054.0, "step": 6185 }, { "entropy": 0.5466858141124249, "epoch": 0.46390823010859966, "grad_norm": 0.2041601836681366, "learning_rate": 0.0002, "loss": 0.7054, "mean_token_accuracy": 0.8057255383580924, "num_tokens": 232778652.0, "step": 6190 }, { "entropy": 0.5610025350935757, "epoch": 0.4642829540424515, "grad_norm": 0.19213034212589264, "learning_rate": 0.0002, "loss": 0.7235, "mean_token_accuracy": 0.8003431372344494, "num_tokens": 234262478.0, "step": 6195 }, { "entropy": 0.5715068995952606, "epoch": 0.4646576779763034, "grad_norm": 0.18266229331493378, "learning_rate": 0.0002, "loss": 0.7296, "mean_token_accuracy": 0.8013206005096436, "num_tokens": 235729894.0, "step": 6200 }, { "entropy": 0.5659889055415988, "epoch": 0.46503240191015527, "grad_norm": 0.17454510927200317, "learning_rate": 0.0002, "loss": 0.7165, "mean_token_accuracy": 0.8028098851442337, "num_tokens": 237238757.0, "step": 6205 }, { "entropy": 0.5650300735607743, "epoch": 0.4654071258440071, "grad_norm": 0.19519200921058655, "learning_rate": 0.0002, "loss": 0.713, "mean_token_accuracy": 0.807767092064023, "num_tokens": 238761654.0, "step": 6210 }, { "entropy": 0.5665958039462566, "epoch": 0.46578184977785897, "grad_norm": 0.20506355166435242, "learning_rate": 0.0002, "loss": 0.7166, "mean_token_accuracy": 0.80225063636899, "num_tokens": 240212432.0, "step": 6215 }, { "entropy": 0.5699425552971661, "epoch": 0.4661565737117108, "grad_norm": 0.21868127584457397, "learning_rate": 0.0002, "loss": 0.7224, "mean_token_accuracy": 0.8048253364861011, "num_tokens": 241640698.0, "step": 6220 }, { "entropy": 0.5764842040836811, "epoch": 0.46653129764556267, "grad_norm": 0.19797249138355255, "learning_rate": 0.0002, "loss": 0.7106, "mean_token_accuracy": 0.8038494635373354, "num_tokens": 243125888.0, "step": 6225 }, { "entropy": 0.5767360355705022, "epoch": 0.4669060215794145, "grad_norm": 0.19101957976818085, "learning_rate": 0.0002, "loss": 0.7193, "mean_token_accuracy": 0.8028814356774092, "num_tokens": 244616873.0, "step": 6230 }, { "entropy": 0.5936159741133451, "epoch": 0.46728074551326637, "grad_norm": 0.2070942223072052, "learning_rate": 0.0002, "loss": 0.7297, "mean_token_accuracy": 0.8033620819449425, "num_tokens": 246099477.0, "step": 6235 }, { "entropy": 0.5999559057876468, "epoch": 0.4676554694471183, "grad_norm": 0.20372208952903748, "learning_rate": 0.0002, "loss": 0.7263, "mean_token_accuracy": 0.8022713448852301, "num_tokens": 247567416.0, "step": 6240 }, { "entropy": 0.6046799918636679, "epoch": 0.4680301933809701, "grad_norm": 0.18810789287090302, "learning_rate": 0.0002, "loss": 0.7293, "mean_token_accuracy": 0.8030561566352844, "num_tokens": 249026156.0, "step": 6245 }, { "entropy": 0.5951093541458249, "epoch": 0.468404917314822, "grad_norm": 0.17989800870418549, "learning_rate": 0.0002, "loss": 0.7262, "mean_token_accuracy": 0.8036297541111708, "num_tokens": 250503944.0, "step": 6250 }, { "entropy": 0.600920625589788, "epoch": 0.46877964124867383, "grad_norm": 0.23522715270519257, "learning_rate": 0.0002, "loss": 0.7352, "mean_token_accuracy": 0.8003701366484165, "num_tokens": 251990139.0, "step": 6255 }, { "entropy": 0.592614889703691, "epoch": 0.4691543651825257, "grad_norm": 0.19141313433647156, "learning_rate": 0.0002, "loss": 0.7143, "mean_token_accuracy": 0.8053893599659204, "num_tokens": 253522317.0, "step": 6260 }, { "entropy": 0.59354744926095, "epoch": 0.46952908911637753, "grad_norm": 0.20137102901935577, "learning_rate": 0.0002, "loss": 0.7342, "mean_token_accuracy": 0.8029232483357192, "num_tokens": 255004525.0, "step": 6265 }, { "entropy": 0.58102509342134, "epoch": 0.4699038130502294, "grad_norm": 0.1868356466293335, "learning_rate": 0.0002, "loss": 0.7136, "mean_token_accuracy": 0.8066398736089468, "num_tokens": 256493550.0, "step": 6270 }, { "entropy": 0.5546394551172853, "epoch": 0.4702785369840813, "grad_norm": 0.20215986669063568, "learning_rate": 0.0002, "loss": 0.6983, "mean_token_accuracy": 0.809065243601799, "num_tokens": 257973461.0, "step": 6275 }, { "entropy": 0.5706901052035391, "epoch": 0.47065326091793314, "grad_norm": 0.17879270017147064, "learning_rate": 0.0002, "loss": 0.7156, "mean_token_accuracy": 0.8030083321034909, "num_tokens": 259452565.0, "step": 6280 }, { "entropy": 0.5680967746302485, "epoch": 0.471027984851785, "grad_norm": 0.22151300311088562, "learning_rate": 0.0002, "loss": 0.7196, "mean_token_accuracy": 0.8067294519394637, "num_tokens": 260899793.0, "step": 6285 }, { "entropy": 0.5692691963165999, "epoch": 0.47140270878563684, "grad_norm": 0.20667481422424316, "learning_rate": 0.0002, "loss": 0.7127, "mean_token_accuracy": 0.8044274248182773, "num_tokens": 262334137.0, "step": 6290 }, { "entropy": 0.5772839400917291, "epoch": 0.4717774327194887, "grad_norm": 0.1921345442533493, "learning_rate": 0.0002, "loss": 0.7265, "mean_token_accuracy": 0.8013227239251137, "num_tokens": 263802046.0, "step": 6295 }, { "entropy": 0.5688043627887964, "epoch": 0.47215215665334054, "grad_norm": 0.17980676889419556, "learning_rate": 0.0002, "loss": 0.7056, "mean_token_accuracy": 0.8038245864212513, "num_tokens": 265294985.0, "step": 6300 }, { "entropy": 0.5740045865997672, "epoch": 0.4725268805871924, "grad_norm": 0.19011490046977997, "learning_rate": 0.0002, "loss": 0.717, "mean_token_accuracy": 0.805035013705492, "num_tokens": 266745509.0, "step": 6305 }, { "entropy": 0.601470286771655, "epoch": 0.47290160452104424, "grad_norm": 0.1872113049030304, "learning_rate": 0.0002, "loss": 0.7356, "mean_token_accuracy": 0.8020713001489639, "num_tokens": 268189322.0, "step": 6310 }, { "entropy": 0.5983931008726359, "epoch": 0.47327632845489614, "grad_norm": 0.18171553313732147, "learning_rate": 0.0002, "loss": 0.7302, "mean_token_accuracy": 0.8025830037891865, "num_tokens": 269647635.0, "step": 6315 }, { "entropy": 0.5726879429072141, "epoch": 0.473651052388748, "grad_norm": 0.20708398520946503, "learning_rate": 0.0002, "loss": 0.7036, "mean_token_accuracy": 0.8056941587477923, "num_tokens": 271101050.0, "step": 6320 }, { "entropy": 0.5805528450757265, "epoch": 0.47402577632259985, "grad_norm": 0.19648925960063934, "learning_rate": 0.0002, "loss": 0.7141, "mean_token_accuracy": 0.80748101323843, "num_tokens": 272562723.0, "step": 6325 }, { "entropy": 0.5778873411938548, "epoch": 0.4744005002564517, "grad_norm": 0.19792988896369934, "learning_rate": 0.0002, "loss": 0.7098, "mean_token_accuracy": 0.8040607817471027, "num_tokens": 274014642.0, "step": 6330 }, { "entropy": 0.5830101709812879, "epoch": 0.47477522419030355, "grad_norm": 0.19356417655944824, "learning_rate": 0.0002, "loss": 0.7208, "mean_token_accuracy": 0.802961977571249, "num_tokens": 275519050.0, "step": 6335 }, { "entropy": 0.5943233067169785, "epoch": 0.4751499481241554, "grad_norm": 0.21164415776729584, "learning_rate": 0.0002, "loss": 0.732, "mean_token_accuracy": 0.8023677594959736, "num_tokens": 277013041.0, "step": 6340 }, { "entropy": 0.5892818232998251, "epoch": 0.47552467205800725, "grad_norm": 0.19672439992427826, "learning_rate": 0.0002, "loss": 0.712, "mean_token_accuracy": 0.8052817631512881, "num_tokens": 278526327.0, "step": 6345 }, { "entropy": 0.5746184427291154, "epoch": 0.4758993959918591, "grad_norm": 0.2020604908466339, "learning_rate": 0.0002, "loss": 0.7036, "mean_token_accuracy": 0.8080575425177813, "num_tokens": 279939181.0, "step": 6350 }, { "entropy": 0.5914259871467948, "epoch": 0.476274119925711, "grad_norm": 0.18974298238754272, "learning_rate": 0.0002, "loss": 0.7124, "mean_token_accuracy": 0.8022800378501416, "num_tokens": 281440670.0, "step": 6355 }, { "entropy": 0.5819566836580634, "epoch": 0.47664884385956285, "grad_norm": 0.19177910685539246, "learning_rate": 0.0002, "loss": 0.7144, "mean_token_accuracy": 0.8021631129086018, "num_tokens": 282869258.0, "step": 6360 }, { "entropy": 0.5850373312830925, "epoch": 0.4770235677934147, "grad_norm": 0.25405168533325195, "learning_rate": 0.0002, "loss": 0.7145, "mean_token_accuracy": 0.8051694732159376, "num_tokens": 284336539.0, "step": 6365 }, { "entropy": 0.5846051732078195, "epoch": 0.47739829172726655, "grad_norm": 0.18375270068645477, "learning_rate": 0.0002, "loss": 0.7151, "mean_token_accuracy": 0.8050051931291818, "num_tokens": 285820503.0, "step": 6370 }, { "entropy": 0.5732131084427238, "epoch": 0.4777730156611184, "grad_norm": 0.20320269465446472, "learning_rate": 0.0002, "loss": 0.7105, "mean_token_accuracy": 0.8064710546284914, "num_tokens": 287273067.0, "step": 6375 }, { "entropy": 0.5748923320323229, "epoch": 0.47814773959497026, "grad_norm": 0.1921539157629013, "learning_rate": 0.0002, "loss": 0.7012, "mean_token_accuracy": 0.8074766460806131, "num_tokens": 288719863.0, "step": 6380 }, { "entropy": 0.5820526340976357, "epoch": 0.4785224635288221, "grad_norm": 0.19500498473644257, "learning_rate": 0.0002, "loss": 0.7166, "mean_token_accuracy": 0.8074325866997242, "num_tokens": 290181198.0, "step": 6385 }, { "entropy": 0.5746378304436803, "epoch": 0.47889718746267396, "grad_norm": 0.18629369139671326, "learning_rate": 0.0002, "loss": 0.7082, "mean_token_accuracy": 0.8044414613395929, "num_tokens": 291679834.0, "step": 6390 }, { "entropy": 0.5718065637163818, "epoch": 0.47927191139652586, "grad_norm": 0.19696173071861267, "learning_rate": 0.0002, "loss": 0.7099, "mean_token_accuracy": 0.7997111346572637, "num_tokens": 293138320.0, "step": 6395 }, { "entropy": 0.5894918859004974, "epoch": 0.4796466353303777, "grad_norm": 0.2009781450033188, "learning_rate": 0.0002, "loss": 0.7173, "mean_token_accuracy": 0.8021941006183624, "num_tokens": 294639252.0, "step": 6400 }, { "entropy": 0.5680445181205869, "epoch": 0.48002135926422956, "grad_norm": 0.2084948718547821, "learning_rate": 0.0002, "loss": 0.7032, "mean_token_accuracy": 0.8074259892106056, "num_tokens": 296090192.0, "step": 6405 }, { "entropy": 0.5803944178856909, "epoch": 0.4803960831980814, "grad_norm": 0.17748700082302094, "learning_rate": 0.0002, "loss": 0.7182, "mean_token_accuracy": 0.8047603443264961, "num_tokens": 297584295.0, "step": 6410 }, { "entropy": 0.5808683911338448, "epoch": 0.48077080713193326, "grad_norm": 0.30221813917160034, "learning_rate": 0.0002, "loss": 0.7184, "mean_token_accuracy": 0.802220631763339, "num_tokens": 299085629.0, "step": 6415 }, { "entropy": 0.5702798064798117, "epoch": 0.4811455310657851, "grad_norm": 0.20363041758537292, "learning_rate": 0.0002, "loss": 0.7128, "mean_token_accuracy": 0.8058562610298395, "num_tokens": 300522927.0, "step": 6420 }, { "entropy": 0.5683101432397961, "epoch": 0.48152025499963697, "grad_norm": 0.18989428877830505, "learning_rate": 0.0002, "loss": 0.7103, "mean_token_accuracy": 0.8066803567111492, "num_tokens": 301968652.0, "step": 6425 }, { "entropy": 0.5871374191716313, "epoch": 0.48189497893348887, "grad_norm": 0.18991880118846893, "learning_rate": 0.0002, "loss": 0.7262, "mean_token_accuracy": 0.8006836019456387, "num_tokens": 303402949.0, "step": 6430 }, { "entropy": 0.5625840074382722, "epoch": 0.4822697028673407, "grad_norm": 0.1991778165102005, "learning_rate": 0.0002, "loss": 0.6985, "mean_token_accuracy": 0.8077242203056813, "num_tokens": 304856824.0, "step": 6435 }, { "entropy": 0.5899238057434559, "epoch": 0.48264442680119257, "grad_norm": 0.19198299944400787, "learning_rate": 0.0002, "loss": 0.7278, "mean_token_accuracy": 0.800427820906043, "num_tokens": 306310214.0, "step": 6440 }, { "entropy": 0.5870103092864156, "epoch": 0.4830191507350444, "grad_norm": 0.19977761805057526, "learning_rate": 0.0002, "loss": 0.7286, "mean_token_accuracy": 0.8014704424887895, "num_tokens": 307778200.0, "step": 6445 }, { "entropy": 0.5769342506304383, "epoch": 0.4833938746688963, "grad_norm": 0.1869092732667923, "learning_rate": 0.0002, "loss": 0.7174, "mean_token_accuracy": 0.8046053562313318, "num_tokens": 309298240.0, "step": 6450 }, { "entropy": 0.5778072119690478, "epoch": 0.4837685986027481, "grad_norm": 0.1974366456270218, "learning_rate": 0.0002, "loss": 0.7178, "mean_token_accuracy": 0.8037523463368416, "num_tokens": 310804287.0, "step": 6455 }, { "entropy": 0.5784501519054175, "epoch": 0.4841433225366, "grad_norm": 0.20298421382904053, "learning_rate": 0.0002, "loss": 0.7223, "mean_token_accuracy": 0.8065130639821291, "num_tokens": 312285079.0, "step": 6460 }, { "entropy": 0.568978612497449, "epoch": 0.4845180464704518, "grad_norm": 0.20997601747512817, "learning_rate": 0.0002, "loss": 0.7037, "mean_token_accuracy": 0.8036875370889902, "num_tokens": 313721803.0, "step": 6465 }, { "entropy": 0.5745416539721191, "epoch": 0.48489277040430373, "grad_norm": 0.20352813601493835, "learning_rate": 0.0002, "loss": 0.7173, "mean_token_accuracy": 0.8044396106153726, "num_tokens": 315176942.0, "step": 6470 }, { "entropy": 0.5658980324864388, "epoch": 0.4852674943381556, "grad_norm": 0.19925184547901154, "learning_rate": 0.0002, "loss": 0.6939, "mean_token_accuracy": 0.8091164842247963, "num_tokens": 316669552.0, "step": 6475 }, { "entropy": 0.5825986759737134, "epoch": 0.48564221827200743, "grad_norm": 0.18520250916481018, "learning_rate": 0.0002, "loss": 0.7069, "mean_token_accuracy": 0.806248252466321, "num_tokens": 318142576.0, "step": 6480 }, { "entropy": 0.5844343276694417, "epoch": 0.4860169422058593, "grad_norm": 0.1891908049583435, "learning_rate": 0.0002, "loss": 0.7123, "mean_token_accuracy": 0.8056124798953533, "num_tokens": 319623211.0, "step": 6485 }, { "entropy": 0.5744618671014905, "epoch": 0.48639166613971113, "grad_norm": 0.1866021603345871, "learning_rate": 0.0002, "loss": 0.7035, "mean_token_accuracy": 0.8037358894944191, "num_tokens": 321126575.0, "step": 6490 }, { "entropy": 0.5598480451852084, "epoch": 0.486766390073563, "grad_norm": 0.18539291620254517, "learning_rate": 0.0002, "loss": 0.6956, "mean_token_accuracy": 0.8084948536008596, "num_tokens": 322587193.0, "step": 6495 }, { "entropy": 0.5758932360447944, "epoch": 0.48714111400741483, "grad_norm": 0.19182328879833221, "learning_rate": 0.0002, "loss": 0.7092, "mean_token_accuracy": 0.8055532418191433, "num_tokens": 324059753.0, "step": 6500 }, { "entropy": 0.5775821207091212, "epoch": 0.4875158379412667, "grad_norm": 0.18832947313785553, "learning_rate": 0.0002, "loss": 0.7167, "mean_token_accuracy": 0.8052501283586025, "num_tokens": 325534878.0, "step": 6505 }, { "entropy": 0.5740286806598306, "epoch": 0.4878905618751186, "grad_norm": 0.19901610910892487, "learning_rate": 0.0002, "loss": 0.7173, "mean_token_accuracy": 0.8038663167506457, "num_tokens": 327007938.0, "step": 6510 }, { "entropy": 0.5806177958846093, "epoch": 0.48826528580897044, "grad_norm": 0.1959144026041031, "learning_rate": 0.0002, "loss": 0.7294, "mean_token_accuracy": 0.8050541900098324, "num_tokens": 328466967.0, "step": 6515 }, { "entropy": 0.5691538986749947, "epoch": 0.4886400097428223, "grad_norm": 0.17542730271816254, "learning_rate": 0.0002, "loss": 0.7049, "mean_token_accuracy": 0.8088202029466629, "num_tokens": 329963939.0, "step": 6520 }, { "entropy": 0.5603298638015985, "epoch": 0.48901473367667414, "grad_norm": 0.19494707882404327, "learning_rate": 0.0002, "loss": 0.6972, "mean_token_accuracy": 0.8081126254051924, "num_tokens": 331442060.0, "step": 6525 }, { "entropy": 0.5572044292464853, "epoch": 0.489389457610526, "grad_norm": 0.189096137881279, "learning_rate": 0.0002, "loss": 0.6961, "mean_token_accuracy": 0.8039431191980839, "num_tokens": 332913858.0, "step": 6530 }, { "entropy": 0.561104622296989, "epoch": 0.48976418154437784, "grad_norm": 0.187826007604599, "learning_rate": 0.0002, "loss": 0.7131, "mean_token_accuracy": 0.808697609975934, "num_tokens": 334347656.0, "step": 6535 }, { "entropy": 0.5812317208386958, "epoch": 0.4901389054782297, "grad_norm": 0.19124871492385864, "learning_rate": 0.0002, "loss": 0.7129, "mean_token_accuracy": 0.8061353303492069, "num_tokens": 335833840.0, "step": 6540 }, { "entropy": 0.574548764526844, "epoch": 0.49051362941208154, "grad_norm": 0.20000040531158447, "learning_rate": 0.0002, "loss": 0.7145, "mean_token_accuracy": 0.8049633525311947, "num_tokens": 337312869.0, "step": 6545 }, { "entropy": 0.5741809627041221, "epoch": 0.49088835334593345, "grad_norm": 0.2109362632036209, "learning_rate": 0.0002, "loss": 0.7208, "mean_token_accuracy": 0.8054828435182572, "num_tokens": 338750792.0, "step": 6550 }, { "entropy": 0.5749991558492183, "epoch": 0.4912630772797853, "grad_norm": 0.2583298981189728, "learning_rate": 0.0002, "loss": 0.6985, "mean_token_accuracy": 0.8053514081984758, "num_tokens": 340255585.0, "step": 6555 }, { "entropy": 0.5770681924186647, "epoch": 0.49163780121363715, "grad_norm": 0.1935085952281952, "learning_rate": 0.0002, "loss": 0.7062, "mean_token_accuracy": 0.8064804576337338, "num_tokens": 341720507.0, "step": 6560 }, { "entropy": 0.5739391635172069, "epoch": 0.492012525147489, "grad_norm": 0.19007104635238647, "learning_rate": 0.0002, "loss": 0.699, "mean_token_accuracy": 0.8053915459662676, "num_tokens": 343183697.0, "step": 6565 }, { "entropy": 0.577253672014922, "epoch": 0.49238724908134085, "grad_norm": 0.20273710787296295, "learning_rate": 0.0002, "loss": 0.7109, "mean_token_accuracy": 0.8046928316354751, "num_tokens": 344658928.0, "step": 6570 }, { "entropy": 0.5897765915840865, "epoch": 0.4927619730151927, "grad_norm": 0.23945939540863037, "learning_rate": 0.0002, "loss": 0.7183, "mean_token_accuracy": 0.8031329981982708, "num_tokens": 346116022.0, "step": 6575 }, { "entropy": 0.587875815294683, "epoch": 0.49313669694904455, "grad_norm": 0.1990686058998108, "learning_rate": 0.0002, "loss": 0.721, "mean_token_accuracy": 0.8046697482466698, "num_tokens": 347583420.0, "step": 6580 }, { "entropy": 0.5750679195858538, "epoch": 0.4935114208828964, "grad_norm": 0.2220117449760437, "learning_rate": 0.0002, "loss": 0.7129, "mean_token_accuracy": 0.8022493656724692, "num_tokens": 349112139.0, "step": 6585 }, { "entropy": 0.5658280348405242, "epoch": 0.4938861448167483, "grad_norm": 0.20793810486793518, "learning_rate": 0.0002, "loss": 0.7153, "mean_token_accuracy": 0.8061930544674396, "num_tokens": 350574178.0, "step": 6590 }, { "entropy": 0.5517780978232623, "epoch": 0.49426086875060016, "grad_norm": 0.23891036212444305, "learning_rate": 0.0002, "loss": 0.7006, "mean_token_accuracy": 0.805527414008975, "num_tokens": 352025832.0, "step": 6595 }, { "entropy": 0.5688492215238512, "epoch": 0.494635592684452, "grad_norm": 0.19305545091629028, "learning_rate": 0.0002, "loss": 0.7093, "mean_token_accuracy": 0.8077768862247467, "num_tokens": 353465764.0, "step": 6600 }, { "entropy": 0.5562643198296428, "epoch": 0.49501031661830386, "grad_norm": 0.19163565337657928, "learning_rate": 0.0002, "loss": 0.7139, "mean_token_accuracy": 0.806814493238926, "num_tokens": 354930131.0, "step": 6605 }, { "entropy": 0.5802584463730455, "epoch": 0.4953850405521557, "grad_norm": 0.19535332918167114, "learning_rate": 0.0002, "loss": 0.7307, "mean_token_accuracy": 0.800765760615468, "num_tokens": 356426462.0, "step": 6610 }, { "entropy": 0.582194359228015, "epoch": 0.49575976448600756, "grad_norm": 0.21887600421905518, "learning_rate": 0.0002, "loss": 0.7284, "mean_token_accuracy": 0.8038565628230572, "num_tokens": 357930706.0, "step": 6615 }, { "entropy": 0.5754471430554986, "epoch": 0.4961344884198594, "grad_norm": 0.17630256712436676, "learning_rate": 0.0002, "loss": 0.7143, "mean_token_accuracy": 0.8036405932158232, "num_tokens": 359402670.0, "step": 6620 }, { "entropy": 0.5523218039423228, "epoch": 0.4965092123537113, "grad_norm": 0.19104072451591492, "learning_rate": 0.0002, "loss": 0.6999, "mean_token_accuracy": 0.8068166073411703, "num_tokens": 360867234.0, "step": 6625 }, { "entropy": 0.5782146191224455, "epoch": 0.49688393628756317, "grad_norm": 0.20311005413532257, "learning_rate": 0.0002, "loss": 0.7109, "mean_token_accuracy": 0.8040988434106111, "num_tokens": 362368128.0, "step": 6630 }, { "entropy": 0.5697874968871475, "epoch": 0.497258660221415, "grad_norm": 0.24586431682109833, "learning_rate": 0.0002, "loss": 0.7181, "mean_token_accuracy": 0.8040745705366135, "num_tokens": 363852354.0, "step": 6635 }, { "entropy": 0.5693161325529218, "epoch": 0.49763338415526687, "grad_norm": 0.19704608619213104, "learning_rate": 0.0002, "loss": 0.7097, "mean_token_accuracy": 0.8004159759730101, "num_tokens": 365345060.0, "step": 6640 }, { "entropy": 0.553676551580429, "epoch": 0.4980081080891187, "grad_norm": 0.2033737748861313, "learning_rate": 0.0002, "loss": 0.6982, "mean_token_accuracy": 0.8062996499240398, "num_tokens": 366812210.0, "step": 6645 }, { "entropy": 0.5761458126828074, "epoch": 0.49838283202297057, "grad_norm": 0.20122486352920532, "learning_rate": 0.0002, "loss": 0.7172, "mean_token_accuracy": 0.8021112956106663, "num_tokens": 368319970.0, "step": 6650 }, { "entropy": 0.5519876315258443, "epoch": 0.4987575559568224, "grad_norm": 0.18532223999500275, "learning_rate": 0.0002, "loss": 0.7034, "mean_token_accuracy": 0.8072867635637522, "num_tokens": 369798025.0, "step": 6655 }, { "entropy": 0.5708202857524156, "epoch": 0.49913227989067427, "grad_norm": 0.21151699125766754, "learning_rate": 0.0002, "loss": 0.7313, "mean_token_accuracy": 0.80389738753438, "num_tokens": 371225069.0, "step": 6660 }, { "entropy": 0.5802095331251621, "epoch": 0.4995070038245262, "grad_norm": 0.19335505366325378, "learning_rate": 0.0002, "loss": 0.7258, "mean_token_accuracy": 0.8004586465656758, "num_tokens": 372674954.0, "step": 6665 }, { "entropy": 0.5703861763700843, "epoch": 0.499881727758378, "grad_norm": 0.20493437349796295, "learning_rate": 0.0002, "loss": 0.7248, "mean_token_accuracy": 0.8023037403821945, "num_tokens": 374164653.0, "step": 6670 }, { "entropy": 0.5613131614401936, "epoch": 0.5002564516922299, "grad_norm": 0.1870698481798172, "learning_rate": 0.0002, "loss": 0.7096, "mean_token_accuracy": 0.8066774763166904, "num_tokens": 375617095.0, "step": 6675 }, { "entropy": 0.5622303519397974, "epoch": 0.5006311756260817, "grad_norm": 0.187578022480011, "learning_rate": 0.0002, "loss": 0.7038, "mean_token_accuracy": 0.8038191571831703, "num_tokens": 377117091.0, "step": 6680 }, { "entropy": 0.5569082166999578, "epoch": 0.5010058995599336, "grad_norm": 0.2041359692811966, "learning_rate": 0.0002, "loss": 0.7104, "mean_token_accuracy": 0.8046543259173632, "num_tokens": 378580172.0, "step": 6685 }, { "entropy": 0.5596237087622284, "epoch": 0.5013806234937854, "grad_norm": 0.19281573593616486, "learning_rate": 0.0002, "loss": 0.7192, "mean_token_accuracy": 0.8039921071380377, "num_tokens": 380039730.0, "step": 6690 }, { "entropy": 0.566476595774293, "epoch": 0.5017553474276373, "grad_norm": 0.20439107716083527, "learning_rate": 0.0002, "loss": 0.7097, "mean_token_accuracy": 0.8040733397006988, "num_tokens": 381488534.0, "step": 6695 }, { "entropy": 0.5630180229432881, "epoch": 0.5021300713614891, "grad_norm": 0.19888901710510254, "learning_rate": 0.0002, "loss": 0.7152, "mean_token_accuracy": 0.805031030997634, "num_tokens": 382962543.0, "step": 6700 }, { "entropy": 0.5569295693188906, "epoch": 0.502504795295341, "grad_norm": 0.21002881228923798, "learning_rate": 0.0002, "loss": 0.7093, "mean_token_accuracy": 0.8048169128596783, "num_tokens": 384419695.0, "step": 6705 }, { "entropy": 0.566155424527824, "epoch": 0.5028795192291928, "grad_norm": 0.20074869692325592, "learning_rate": 0.0002, "loss": 0.7199, "mean_token_accuracy": 0.8042807139456272, "num_tokens": 385905530.0, "step": 6710 }, { "entropy": 0.5635008564218879, "epoch": 0.5032542431630447, "grad_norm": 0.20540881156921387, "learning_rate": 0.0002, "loss": 0.7106, "mean_token_accuracy": 0.8070353262126446, "num_tokens": 387385541.0, "step": 6715 }, { "entropy": 0.5519575557671488, "epoch": 0.5036289670968965, "grad_norm": 0.18706832826137543, "learning_rate": 0.0002, "loss": 0.6926, "mean_token_accuracy": 0.8067164912819862, "num_tokens": 388872168.0, "step": 6720 }, { "entropy": 0.568402848765254, "epoch": 0.5040036910307485, "grad_norm": 0.20493526756763458, "learning_rate": 0.0002, "loss": 0.718, "mean_token_accuracy": 0.8041048046201468, "num_tokens": 390366955.0, "step": 6725 }, { "entropy": 0.5817664645612239, "epoch": 0.5043784149646003, "grad_norm": 0.28459450602531433, "learning_rate": 0.0002, "loss": 0.725, "mean_token_accuracy": 0.8043787159025669, "num_tokens": 391857185.0, "step": 6730 }, { "entropy": 0.5671103086322546, "epoch": 0.5047531388984522, "grad_norm": 0.20103764533996582, "learning_rate": 0.0002, "loss": 0.7089, "mean_token_accuracy": 0.8038605559617281, "num_tokens": 393325843.0, "step": 6735 }, { "entropy": 0.5432322006672621, "epoch": 0.505127862832304, "grad_norm": 0.20560821890830994, "learning_rate": 0.0002, "loss": 0.6904, "mean_token_accuracy": 0.8082158833742141, "num_tokens": 394738156.0, "step": 6740 }, { "entropy": 0.5713327271863818, "epoch": 0.5055025867661559, "grad_norm": 0.20299845933914185, "learning_rate": 0.0002, "loss": 0.7084, "mean_token_accuracy": 0.8030953258275986, "num_tokens": 396229770.0, "step": 6745 }, { "entropy": 0.573204574920237, "epoch": 0.5058773107000077, "grad_norm": 0.2482006847858429, "learning_rate": 0.0002, "loss": 0.7149, "mean_token_accuracy": 0.8039033468812704, "num_tokens": 397714663.0, "step": 6750 }, { "entropy": 0.5733205077238381, "epoch": 0.5062520346338596, "grad_norm": 0.20228508114814758, "learning_rate": 0.0002, "loss": 0.7215, "mean_token_accuracy": 0.8042657382786274, "num_tokens": 399141121.0, "step": 6755 }, { "entropy": 0.5537089459598065, "epoch": 0.5066267585677114, "grad_norm": 0.20052340626716614, "learning_rate": 0.0002, "loss": 0.7034, "mean_token_accuracy": 0.8040527787059546, "num_tokens": 400574437.0, "step": 6760 }, { "entropy": 0.5581931412220001, "epoch": 0.5070014825015633, "grad_norm": 0.1949377804994583, "learning_rate": 0.0002, "loss": 0.7011, "mean_token_accuracy": 0.8084216505289078, "num_tokens": 402025223.0, "step": 6765 }, { "entropy": 0.5742569636553526, "epoch": 0.5073762064354151, "grad_norm": 0.2043093740940094, "learning_rate": 0.0002, "loss": 0.7175, "mean_token_accuracy": 0.8044337682425976, "num_tokens": 403519058.0, "step": 6770 }, { "entropy": 0.5644060633145273, "epoch": 0.507750930369267, "grad_norm": 0.2025926411151886, "learning_rate": 0.0002, "loss": 0.7169, "mean_token_accuracy": 0.804786528646946, "num_tokens": 404987635.0, "step": 6775 }, { "entropy": 0.575438023544848, "epoch": 0.5081256543031188, "grad_norm": 0.1983557939529419, "learning_rate": 0.0002, "loss": 0.7335, "mean_token_accuracy": 0.8033240612596273, "num_tokens": 406489883.0, "step": 6780 }, { "entropy": 0.5363465380854905, "epoch": 0.5085003782369707, "grad_norm": 0.18620125949382782, "learning_rate": 0.0002, "loss": 0.6795, "mean_token_accuracy": 0.8098006498068571, "num_tokens": 407924770.0, "step": 6785 }, { "entropy": 0.551512124761939, "epoch": 0.5088751021708225, "grad_norm": 0.17931324243545532, "learning_rate": 0.0002, "loss": 0.7071, "mean_token_accuracy": 0.8033021990209818, "num_tokens": 409381375.0, "step": 6790 }, { "entropy": 0.5625958652235568, "epoch": 0.5092498261046744, "grad_norm": 0.1882510781288147, "learning_rate": 0.0002, "loss": 0.7148, "mean_token_accuracy": 0.8095925197005271, "num_tokens": 410879853.0, "step": 6795 }, { "entropy": 0.5394654715433717, "epoch": 0.5096245500385264, "grad_norm": 0.1843593865633011, "learning_rate": 0.0002, "loss": 0.7043, "mean_token_accuracy": 0.8062427539378405, "num_tokens": 412277322.0, "step": 6800 }, { "entropy": 0.5503414396196604, "epoch": 0.5099992739723782, "grad_norm": 0.1921103298664093, "learning_rate": 0.0002, "loss": 0.7106, "mean_token_accuracy": 0.8051510531455278, "num_tokens": 413726113.0, "step": 6805 }, { "entropy": 0.5614959362894296, "epoch": 0.5103739979062301, "grad_norm": 0.19517242908477783, "learning_rate": 0.0002, "loss": 0.7174, "mean_token_accuracy": 0.8037893824279309, "num_tokens": 415218016.0, "step": 6810 }, { "entropy": 0.550840107165277, "epoch": 0.5107487218400819, "grad_norm": 0.1891271471977234, "learning_rate": 0.0002, "loss": 0.699, "mean_token_accuracy": 0.8077724777162075, "num_tokens": 416699649.0, "step": 6815 }, { "entropy": 0.5429429818876088, "epoch": 0.5111234457739338, "grad_norm": 0.20341293513774872, "learning_rate": 0.0002, "loss": 0.6904, "mean_token_accuracy": 0.8124392334371805, "num_tokens": 418154113.0, "step": 6820 }, { "entropy": 0.5546896474435925, "epoch": 0.5114981697077856, "grad_norm": 0.1982382833957672, "learning_rate": 0.0002, "loss": 0.7073, "mean_token_accuracy": 0.8061025205999612, "num_tokens": 419611015.0, "step": 6825 }, { "entropy": 0.5650325451977551, "epoch": 0.5118728936416375, "grad_norm": 0.18573366105556488, "learning_rate": 0.0002, "loss": 0.7153, "mean_token_accuracy": 0.8043062150478363, "num_tokens": 421082836.0, "step": 6830 }, { "entropy": 0.555726302228868, "epoch": 0.5122476175754893, "grad_norm": 0.20255890488624573, "learning_rate": 0.0002, "loss": 0.7019, "mean_token_accuracy": 0.8066173452883959, "num_tokens": 422521939.0, "step": 6835 }, { "entropy": 0.5677885536104441, "epoch": 0.5126223415093412, "grad_norm": 0.20771971344947815, "learning_rate": 0.0002, "loss": 0.7198, "mean_token_accuracy": 0.8068758774548769, "num_tokens": 423983260.0, "step": 6840 }, { "entropy": 0.5670061141252518, "epoch": 0.512997065443193, "grad_norm": 0.18944910168647766, "learning_rate": 0.0002, "loss": 0.7166, "mean_token_accuracy": 0.8035662490874529, "num_tokens": 425455075.0, "step": 6845 }, { "entropy": 0.5590897025540471, "epoch": 0.5133717893770449, "grad_norm": 0.1970333606004715, "learning_rate": 0.0002, "loss": 0.7151, "mean_token_accuracy": 0.8051090221852064, "num_tokens": 426942646.0, "step": 6850 }, { "entropy": 0.5645433090627193, "epoch": 0.5137465133108967, "grad_norm": 0.20928625762462616, "learning_rate": 0.0002, "loss": 0.7151, "mean_token_accuracy": 0.8038314275443554, "num_tokens": 428365151.0, "step": 6855 }, { "entropy": 0.5684306766837836, "epoch": 0.5141212372447486, "grad_norm": 0.21312452852725983, "learning_rate": 0.0002, "loss": 0.7116, "mean_token_accuracy": 0.8049528367817402, "num_tokens": 429854471.0, "step": 6860 }, { "entropy": 0.558095621690154, "epoch": 0.5144959611786004, "grad_norm": 0.20249970257282257, "learning_rate": 0.0002, "loss": 0.707, "mean_token_accuracy": 0.8078664299100637, "num_tokens": 431307295.0, "step": 6865 }, { "entropy": 0.5545700648799539, "epoch": 0.5148706851124523, "grad_norm": 0.18601781129837036, "learning_rate": 0.0002, "loss": 0.7051, "mean_token_accuracy": 0.8043225560337305, "num_tokens": 432729979.0, "step": 6870 }, { "entropy": 0.5737195190042257, "epoch": 0.5152454090463041, "grad_norm": 0.19489602744579315, "learning_rate": 0.0002, "loss": 0.72, "mean_token_accuracy": 0.8019707795232535, "num_tokens": 434209956.0, "step": 6875 }, { "entropy": 0.5695656031370163, "epoch": 0.5156201329801561, "grad_norm": 0.19365328550338745, "learning_rate": 0.0002, "loss": 0.7144, "mean_token_accuracy": 0.8040817383676767, "num_tokens": 435644434.0, "step": 6880 }, { "entropy": 0.5732458421960474, "epoch": 0.5159948569140079, "grad_norm": 0.18116629123687744, "learning_rate": 0.0002, "loss": 0.7151, "mean_token_accuracy": 0.8048831839114428, "num_tokens": 437101061.0, "step": 6885 }, { "entropy": 0.5883892649784684, "epoch": 0.5163695808478598, "grad_norm": 0.18954770267009735, "learning_rate": 0.0002, "loss": 0.7281, "mean_token_accuracy": 0.8027772754430771, "num_tokens": 438574451.0, "step": 6890 }, { "entropy": 0.5698805424384773, "epoch": 0.5167443047817116, "grad_norm": 0.18269981443881989, "learning_rate": 0.0002, "loss": 0.7048, "mean_token_accuracy": 0.805233795940876, "num_tokens": 439994666.0, "step": 6895 }, { "entropy": 0.5876870954409241, "epoch": 0.5171190287155635, "grad_norm": 0.21733665466308594, "learning_rate": 0.0002, "loss": 0.72, "mean_token_accuracy": 0.8038408324122429, "num_tokens": 441452257.0, "step": 6900 }, { "entropy": 0.5719728540629149, "epoch": 0.5174937526494153, "grad_norm": 0.2017091065645218, "learning_rate": 0.0002, "loss": 0.7114, "mean_token_accuracy": 0.8043426562100648, "num_tokens": 442940727.0, "step": 6905 }, { "entropy": 0.5801536941900849, "epoch": 0.5178684765832672, "grad_norm": 0.19054287672042847, "learning_rate": 0.0002, "loss": 0.716, "mean_token_accuracy": 0.8017243225127458, "num_tokens": 444428736.0, "step": 6910 }, { "entropy": 0.5854903698898852, "epoch": 0.518243200517119, "grad_norm": 0.1790088266134262, "learning_rate": 0.0002, "loss": 0.7199, "mean_token_accuracy": 0.8009200539439917, "num_tokens": 445937787.0, "step": 6915 }, { "entropy": 0.5889429304748773, "epoch": 0.5186179244509709, "grad_norm": 0.19035136699676514, "learning_rate": 0.0002, "loss": 0.7242, "mean_token_accuracy": 0.8030304044485093, "num_tokens": 447426775.0, "step": 6920 }, { "entropy": 0.5683792505413294, "epoch": 0.5189926483848227, "grad_norm": 0.19838757812976837, "learning_rate": 0.0002, "loss": 0.7058, "mean_token_accuracy": 0.8054485630244017, "num_tokens": 448872101.0, "step": 6925 }, { "entropy": 0.5598808147013188, "epoch": 0.5193673723186746, "grad_norm": 0.20060235261917114, "learning_rate": 0.0002, "loss": 0.6928, "mean_token_accuracy": 0.805380055308342, "num_tokens": 450339557.0, "step": 6930 }, { "entropy": 0.5727101493626833, "epoch": 0.5197420962525264, "grad_norm": 0.1882556527853012, "learning_rate": 0.0002, "loss": 0.7048, "mean_token_accuracy": 0.8054290764033795, "num_tokens": 451833167.0, "step": 6935 }, { "entropy": 0.5693571411073208, "epoch": 0.5201168201863783, "grad_norm": 0.19886858761310577, "learning_rate": 0.0002, "loss": 0.7035, "mean_token_accuracy": 0.805060613900423, "num_tokens": 453304661.0, "step": 6940 }, { "entropy": 0.57584341596812, "epoch": 0.5204915441202301, "grad_norm": 0.18729320168495178, "learning_rate": 0.0002, "loss": 0.7149, "mean_token_accuracy": 0.8036036714911461, "num_tokens": 454825258.0, "step": 6945 }, { "entropy": 0.5840437766164541, "epoch": 0.520866268054082, "grad_norm": 0.21152974665164948, "learning_rate": 0.0002, "loss": 0.7179, "mean_token_accuracy": 0.8055635649710894, "num_tokens": 456298380.0, "step": 6950 }, { "entropy": 0.575241038762033, "epoch": 0.5212409919879339, "grad_norm": 0.19690266251564026, "learning_rate": 0.0002, "loss": 0.7012, "mean_token_accuracy": 0.8051123701035976, "num_tokens": 457763620.0, "step": 6955 }, { "entropy": 0.5791049560531973, "epoch": 0.5216157159217858, "grad_norm": 0.20173241198062897, "learning_rate": 0.0002, "loss": 0.7097, "mean_token_accuracy": 0.8055694811046124, "num_tokens": 459236374.0, "step": 6960 }, { "entropy": 0.5884912457317114, "epoch": 0.5219904398556376, "grad_norm": 0.2079961746931076, "learning_rate": 0.0002, "loss": 0.7142, "mean_token_accuracy": 0.8049900528043509, "num_tokens": 460669470.0, "step": 6965 }, { "entropy": 0.5648861357942223, "epoch": 0.5223651637894895, "grad_norm": 0.19318334758281708, "learning_rate": 0.0002, "loss": 0.7043, "mean_token_accuracy": 0.8053955167531968, "num_tokens": 462100554.0, "step": 6970 }, { "entropy": 0.5724186933599412, "epoch": 0.5227398877233413, "grad_norm": 0.19490018486976624, "learning_rate": 0.0002, "loss": 0.6993, "mean_token_accuracy": 0.80627580024302, "num_tokens": 463565547.0, "step": 6975 }, { "entropy": 0.5787977574393153, "epoch": 0.5231146116571932, "grad_norm": 0.2082165628671646, "learning_rate": 0.0002, "loss": 0.7122, "mean_token_accuracy": 0.8049663547426462, "num_tokens": 465033641.0, "step": 6980 }, { "entropy": 0.5844777343794704, "epoch": 0.523489335591045, "grad_norm": 0.2039259970188141, "learning_rate": 0.0002, "loss": 0.7064, "mean_token_accuracy": 0.8062567044049501, "num_tokens": 466527803.0, "step": 6985 }, { "entropy": 0.5764939583837986, "epoch": 0.5238640595248969, "grad_norm": 0.21058209240436554, "learning_rate": 0.0002, "loss": 0.6999, "mean_token_accuracy": 0.8067972056567669, "num_tokens": 467988317.0, "step": 6990 }, { "entropy": 0.5788717973977328, "epoch": 0.5242387834587487, "grad_norm": 0.1931866705417633, "learning_rate": 0.0002, "loss": 0.7108, "mean_token_accuracy": 0.80607840269804, "num_tokens": 469409160.0, "step": 6995 }, { "entropy": 0.5863593252375722, "epoch": 0.5246135073926006, "grad_norm": 0.21215227246284485, "learning_rate": 0.0002, "loss": 0.7084, "mean_token_accuracy": 0.8065876439213753, "num_tokens": 470895287.0, "step": 7000 }, { "entropy": 0.5921842474490404, "epoch": 0.5249882313264524, "grad_norm": 0.1926666498184204, "learning_rate": 0.0002, "loss": 0.7171, "mean_token_accuracy": 0.8056606281548738, "num_tokens": 472362795.0, "step": 7005 }, { "entropy": 0.6000145126134158, "epoch": 0.5253629552603043, "grad_norm": 0.19377222657203674, "learning_rate": 0.0002, "loss": 0.7224, "mean_token_accuracy": 0.8033628903329373, "num_tokens": 473815850.0, "step": 7010 }, { "entropy": 0.5845732264220714, "epoch": 0.5257376791941561, "grad_norm": 0.1960756480693817, "learning_rate": 0.0002, "loss": 0.7028, "mean_token_accuracy": 0.8056566316634417, "num_tokens": 475293132.0, "step": 7015 }, { "entropy": 0.5913247555494309, "epoch": 0.526112403128008, "grad_norm": 0.1872503161430359, "learning_rate": 0.0002, "loss": 0.7128, "mean_token_accuracy": 0.8054156377911568, "num_tokens": 476763117.0, "step": 7020 }, { "entropy": 0.5961612426675856, "epoch": 0.5264871270618598, "grad_norm": 0.17608486115932465, "learning_rate": 0.0002, "loss": 0.72, "mean_token_accuracy": 0.8013118922710418, "num_tokens": 478253169.0, "step": 7025 }, { "entropy": 0.5687848962843418, "epoch": 0.5268618509957117, "grad_norm": 0.18899083137512207, "learning_rate": 0.0002, "loss": 0.6817, "mean_token_accuracy": 0.8083280924707651, "num_tokens": 479721443.0, "step": 7030 }, { "entropy": 0.5798401245847344, "epoch": 0.5272365749295637, "grad_norm": 0.20322395861148834, "learning_rate": 0.0002, "loss": 0.6909, "mean_token_accuracy": 0.8081312790513039, "num_tokens": 481213691.0, "step": 7035 }, { "entropy": 0.5884888136759401, "epoch": 0.5276112988634155, "grad_norm": 0.20053118467330933, "learning_rate": 0.0002, "loss": 0.7013, "mean_token_accuracy": 0.8054501350969077, "num_tokens": 482672379.0, "step": 7040 }, { "entropy": 0.596379648707807, "epoch": 0.5279860227972674, "grad_norm": 0.20732718706130981, "learning_rate": 0.0002, "loss": 0.7236, "mean_token_accuracy": 0.8045814778655768, "num_tokens": 484126670.0, "step": 7045 }, { "entropy": 0.585325488075614, "epoch": 0.5283607467311192, "grad_norm": 0.21417492628097534, "learning_rate": 0.0002, "loss": 0.7008, "mean_token_accuracy": 0.8065591983497142, "num_tokens": 485552881.0, "step": 7050 }, { "entropy": 0.5905826878733933, "epoch": 0.5287354706649711, "grad_norm": 0.1998925805091858, "learning_rate": 0.0002, "loss": 0.7139, "mean_token_accuracy": 0.8039754565805197, "num_tokens": 486968167.0, "step": 7055 }, { "entropy": 0.5846349166706204, "epoch": 0.5291101945988229, "grad_norm": 0.20833951234817505, "learning_rate": 0.0002, "loss": 0.7083, "mean_token_accuracy": 0.8009915314614773, "num_tokens": 488435332.0, "step": 7060 }, { "entropy": 0.5858986103907228, "epoch": 0.5294849185326748, "grad_norm": 0.2905161678791046, "learning_rate": 0.0002, "loss": 0.7066, "mean_token_accuracy": 0.8053287751972675, "num_tokens": 489885452.0, "step": 7065 }, { "entropy": 0.6075857579708099, "epoch": 0.5298596424665266, "grad_norm": 0.18459512293338776, "learning_rate": 0.0002, "loss": 0.7171, "mean_token_accuracy": 0.8015652529895305, "num_tokens": 491362005.0, "step": 7070 }, { "entropy": 0.5763543371111155, "epoch": 0.5302343664003785, "grad_norm": 0.18608391284942627, "learning_rate": 0.0002, "loss": 0.6984, "mean_token_accuracy": 0.8071139328181743, "num_tokens": 492816959.0, "step": 7075 }, { "entropy": 0.5938618989661336, "epoch": 0.5306090903342303, "grad_norm": 0.1921825110912323, "learning_rate": 0.0002, "loss": 0.7142, "mean_token_accuracy": 0.8056585349142551, "num_tokens": 494321909.0, "step": 7080 }, { "entropy": 0.5816818850114942, "epoch": 0.5309838142680822, "grad_norm": 0.19632992148399353, "learning_rate": 0.0002, "loss": 0.7158, "mean_token_accuracy": 0.8021732043474913, "num_tokens": 495790838.0, "step": 7085 }, { "entropy": 0.588327082619071, "epoch": 0.531358538201934, "grad_norm": 0.19375108182430267, "learning_rate": 0.0002, "loss": 0.7168, "mean_token_accuracy": 0.8022254507988691, "num_tokens": 497307781.0, "step": 7090 }, { "entropy": 0.5718763487413525, "epoch": 0.5317332621357859, "grad_norm": 0.1863706111907959, "learning_rate": 0.0002, "loss": 0.699, "mean_token_accuracy": 0.8086799006909132, "num_tokens": 498770664.0, "step": 7095 }, { "entropy": 0.5698075633496046, "epoch": 0.5321079860696377, "grad_norm": 0.19931283593177795, "learning_rate": 0.0002, "loss": 0.6975, "mean_token_accuracy": 0.8079872619360685, "num_tokens": 500260774.0, "step": 7100 }, { "entropy": 0.5791182111948728, "epoch": 0.5324827100034896, "grad_norm": 0.18577861785888672, "learning_rate": 0.0002, "loss": 0.7127, "mean_token_accuracy": 0.8062004633247852, "num_tokens": 501744624.0, "step": 7105 }, { "entropy": 0.5811051471158862, "epoch": 0.5328574339373415, "grad_norm": 0.20147855579853058, "learning_rate": 0.0002, "loss": 0.7052, "mean_token_accuracy": 0.8065615054219961, "num_tokens": 503184600.0, "step": 7110 }, { "entropy": 0.5906685208901763, "epoch": 0.5332321578711934, "grad_norm": 0.1927463263273239, "learning_rate": 0.0002, "loss": 0.6998, "mean_token_accuracy": 0.8041404880583286, "num_tokens": 504677902.0, "step": 7115 }, { "entropy": 0.575703433342278, "epoch": 0.5336068818050452, "grad_norm": 0.17892248928546906, "learning_rate": 0.0002, "loss": 0.6943, "mean_token_accuracy": 0.8093731228262186, "num_tokens": 506145537.0, "step": 7120 }, { "entropy": 0.5917358350008726, "epoch": 0.5339816057388971, "grad_norm": 0.19334670901298523, "learning_rate": 0.0002, "loss": 0.7208, "mean_token_accuracy": 0.8021117962896824, "num_tokens": 507612376.0, "step": 7125 }, { "entropy": 0.5767823936417699, "epoch": 0.5343563296727489, "grad_norm": 0.19008567929267883, "learning_rate": 0.0002, "loss": 0.6983, "mean_token_accuracy": 0.8094220321625472, "num_tokens": 509081131.0, "step": 7130 }, { "entropy": 0.5912152303382754, "epoch": 0.5347310536066008, "grad_norm": 0.19262389838695526, "learning_rate": 0.0002, "loss": 0.7071, "mean_token_accuracy": 0.8046695630997419, "num_tokens": 510561051.0, "step": 7135 }, { "entropy": 0.5740980837494135, "epoch": 0.5351057775404526, "grad_norm": 0.1920168697834015, "learning_rate": 0.0002, "loss": 0.6859, "mean_token_accuracy": 0.8091751705855131, "num_tokens": 512020613.0, "step": 7140 }, { "entropy": 0.5896951848641038, "epoch": 0.5354805014743045, "grad_norm": 0.20208698511123657, "learning_rate": 0.0002, "loss": 0.7119, "mean_token_accuracy": 0.805272388830781, "num_tokens": 513486170.0, "step": 7145 }, { "entropy": 0.5913738527335226, "epoch": 0.5358552254081563, "grad_norm": 0.1986195594072342, "learning_rate": 0.0002, "loss": 0.7093, "mean_token_accuracy": 0.8062884237617254, "num_tokens": 514995421.0, "step": 7150 }, { "entropy": 0.5870693627744913, "epoch": 0.5362299493420082, "grad_norm": 0.17757155001163483, "learning_rate": 0.0002, "loss": 0.7068, "mean_token_accuracy": 0.8062805369496345, "num_tokens": 516495046.0, "step": 7155 }, { "entropy": 0.5747575471177697, "epoch": 0.53660467327586, "grad_norm": 0.1882915049791336, "learning_rate": 0.0002, "loss": 0.7139, "mean_token_accuracy": 0.808461257815361, "num_tokens": 517948466.0, "step": 7160 }, { "entropy": 0.5743806896731257, "epoch": 0.5369793972097119, "grad_norm": 0.17847269773483276, "learning_rate": 0.0002, "loss": 0.7065, "mean_token_accuracy": 0.805897070094943, "num_tokens": 519425773.0, "step": 7165 }, { "entropy": 0.5658199764788151, "epoch": 0.5373541211435637, "grad_norm": 0.2000131905078888, "learning_rate": 0.0002, "loss": 0.7069, "mean_token_accuracy": 0.8048939023166894, "num_tokens": 520884000.0, "step": 7170 }, { "entropy": 0.5637850837782026, "epoch": 0.5377288450774156, "grad_norm": 0.1885307878255844, "learning_rate": 0.0002, "loss": 0.7023, "mean_token_accuracy": 0.8081496667116881, "num_tokens": 522320093.0, "step": 7175 }, { "entropy": 0.5545734852552414, "epoch": 0.5381035690112674, "grad_norm": 0.2077111452817917, "learning_rate": 0.0002, "loss": 0.6955, "mean_token_accuracy": 0.8086020849645138, "num_tokens": 523763784.0, "step": 7180 }, { "entropy": 0.5617074586451054, "epoch": 0.5384782929451193, "grad_norm": 0.2521677613258362, "learning_rate": 0.0002, "loss": 0.699, "mean_token_accuracy": 0.8030217289924622, "num_tokens": 525198669.0, "step": 7185 }, { "entropy": 0.5565117220394313, "epoch": 0.5388530168789712, "grad_norm": 0.18750612437725067, "learning_rate": 0.0002, "loss": 0.677, "mean_token_accuracy": 0.8119407065212727, "num_tokens": 526693382.0, "step": 7190 }, { "entropy": 0.5759574346244335, "epoch": 0.5392277408128231, "grad_norm": 0.20895493030548096, "learning_rate": 0.0002, "loss": 0.719, "mean_token_accuracy": 0.8035143744200468, "num_tokens": 528202601.0, "step": 7195 }, { "entropy": 0.5842557806521654, "epoch": 0.539602464746675, "grad_norm": 0.1995818316936493, "learning_rate": 0.0002, "loss": 0.7116, "mean_token_accuracy": 0.8018699280917645, "num_tokens": 529700586.0, "step": 7200 }, { "entropy": 0.5807992540299892, "epoch": 0.5399771886805268, "grad_norm": 0.21379069983959198, "learning_rate": 0.0002, "loss": 0.698, "mean_token_accuracy": 0.807707604765892, "num_tokens": 531191723.0, "step": 7205 }, { "entropy": 0.5822959824465215, "epoch": 0.5403519126143786, "grad_norm": 0.2090209275484085, "learning_rate": 0.0002, "loss": 0.7189, "mean_token_accuracy": 0.8029172930866479, "num_tokens": 532648022.0, "step": 7210 }, { "entropy": 0.5742923406884074, "epoch": 0.5407266365482305, "grad_norm": 0.19164730608463287, "learning_rate": 0.0002, "loss": 0.7076, "mean_token_accuracy": 0.8084150191396475, "num_tokens": 534102842.0, "step": 7215 }, { "entropy": 0.5840278409421444, "epoch": 0.5411013604820823, "grad_norm": 0.18375739455223083, "learning_rate": 0.0002, "loss": 0.7024, "mean_token_accuracy": 0.8053976789116859, "num_tokens": 535561520.0, "step": 7220 }, { "entropy": 0.5933985473588109, "epoch": 0.5414760844159342, "grad_norm": 0.20938901603221893, "learning_rate": 0.0002, "loss": 0.7184, "mean_token_accuracy": 0.8029989048838615, "num_tokens": 537039200.0, "step": 7225 }, { "entropy": 0.5845063871704042, "epoch": 0.541850808349786, "grad_norm": 0.1885576695203781, "learning_rate": 0.0002, "loss": 0.7132, "mean_token_accuracy": 0.8052143890410661, "num_tokens": 538495843.0, "step": 7230 }, { "entropy": 0.5898212708532811, "epoch": 0.5422255322836379, "grad_norm": 0.18795058131217957, "learning_rate": 0.0002, "loss": 0.7267, "mean_token_accuracy": 0.8020065203309059, "num_tokens": 539928745.0, "step": 7235 }, { "entropy": 0.5832986464723945, "epoch": 0.5426002562174898, "grad_norm": 0.22738374769687653, "learning_rate": 0.0002, "loss": 0.7104, "mean_token_accuracy": 0.805775647982955, "num_tokens": 541386125.0, "step": 7240 }, { "entropy": 0.5795387651771307, "epoch": 0.5429749801513416, "grad_norm": 0.18721385300159454, "learning_rate": 0.0002, "loss": 0.7154, "mean_token_accuracy": 0.8066803880035878, "num_tokens": 542869686.0, "step": 7245 }, { "entropy": 0.5669100122526288, "epoch": 0.5433497040851935, "grad_norm": 0.1889769434928894, "learning_rate": 0.0002, "loss": 0.6906, "mean_token_accuracy": 0.8075472727417946, "num_tokens": 544341632.0, "step": 7250 }, { "entropy": 0.5823581364005804, "epoch": 0.5437244280190453, "grad_norm": 0.1821509599685669, "learning_rate": 0.0002, "loss": 0.7206, "mean_token_accuracy": 0.8043073296546936, "num_tokens": 545840451.0, "step": 7255 }, { "entropy": 0.5716875409707427, "epoch": 0.5440991519528972, "grad_norm": 0.19804388284683228, "learning_rate": 0.0002, "loss": 0.7094, "mean_token_accuracy": 0.806785624474287, "num_tokens": 547305034.0, "step": 7260 }, { "entropy": 0.5734949437901378, "epoch": 0.544473875886749, "grad_norm": 0.18214420974254608, "learning_rate": 0.0002, "loss": 0.7058, "mean_token_accuracy": 0.8043270338326692, "num_tokens": 548814745.0, "step": 7265 }, { "entropy": 0.5673665817826986, "epoch": 0.544848599820601, "grad_norm": 0.1950073391199112, "learning_rate": 0.0002, "loss": 0.7036, "mean_token_accuracy": 0.8054627284407616, "num_tokens": 550271968.0, "step": 7270 }, { "entropy": 0.5822775585576891, "epoch": 0.5452233237544528, "grad_norm": 0.19767847657203674, "learning_rate": 0.0002, "loss": 0.7139, "mean_token_accuracy": 0.8037148181349039, "num_tokens": 551746093.0, "step": 7275 }, { "entropy": 0.5762991359457373, "epoch": 0.5455980476883047, "grad_norm": 0.19470158219337463, "learning_rate": 0.0002, "loss": 0.7148, "mean_token_accuracy": 0.8048220001161098, "num_tokens": 553236918.0, "step": 7280 }, { "entropy": 0.5789443692192435, "epoch": 0.5459727716221565, "grad_norm": 0.18942423164844513, "learning_rate": 0.0002, "loss": 0.7004, "mean_token_accuracy": 0.8060458250343799, "num_tokens": 554756595.0, "step": 7285 }, { "entropy": 0.5895624900236726, "epoch": 0.5463474955560084, "grad_norm": 0.2032380849123001, "learning_rate": 0.0002, "loss": 0.7265, "mean_token_accuracy": 0.8036397445946932, "num_tokens": 556211198.0, "step": 7290 }, { "entropy": 0.5726151478476822, "epoch": 0.5467222194898602, "grad_norm": 0.18947216868400574, "learning_rate": 0.0002, "loss": 0.6947, "mean_token_accuracy": 0.8071858309209347, "num_tokens": 557662166.0, "step": 7295 }, { "entropy": 0.5860589502379299, "epoch": 0.5470969434237121, "grad_norm": 0.19506427645683289, "learning_rate": 0.0002, "loss": 0.7221, "mean_token_accuracy": 0.8047168008983135, "num_tokens": 559112618.0, "step": 7300 }, { "entropy": 0.5902962602674962, "epoch": 0.5474716673575639, "grad_norm": 0.1903241127729416, "learning_rate": 0.0002, "loss": 0.7103, "mean_token_accuracy": 0.8054472990334034, "num_tokens": 560572431.0, "step": 7305 }, { "entropy": 0.5876293404959142, "epoch": 0.5478463912914158, "grad_norm": 0.20942333340644836, "learning_rate": 0.0002, "loss": 0.6974, "mean_token_accuracy": 0.8058855399489403, "num_tokens": 562050209.0, "step": 7310 }, { "entropy": 0.5798368845134974, "epoch": 0.5482211152252676, "grad_norm": 0.18923404812812805, "learning_rate": 0.0002, "loss": 0.7032, "mean_token_accuracy": 0.8082741156220437, "num_tokens": 563521481.0, "step": 7315 }, { "entropy": 0.5591843519359827, "epoch": 0.5485958391591195, "grad_norm": 0.18741363286972046, "learning_rate": 0.0002, "loss": 0.6845, "mean_token_accuracy": 0.8059575870633126, "num_tokens": 564989642.0, "step": 7320 }, { "entropy": 0.5699400492012501, "epoch": 0.5489705630929713, "grad_norm": 0.1964019387960434, "learning_rate": 0.0002, "loss": 0.7001, "mean_token_accuracy": 0.8079774990677834, "num_tokens": 566480939.0, "step": 7325 }, { "entropy": 0.5639858927577734, "epoch": 0.5493452870268232, "grad_norm": 0.20066803693771362, "learning_rate": 0.0002, "loss": 0.7063, "mean_token_accuracy": 0.805413207784295, "num_tokens": 567912143.0, "step": 7330 }, { "entropy": 0.5753792360424995, "epoch": 0.549720010960675, "grad_norm": 0.19786427915096283, "learning_rate": 0.0002, "loss": 0.7163, "mean_token_accuracy": 0.8041970040649176, "num_tokens": 569398217.0, "step": 7335 }, { "entropy": 0.5757997332140803, "epoch": 0.5500947348945269, "grad_norm": 0.18198613822460175, "learning_rate": 0.0002, "loss": 0.716, "mean_token_accuracy": 0.8031549844890833, "num_tokens": 570886339.0, "step": 7340 }, { "entropy": 0.5721546098589897, "epoch": 0.5504694588283788, "grad_norm": 0.2121933400630951, "learning_rate": 0.0002, "loss": 0.7135, "mean_token_accuracy": 0.8052106626331806, "num_tokens": 572413819.0, "step": 7345 }, { "entropy": 0.5780767250806094, "epoch": 0.5508441827622307, "grad_norm": 0.21226683259010315, "learning_rate": 0.0002, "loss": 0.7134, "mean_token_accuracy": 0.8032973080873489, "num_tokens": 573857761.0, "step": 7350 }, { "entropy": 0.5727241748943925, "epoch": 0.5512189066960825, "grad_norm": 0.19672225415706635, "learning_rate": 0.0002, "loss": 0.6943, "mean_token_accuracy": 0.8092303991317749, "num_tokens": 575286479.0, "step": 7355 }, { "entropy": 0.5954177767038346, "epoch": 0.5515936306299344, "grad_norm": 0.1704392284154892, "learning_rate": 0.0002, "loss": 0.7063, "mean_token_accuracy": 0.8037355102598667, "num_tokens": 576775169.0, "step": 7360 }, { "entropy": 0.5973560269922018, "epoch": 0.5519683545637862, "grad_norm": 0.20939750969409943, "learning_rate": 0.0002, "loss": 0.7127, "mean_token_accuracy": 0.8056595209985972, "num_tokens": 578275674.0, "step": 7365 }, { "entropy": 0.580442500859499, "epoch": 0.5523430784976381, "grad_norm": 0.18868279457092285, "learning_rate": 0.0002, "loss": 0.6863, "mean_token_accuracy": 0.8082554806023836, "num_tokens": 579737641.0, "step": 7370 }, { "entropy": 0.5993097197264433, "epoch": 0.5527178024314899, "grad_norm": 0.19803379476070404, "learning_rate": 0.0002, "loss": 0.7121, "mean_token_accuracy": 0.8060292482376099, "num_tokens": 581222558.0, "step": 7375 }, { "entropy": 0.5878266923129558, "epoch": 0.5530925263653418, "grad_norm": 0.18603187799453735, "learning_rate": 0.0002, "loss": 0.6971, "mean_token_accuracy": 0.8083024576306344, "num_tokens": 582698915.0, "step": 7380 }, { "entropy": 0.5991018308326602, "epoch": 0.5534672502991936, "grad_norm": 0.19470544159412384, "learning_rate": 0.0002, "loss": 0.7224, "mean_token_accuracy": 0.8047881182283163, "num_tokens": 584156019.0, "step": 7385 }, { "entropy": 0.5962937435135245, "epoch": 0.5538419742330455, "grad_norm": 0.1869860738515854, "learning_rate": 0.0002, "loss": 0.7113, "mean_token_accuracy": 0.8043368089944124, "num_tokens": 585608022.0, "step": 7390 }, { "entropy": 0.5721300749108196, "epoch": 0.5542166981668973, "grad_norm": 0.19279029965400696, "learning_rate": 0.0002, "loss": 0.6856, "mean_token_accuracy": 0.8066269230097532, "num_tokens": 587113076.0, "step": 7395 }, { "entropy": 0.5964579291641712, "epoch": 0.5545914221007492, "grad_norm": 0.19615788757801056, "learning_rate": 0.0002, "loss": 0.7167, "mean_token_accuracy": 0.8027335584163666, "num_tokens": 588567530.0, "step": 7400 }, { "entropy": 0.5934633506461978, "epoch": 0.554966146034601, "grad_norm": 0.19669592380523682, "learning_rate": 0.0002, "loss": 0.705, "mean_token_accuracy": 0.80463593788445, "num_tokens": 590064331.0, "step": 7405 }, { "entropy": 0.6147742157801985, "epoch": 0.5553408699684529, "grad_norm": 0.1926412731409073, "learning_rate": 0.0002, "loss": 0.7239, "mean_token_accuracy": 0.8025365032255649, "num_tokens": 591550159.0, "step": 7410 }, { "entropy": 0.6105181492865086, "epoch": 0.5557155939023047, "grad_norm": 0.20128081738948822, "learning_rate": 0.0002, "loss": 0.7194, "mean_token_accuracy": 0.8039324834942818, "num_tokens": 593038154.0, "step": 7415 }, { "entropy": 0.5963605942204595, "epoch": 0.5560903178361566, "grad_norm": 0.23242223262786865, "learning_rate": 0.0002, "loss": 0.7098, "mean_token_accuracy": 0.8041646454483271, "num_tokens": 594486428.0, "step": 7420 }, { "entropy": 0.5933815114200115, "epoch": 0.5564650417700086, "grad_norm": 0.1838495284318924, "learning_rate": 0.0002, "loss": 0.7135, "mean_token_accuracy": 0.8039201308041811, "num_tokens": 595975085.0, "step": 7425 }, { "entropy": 0.5902781380340457, "epoch": 0.5568397657038604, "grad_norm": 0.20146295428276062, "learning_rate": 0.0002, "loss": 0.7141, "mean_token_accuracy": 0.8048522152006626, "num_tokens": 597422094.0, "step": 7430 }, { "entropy": 0.5829999778419733, "epoch": 0.5572144896377123, "grad_norm": 0.17914356291294098, "learning_rate": 0.0002, "loss": 0.703, "mean_token_accuracy": 0.8055457297712565, "num_tokens": 598902513.0, "step": 7435 }, { "entropy": 0.5785612151026726, "epoch": 0.5575892135715641, "grad_norm": 0.18271026015281677, "learning_rate": 0.0002, "loss": 0.6993, "mean_token_accuracy": 0.8082588218152523, "num_tokens": 600371082.0, "step": 7440 }, { "entropy": 0.581974446773529, "epoch": 0.557963937505416, "grad_norm": 0.20836769044399261, "learning_rate": 0.0002, "loss": 0.7068, "mean_token_accuracy": 0.8063400521874428, "num_tokens": 601817920.0, "step": 7445 }, { "entropy": 0.5900952322408557, "epoch": 0.5583386614392678, "grad_norm": 0.19322878122329712, "learning_rate": 0.0002, "loss": 0.7078, "mean_token_accuracy": 0.8041128639131785, "num_tokens": 603291206.0, "step": 7450 }, { "entropy": 0.5676261960528791, "epoch": 0.5587133853731197, "grad_norm": 0.1855902522802353, "learning_rate": 0.0002, "loss": 0.6779, "mean_token_accuracy": 0.8126859400421381, "num_tokens": 604740506.0, "step": 7455 }, { "entropy": 0.566250681784004, "epoch": 0.5590881093069715, "grad_norm": 0.1938765048980713, "learning_rate": 0.0002, "loss": 0.6885, "mean_token_accuracy": 0.8105572819709778, "num_tokens": 606173859.0, "step": 7460 }, { "entropy": 0.5844726532697677, "epoch": 0.5594628332408234, "grad_norm": 0.19365571439266205, "learning_rate": 0.0002, "loss": 0.7088, "mean_token_accuracy": 0.8063007041811943, "num_tokens": 607601834.0, "step": 7465 }, { "entropy": 0.583695775642991, "epoch": 0.5598375571746752, "grad_norm": 0.20877116918563843, "learning_rate": 0.0002, "loss": 0.7031, "mean_token_accuracy": 0.8031056370586157, "num_tokens": 609116258.0, "step": 7470 }, { "entropy": 0.5834782822057605, "epoch": 0.560212281108527, "grad_norm": 0.20406122505664825, "learning_rate": 0.0002, "loss": 0.6993, "mean_token_accuracy": 0.8063200004398823, "num_tokens": 610620037.0, "step": 7475 }, { "entropy": 0.5748168956488371, "epoch": 0.5605870050423789, "grad_norm": 0.18569554388523102, "learning_rate": 0.0002, "loss": 0.686, "mean_token_accuracy": 0.8121142037212848, "num_tokens": 612085519.0, "step": 7480 }, { "entropy": 0.600178087875247, "epoch": 0.5609617289762308, "grad_norm": 0.19815514981746674, "learning_rate": 0.0002, "loss": 0.7151, "mean_token_accuracy": 0.8064014919102191, "num_tokens": 613561458.0, "step": 7485 }, { "entropy": 0.5930198866873979, "epoch": 0.5613364529100826, "grad_norm": 0.1792614459991455, "learning_rate": 0.0002, "loss": 0.7086, "mean_token_accuracy": 0.8059932678937912, "num_tokens": 615020850.0, "step": 7490 }, { "entropy": 0.583706708624959, "epoch": 0.5617111768439345, "grad_norm": 0.1867334246635437, "learning_rate": 0.0002, "loss": 0.6955, "mean_token_accuracy": 0.8111580312252045, "num_tokens": 616508995.0, "step": 7495 }, { "entropy": 0.6018721261993051, "epoch": 0.5620859007777864, "grad_norm": 0.18677999079227448, "learning_rate": 0.0002, "loss": 0.7151, "mean_token_accuracy": 0.8052266262471676, "num_tokens": 617974645.0, "step": 7500 }, { "entropy": 0.5906938776373863, "epoch": 0.5624606247116383, "grad_norm": 0.19459226727485657, "learning_rate": 0.0002, "loss": 0.6971, "mean_token_accuracy": 0.8061888292431831, "num_tokens": 619423725.0, "step": 7505 }, { "entropy": 0.5798800211399794, "epoch": 0.5628353486454901, "grad_norm": 0.19587403535842896, "learning_rate": 0.0002, "loss": 0.6764, "mean_token_accuracy": 0.8083746958523989, "num_tokens": 620914147.0, "step": 7510 }, { "entropy": 0.5995255688205361, "epoch": 0.563210072579342, "grad_norm": 0.19070236384868622, "learning_rate": 0.0002, "loss": 0.7005, "mean_token_accuracy": 0.8053442861884832, "num_tokens": 622394464.0, "step": 7515 }, { "entropy": 0.5921591643244029, "epoch": 0.5635847965131938, "grad_norm": 0.2117854803800583, "learning_rate": 0.0002, "loss": 0.686, "mean_token_accuracy": 0.8097874693572521, "num_tokens": 623901618.0, "step": 7520 }, { "entropy": 0.6120941564440727, "epoch": 0.5639595204470457, "grad_norm": 0.2001645863056183, "learning_rate": 0.0002, "loss": 0.7106, "mean_token_accuracy": 0.805100978538394, "num_tokens": 625377221.0, "step": 7525 }, { "entropy": 0.6083260817453265, "epoch": 0.5643342443808975, "grad_norm": 0.23490792512893677, "learning_rate": 0.0002, "loss": 0.7073, "mean_token_accuracy": 0.8083327013999224, "num_tokens": 626846166.0, "step": 7530 }, { "entropy": 0.6125145003199577, "epoch": 0.5647089683147494, "grad_norm": 0.19664810597896576, "learning_rate": 0.0002, "loss": 0.7191, "mean_token_accuracy": 0.8048329189419746, "num_tokens": 628313798.0, "step": 7535 }, { "entropy": 0.6136679196730256, "epoch": 0.5650836922486012, "grad_norm": 0.19378547370433807, "learning_rate": 0.0002, "loss": 0.7119, "mean_token_accuracy": 0.8055468782782554, "num_tokens": 629809035.0, "step": 7540 }, { "entropy": 0.6053543008863926, "epoch": 0.5654584161824531, "grad_norm": 0.1950511783361435, "learning_rate": 0.0002, "loss": 0.6951, "mean_token_accuracy": 0.8080556385219098, "num_tokens": 631302490.0, "step": 7545 }, { "entropy": 0.601255901530385, "epoch": 0.5658331401163049, "grad_norm": 0.1984206587076187, "learning_rate": 0.0002, "loss": 0.6988, "mean_token_accuracy": 0.8057555742561817, "num_tokens": 632805451.0, "step": 7550 }, { "entropy": 0.5983289642259478, "epoch": 0.5662078640501568, "grad_norm": 0.20093242824077606, "learning_rate": 0.0002, "loss": 0.6933, "mean_token_accuracy": 0.8072681825608015, "num_tokens": 634237500.0, "step": 7555 }, { "entropy": 0.6215622378513217, "epoch": 0.5665825879840086, "grad_norm": 0.2315250039100647, "learning_rate": 0.0002, "loss": 0.7107, "mean_token_accuracy": 0.802265214920044, "num_tokens": 635718303.0, "step": 7560 }, { "entropy": 0.6169630696997046, "epoch": 0.5669573119178605, "grad_norm": 0.1915183812379837, "learning_rate": 0.0002, "loss": 0.7099, "mean_token_accuracy": 0.8035264778882265, "num_tokens": 637223850.0, "step": 7565 }, { "entropy": 0.5992787033319473, "epoch": 0.5673320358517123, "grad_norm": 0.19152645766735077, "learning_rate": 0.0002, "loss": 0.7114, "mean_token_accuracy": 0.8077162228524685, "num_tokens": 638698568.0, "step": 7570 }, { "entropy": 0.5931390238925814, "epoch": 0.5677067597855642, "grad_norm": 0.1960638463497162, "learning_rate": 0.0002, "loss": 0.6997, "mean_token_accuracy": 0.8073445502668619, "num_tokens": 640165537.0, "step": 7575 }, { "entropy": 0.5912628129124642, "epoch": 0.5680814837194161, "grad_norm": 0.2015029639005661, "learning_rate": 0.0002, "loss": 0.7036, "mean_token_accuracy": 0.806988125294447, "num_tokens": 641639463.0, "step": 7580 }, { "entropy": 0.565935049764812, "epoch": 0.568456207653268, "grad_norm": 0.17860755324363708, "learning_rate": 0.0002, "loss": 0.6777, "mean_token_accuracy": 0.8099491722881794, "num_tokens": 643107157.0, "step": 7585 }, { "entropy": 0.5953907832503319, "epoch": 0.5688309315871198, "grad_norm": 0.2122647911310196, "learning_rate": 0.0002, "loss": 0.7212, "mean_token_accuracy": 0.803196394816041, "num_tokens": 644593414.0, "step": 7590 }, { "entropy": 0.5876904988661409, "epoch": 0.5692056555209717, "grad_norm": 0.19592836499214172, "learning_rate": 0.0002, "loss": 0.7095, "mean_token_accuracy": 0.8054693106561899, "num_tokens": 646054036.0, "step": 7595 }, { "entropy": 0.5734860491007566, "epoch": 0.5695803794548235, "grad_norm": 0.20400184392929077, "learning_rate": 0.0002, "loss": 0.6982, "mean_token_accuracy": 0.8109839677810669, "num_tokens": 647550625.0, "step": 7600 }, { "entropy": 0.593635493144393, "epoch": 0.5699551033886754, "grad_norm": 0.19006314873695374, "learning_rate": 0.0002, "loss": 0.711, "mean_token_accuracy": 0.8061530534178019, "num_tokens": 649033649.0, "step": 7605 }, { "entropy": 0.5694069884717464, "epoch": 0.5703298273225272, "grad_norm": 0.20431019365787506, "learning_rate": 0.0002, "loss": 0.7018, "mean_token_accuracy": 0.807955901697278, "num_tokens": 650474973.0, "step": 7610 }, { "entropy": 0.5676112191751599, "epoch": 0.5707045512563791, "grad_norm": 0.20034658908843994, "learning_rate": 0.0002, "loss": 0.6928, "mean_token_accuracy": 0.8097437206655741, "num_tokens": 651942822.0, "step": 7615 }, { "entropy": 0.5634347690269351, "epoch": 0.5710792751902309, "grad_norm": 0.20112209022045135, "learning_rate": 0.0002, "loss": 0.6873, "mean_token_accuracy": 0.8069677438586951, "num_tokens": 653425972.0, "step": 7620 }, { "entropy": 0.5875279868021608, "epoch": 0.5714539991240828, "grad_norm": 0.21834594011306763, "learning_rate": 0.0002, "loss": 0.7042, "mean_token_accuracy": 0.8037209417670965, "num_tokens": 654865597.0, "step": 7625 }, { "entropy": 0.6001356922090053, "epoch": 0.5718287230579346, "grad_norm": 0.20048150420188904, "learning_rate": 0.0002, "loss": 0.723, "mean_token_accuracy": 0.803244448453188, "num_tokens": 656372806.0, "step": 7630 }, { "entropy": 0.6028474155813456, "epoch": 0.5722034469917865, "grad_norm": 0.2053007334470749, "learning_rate": 0.0002, "loss": 0.7259, "mean_token_accuracy": 0.8047601167112589, "num_tokens": 657837686.0, "step": 7635 }, { "entropy": 0.5672413131222129, "epoch": 0.5725781709256383, "grad_norm": 0.1833409070968628, "learning_rate": 0.0002, "loss": 0.6813, "mean_token_accuracy": 0.80884840041399, "num_tokens": 659265169.0, "step": 7640 }, { "entropy": 0.5794876063242554, "epoch": 0.5729528948594902, "grad_norm": 0.19697216153144836, "learning_rate": 0.0002, "loss": 0.7035, "mean_token_accuracy": 0.8068448659032583, "num_tokens": 660709212.0, "step": 7645 }, { "entropy": 0.5942346950992942, "epoch": 0.573327618793342, "grad_norm": 0.19349394738674164, "learning_rate": 0.0002, "loss": 0.716, "mean_token_accuracy": 0.8067252360284328, "num_tokens": 662161271.0, "step": 7650 }, { "entropy": 0.5873289234936238, "epoch": 0.573702342727194, "grad_norm": 0.18789193034172058, "learning_rate": 0.0002, "loss": 0.6969, "mean_token_accuracy": 0.809282659739256, "num_tokens": 663628157.0, "step": 7655 }, { "entropy": 0.5892319433391094, "epoch": 0.5740770666610459, "grad_norm": 0.18984587490558624, "learning_rate": 0.0002, "loss": 0.6978, "mean_token_accuracy": 0.8128319192677737, "num_tokens": 665081779.0, "step": 7660 }, { "entropy": 0.6051113065332174, "epoch": 0.5744517905948977, "grad_norm": 0.1965593546628952, "learning_rate": 0.0002, "loss": 0.7102, "mean_token_accuracy": 0.8047284685075283, "num_tokens": 666600576.0, "step": 7665 }, { "entropy": 0.6059761710464955, "epoch": 0.5748265145287496, "grad_norm": 0.2142118364572525, "learning_rate": 0.0002, "loss": 0.7082, "mean_token_accuracy": 0.8071706756949425, "num_tokens": 668060800.0, "step": 7670 }, { "entropy": 0.5941949628293515, "epoch": 0.5752012384626014, "grad_norm": 0.18021433055400848, "learning_rate": 0.0002, "loss": 0.7052, "mean_token_accuracy": 0.8064910747110844, "num_tokens": 669529787.0, "step": 7675 }, { "entropy": 0.5742913898080587, "epoch": 0.5755759623964533, "grad_norm": 0.18684989213943481, "learning_rate": 0.0002, "loss": 0.6851, "mean_token_accuracy": 0.8110403452068567, "num_tokens": 670954738.0, "step": 7680 }, { "entropy": 0.595868831500411, "epoch": 0.5759506863303051, "grad_norm": 0.20933103561401367, "learning_rate": 0.0002, "loss": 0.7009, "mean_token_accuracy": 0.8038170706480742, "num_tokens": 672405664.0, "step": 7685 }, { "entropy": 0.6084006506949663, "epoch": 0.576325410264157, "grad_norm": 0.1958962380886078, "learning_rate": 0.0002, "loss": 0.7201, "mean_token_accuracy": 0.8041846621781588, "num_tokens": 673844487.0, "step": 7690 }, { "entropy": 0.597033080086112, "epoch": 0.5767001341980088, "grad_norm": 0.19019848108291626, "learning_rate": 0.0002, "loss": 0.7055, "mean_token_accuracy": 0.806320334598422, "num_tokens": 675281677.0, "step": 7695 }, { "entropy": 0.5916355211287737, "epoch": 0.5770748581318607, "grad_norm": 0.18993963301181793, "learning_rate": 0.0002, "loss": 0.6987, "mean_token_accuracy": 0.8094860464334488, "num_tokens": 676743910.0, "step": 7700 }, { "entropy": 0.5650250352919102, "epoch": 0.5774495820657125, "grad_norm": 0.20239509642124176, "learning_rate": 0.0002, "loss": 0.6731, "mean_token_accuracy": 0.8118349462747574, "num_tokens": 678214673.0, "step": 7705 }, { "entropy": 0.5797105899080635, "epoch": 0.5778243059995644, "grad_norm": 0.22720885276794434, "learning_rate": 0.0002, "loss": 0.6918, "mean_token_accuracy": 0.8083794921636581, "num_tokens": 679661728.0, "step": 7710 }, { "entropy": 0.5935319060459733, "epoch": 0.5781990299334162, "grad_norm": 0.23104548454284668, "learning_rate": 0.0002, "loss": 0.7074, "mean_token_accuracy": 0.8056869558990002, "num_tokens": 681119681.0, "step": 7715 }, { "entropy": 0.599073476716876, "epoch": 0.5785737538672681, "grad_norm": 0.1944766640663147, "learning_rate": 0.0002, "loss": 0.6969, "mean_token_accuracy": 0.8070223983377218, "num_tokens": 682590305.0, "step": 7720 }, { "entropy": 0.5789775440469385, "epoch": 0.5789484778011199, "grad_norm": 0.21108490228652954, "learning_rate": 0.0002, "loss": 0.692, "mean_token_accuracy": 0.8107600849121809, "num_tokens": 684035256.0, "step": 7725 }, { "entropy": 0.5858387984335423, "epoch": 0.5793232017349718, "grad_norm": 0.2270599901676178, "learning_rate": 0.0002, "loss": 0.6954, "mean_token_accuracy": 0.8113955523818731, "num_tokens": 685486337.0, "step": 7730 }, { "entropy": 0.5842491878196597, "epoch": 0.5796979256688237, "grad_norm": 0.22827233374118805, "learning_rate": 0.0002, "loss": 0.6907, "mean_token_accuracy": 0.8094263240695, "num_tokens": 686897542.0, "step": 7735 }, { "entropy": 0.5877562798559666, "epoch": 0.5800726496026756, "grad_norm": 0.19864612817764282, "learning_rate": 0.0002, "loss": 0.7006, "mean_token_accuracy": 0.8082044087350368, "num_tokens": 688340519.0, "step": 7740 }, { "entropy": 0.5906383128836751, "epoch": 0.5804473735365274, "grad_norm": 0.20670181512832642, "learning_rate": 0.0002, "loss": 0.7064, "mean_token_accuracy": 0.8046660333871841, "num_tokens": 689839298.0, "step": 7745 }, { "entropy": 0.5756105359643697, "epoch": 0.5808220974703793, "grad_norm": 0.20056399703025818, "learning_rate": 0.0002, "loss": 0.698, "mean_token_accuracy": 0.8077678322792053, "num_tokens": 691290648.0, "step": 7750 }, { "entropy": 0.5931109117344022, "epoch": 0.5811968214042311, "grad_norm": 0.2060350924730301, "learning_rate": 0.0002, "loss": 0.7138, "mean_token_accuracy": 0.8033034101128578, "num_tokens": 692765725.0, "step": 7755 }, { "entropy": 0.5665806801989675, "epoch": 0.581571545338083, "grad_norm": 0.17332406342029572, "learning_rate": 0.0002, "loss": 0.6985, "mean_token_accuracy": 0.8046334616839885, "num_tokens": 694222213.0, "step": 7760 }, { "entropy": 0.570283168181777, "epoch": 0.5819462692719348, "grad_norm": 0.20416082441806793, "learning_rate": 0.0002, "loss": 0.6998, "mean_token_accuracy": 0.8088464982807636, "num_tokens": 695672797.0, "step": 7765 }, { "entropy": 0.5840190278366209, "epoch": 0.5823209932057867, "grad_norm": 0.18195323646068573, "learning_rate": 0.0002, "loss": 0.7103, "mean_token_accuracy": 0.8056580279022455, "num_tokens": 697175410.0, "step": 7770 }, { "entropy": 0.5866118997335434, "epoch": 0.5826957171396385, "grad_norm": 0.19626007974147797, "learning_rate": 0.0002, "loss": 0.7091, "mean_token_accuracy": 0.8043894909322262, "num_tokens": 698674856.0, "step": 7775 }, { "entropy": 0.5810930689796805, "epoch": 0.5830704410734904, "grad_norm": 0.20008717477321625, "learning_rate": 0.0002, "loss": 0.7022, "mean_token_accuracy": 0.8073858119547367, "num_tokens": 700132479.0, "step": 7780 }, { "entropy": 0.5978312637656927, "epoch": 0.5834451650073422, "grad_norm": 0.18433286249637604, "learning_rate": 0.0002, "loss": 0.7115, "mean_token_accuracy": 0.8071970012038946, "num_tokens": 701646036.0, "step": 7785 }, { "entropy": 0.5667931105941534, "epoch": 0.5838198889411941, "grad_norm": 0.1813700646162033, "learning_rate": 0.0002, "loss": 0.6747, "mean_token_accuracy": 0.8104138150811195, "num_tokens": 703092855.0, "step": 7790 }, { "entropy": 0.590162162296474, "epoch": 0.5841946128750459, "grad_norm": 0.1904485821723938, "learning_rate": 0.0002, "loss": 0.6946, "mean_token_accuracy": 0.8061810102313757, "num_tokens": 704573970.0, "step": 7795 }, { "entropy": 0.6017726756632328, "epoch": 0.5845693368088978, "grad_norm": 0.21526546776294708, "learning_rate": 0.0002, "loss": 0.7072, "mean_token_accuracy": 0.8068605948239564, "num_tokens": 706076203.0, "step": 7800 }, { "entropy": 0.5969136860221624, "epoch": 0.5849440607427496, "grad_norm": 0.17493298649787903, "learning_rate": 0.0002, "loss": 0.6984, "mean_token_accuracy": 0.8069991961121559, "num_tokens": 707555032.0, "step": 7805 }, { "entropy": 0.6007966425269842, "epoch": 0.5853187846766016, "grad_norm": 0.22381724417209625, "learning_rate": 0.0002, "loss": 0.7039, "mean_token_accuracy": 0.8086880639195442, "num_tokens": 709011889.0, "step": 7810 }, { "entropy": 0.5993842575699091, "epoch": 0.5856935086104534, "grad_norm": 0.19134360551834106, "learning_rate": 0.0002, "loss": 0.69, "mean_token_accuracy": 0.8073661305010319, "num_tokens": 710439856.0, "step": 7815 }, { "entropy": 0.5765126321464777, "epoch": 0.5860682325443053, "grad_norm": 0.19740228354930878, "learning_rate": 0.0002, "loss": 0.693, "mean_token_accuracy": 0.811051132157445, "num_tokens": 711906667.0, "step": 7820 }, { "entropy": 0.5790290739387274, "epoch": 0.5864429564781571, "grad_norm": 0.1942550539970398, "learning_rate": 0.0002, "loss": 0.6998, "mean_token_accuracy": 0.8071294821798801, "num_tokens": 713381927.0, "step": 7825 }, { "entropy": 0.5661481386050582, "epoch": 0.586817680412009, "grad_norm": 0.21869227290153503, "learning_rate": 0.0002, "loss": 0.6907, "mean_token_accuracy": 0.806854734942317, "num_tokens": 714844573.0, "step": 7830 }, { "entropy": 0.5775679843500257, "epoch": 0.5871924043458608, "grad_norm": 0.19849729537963867, "learning_rate": 0.0002, "loss": 0.7104, "mean_token_accuracy": 0.8023919899016618, "num_tokens": 716276142.0, "step": 7835 }, { "entropy": 0.5792938619852066, "epoch": 0.5875671282797127, "grad_norm": 0.1991315484046936, "learning_rate": 0.0002, "loss": 0.7061, "mean_token_accuracy": 0.8047155320644379, "num_tokens": 717765580.0, "step": 7840 }, { "entropy": 0.5651520744897425, "epoch": 0.5879418522135645, "grad_norm": 0.19578516483306885, "learning_rate": 0.0002, "loss": 0.687, "mean_token_accuracy": 0.8099517721682787, "num_tokens": 719216923.0, "step": 7845 }, { "entropy": 0.579817665554583, "epoch": 0.5883165761474164, "grad_norm": 0.18895196914672852, "learning_rate": 0.0002, "loss": 0.7004, "mean_token_accuracy": 0.8049699068069458, "num_tokens": 720710436.0, "step": 7850 }, { "entropy": 0.5677756506949663, "epoch": 0.5886913000812682, "grad_norm": 0.18685902655124664, "learning_rate": 0.0002, "loss": 0.695, "mean_token_accuracy": 0.8096627373248338, "num_tokens": 722182385.0, "step": 7855 }, { "entropy": 0.5767222128808498, "epoch": 0.5890660240151201, "grad_norm": 0.19704760611057281, "learning_rate": 0.0002, "loss": 0.708, "mean_token_accuracy": 0.8051927961409092, "num_tokens": 723665565.0, "step": 7860 }, { "entropy": 0.5741860877722502, "epoch": 0.5894407479489719, "grad_norm": 0.20466279983520508, "learning_rate": 0.0002, "loss": 0.703, "mean_token_accuracy": 0.8046926710754633, "num_tokens": 725140230.0, "step": 7865 }, { "entropy": 0.5668004786595702, "epoch": 0.5898154718828238, "grad_norm": 0.20640574395656586, "learning_rate": 0.0002, "loss": 0.6864, "mean_token_accuracy": 0.8110966246575118, "num_tokens": 726625300.0, "step": 7870 }, { "entropy": 0.5999583769589663, "epoch": 0.5901901958166756, "grad_norm": 0.18661759793758392, "learning_rate": 0.0002, "loss": 0.7139, "mean_token_accuracy": 0.8039695240557194, "num_tokens": 728108719.0, "step": 7875 }, { "entropy": 0.5967548925429582, "epoch": 0.5905649197505275, "grad_norm": 0.1872444748878479, "learning_rate": 0.0002, "loss": 0.7133, "mean_token_accuracy": 0.8043331354856491, "num_tokens": 729591262.0, "step": 7880 }, { "entropy": 0.5855586666613817, "epoch": 0.5909396436843793, "grad_norm": 0.2008020132780075, "learning_rate": 0.0002, "loss": 0.703, "mean_token_accuracy": 0.8075286515057087, "num_tokens": 731061577.0, "step": 7885 }, { "entropy": 0.5845367008820176, "epoch": 0.5913143676182313, "grad_norm": 0.1979912966489792, "learning_rate": 0.0002, "loss": 0.6952, "mean_token_accuracy": 0.8096553564071656, "num_tokens": 732536111.0, "step": 7890 }, { "entropy": 0.5928134037181735, "epoch": 0.5916890915520832, "grad_norm": 0.20574632287025452, "learning_rate": 0.0002, "loss": 0.7033, "mean_token_accuracy": 0.8056111950427294, "num_tokens": 733982466.0, "step": 7895 }, { "entropy": 0.5848807269707322, "epoch": 0.592063815485935, "grad_norm": 0.1847999542951584, "learning_rate": 0.0002, "loss": 0.6949, "mean_token_accuracy": 0.8083018094301224, "num_tokens": 735474465.0, "step": 7900 }, { "entropy": 0.5966286605224014, "epoch": 0.5924385394197869, "grad_norm": 0.2032739222049713, "learning_rate": 0.0002, "loss": 0.7066, "mean_token_accuracy": 0.807586082443595, "num_tokens": 736923806.0, "step": 7905 }, { "entropy": 0.5965378133580088, "epoch": 0.5928132633536387, "grad_norm": 0.2087286114692688, "learning_rate": 0.0002, "loss": 0.7081, "mean_token_accuracy": 0.8071969885379076, "num_tokens": 738404911.0, "step": 7910 }, { "entropy": 0.5767922313883901, "epoch": 0.5931879872874906, "grad_norm": 0.18538770079612732, "learning_rate": 0.0002, "loss": 0.6906, "mean_token_accuracy": 0.8095645755529404, "num_tokens": 739861098.0, "step": 7915 }, { "entropy": 0.5877749249339104, "epoch": 0.5935627112213424, "grad_norm": 0.18372981250286102, "learning_rate": 0.0002, "loss": 0.6931, "mean_token_accuracy": 0.8093736756592989, "num_tokens": 741367686.0, "step": 7920 }, { "entropy": 0.5879287998192012, "epoch": 0.5939374351551943, "grad_norm": 0.19215402007102966, "learning_rate": 0.0002, "loss": 0.6999, "mean_token_accuracy": 0.8076953437179327, "num_tokens": 742796866.0, "step": 7925 }, { "entropy": 0.5905577506870031, "epoch": 0.5943121590890461, "grad_norm": 0.19905756413936615, "learning_rate": 0.0002, "loss": 0.6962, "mean_token_accuracy": 0.8063331492245197, "num_tokens": 744282084.0, "step": 7930 }, { "entropy": 0.587031290307641, "epoch": 0.594686883022898, "grad_norm": 0.19955319166183472, "learning_rate": 0.0002, "loss": 0.6945, "mean_token_accuracy": 0.8079735834151507, "num_tokens": 745771569.0, "step": 7935 }, { "entropy": 0.5730623427778483, "epoch": 0.5950616069567498, "grad_norm": 0.18736045062541962, "learning_rate": 0.0002, "loss": 0.6746, "mean_token_accuracy": 0.8114134393632412, "num_tokens": 747229649.0, "step": 7940 }, { "entropy": 0.5902777593582869, "epoch": 0.5954363308906017, "grad_norm": 0.18328183889389038, "learning_rate": 0.0002, "loss": 0.7002, "mean_token_accuracy": 0.8065231818705797, "num_tokens": 748702922.0, "step": 7945 }, { "entropy": 0.5687355633825064, "epoch": 0.5958110548244535, "grad_norm": 0.19669070839881897, "learning_rate": 0.0002, "loss": 0.6821, "mean_token_accuracy": 0.8102533511817456, "num_tokens": 750143374.0, "step": 7950 }, { "entropy": 0.5847763823345303, "epoch": 0.5961857787583054, "grad_norm": 0.18540766835212708, "learning_rate": 0.0002, "loss": 0.7076, "mean_token_accuracy": 0.8066551204770803, "num_tokens": 751621726.0, "step": 7955 }, { "entropy": 0.586669372394681, "epoch": 0.5965605026921572, "grad_norm": 0.19026677310466766, "learning_rate": 0.0002, "loss": 0.7042, "mean_token_accuracy": 0.80835069604218, "num_tokens": 753080099.0, "step": 7960 }, { "entropy": 0.5841529309749603, "epoch": 0.5969352266260092, "grad_norm": 0.18590420484542847, "learning_rate": 0.0002, "loss": 0.6943, "mean_token_accuracy": 0.8030907459557056, "num_tokens": 754561033.0, "step": 7965 }, { "entropy": 0.5756285410374403, "epoch": 0.597309950559861, "grad_norm": 0.20476053655147552, "learning_rate": 0.0002, "loss": 0.6817, "mean_token_accuracy": 0.8071594867855311, "num_tokens": 756019908.0, "step": 7970 }, { "entropy": 0.5717176388949156, "epoch": 0.5976846744937129, "grad_norm": 0.21982760727405548, "learning_rate": 0.0002, "loss": 0.6815, "mean_token_accuracy": 0.8121380593627692, "num_tokens": 757483373.0, "step": 7975 }, { "entropy": 0.5906906824558973, "epoch": 0.5980593984275647, "grad_norm": 0.1902521550655365, "learning_rate": 0.0002, "loss": 0.6868, "mean_token_accuracy": 0.809734508395195, "num_tokens": 758952490.0, "step": 7980 }, { "entropy": 0.5868288261815906, "epoch": 0.5984341223614166, "grad_norm": 0.18939441442489624, "learning_rate": 0.0002, "loss": 0.6983, "mean_token_accuracy": 0.807635186240077, "num_tokens": 760447063.0, "step": 7985 }, { "entropy": 0.5956419611349701, "epoch": 0.5988088462952684, "grad_norm": 0.19270208477973938, "learning_rate": 0.0002, "loss": 0.704, "mean_token_accuracy": 0.8062552362680435, "num_tokens": 761923059.0, "step": 7990 }, { "entropy": 0.5773014040663839, "epoch": 0.5991835702291203, "grad_norm": 0.17398521304130554, "learning_rate": 0.0002, "loss": 0.6874, "mean_token_accuracy": 0.8077679663896561, "num_tokens": 763396398.0, "step": 7995 }, { "entropy": 0.5764816455543041, "epoch": 0.5995582941629721, "grad_norm": 0.19050125777721405, "learning_rate": 0.0002, "loss": 0.7017, "mean_token_accuracy": 0.806984619051218, "num_tokens": 764813823.0, "step": 8000 }, { "entropy": 0.5687777297571301, "epoch": 0.599933018096824, "grad_norm": 0.19997826218605042, "learning_rate": 0.0002, "loss": 0.6866, "mean_token_accuracy": 0.8110523007810115, "num_tokens": 766259775.0, "step": 8005 }, { "entropy": 0.5864621784538031, "epoch": 0.6003077420306758, "grad_norm": 0.20564059913158417, "learning_rate": 0.0002, "loss": 0.6978, "mean_token_accuracy": 0.8072557952255011, "num_tokens": 767784213.0, "step": 8010 }, { "entropy": 0.5595513289794326, "epoch": 0.6006824659645277, "grad_norm": 0.19509784877300262, "learning_rate": 0.0002, "loss": 0.674, "mean_token_accuracy": 0.8143621690571308, "num_tokens": 769178056.0, "step": 8015 }, { "entropy": 0.5693026002496481, "epoch": 0.6010571898983795, "grad_norm": 0.18054136633872986, "learning_rate": 0.0002, "loss": 0.683, "mean_token_accuracy": 0.8100642297416926, "num_tokens": 770677971.0, "step": 8020 }, { "entropy": 0.5722713623195886, "epoch": 0.6014319138322314, "grad_norm": 0.2111079841852188, "learning_rate": 0.0002, "loss": 0.6872, "mean_token_accuracy": 0.8137331161648035, "num_tokens": 772148696.0, "step": 8025 }, { "entropy": 0.6057047044858337, "epoch": 0.6018066377660832, "grad_norm": 0.1980004608631134, "learning_rate": 0.0002, "loss": 0.7191, "mean_token_accuracy": 0.8033263549208641, "num_tokens": 773643616.0, "step": 8030 }, { "entropy": 0.6016471538692713, "epoch": 0.6021813616999351, "grad_norm": 0.19902174174785614, "learning_rate": 0.0002, "loss": 0.7037, "mean_token_accuracy": 0.8057134173810482, "num_tokens": 775164430.0, "step": 8035 }, { "entropy": 0.6182386832311749, "epoch": 0.6025560856337869, "grad_norm": 0.19253374636173248, "learning_rate": 0.0002, "loss": 0.7284, "mean_token_accuracy": 0.8036291383206844, "num_tokens": 776636551.0, "step": 8040 }, { "entropy": 0.5899399966932833, "epoch": 0.6029308095676389, "grad_norm": 0.18910938501358032, "learning_rate": 0.0002, "loss": 0.687, "mean_token_accuracy": 0.8092796556651592, "num_tokens": 778140688.0, "step": 8045 }, { "entropy": 0.5916596699506045, "epoch": 0.6033055335014907, "grad_norm": 0.19330236315727234, "learning_rate": 0.0002, "loss": 0.6955, "mean_token_accuracy": 0.807006585970521, "num_tokens": 779603148.0, "step": 8050 }, { "entropy": 0.5798724723979831, "epoch": 0.6036802574353426, "grad_norm": 0.18617714941501617, "learning_rate": 0.0002, "loss": 0.6796, "mean_token_accuracy": 0.8101030580699444, "num_tokens": 781086713.0, "step": 8055 }, { "entropy": 0.5949612932279706, "epoch": 0.6040549813691944, "grad_norm": 0.21224214136600494, "learning_rate": 0.0002, "loss": 0.7035, "mean_token_accuracy": 0.8069075252860785, "num_tokens": 782564153.0, "step": 8060 }, { "entropy": 0.5959294877946377, "epoch": 0.6044297053030463, "grad_norm": 0.20205168426036835, "learning_rate": 0.0002, "loss": 0.7052, "mean_token_accuracy": 0.805767672881484, "num_tokens": 784034382.0, "step": 8065 }, { "entropy": 0.5973371878266335, "epoch": 0.6048044292368981, "grad_norm": 0.1929030418395996, "learning_rate": 0.0002, "loss": 0.7001, "mean_token_accuracy": 0.8063626457005739, "num_tokens": 785543135.0, "step": 8070 }, { "entropy": 0.596559798438102, "epoch": 0.60517915317075, "grad_norm": 0.19094890356063843, "learning_rate": 0.0002, "loss": 0.7041, "mean_token_accuracy": 0.8069100607186556, "num_tokens": 787052029.0, "step": 8075 }, { "entropy": 0.5728386703878641, "epoch": 0.6055538771046018, "grad_norm": 0.2184302657842636, "learning_rate": 0.0002, "loss": 0.6966, "mean_token_accuracy": 0.8100721724331379, "num_tokens": 788545648.0, "step": 8080 }, { "entropy": 0.5798616575077176, "epoch": 0.6059286010384537, "grad_norm": 0.19891680777072906, "learning_rate": 0.0002, "loss": 0.7106, "mean_token_accuracy": 0.8053294770419598, "num_tokens": 790028147.0, "step": 8085 }, { "entropy": 0.5553605649620295, "epoch": 0.6063033249723055, "grad_norm": 0.19162768125534058, "learning_rate": 0.0002, "loss": 0.6917, "mean_token_accuracy": 0.8074747435748577, "num_tokens": 791456158.0, "step": 8090 }, { "entropy": 0.5748262438923121, "epoch": 0.6066780489061574, "grad_norm": 0.19148604571819305, "learning_rate": 0.0002, "loss": 0.7148, "mean_token_accuracy": 0.804475998878479, "num_tokens": 792956360.0, "step": 8095 }, { "entropy": 0.5650876758620142, "epoch": 0.6070527728400092, "grad_norm": 0.19328798353672028, "learning_rate": 0.0002, "loss": 0.6922, "mean_token_accuracy": 0.8100174646824598, "num_tokens": 794379485.0, "step": 8100 }, { "entropy": 0.5906645119190216, "epoch": 0.6074274967738611, "grad_norm": 0.18306800723075867, "learning_rate": 0.0002, "loss": 0.7086, "mean_token_accuracy": 0.8057189371436835, "num_tokens": 795916423.0, "step": 8105 }, { "entropy": 0.5702934812754392, "epoch": 0.607802220707713, "grad_norm": 0.1889035403728485, "learning_rate": 0.0002, "loss": 0.6923, "mean_token_accuracy": 0.8095515720546246, "num_tokens": 797393546.0, "step": 8110 }, { "entropy": 0.5804077293723822, "epoch": 0.6081769446415648, "grad_norm": 0.21237418055534363, "learning_rate": 0.0002, "loss": 0.7243, "mean_token_accuracy": 0.8029906261712313, "num_tokens": 798882757.0, "step": 8115 }, { "entropy": 0.5750229225493968, "epoch": 0.6085516685754168, "grad_norm": 0.19595623016357422, "learning_rate": 0.0002, "loss": 0.7051, "mean_token_accuracy": 0.8066011060029268, "num_tokens": 800355108.0, "step": 8120 }, { "entropy": 0.5753758157603442, "epoch": 0.6089263925092686, "grad_norm": 0.18919703364372253, "learning_rate": 0.0002, "loss": 0.7166, "mean_token_accuracy": 0.8054553620517254, "num_tokens": 801841815.0, "step": 8125 }, { "entropy": 0.5475903509184719, "epoch": 0.6093011164431205, "grad_norm": 0.19017471373081207, "learning_rate": 0.0002, "loss": 0.6879, "mean_token_accuracy": 0.8079722169786692, "num_tokens": 803285033.0, "step": 8130 }, { "entropy": 0.5608978549018502, "epoch": 0.6096758403769723, "grad_norm": 0.1947588324546814, "learning_rate": 0.0002, "loss": 0.6862, "mean_token_accuracy": 0.8091987024992704, "num_tokens": 804763520.0, "step": 8135 }, { "entropy": 0.5641767201945186, "epoch": 0.6100505643108242, "grad_norm": 0.20011289417743683, "learning_rate": 0.0002, "loss": 0.6914, "mean_token_accuracy": 0.8080226477235556, "num_tokens": 806249987.0, "step": 8140 }, { "entropy": 0.5773737928830087, "epoch": 0.610425288244676, "grad_norm": 0.18978092074394226, "learning_rate": 0.0002, "loss": 0.7088, "mean_token_accuracy": 0.8051832035183907, "num_tokens": 807728481.0, "step": 8145 }, { "entropy": 0.5738533044233918, "epoch": 0.6108000121785279, "grad_norm": 0.2211359292268753, "learning_rate": 0.0002, "loss": 0.6945, "mean_token_accuracy": 0.8087116539478302, "num_tokens": 809212904.0, "step": 8150 }, { "entropy": 0.5658152872696519, "epoch": 0.6111747361123797, "grad_norm": 0.3157092332839966, "learning_rate": 0.0002, "loss": 0.6897, "mean_token_accuracy": 0.8090641211718321, "num_tokens": 810678910.0, "step": 8155 }, { "entropy": 0.5769775984808803, "epoch": 0.6115494600462316, "grad_norm": 0.19576676189899445, "learning_rate": 0.0002, "loss": 0.705, "mean_token_accuracy": 0.806564512476325, "num_tokens": 812145877.0, "step": 8160 }, { "entropy": 0.5558397851884365, "epoch": 0.6119241839800834, "grad_norm": 0.21386829018592834, "learning_rate": 0.0002, "loss": 0.6865, "mean_token_accuracy": 0.8103952255100012, "num_tokens": 813595479.0, "step": 8165 }, { "entropy": 0.5483615014702081, "epoch": 0.6122989079139353, "grad_norm": 0.1882874071598053, "learning_rate": 0.0002, "loss": 0.6875, "mean_token_accuracy": 0.8104403927922249, "num_tokens": 815017045.0, "step": 8170 }, { "entropy": 0.5751571854576468, "epoch": 0.6126736318477871, "grad_norm": 0.20590369403362274, "learning_rate": 0.0002, "loss": 0.7108, "mean_token_accuracy": 0.8089197184890509, "num_tokens": 816483711.0, "step": 8175 }, { "entropy": 0.5703013062477111, "epoch": 0.613048355781639, "grad_norm": 0.19656412303447723, "learning_rate": 0.0002, "loss": 0.6995, "mean_token_accuracy": 0.8103145685046911, "num_tokens": 817951235.0, "step": 8180 }, { "entropy": 0.5598161876201629, "epoch": 0.6134230797154908, "grad_norm": 0.21594884991645813, "learning_rate": 0.0002, "loss": 0.6994, "mean_token_accuracy": 0.8112949971109629, "num_tokens": 819393785.0, "step": 8185 }, { "entropy": 0.5651267623528838, "epoch": 0.6137978036493427, "grad_norm": 0.21566656231880188, "learning_rate": 0.0002, "loss": 0.6908, "mean_token_accuracy": 0.809461671486497, "num_tokens": 820859816.0, "step": 8190 }, { "entropy": 0.5662884585559368, "epoch": 0.6141725275831945, "grad_norm": 0.1833394467830658, "learning_rate": 0.0002, "loss": 0.6892, "mean_token_accuracy": 0.8074553176760674, "num_tokens": 822349571.0, "step": 8195 }, { "entropy": 0.5582444187253713, "epoch": 0.6145472515170465, "grad_norm": 0.19691596925258636, "learning_rate": 0.0002, "loss": 0.6824, "mean_token_accuracy": 0.8116187080740929, "num_tokens": 823793413.0, "step": 8200 }, { "entropy": 0.579080256074667, "epoch": 0.6149219754508983, "grad_norm": 0.19438564777374268, "learning_rate": 0.0002, "loss": 0.7004, "mean_token_accuracy": 0.8061364602297545, "num_tokens": 825282487.0, "step": 8205 }, { "entropy": 0.5439118601381778, "epoch": 0.6152966993847502, "grad_norm": 0.2058333158493042, "learning_rate": 0.0002, "loss": 0.6568, "mean_token_accuracy": 0.8160980753600597, "num_tokens": 826690633.0, "step": 8210 }, { "entropy": 0.5660943957976997, "epoch": 0.615671423318602, "grad_norm": 0.20509080588817596, "learning_rate": 0.0002, "loss": 0.6952, "mean_token_accuracy": 0.8085428606718779, "num_tokens": 828137093.0, "step": 8215 }, { "entropy": 0.5734768467023968, "epoch": 0.6160461472524539, "grad_norm": 0.20607373118400574, "learning_rate": 0.0002, "loss": 0.7059, "mean_token_accuracy": 0.8064258933067322, "num_tokens": 829627689.0, "step": 8220 }, { "entropy": 0.5552794401533901, "epoch": 0.6164208711863057, "grad_norm": 0.18687503039836884, "learning_rate": 0.0002, "loss": 0.6813, "mean_token_accuracy": 0.8111786939203739, "num_tokens": 831077961.0, "step": 8225 }, { "entropy": 0.5595267966389657, "epoch": 0.6167955951201576, "grad_norm": 0.19023694097995758, "learning_rate": 0.0002, "loss": 0.6985, "mean_token_accuracy": 0.8095310762524605, "num_tokens": 832544569.0, "step": 8230 }, { "entropy": 0.5604632142931223, "epoch": 0.6171703190540094, "grad_norm": 0.2092631310224533, "learning_rate": 0.0002, "loss": 0.6991, "mean_token_accuracy": 0.8081362251192331, "num_tokens": 834013438.0, "step": 8235 }, { "entropy": 0.5756641753017903, "epoch": 0.6175450429878613, "grad_norm": 0.19028054177761078, "learning_rate": 0.0002, "loss": 0.7156, "mean_token_accuracy": 0.8077949162572622, "num_tokens": 835470293.0, "step": 8240 }, { "entropy": 0.5543070618063212, "epoch": 0.6179197669217131, "grad_norm": 0.18457470834255219, "learning_rate": 0.0002, "loss": 0.6862, "mean_token_accuracy": 0.8086331736296415, "num_tokens": 836924523.0, "step": 8245 }, { "entropy": 0.5725725408643484, "epoch": 0.618294490855565, "grad_norm": 0.20722317695617676, "learning_rate": 0.0002, "loss": 0.7036, "mean_token_accuracy": 0.8051132149994373, "num_tokens": 838410557.0, "step": 8250 }, { "entropy": 0.5403832323849201, "epoch": 0.6186692147894168, "grad_norm": 0.220123752951622, "learning_rate": 0.0002, "loss": 0.6683, "mean_token_accuracy": 0.8135446071624756, "num_tokens": 839861382.0, "step": 8255 }, { "entropy": 0.559863312356174, "epoch": 0.6190439387232687, "grad_norm": 0.18079736828804016, "learning_rate": 0.0002, "loss": 0.678, "mean_token_accuracy": 0.8093907952308654, "num_tokens": 841356990.0, "step": 8260 }, { "entropy": 0.5631854427978397, "epoch": 0.6194186626571205, "grad_norm": 0.18131352961063385, "learning_rate": 0.0002, "loss": 0.6758, "mean_token_accuracy": 0.8106828235089779, "num_tokens": 842792888.0, "step": 8265 }, { "entropy": 0.5776104310527443, "epoch": 0.6197933865909724, "grad_norm": 0.2033277451992035, "learning_rate": 0.0002, "loss": 0.6995, "mean_token_accuracy": 0.8089812833815813, "num_tokens": 844215135.0, "step": 8270 }, { "entropy": 0.5700858352705837, "epoch": 0.6201681105248242, "grad_norm": 0.20585450530052185, "learning_rate": 0.0002, "loss": 0.6877, "mean_token_accuracy": 0.8098358035087585, "num_tokens": 845693321.0, "step": 8275 }, { "entropy": 0.5786365846171975, "epoch": 0.6205428344586762, "grad_norm": 0.19399569928646088, "learning_rate": 0.0002, "loss": 0.6889, "mean_token_accuracy": 0.8094106808304786, "num_tokens": 847139839.0, "step": 8280 }, { "entropy": 0.5678062645718456, "epoch": 0.620917558392528, "grad_norm": 0.20432007312774658, "learning_rate": 0.0002, "loss": 0.6872, "mean_token_accuracy": 0.8059116385877132, "num_tokens": 848610294.0, "step": 8285 }, { "entropy": 0.5818573461845518, "epoch": 0.6212922823263799, "grad_norm": 0.20652829110622406, "learning_rate": 0.0002, "loss": 0.696, "mean_token_accuracy": 0.8095889698714018, "num_tokens": 850087783.0, "step": 8290 }, { "entropy": 0.5814870866015553, "epoch": 0.6216670062602317, "grad_norm": 0.19550567865371704, "learning_rate": 0.0002, "loss": 0.7039, "mean_token_accuracy": 0.8082747898995877, "num_tokens": 851509698.0, "step": 8295 }, { "entropy": 0.5658791824243963, "epoch": 0.6220417301940836, "grad_norm": 0.20596009492874146, "learning_rate": 0.0002, "loss": 0.6832, "mean_token_accuracy": 0.8121348273009061, "num_tokens": 852965587.0, "step": 8300 }, { "entropy": 0.5809785714372993, "epoch": 0.6224164541279354, "grad_norm": 0.21009626984596252, "learning_rate": 0.0002, "loss": 0.6972, "mean_token_accuracy": 0.8094910971820355, "num_tokens": 854441202.0, "step": 8305 }, { "entropy": 0.5555258888751269, "epoch": 0.6227911780617873, "grad_norm": 0.29920825362205505, "learning_rate": 0.0002, "loss": 0.677, "mean_token_accuracy": 0.8136997051537037, "num_tokens": 855878126.0, "step": 8310 }, { "entropy": 0.5664542412385345, "epoch": 0.6231659019956391, "grad_norm": 0.2008151113986969, "learning_rate": 0.0002, "loss": 0.6884, "mean_token_accuracy": 0.8092805773019791, "num_tokens": 857344111.0, "step": 8315 }, { "entropy": 0.5676280679181218, "epoch": 0.623540625929491, "grad_norm": 0.19093848764896393, "learning_rate": 0.0002, "loss": 0.6947, "mean_token_accuracy": 0.8085448898375034, "num_tokens": 858826602.0, "step": 8320 }, { "entropy": 0.5745018338784575, "epoch": 0.6239153498633428, "grad_norm": 0.19585295021533966, "learning_rate": 0.0002, "loss": 0.6988, "mean_token_accuracy": 0.8079898171126842, "num_tokens": 860270393.0, "step": 8325 }, { "entropy": 0.5774150079116225, "epoch": 0.6242900737971947, "grad_norm": 0.20016410946846008, "learning_rate": 0.0002, "loss": 0.6979, "mean_token_accuracy": 0.8066683877259493, "num_tokens": 861747868.0, "step": 8330 }, { "entropy": 0.5807904439978302, "epoch": 0.6246647977310466, "grad_norm": 0.21056479215621948, "learning_rate": 0.0002, "loss": 0.6994, "mean_token_accuracy": 0.8090552642941475, "num_tokens": 863277647.0, "step": 8335 }, { "entropy": 0.5731282191351056, "epoch": 0.6250395216648984, "grad_norm": 0.18497444689273834, "learning_rate": 0.0002, "loss": 0.6961, "mean_token_accuracy": 0.8067377053201199, "num_tokens": 864718720.0, "step": 8340 }, { "entropy": 0.567909861728549, "epoch": 0.6254142455987503, "grad_norm": 0.19236025214195251, "learning_rate": 0.0002, "loss": 0.692, "mean_token_accuracy": 0.8087088353931904, "num_tokens": 866157936.0, "step": 8345 }, { "entropy": 0.56201185407117, "epoch": 0.6257889695326021, "grad_norm": 0.19527870416641235, "learning_rate": 0.0002, "loss": 0.6961, "mean_token_accuracy": 0.8054614279419183, "num_tokens": 867614805.0, "step": 8350 }, { "entropy": 0.567673066444695, "epoch": 0.6261636934664541, "grad_norm": 0.2146463841199875, "learning_rate": 0.0002, "loss": 0.695, "mean_token_accuracy": 0.811019928753376, "num_tokens": 869095419.0, "step": 8355 }, { "entropy": 0.5568877485580742, "epoch": 0.6265384174003059, "grad_norm": 0.20819926261901855, "learning_rate": 0.0002, "loss": 0.6728, "mean_token_accuracy": 0.8123465236276388, "num_tokens": 870516989.0, "step": 8360 }, { "entropy": 0.5711867371574044, "epoch": 0.6269131413341578, "grad_norm": 0.1814757138490677, "learning_rate": 0.0002, "loss": 0.676, "mean_token_accuracy": 0.8099500216543675, "num_tokens": 872008621.0, "step": 8365 }, { "entropy": 0.5775429047644138, "epoch": 0.6272878652680096, "grad_norm": 0.20236459374427795, "learning_rate": 0.0002, "loss": 0.6996, "mean_token_accuracy": 0.8062312006950378, "num_tokens": 873451061.0, "step": 8370 }, { "entropy": 0.5800029223784804, "epoch": 0.6276625892018615, "grad_norm": 0.1962941735982895, "learning_rate": 0.0002, "loss": 0.6971, "mean_token_accuracy": 0.8085700076073408, "num_tokens": 874905449.0, "step": 8375 }, { "entropy": 0.5735911060124635, "epoch": 0.6280373131357133, "grad_norm": 0.2079906314611435, "learning_rate": 0.0002, "loss": 0.7005, "mean_token_accuracy": 0.8056292936205864, "num_tokens": 876340453.0, "step": 8380 }, { "entropy": 0.562756372615695, "epoch": 0.6284120370695652, "grad_norm": 0.19131428003311157, "learning_rate": 0.0002, "loss": 0.6852, "mean_token_accuracy": 0.8085511807352305, "num_tokens": 877837777.0, "step": 8385 }, { "entropy": 0.5839243924245239, "epoch": 0.628786761003417, "grad_norm": 0.1918434202671051, "learning_rate": 0.0002, "loss": 0.7173, "mean_token_accuracy": 0.8060014024376869, "num_tokens": 879272514.0, "step": 8390 }, { "entropy": 0.5781015571206808, "epoch": 0.6291614849372689, "grad_norm": 0.19736668467521667, "learning_rate": 0.0002, "loss": 0.6972, "mean_token_accuracy": 0.8107492838054895, "num_tokens": 880768381.0, "step": 8395 }, { "entropy": 0.5687227116897702, "epoch": 0.6295362088711207, "grad_norm": 0.19855698943138123, "learning_rate": 0.0002, "loss": 0.6832, "mean_token_accuracy": 0.8098194018006325, "num_tokens": 882214192.0, "step": 8400 }, { "entropy": 0.5888054274022579, "epoch": 0.6299109328049726, "grad_norm": 0.1911585032939911, "learning_rate": 0.0002, "loss": 0.7031, "mean_token_accuracy": 0.805242034420371, "num_tokens": 1496410.0, "step": 8405 }, { "entropy": 0.5755172619596124, "epoch": 0.6302856567388244, "grad_norm": 0.23546084761619568, "learning_rate": 0.0002, "loss": 0.6852, "mean_token_accuracy": 0.8118073720484972, "num_tokens": 2930060.0, "step": 8410 }, { "entropy": 0.5808069422841072, "epoch": 0.6306603806726763, "grad_norm": 0.23282359540462494, "learning_rate": 0.0002, "loss": 0.6847, "mean_token_accuracy": 0.8110078930854797, "num_tokens": 4396099.0, "step": 8415 }, { "entropy": 0.5587007310241461, "epoch": 0.6310351046065281, "grad_norm": 0.22069253027439117, "learning_rate": 0.0002, "loss": 0.6799, "mean_token_accuracy": 0.8121158525347709, "num_tokens": 5773603.0, "step": 8420 }, { "entropy": 0.5846344409510493, "epoch": 0.63140982854038, "grad_norm": 0.22575435042381287, "learning_rate": 0.0002, "loss": 0.6967, "mean_token_accuracy": 0.8073075465857983, "num_tokens": 7212918.0, "step": 8425 }, { "entropy": 0.5789365356788039, "epoch": 0.6317845524742318, "grad_norm": 0.1881130337715149, "learning_rate": 0.0002, "loss": 0.6852, "mean_token_accuracy": 0.8097123075276613, "num_tokens": 8691708.0, "step": 8430 }, { "entropy": 0.5981750005856157, "epoch": 0.6321592764080838, "grad_norm": 0.2886039614677429, "learning_rate": 0.0002, "loss": 0.7163, "mean_token_accuracy": 0.8048824116587638, "num_tokens": 10180300.0, "step": 8435 }, { "entropy": 0.5689307535067201, "epoch": 0.6325340003419356, "grad_norm": 0.20759589970111847, "learning_rate": 0.0002, "loss": 0.6663, "mean_token_accuracy": 0.8138830494135618, "num_tokens": 11688205.0, "step": 8440 }, { "entropy": 0.5710395069792866, "epoch": 0.6329087242757875, "grad_norm": 0.20845401287078857, "learning_rate": 0.0002, "loss": 0.6874, "mean_token_accuracy": 0.8074986040592194, "num_tokens": 13169638.0, "step": 8445 }, { "entropy": 0.579454711638391, "epoch": 0.6332834482096393, "grad_norm": 0.2095518559217453, "learning_rate": 0.0002, "loss": 0.7065, "mean_token_accuracy": 0.8062692306935787, "num_tokens": 14633992.0, "step": 8450 }, { "entropy": 0.5625411363318562, "epoch": 0.6336581721434912, "grad_norm": 0.19334529340267181, "learning_rate": 0.0002, "loss": 0.6938, "mean_token_accuracy": 0.8073469400405884, "num_tokens": 16149736.0, "step": 8455 }, { "entropy": 0.5510533917695284, "epoch": 0.634032896077343, "grad_norm": 0.19582128524780273, "learning_rate": 0.0002, "loss": 0.6957, "mean_token_accuracy": 0.8064533285796642, "num_tokens": 17618085.0, "step": 8460 }, { "entropy": 0.5559302946552634, "epoch": 0.6344076200111949, "grad_norm": 0.19284266233444214, "learning_rate": 0.0002, "loss": 0.6932, "mean_token_accuracy": 0.8104426458477973, "num_tokens": 19059619.0, "step": 8465 }, { "entropy": 0.5406697784550488, "epoch": 0.6347823439450467, "grad_norm": 0.21088118851184845, "learning_rate": 0.0002, "loss": 0.6733, "mean_token_accuracy": 0.815197629109025, "num_tokens": 20506475.0, "step": 8470 }, { "entropy": 0.5678552784025669, "epoch": 0.6351570678788986, "grad_norm": 0.18218819797039032, "learning_rate": 0.0002, "loss": 0.7036, "mean_token_accuracy": 0.8072493024170398, "num_tokens": 21993183.0, "step": 8475 }, { "entropy": 0.5518258097581565, "epoch": 0.6355317918127504, "grad_norm": 0.19617252051830292, "learning_rate": 0.0002, "loss": 0.6836, "mean_token_accuracy": 0.812757333368063, "num_tokens": 23460257.0, "step": 8480 }, { "entropy": 0.5563441006466746, "epoch": 0.6359065157466023, "grad_norm": 0.20884260535240173, "learning_rate": 0.0002, "loss": 0.6871, "mean_token_accuracy": 0.8085479572415352, "num_tokens": 24938407.0, "step": 8485 }, { "entropy": 0.5468738533556461, "epoch": 0.6362812396804541, "grad_norm": 0.19096732139587402, "learning_rate": 0.0002, "loss": 0.6772, "mean_token_accuracy": 0.8108162205666304, "num_tokens": 26451525.0, "step": 8490 }, { "entropy": 0.5616157784126699, "epoch": 0.636655963614306, "grad_norm": 0.18939629197120667, "learning_rate": 0.0002, "loss": 0.6937, "mean_token_accuracy": 0.8096457369625568, "num_tokens": 27944976.0, "step": 8495 }, { "entropy": 0.569039125367999, "epoch": 0.6370306875481578, "grad_norm": 0.19134652614593506, "learning_rate": 0.0002, "loss": 0.7001, "mean_token_accuracy": 0.8059010803699493, "num_tokens": 29412733.0, "step": 8500 }, { "entropy": 0.5610488253645599, "epoch": 0.6374054114820097, "grad_norm": 0.20746880769729614, "learning_rate": 0.0002, "loss": 0.6976, "mean_token_accuracy": 0.8095976337790489, "num_tokens": 30869609.0, "step": 8505 }, { "entropy": 0.5598270410671831, "epoch": 0.6377801354158616, "grad_norm": 0.20074328780174255, "learning_rate": 0.0002, "loss": 0.7002, "mean_token_accuracy": 0.8075224459171295, "num_tokens": 32349347.0, "step": 8510 }, { "entropy": 0.551480202190578, "epoch": 0.6381548593497135, "grad_norm": 0.2060740739107132, "learning_rate": 0.0002, "loss": 0.6881, "mean_token_accuracy": 0.813231660053134, "num_tokens": 33802970.0, "step": 8515 }, { "entropy": 0.546760201267898, "epoch": 0.6385295832835653, "grad_norm": 0.18739746510982513, "learning_rate": 0.0002, "loss": 0.6857, "mean_token_accuracy": 0.8102746866643429, "num_tokens": 35299543.0, "step": 8520 }, { "entropy": 0.537922834791243, "epoch": 0.6389043072174172, "grad_norm": 0.18480037152767181, "learning_rate": 0.0002, "loss": 0.6819, "mean_token_accuracy": 0.809236577898264, "num_tokens": 36736607.0, "step": 8525 }, { "entropy": 0.5462459186092019, "epoch": 0.639279031151269, "grad_norm": 0.19691883027553558, "learning_rate": 0.0002, "loss": 0.696, "mean_token_accuracy": 0.8089580003172159, "num_tokens": 38221703.0, "step": 8530 }, { "entropy": 0.5492604221217334, "epoch": 0.6396537550851209, "grad_norm": 0.19990985095500946, "learning_rate": 0.0002, "loss": 0.6898, "mean_token_accuracy": 0.8054715771228075, "num_tokens": 39686801.0, "step": 8535 }, { "entropy": 0.5525470120832324, "epoch": 0.6400284790189728, "grad_norm": 0.20403143763542175, "learning_rate": 0.0002, "loss": 0.6937, "mean_token_accuracy": 0.8069779109209776, "num_tokens": 41131083.0, "step": 8540 }, { "entropy": 0.5413818411529064, "epoch": 0.6404032029528246, "grad_norm": 0.20404945313930511, "learning_rate": 0.0002, "loss": 0.6781, "mean_token_accuracy": 0.8129302434623241, "num_tokens": 42554671.0, "step": 8545 }, { "entropy": 0.558983477205038, "epoch": 0.6407779268866765, "grad_norm": 0.2297065109014511, "learning_rate": 0.0002, "loss": 0.682, "mean_token_accuracy": 0.8126106388866902, "num_tokens": 44047030.0, "step": 8550 }, { "entropy": 0.5592643596231938, "epoch": 0.6411526508205283, "grad_norm": 0.2130797803401947, "learning_rate": 0.0002, "loss": 0.6776, "mean_token_accuracy": 0.8114345874637365, "num_tokens": 45510497.0, "step": 8555 }, { "entropy": 0.5676627283915877, "epoch": 0.6415273747543802, "grad_norm": 0.1960885226726532, "learning_rate": 0.0002, "loss": 0.691, "mean_token_accuracy": 0.8065406002104283, "num_tokens": 46989925.0, "step": 8560 }, { "entropy": 0.5708889385685325, "epoch": 0.641902098688232, "grad_norm": 0.1981040984392166, "learning_rate": 0.0002, "loss": 0.6904, "mean_token_accuracy": 0.8097889862954617, "num_tokens": 48459310.0, "step": 8565 }, { "entropy": 0.5695927575230598, "epoch": 0.6422768226220839, "grad_norm": 0.1961108297109604, "learning_rate": 0.0002, "loss": 0.6905, "mean_token_accuracy": 0.8097909111529589, "num_tokens": 49906215.0, "step": 8570 }, { "entropy": 0.5770621059462429, "epoch": 0.6426515465559357, "grad_norm": 0.2062181681394577, "learning_rate": 0.0002, "loss": 0.6883, "mean_token_accuracy": 0.8088529571890831, "num_tokens": 51422567.0, "step": 8575 }, { "entropy": 0.5731509869918228, "epoch": 0.6430262704897876, "grad_norm": 0.18470458686351776, "learning_rate": 0.0002, "loss": 0.6959, "mean_token_accuracy": 0.8098501697182655, "num_tokens": 52882588.0, "step": 8580 }, { "entropy": 0.5826034488156437, "epoch": 0.6434009944236394, "grad_norm": 0.1951655000448227, "learning_rate": 0.0002, "loss": 0.7011, "mean_token_accuracy": 0.8081632945686579, "num_tokens": 54342416.0, "step": 8585 }, { "entropy": 0.5793723916634917, "epoch": 0.6437757183574914, "grad_norm": 0.2015705108642578, "learning_rate": 0.0002, "loss": 0.6919, "mean_token_accuracy": 0.8087500408291817, "num_tokens": 55773262.0, "step": 8590 }, { "entropy": 0.5834096387028694, "epoch": 0.6441504422913432, "grad_norm": 0.18481437861919403, "learning_rate": 0.0002, "loss": 0.6972, "mean_token_accuracy": 0.8086368825286627, "num_tokens": 57242607.0, "step": 8595 }, { "entropy": 0.581971126422286, "epoch": 0.6445251662251951, "grad_norm": 0.19084137678146362, "learning_rate": 0.0002, "loss": 0.7043, "mean_token_accuracy": 0.8073382880538702, "num_tokens": 58705761.0, "step": 8600 }, { "entropy": 0.5713413381949067, "epoch": 0.6448998901590469, "grad_norm": 0.20334690809249878, "learning_rate": 0.0002, "loss": 0.6905, "mean_token_accuracy": 0.8096685912460089, "num_tokens": 60176527.0, "step": 8605 }, { "entropy": 0.5670973364263773, "epoch": 0.6452746140928988, "grad_norm": 0.20114032924175262, "learning_rate": 0.0002, "loss": 0.6875, "mean_token_accuracy": 0.8047778647392988, "num_tokens": 61632999.0, "step": 8610 }, { "entropy": 0.5769799906760454, "epoch": 0.6456493380267506, "grad_norm": 0.18690837919712067, "learning_rate": 0.0002, "loss": 0.6982, "mean_token_accuracy": 0.8084209587424993, "num_tokens": 63120525.0, "step": 8615 }, { "entropy": 0.5726185817271471, "epoch": 0.6460240619606025, "grad_norm": 0.27100327610969543, "learning_rate": 0.0002, "loss": 0.6964, "mean_token_accuracy": 0.8063342798501253, "num_tokens": 64579346.0, "step": 8620 }, { "entropy": 0.5941944573074579, "epoch": 0.6463987858944543, "grad_norm": 0.186022087931633, "learning_rate": 0.0002, "loss": 0.7161, "mean_token_accuracy": 0.8024027045816183, "num_tokens": 66040201.0, "step": 8625 }, { "entropy": 0.5769894793629646, "epoch": 0.6467735098283062, "grad_norm": 0.18814796209335327, "learning_rate": 0.0002, "loss": 0.6969, "mean_token_accuracy": 0.8076300878077746, "num_tokens": 67550134.0, "step": 8630 }, { "entropy": 0.5453541431576013, "epoch": 0.647148233762158, "grad_norm": 0.21156731247901917, "learning_rate": 0.0002, "loss": 0.6611, "mean_token_accuracy": 0.8145249966531992, "num_tokens": 69001254.0, "step": 8635 }, { "entropy": 0.5653457913547755, "epoch": 0.6475229576960099, "grad_norm": 0.215678870677948, "learning_rate": 0.0002, "loss": 0.6898, "mean_token_accuracy": 0.8086681064218283, "num_tokens": 70406555.0, "step": 8640 }, { "entropy": 0.5678335951641202, "epoch": 0.6478976816298617, "grad_norm": 0.19801416993141174, "learning_rate": 0.0002, "loss": 0.694, "mean_token_accuracy": 0.8087244365364313, "num_tokens": 71847213.0, "step": 8645 }, { "entropy": 0.5608452854678034, "epoch": 0.6482724055637136, "grad_norm": 0.226779043674469, "learning_rate": 0.0002, "loss": 0.6926, "mean_token_accuracy": 0.8077531050890684, "num_tokens": 73315709.0, "step": 8650 }, { "entropy": 0.577993737347424, "epoch": 0.6486471294975654, "grad_norm": 0.193668395280838, "learning_rate": 0.0002, "loss": 0.6986, "mean_token_accuracy": 0.806943853572011, "num_tokens": 74776275.0, "step": 8655 }, { "entropy": 0.5690838759765029, "epoch": 0.6490218534314173, "grad_norm": 0.1839076280593872, "learning_rate": 0.0002, "loss": 0.6894, "mean_token_accuracy": 0.808586011454463, "num_tokens": 76222010.0, "step": 8660 }, { "entropy": 0.5684005826711654, "epoch": 0.6493965773652692, "grad_norm": 0.21151287853717804, "learning_rate": 0.0002, "loss": 0.6801, "mean_token_accuracy": 0.8097742985934019, "num_tokens": 77711872.0, "step": 8665 }, { "entropy": 0.5708202332258224, "epoch": 0.6497713012991211, "grad_norm": 0.19817091524600983, "learning_rate": 0.0002, "loss": 0.6924, "mean_token_accuracy": 0.8101702984422445, "num_tokens": 79200415.0, "step": 8670 }, { "entropy": 0.5869948253035545, "epoch": 0.6501460252329729, "grad_norm": 0.19797123968601227, "learning_rate": 0.0002, "loss": 0.7027, "mean_token_accuracy": 0.8071863129734993, "num_tokens": 80613372.0, "step": 8675 }, { "entropy": 0.5822664242237806, "epoch": 0.6505207491668248, "grad_norm": 0.19880083203315735, "learning_rate": 0.0002, "loss": 0.6895, "mean_token_accuracy": 0.8070543106645346, "num_tokens": 82082847.0, "step": 8680 }, { "entropy": 0.5791427478194237, "epoch": 0.6508954731006766, "grad_norm": 0.20377899706363678, "learning_rate": 0.0002, "loss": 0.7061, "mean_token_accuracy": 0.8098480939865113, "num_tokens": 83548790.0, "step": 8685 }, { "entropy": 0.5749524630606174, "epoch": 0.6512701970345285, "grad_norm": 0.191803976893425, "learning_rate": 0.0002, "loss": 0.7009, "mean_token_accuracy": 0.8107928164303303, "num_tokens": 85021210.0, "step": 8690 }, { "entropy": 0.5761828595772386, "epoch": 0.6516449209683803, "grad_norm": 0.20027513802051544, "learning_rate": 0.0002, "loss": 0.6926, "mean_token_accuracy": 0.8116225130856037, "num_tokens": 86476274.0, "step": 8695 }, { "entropy": 0.5761985827237368, "epoch": 0.6520196449022322, "grad_norm": 0.20933090150356293, "learning_rate": 0.0002, "loss": 0.6987, "mean_token_accuracy": 0.8097898788750172, "num_tokens": 87911173.0, "step": 8700 }, { "entropy": 0.5623636208474636, "epoch": 0.652394368836084, "grad_norm": 0.26050320267677307, "learning_rate": 0.0002, "loss": 0.6724, "mean_token_accuracy": 0.8101261738687754, "num_tokens": 89370374.0, "step": 8705 }, { "entropy": 0.5814249807037413, "epoch": 0.6527690927699359, "grad_norm": 0.19781731069087982, "learning_rate": 0.0002, "loss": 0.6838, "mean_token_accuracy": 0.8083756312727928, "num_tokens": 90898672.0, "step": 8710 }, { "entropy": 0.5905889384448528, "epoch": 0.6531438167037877, "grad_norm": 0.18385039269924164, "learning_rate": 0.0002, "loss": 0.6859, "mean_token_accuracy": 0.809320218116045, "num_tokens": 92389423.0, "step": 8715 }, { "entropy": 0.5876932037994266, "epoch": 0.6535185406376396, "grad_norm": 0.18813864886760712, "learning_rate": 0.0002, "loss": 0.7013, "mean_token_accuracy": 0.8081163119524717, "num_tokens": 93858283.0, "step": 8720 }, { "entropy": 0.596122420206666, "epoch": 0.6538932645714914, "grad_norm": 0.20324824750423431, "learning_rate": 0.0002, "loss": 0.6897, "mean_token_accuracy": 0.8084863942116499, "num_tokens": 95304144.0, "step": 8725 }, { "entropy": 0.5819827310740948, "epoch": 0.6542679885053433, "grad_norm": 0.19888535141944885, "learning_rate": 0.0002, "loss": 0.6975, "mean_token_accuracy": 0.8109719477593899, "num_tokens": 96737231.0, "step": 8730 }, { "entropy": 0.5563973736017942, "epoch": 0.6546427124391951, "grad_norm": 0.19321641325950623, "learning_rate": 0.0002, "loss": 0.6841, "mean_token_accuracy": 0.8080523051321507, "num_tokens": 98175488.0, "step": 8735 }, { "entropy": 0.5603059965185822, "epoch": 0.655017436373047, "grad_norm": 0.22084487974643707, "learning_rate": 0.0002, "loss": 0.6802, "mean_token_accuracy": 0.8104587394744158, "num_tokens": 99693202.0, "step": 8740 }, { "entropy": 0.5743392518721521, "epoch": 0.655392160306899, "grad_norm": 0.2119465470314026, "learning_rate": 0.0002, "loss": 0.6852, "mean_token_accuracy": 0.810924569889903, "num_tokens": 101208801.0, "step": 8745 }, { "entropy": 0.5651632038876414, "epoch": 0.6557668842407508, "grad_norm": 0.19678038358688354, "learning_rate": 0.0002, "loss": 0.6807, "mean_token_accuracy": 0.811205055192113, "num_tokens": 102664874.0, "step": 8750 }, { "entropy": 0.5646142058074475, "epoch": 0.6561416081746027, "grad_norm": 0.21980001032352448, "learning_rate": 0.0002, "loss": 0.6857, "mean_token_accuracy": 0.8109080404043197, "num_tokens": 104099322.0, "step": 8755 }, { "entropy": 0.5780180606991052, "epoch": 0.6565163321084545, "grad_norm": 0.2248496562242508, "learning_rate": 0.0002, "loss": 0.7037, "mean_token_accuracy": 0.8070934232324362, "num_tokens": 105563878.0, "step": 8760 }, { "entropy": 0.5852313162758946, "epoch": 0.6568910560423064, "grad_norm": 0.19650156795978546, "learning_rate": 0.0002, "loss": 0.6907, "mean_token_accuracy": 0.8072824817150831, "num_tokens": 107039851.0, "step": 8765 }, { "entropy": 0.5697753534652292, "epoch": 0.6572657799761582, "grad_norm": 0.21986687183380127, "learning_rate": 0.0002, "loss": 0.6926, "mean_token_accuracy": 0.8075875218957662, "num_tokens": 108501191.0, "step": 8770 }, { "entropy": 0.5707916723564267, "epoch": 0.65764050391001, "grad_norm": 0.21602872014045715, "learning_rate": 0.0002, "loss": 0.7028, "mean_token_accuracy": 0.8074234060943126, "num_tokens": 109963453.0, "step": 8775 }, { "entropy": 0.5703456372022628, "epoch": 0.6580152278438619, "grad_norm": 0.21008418500423431, "learning_rate": 0.0002, "loss": 0.6831, "mean_token_accuracy": 0.8088516768068075, "num_tokens": 111432178.0, "step": 8780 }, { "entropy": 0.5685243390500545, "epoch": 0.6583899517777138, "grad_norm": 0.21793080866336823, "learning_rate": 0.0002, "loss": 0.6975, "mean_token_accuracy": 0.8098353412002325, "num_tokens": 112921951.0, "step": 8785 }, { "entropy": 0.5748433137312532, "epoch": 0.6587646757115656, "grad_norm": 0.18088218569755554, "learning_rate": 0.0002, "loss": 0.701, "mean_token_accuracy": 0.8076816953718662, "num_tokens": 114371472.0, "step": 8790 }, { "entropy": 0.5717011159285903, "epoch": 0.6591393996454175, "grad_norm": 0.2119349092245102, "learning_rate": 0.0002, "loss": 0.6947, "mean_token_accuracy": 0.8088817920535802, "num_tokens": 115830188.0, "step": 8795 }, { "entropy": 0.5757096532732249, "epoch": 0.6595141235792693, "grad_norm": 0.21579429507255554, "learning_rate": 0.0002, "loss": 0.6904, "mean_token_accuracy": 0.8101357307285071, "num_tokens": 117309251.0, "step": 8800 }, { "entropy": 0.5914191605523229, "epoch": 0.6598888475131212, "grad_norm": 0.19164599478244781, "learning_rate": 0.0002, "loss": 0.7069, "mean_token_accuracy": 0.806368550658226, "num_tokens": 118799332.0, "step": 8805 }, { "entropy": 0.5542774179950356, "epoch": 0.660263571446973, "grad_norm": 0.1998702436685562, "learning_rate": 0.0002, "loss": 0.6672, "mean_token_accuracy": 0.813937944918871, "num_tokens": 120204893.0, "step": 8810 }, { "entropy": 0.5746725667268038, "epoch": 0.6606382953808249, "grad_norm": 0.20053455233573914, "learning_rate": 0.0002, "loss": 0.677, "mean_token_accuracy": 0.8108076307922601, "num_tokens": 121679630.0, "step": 8815 }, { "entropy": 0.579914634861052, "epoch": 0.6610130193146768, "grad_norm": 0.19699405133724213, "learning_rate": 0.0002, "loss": 0.6817, "mean_token_accuracy": 0.808728438988328, "num_tokens": 123179554.0, "step": 8820 }, { "entropy": 0.6085537414997816, "epoch": 0.6613877432485287, "grad_norm": 0.18440335988998413, "learning_rate": 0.0002, "loss": 0.7097, "mean_token_accuracy": 0.806881207600236, "num_tokens": 124675034.0, "step": 8825 }, { "entropy": 0.6030332835391163, "epoch": 0.6617624671823805, "grad_norm": 0.20188429951667786, "learning_rate": 0.0002, "loss": 0.6899, "mean_token_accuracy": 0.8092405457049608, "num_tokens": 126143035.0, "step": 8830 }, { "entropy": 0.5984531674534083, "epoch": 0.6621371911162324, "grad_norm": 0.193935826420784, "learning_rate": 0.0002, "loss": 0.7025, "mean_token_accuracy": 0.8082299046218395, "num_tokens": 127661493.0, "step": 8835 }, { "entropy": 0.5641464740037918, "epoch": 0.6625119150500842, "grad_norm": 0.1879323124885559, "learning_rate": 0.0002, "loss": 0.6782, "mean_token_accuracy": 0.8109245292842389, "num_tokens": 129139773.0, "step": 8840 }, { "entropy": 0.5743764961138368, "epoch": 0.6628866389839361, "grad_norm": 0.19059154391288757, "learning_rate": 0.0002, "loss": 0.6969, "mean_token_accuracy": 0.8078914698213339, "num_tokens": 130630502.0, "step": 8845 }, { "entropy": 0.5711457682773471, "epoch": 0.6632613629177879, "grad_norm": 0.19288752973079681, "learning_rate": 0.0002, "loss": 0.6903, "mean_token_accuracy": 0.809267695993185, "num_tokens": 132136917.0, "step": 8850 }, { "entropy": 0.5765966830775142, "epoch": 0.6636360868516398, "grad_norm": 0.2198989987373352, "learning_rate": 0.0002, "loss": 0.6969, "mean_token_accuracy": 0.8063497994095087, "num_tokens": 133651189.0, "step": 8855 }, { "entropy": 0.5767378704622388, "epoch": 0.6640108107854916, "grad_norm": 0.21669413149356842, "learning_rate": 0.0002, "loss": 0.7038, "mean_token_accuracy": 0.8073947541415691, "num_tokens": 135125285.0, "step": 8860 }, { "entropy": 0.5579169292002917, "epoch": 0.6643855347193435, "grad_norm": 0.20026569068431854, "learning_rate": 0.0002, "loss": 0.6758, "mean_token_accuracy": 0.8136023204773665, "num_tokens": 136605387.0, "step": 8865 }, { "entropy": 0.5585828656330705, "epoch": 0.6647602586531953, "grad_norm": 0.2021680772304535, "learning_rate": 0.0002, "loss": 0.6826, "mean_token_accuracy": 0.8136448469012976, "num_tokens": 138053724.0, "step": 8870 }, { "entropy": 0.5575320355594158, "epoch": 0.6651349825870472, "grad_norm": 0.18833491206169128, "learning_rate": 0.0002, "loss": 0.6905, "mean_token_accuracy": 0.8084268566220999, "num_tokens": 139567359.0, "step": 8875 }, { "entropy": 0.5662820022553205, "epoch": 0.665509706520899, "grad_norm": 0.1924443393945694, "learning_rate": 0.0002, "loss": 0.6977, "mean_token_accuracy": 0.8085336111485958, "num_tokens": 141069635.0, "step": 8880 }, { "entropy": 0.5546599773690104, "epoch": 0.6658844304547509, "grad_norm": 0.20662815868854523, "learning_rate": 0.0002, "loss": 0.6972, "mean_token_accuracy": 0.8074016332626343, "num_tokens": 142508239.0, "step": 8885 }, { "entropy": 0.5568461997434497, "epoch": 0.6662591543886027, "grad_norm": 0.21860404312610626, "learning_rate": 0.0002, "loss": 0.694, "mean_token_accuracy": 0.8104878421872854, "num_tokens": 143951680.0, "step": 8890 }, { "entropy": 0.5609617548063397, "epoch": 0.6666338783224546, "grad_norm": 0.20741046965122223, "learning_rate": 0.0002, "loss": 0.6927, "mean_token_accuracy": 0.8099694509059191, "num_tokens": 145454370.0, "step": 8895 }, { "entropy": 0.5760274022817612, "epoch": 0.6670086022563065, "grad_norm": 0.21321925520896912, "learning_rate": 0.0002, "loss": 0.7034, "mean_token_accuracy": 0.805579998716712, "num_tokens": 146956614.0, "step": 8900 }, { "entropy": 0.5686179386451841, "epoch": 0.6673833261901584, "grad_norm": 0.20321384072303772, "learning_rate": 0.0002, "loss": 0.7047, "mean_token_accuracy": 0.8046811398118734, "num_tokens": 148467003.0, "step": 8905 }, { "entropy": 0.5661126192659139, "epoch": 0.6677580501240102, "grad_norm": 0.21433976292610168, "learning_rate": 0.0002, "loss": 0.6906, "mean_token_accuracy": 0.8090635381639004, "num_tokens": 149945998.0, "step": 8910 }, { "entropy": 0.5571287432685494, "epoch": 0.6681327740578621, "grad_norm": 0.19607661664485931, "learning_rate": 0.0002, "loss": 0.6907, "mean_token_accuracy": 0.8120898067951202, "num_tokens": 151417073.0, "step": 8915 }, { "entropy": 0.5581950359046459, "epoch": 0.6685074979917139, "grad_norm": 0.19918368756771088, "learning_rate": 0.0002, "loss": 0.6926, "mean_token_accuracy": 0.8089442666620016, "num_tokens": 152904023.0, "step": 8920 }, { "entropy": 0.5568174466490745, "epoch": 0.6688822219255658, "grad_norm": 0.2314944863319397, "learning_rate": 0.0002, "loss": 0.6956, "mean_token_accuracy": 0.8092631235718727, "num_tokens": 154344633.0, "step": 8925 }, { "entropy": 0.5678830741904676, "epoch": 0.6692569458594176, "grad_norm": 0.22213736176490784, "learning_rate": 0.0002, "loss": 0.7071, "mean_token_accuracy": 0.8064409505575896, "num_tokens": 155793490.0, "step": 8930 }, { "entropy": 0.5460827864706517, "epoch": 0.6696316697932695, "grad_norm": 0.19722233712673187, "learning_rate": 0.0002, "loss": 0.6785, "mean_token_accuracy": 0.8109843447804451, "num_tokens": 157278761.0, "step": 8935 }, { "entropy": 0.5517586566507816, "epoch": 0.6700063937271213, "grad_norm": 0.2041502594947815, "learning_rate": 0.0002, "loss": 0.6719, "mean_token_accuracy": 0.8081135839223862, "num_tokens": 158786291.0, "step": 8940 }, { "entropy": 0.546484770718962, "epoch": 0.6703811176609732, "grad_norm": 0.20371313393115997, "learning_rate": 0.0002, "loss": 0.6862, "mean_token_accuracy": 0.8098129782825708, "num_tokens": 160236807.0, "step": 8945 }, { "entropy": 0.5637400499545038, "epoch": 0.670755841594825, "grad_norm": 0.20929396152496338, "learning_rate": 0.0002, "loss": 0.6856, "mean_token_accuracy": 0.8086141340434552, "num_tokens": 161663970.0, "step": 8950 }, { "entropy": 0.5775301699526608, "epoch": 0.6711305655286769, "grad_norm": 0.21279333531856537, "learning_rate": 0.0002, "loss": 0.6845, "mean_token_accuracy": 0.8103465884923935, "num_tokens": 163161770.0, "step": 8955 }, { "entropy": 0.5621178966015578, "epoch": 0.6715052894625287, "grad_norm": 0.20368540287017822, "learning_rate": 0.0002, "loss": 0.6819, "mean_token_accuracy": 0.8114964615553617, "num_tokens": 164595319.0, "step": 8960 }, { "entropy": 0.5786595139652491, "epoch": 0.6718800133963806, "grad_norm": 0.21201135218143463, "learning_rate": 0.0002, "loss": 0.7032, "mean_token_accuracy": 0.8064602233469487, "num_tokens": 166046477.0, "step": 8965 }, { "entropy": 0.5580886678770185, "epoch": 0.6722547373302324, "grad_norm": 0.205311119556427, "learning_rate": 0.0002, "loss": 0.6727, "mean_token_accuracy": 0.813159741833806, "num_tokens": 167487888.0, "step": 8970 }, { "entropy": 0.5674699062481523, "epoch": 0.6726294612640844, "grad_norm": 0.203612819314003, "learning_rate": 0.0002, "loss": 0.6972, "mean_token_accuracy": 0.8099332392215729, "num_tokens": 168913214.0, "step": 8975 }, { "entropy": 0.5837869113311172, "epoch": 0.6730041851979363, "grad_norm": 0.20636308193206787, "learning_rate": 0.0002, "loss": 0.7005, "mean_token_accuracy": 0.8093649551272393, "num_tokens": 170390645.0, "step": 8980 }, { "entropy": 0.5715874034911395, "epoch": 0.6733789091317881, "grad_norm": 0.20765885710716248, "learning_rate": 0.0002, "loss": 0.7013, "mean_token_accuracy": 0.8106307946145535, "num_tokens": 171843757.0, "step": 8985 }, { "entropy": 0.5734463004395366, "epoch": 0.67375363306564, "grad_norm": 0.18244491517543793, "learning_rate": 0.0002, "loss": 0.7015, "mean_token_accuracy": 0.8073928449302912, "num_tokens": 173364380.0, "step": 8990 }, { "entropy": 0.5472411831840873, "epoch": 0.6741283569994918, "grad_norm": 0.18661321699619293, "learning_rate": 0.0002, "loss": 0.6807, "mean_token_accuracy": 0.8091681204736233, "num_tokens": 174806553.0, "step": 8995 }, { "entropy": 0.5607635343447328, "epoch": 0.6745030809333437, "grad_norm": 0.21728909015655518, "learning_rate": 0.0002, "loss": 0.6979, "mean_token_accuracy": 0.8106600705534219, "num_tokens": 176257030.0, "step": 9000 }, { "entropy": 0.5469781465828418, "epoch": 0.6748778048671955, "grad_norm": 0.18325121700763702, "learning_rate": 0.0002, "loss": 0.6708, "mean_token_accuracy": 0.8131164114922285, "num_tokens": 177744955.0, "step": 9005 }, { "entropy": 0.5594745030626654, "epoch": 0.6752525288010474, "grad_norm": 0.2094789445400238, "learning_rate": 0.0002, "loss": 0.7006, "mean_token_accuracy": 0.8103986769914627, "num_tokens": 179240641.0, "step": 9010 }, { "entropy": 0.5589252021163702, "epoch": 0.6756272527348992, "grad_norm": 0.20882214605808258, "learning_rate": 0.0002, "loss": 0.6919, "mean_token_accuracy": 0.8077965781092644, "num_tokens": 180726700.0, "step": 9015 }, { "entropy": 0.5438504736870527, "epoch": 0.6760019766687511, "grad_norm": 0.19996365904808044, "learning_rate": 0.0002, "loss": 0.6753, "mean_token_accuracy": 0.8113720454275608, "num_tokens": 182173057.0, "step": 9020 }, { "entropy": 0.5586946949362754, "epoch": 0.6763767006026029, "grad_norm": 0.2276037335395813, "learning_rate": 0.0002, "loss": 0.6951, "mean_token_accuracy": 0.8075786910951137, "num_tokens": 183626630.0, "step": 9025 }, { "entropy": 0.5570791793987155, "epoch": 0.6767514245364548, "grad_norm": 0.2244814783334732, "learning_rate": 0.0002, "loss": 0.6942, "mean_token_accuracy": 0.8107522763311863, "num_tokens": 185089641.0, "step": 9030 }, { "entropy": 0.5529836066067219, "epoch": 0.6771261484703066, "grad_norm": 0.20379705727100372, "learning_rate": 0.0002, "loss": 0.6792, "mean_token_accuracy": 0.8092548206448555, "num_tokens": 186582556.0, "step": 9035 }, { "entropy": 0.5573127653449774, "epoch": 0.6775008724041585, "grad_norm": 0.2100895494222641, "learning_rate": 0.0002, "loss": 0.6899, "mean_token_accuracy": 0.8095039911568165, "num_tokens": 188032837.0, "step": 9040 }, { "entropy": 0.5489915238693357, "epoch": 0.6778755963380103, "grad_norm": 0.19994409382343292, "learning_rate": 0.0002, "loss": 0.6795, "mean_token_accuracy": 0.8142535593360662, "num_tokens": 189496599.0, "step": 9045 }, { "entropy": 0.5641831092536449, "epoch": 0.6782503202718622, "grad_norm": 0.18870925903320312, "learning_rate": 0.0002, "loss": 0.6986, "mean_token_accuracy": 0.8074972029775381, "num_tokens": 191005792.0, "step": 9050 }, { "entropy": 0.5523669173009693, "epoch": 0.6786250442057141, "grad_norm": 0.19186195731163025, "learning_rate": 0.0002, "loss": 0.6817, "mean_token_accuracy": 0.8120878357440233, "num_tokens": 192496236.0, "step": 9055 }, { "entropy": 0.5598044337704777, "epoch": 0.678999768139566, "grad_norm": 0.21367278695106506, "learning_rate": 0.0002, "loss": 0.6936, "mean_token_accuracy": 0.8092241737991571, "num_tokens": 193952460.0, "step": 9060 }, { "entropy": 0.5532370926812291, "epoch": 0.6793744920734178, "grad_norm": 0.20122574269771576, "learning_rate": 0.0002, "loss": 0.7018, "mean_token_accuracy": 0.8118499521166086, "num_tokens": 195399516.0, "step": 9065 }, { "entropy": 0.5460626963526011, "epoch": 0.6797492160072697, "grad_norm": 0.20467646420001984, "learning_rate": 0.0002, "loss": 0.6832, "mean_token_accuracy": 0.8134499758481979, "num_tokens": 196820007.0, "step": 9070 }, { "entropy": 0.5526738118380308, "epoch": 0.6801239399411215, "grad_norm": 0.20568911731243134, "learning_rate": 0.0002, "loss": 0.7067, "mean_token_accuracy": 0.8053404461592436, "num_tokens": 198269528.0, "step": 9075 }, { "entropy": 0.5546795919537544, "epoch": 0.6804986638749734, "grad_norm": 0.1940842866897583, "learning_rate": 0.0002, "loss": 0.6986, "mean_token_accuracy": 0.8075628586113452, "num_tokens": 199726896.0, "step": 9080 }, { "entropy": 0.5411200986243785, "epoch": 0.6808733878088252, "grad_norm": 0.20041145384311676, "learning_rate": 0.0002, "loss": 0.6755, "mean_token_accuracy": 0.8118481777608395, "num_tokens": 201148432.0, "step": 9085 }, { "entropy": 0.5549153448082507, "epoch": 0.6812481117426771, "grad_norm": 0.2192068248987198, "learning_rate": 0.0002, "loss": 0.6879, "mean_token_accuracy": 0.808594747632742, "num_tokens": 202592193.0, "step": 9090 }, { "entropy": 0.5655572084710002, "epoch": 0.6816228356765289, "grad_norm": 0.20177920162677765, "learning_rate": 0.0002, "loss": 0.6999, "mean_token_accuracy": 0.8086969520896673, "num_tokens": 204050093.0, "step": 9095 }, { "entropy": 0.5540594425052404, "epoch": 0.6819975596103808, "grad_norm": 0.18365873396396637, "learning_rate": 0.0002, "loss": 0.6823, "mean_token_accuracy": 0.8127973280847073, "num_tokens": 205470026.0, "step": 9100 }, { "entropy": 0.5712278252467513, "epoch": 0.6823722835442326, "grad_norm": 0.22976966202259064, "learning_rate": 0.0002, "loss": 0.7036, "mean_token_accuracy": 0.808131943270564, "num_tokens": 206938010.0, "step": 9105 }, { "entropy": 0.5711413573473691, "epoch": 0.6827470074780845, "grad_norm": 0.20322518050670624, "learning_rate": 0.0002, "loss": 0.695, "mean_token_accuracy": 0.8081599473953247, "num_tokens": 208457122.0, "step": 9110 }, { "entropy": 0.5460259074345231, "epoch": 0.6831217314119363, "grad_norm": 0.19843657314777374, "learning_rate": 0.0002, "loss": 0.6737, "mean_token_accuracy": 0.811813572049141, "num_tokens": 209916294.0, "step": 9115 }, { "entropy": 0.5505036115646362, "epoch": 0.6834964553457882, "grad_norm": 0.20029638707637787, "learning_rate": 0.0002, "loss": 0.6811, "mean_token_accuracy": 0.8121244564652443, "num_tokens": 211339968.0, "step": 9120 }, { "entropy": 0.5573441937565804, "epoch": 0.68387117927964, "grad_norm": 0.1951913982629776, "learning_rate": 0.0002, "loss": 0.6963, "mean_token_accuracy": 0.8079810388386249, "num_tokens": 212800522.0, "step": 9125 }, { "entropy": 0.544712056685239, "epoch": 0.684245903213492, "grad_norm": 0.21435868740081787, "learning_rate": 0.0002, "loss": 0.6767, "mean_token_accuracy": 0.8143212907016277, "num_tokens": 214237818.0, "step": 9130 }, { "entropy": 0.5650360442698001, "epoch": 0.6846206271473438, "grad_norm": 0.22520844638347626, "learning_rate": 0.0002, "loss": 0.6868, "mean_token_accuracy": 0.8089212913066148, "num_tokens": 215743378.0, "step": 9135 }, { "entropy": 0.5697091825306415, "epoch": 0.6849953510811957, "grad_norm": 0.21463395655155182, "learning_rate": 0.0002, "loss": 0.6977, "mean_token_accuracy": 0.8075072657316923, "num_tokens": 217244566.0, "step": 9140 }, { "entropy": 0.5546705506742, "epoch": 0.6853700750150475, "grad_norm": 0.19887536764144897, "learning_rate": 0.0002, "loss": 0.6864, "mean_token_accuracy": 0.8108701415359973, "num_tokens": 218737628.0, "step": 9145 }, { "entropy": 0.5619656886905432, "epoch": 0.6857447989488994, "grad_norm": 0.19196076691150665, "learning_rate": 0.0002, "loss": 0.6867, "mean_token_accuracy": 0.8086641110479832, "num_tokens": 220262891.0, "step": 9150 }, { "entropy": 0.5744221860542893, "epoch": 0.6861195228827512, "grad_norm": 0.221007838845253, "learning_rate": 0.0002, "loss": 0.7022, "mean_token_accuracy": 0.8088461615145206, "num_tokens": 221767164.0, "step": 9155 }, { "entropy": 0.5624970091506839, "epoch": 0.6864942468166031, "grad_norm": 0.1909516602754593, "learning_rate": 0.0002, "loss": 0.6744, "mean_token_accuracy": 0.813494211807847, "num_tokens": 223226506.0, "step": 9160 }, { "entropy": 0.5623449506238103, "epoch": 0.6868689707504549, "grad_norm": 0.19920077919960022, "learning_rate": 0.0002, "loss": 0.6852, "mean_token_accuracy": 0.8075623389333486, "num_tokens": 224738292.0, "step": 9165 }, { "entropy": 0.5535861153155566, "epoch": 0.6872436946843068, "grad_norm": 0.19877851009368896, "learning_rate": 0.0002, "loss": 0.6688, "mean_token_accuracy": 0.8138801120221615, "num_tokens": 226166784.0, "step": 9170 }, { "entropy": 0.5704408928751945, "epoch": 0.6876184186181586, "grad_norm": 0.2272275984287262, "learning_rate": 0.0002, "loss": 0.6867, "mean_token_accuracy": 0.8097728822380305, "num_tokens": 227615083.0, "step": 9175 }, { "entropy": 0.5645706355571747, "epoch": 0.6879931425520105, "grad_norm": 0.19913746416568756, "learning_rate": 0.0002, "loss": 0.685, "mean_token_accuracy": 0.813049491494894, "num_tokens": 229074262.0, "step": 9180 }, { "entropy": 0.5662490891292691, "epoch": 0.6883678664858623, "grad_norm": 0.1923271119594574, "learning_rate": 0.0002, "loss": 0.6975, "mean_token_accuracy": 0.8082740064710379, "num_tokens": 230529785.0, "step": 9185 }, { "entropy": 0.5600707856938243, "epoch": 0.6887425904197142, "grad_norm": 0.2010805755853653, "learning_rate": 0.0002, "loss": 0.6716, "mean_token_accuracy": 0.8128542814403772, "num_tokens": 231993571.0, "step": 9190 }, { "entropy": 0.5746359419077635, "epoch": 0.689117314353566, "grad_norm": 0.1835465133190155, "learning_rate": 0.0002, "loss": 0.7066, "mean_token_accuracy": 0.8067587275058031, "num_tokens": 233458468.0, "step": 9195 }, { "entropy": 0.5625707872211934, "epoch": 0.6894920382874179, "grad_norm": 0.2035030722618103, "learning_rate": 0.0002, "loss": 0.6853, "mean_token_accuracy": 0.8106963008642196, "num_tokens": 234897507.0, "step": 9200 }, { "entropy": 0.564222939312458, "epoch": 0.6898667622212697, "grad_norm": 0.23543575406074524, "learning_rate": 0.0002, "loss": 0.6871, "mean_token_accuracy": 0.810631412640214, "num_tokens": 236398726.0, "step": 9205 }, { "entropy": 0.5667139615863561, "epoch": 0.6902414861551217, "grad_norm": 0.20211949944496155, "learning_rate": 0.0002, "loss": 0.6868, "mean_token_accuracy": 0.8140230376273394, "num_tokens": 237847534.0, "step": 9210 }, { "entropy": 0.5726947329938412, "epoch": 0.6906162100889736, "grad_norm": 0.18443265557289124, "learning_rate": 0.0002, "loss": 0.6845, "mean_token_accuracy": 0.8083224695175886, "num_tokens": 239312051.0, "step": 9215 }, { "entropy": 0.5868649639189243, "epoch": 0.6909909340228254, "grad_norm": 0.2064056098461151, "learning_rate": 0.0002, "loss": 0.7041, "mean_token_accuracy": 0.8077993534505368, "num_tokens": 240801520.0, "step": 9220 }, { "entropy": 0.5774728732183576, "epoch": 0.6913656579566773, "grad_norm": 0.1988067775964737, "learning_rate": 0.0002, "loss": 0.6932, "mean_token_accuracy": 0.8060361009091139, "num_tokens": 242252560.0, "step": 9225 }, { "entropy": 0.58578976765275, "epoch": 0.6917403818905291, "grad_norm": 0.31958866119384766, "learning_rate": 0.0002, "loss": 0.7006, "mean_token_accuracy": 0.8079989098012448, "num_tokens": 243683655.0, "step": 9230 }, { "entropy": 0.5747304463759064, "epoch": 0.692115105824381, "grad_norm": 0.19945840537548065, "learning_rate": 0.0002, "loss": 0.6819, "mean_token_accuracy": 0.8109968606382608, "num_tokens": 245187185.0, "step": 9235 }, { "entropy": 0.5653606133535505, "epoch": 0.6924898297582328, "grad_norm": 0.22856296598911285, "learning_rate": 0.0002, "loss": 0.6804, "mean_token_accuracy": 0.8102148849517107, "num_tokens": 246642022.0, "step": 9240 }, { "entropy": 0.605586196295917, "epoch": 0.6928645536920847, "grad_norm": 0.18270501494407654, "learning_rate": 0.0002, "loss": 0.7132, "mean_token_accuracy": 0.8069069661200047, "num_tokens": 248130165.0, "step": 9245 }, { "entropy": 0.5579721601679921, "epoch": 0.6932392776259365, "grad_norm": 0.20584233105182648, "learning_rate": 0.0002, "loss": 0.6565, "mean_token_accuracy": 0.8169690523296594, "num_tokens": 249621330.0, "step": 9250 }, { "entropy": 0.573118249233812, "epoch": 0.6936140015597884, "grad_norm": 0.19015097618103027, "learning_rate": 0.0002, "loss": 0.6836, "mean_token_accuracy": 0.8095465011894702, "num_tokens": 251023960.0, "step": 9255 }, { "entropy": 0.583728133328259, "epoch": 0.6939887254936402, "grad_norm": 0.20142067968845367, "learning_rate": 0.0002, "loss": 0.6947, "mean_token_accuracy": 0.8114604599773884, "num_tokens": 252456785.0, "step": 9260 }, { "entropy": 0.5660674273967743, "epoch": 0.6943634494274921, "grad_norm": 0.25858616828918457, "learning_rate": 0.0002, "loss": 0.6801, "mean_token_accuracy": 0.8109177500009537, "num_tokens": 253937391.0, "step": 9265 }, { "entropy": 0.5650628665462136, "epoch": 0.6947381733613439, "grad_norm": 0.2556462585926056, "learning_rate": 0.0002, "loss": 0.6857, "mean_token_accuracy": 0.8094343099743128, "num_tokens": 255441578.0, "step": 9270 }, { "entropy": 0.5626763731241227, "epoch": 0.6951128972951958, "grad_norm": 0.22442008554935455, "learning_rate": 0.0002, "loss": 0.6763, "mean_token_accuracy": 0.8116638828068972, "num_tokens": 256872064.0, "step": 9275 }, { "entropy": 0.559494623169303, "epoch": 0.6954876212290476, "grad_norm": 0.20980772376060486, "learning_rate": 0.0002, "loss": 0.6751, "mean_token_accuracy": 0.8109619416296482, "num_tokens": 258314987.0, "step": 9280 }, { "entropy": 0.5687774166464805, "epoch": 0.6958623451628995, "grad_norm": 0.19264748692512512, "learning_rate": 0.0002, "loss": 0.6853, "mean_token_accuracy": 0.8101686771959067, "num_tokens": 259799683.0, "step": 9285 }, { "entropy": 0.579772038012743, "epoch": 0.6962370690967514, "grad_norm": 0.21326081454753876, "learning_rate": 0.0002, "loss": 0.6963, "mean_token_accuracy": 0.8089012857526541, "num_tokens": 261234859.0, "step": 9290 }, { "entropy": 0.5721841083839536, "epoch": 0.6966117930306033, "grad_norm": 0.2013809233903885, "learning_rate": 0.0002, "loss": 0.6873, "mean_token_accuracy": 0.8104832138866186, "num_tokens": 262692755.0, "step": 9295 }, { "entropy": 0.5840925443917513, "epoch": 0.6969865169644551, "grad_norm": 0.19612526893615723, "learning_rate": 0.0002, "loss": 0.7104, "mean_token_accuracy": 0.8080903869122267, "num_tokens": 264123467.0, "step": 9300 }, { "entropy": 0.5458680627867579, "epoch": 0.697361240898307, "grad_norm": 0.20880380272865295, "learning_rate": 0.0002, "loss": 0.6566, "mean_token_accuracy": 0.8144063673913479, "num_tokens": 265533816.0, "step": 9305 }, { "entropy": 0.5651582239195705, "epoch": 0.6977359648321588, "grad_norm": 0.21246451139450073, "learning_rate": 0.0002, "loss": 0.6669, "mean_token_accuracy": 0.8128350172191858, "num_tokens": 267004087.0, "step": 9310 }, { "entropy": 0.5713465427979827, "epoch": 0.6981106887660107, "grad_norm": 0.2084866464138031, "learning_rate": 0.0002, "loss": 0.6838, "mean_token_accuracy": 0.8118864629417658, "num_tokens": 268521185.0, "step": 9315 }, { "entropy": 0.5579501422122121, "epoch": 0.6984854126998625, "grad_norm": 0.1775418519973755, "learning_rate": 0.0002, "loss": 0.6765, "mean_token_accuracy": 0.8112402942031622, "num_tokens": 270001701.0, "step": 9320 }, { "entropy": 0.5619613079354167, "epoch": 0.6988601366337144, "grad_norm": 0.23297585546970367, "learning_rate": 0.0002, "loss": 0.7048, "mean_token_accuracy": 0.8054498881101608, "num_tokens": 271503188.0, "step": 9325 }, { "entropy": 0.5332535175606609, "epoch": 0.6992348605675662, "grad_norm": 0.20100349187850952, "learning_rate": 0.0002, "loss": 0.6701, "mean_token_accuracy": 0.8108392875641585, "num_tokens": 272993808.0, "step": 9330 }, { "entropy": 0.5450028222054243, "epoch": 0.6996095845014181, "grad_norm": 0.1937745362520218, "learning_rate": 0.0002, "loss": 0.6828, "mean_token_accuracy": 0.8114616796374321, "num_tokens": 274464451.0, "step": 9335 }, { "entropy": 0.5398764159530401, "epoch": 0.6999843084352699, "grad_norm": 0.22471533715724945, "learning_rate": 0.0002, "loss": 0.6804, "mean_token_accuracy": 0.8101696170866489, "num_tokens": 275969429.0, "step": 9340 }, { "entropy": 0.535898114927113, "epoch": 0.7003590323691218, "grad_norm": 0.19690635800361633, "learning_rate": 0.0002, "loss": 0.6777, "mean_token_accuracy": 0.8091764066368341, "num_tokens": 277434387.0, "step": 9345 }, { "entropy": 0.5566369112581014, "epoch": 0.7007337563029736, "grad_norm": 0.20744509994983673, "learning_rate": 0.0002, "loss": 0.7045, "mean_token_accuracy": 0.8081741563975811, "num_tokens": 278950564.0, "step": 9350 }, { "entropy": 0.548059621360153, "epoch": 0.7011084802368255, "grad_norm": 0.19504769146442413, "learning_rate": 0.0002, "loss": 0.6838, "mean_token_accuracy": 0.8077988784760237, "num_tokens": 280459706.0, "step": 9355 }, { "entropy": 0.5367607980966568, "epoch": 0.7014832041706773, "grad_norm": 0.20044344663619995, "learning_rate": 0.0002, "loss": 0.6693, "mean_token_accuracy": 0.8100342396646738, "num_tokens": 281919321.0, "step": 9360 }, { "entropy": 0.5673394832760096, "epoch": 0.7018579281045293, "grad_norm": 0.20335634052753448, "learning_rate": 0.0002, "loss": 0.7062, "mean_token_accuracy": 0.8058579780161381, "num_tokens": 283404843.0, "step": 9365 }, { "entropy": 0.538521664403379, "epoch": 0.7022326520383811, "grad_norm": 0.18772481381893158, "learning_rate": 0.0002, "loss": 0.6724, "mean_token_accuracy": 0.8109875399619341, "num_tokens": 284846274.0, "step": 9370 }, { "entropy": 0.5513892110437155, "epoch": 0.702607375972233, "grad_norm": 0.19759675860404968, "learning_rate": 0.0002, "loss": 0.6883, "mean_token_accuracy": 0.8099147137254477, "num_tokens": 286257804.0, "step": 9375 }, { "entropy": 0.5567292889580131, "epoch": 0.7029820999060848, "grad_norm": 0.21086803078651428, "learning_rate": 0.0002, "loss": 0.6841, "mean_token_accuracy": 0.8130553219467401, "num_tokens": 287756701.0, "step": 9380 }, { "entropy": 0.5576374275609851, "epoch": 0.7033568238399367, "grad_norm": 0.18818247318267822, "learning_rate": 0.0002, "loss": 0.6792, "mean_token_accuracy": 0.8071217477321625, "num_tokens": 289231887.0, "step": 9385 }, { "entropy": 0.5740386936813593, "epoch": 0.7037315477737885, "grad_norm": 0.1842556595802307, "learning_rate": 0.0002, "loss": 0.6928, "mean_token_accuracy": 0.8074708696454763, "num_tokens": 290710464.0, "step": 9390 }, { "entropy": 0.5670117631554603, "epoch": 0.7041062717076404, "grad_norm": 0.18750251829624176, "learning_rate": 0.0002, "loss": 0.7005, "mean_token_accuracy": 0.8057280883193016, "num_tokens": 292191580.0, "step": 9395 }, { "entropy": 0.5643478512763977, "epoch": 0.7044809956414922, "grad_norm": 0.21905435621738434, "learning_rate": 0.0002, "loss": 0.6915, "mean_token_accuracy": 0.8084769040346146, "num_tokens": 293684419.0, "step": 9400 }, { "entropy": 0.5665431465022266, "epoch": 0.7048557195753441, "grad_norm": 0.2109280675649643, "learning_rate": 0.0002, "loss": 0.6986, "mean_token_accuracy": 0.8076136004179716, "num_tokens": 295185888.0, "step": 9405 }, { "entropy": 0.5544861959293484, "epoch": 0.705230443509196, "grad_norm": 0.1921771615743637, "learning_rate": 0.0002, "loss": 0.6735, "mean_token_accuracy": 0.8113161228597164, "num_tokens": 296671439.0, "step": 9410 }, { "entropy": 0.5592985795810819, "epoch": 0.7056051674430478, "grad_norm": 0.19619013369083405, "learning_rate": 0.0002, "loss": 0.6976, "mean_token_accuracy": 0.8102676089853048, "num_tokens": 298132483.0, "step": 9415 }, { "entropy": 0.5534486586228013, "epoch": 0.7059798913768996, "grad_norm": 0.2013750970363617, "learning_rate": 0.0002, "loss": 0.6893, "mean_token_accuracy": 0.8087922632694244, "num_tokens": 299599241.0, "step": 9420 }, { "entropy": 0.5350468529388308, "epoch": 0.7063546153107515, "grad_norm": 0.2063247412443161, "learning_rate": 0.0002, "loss": 0.6613, "mean_token_accuracy": 0.8141646772623062, "num_tokens": 301083516.0, "step": 9425 }, { "entropy": 0.5428760152310133, "epoch": 0.7067293392446033, "grad_norm": 0.19795668125152588, "learning_rate": 0.0002, "loss": 0.673, "mean_token_accuracy": 0.8156925357878209, "num_tokens": 302537038.0, "step": 9430 }, { "entropy": 0.5373608775436878, "epoch": 0.7071040631784552, "grad_norm": 0.19614139199256897, "learning_rate": 0.0002, "loss": 0.6692, "mean_token_accuracy": 0.8117856051772833, "num_tokens": 303966324.0, "step": 9435 }, { "entropy": 0.550593795068562, "epoch": 0.707478787112307, "grad_norm": 0.19410653412342072, "learning_rate": 0.0002, "loss": 0.6738, "mean_token_accuracy": 0.8115297842770814, "num_tokens": 305442354.0, "step": 9440 }, { "entropy": 0.5707940187305212, "epoch": 0.707853511046159, "grad_norm": 0.19318945705890656, "learning_rate": 0.0002, "loss": 0.6967, "mean_token_accuracy": 0.8057822115719319, "num_tokens": 306958311.0, "step": 9445 }, { "entropy": 0.5798225520178676, "epoch": 0.7082282349800109, "grad_norm": 0.2069564163684845, "learning_rate": 0.0002, "loss": 0.7131, "mean_token_accuracy": 0.8056434914469719, "num_tokens": 308398979.0, "step": 9450 }, { "entropy": 0.5629785567522049, "epoch": 0.7086029589138627, "grad_norm": 0.1895407736301422, "learning_rate": 0.0002, "loss": 0.6791, "mean_token_accuracy": 0.8142413780093193, "num_tokens": 309878983.0, "step": 9455 }, { "entropy": 0.5802291940897704, "epoch": 0.7089776828477146, "grad_norm": 0.1894954890012741, "learning_rate": 0.0002, "loss": 0.6957, "mean_token_accuracy": 0.8108474887907505, "num_tokens": 311409944.0, "step": 9460 }, { "entropy": 0.581038399040699, "epoch": 0.7093524067815664, "grad_norm": 0.19308386743068695, "learning_rate": 0.0002, "loss": 0.685, "mean_token_accuracy": 0.8082330286502838, "num_tokens": 312904546.0, "step": 9465 }, { "entropy": 0.5594838980585337, "epoch": 0.7097271307154183, "grad_norm": 0.20135784149169922, "learning_rate": 0.0002, "loss": 0.6691, "mean_token_accuracy": 0.812221136316657, "num_tokens": 314355892.0, "step": 9470 }, { "entropy": 0.5532593097537756, "epoch": 0.7101018546492701, "grad_norm": 0.18977202475070953, "learning_rate": 0.0002, "loss": 0.6686, "mean_token_accuracy": 0.8130472760647536, "num_tokens": 315756597.0, "step": 9475 }, { "entropy": 0.5789201125502587, "epoch": 0.710476578583122, "grad_norm": 0.1845875233411789, "learning_rate": 0.0002, "loss": 0.6893, "mean_token_accuracy": 0.8094297964125872, "num_tokens": 317252129.0, "step": 9480 }, { "entropy": 0.5703954190015793, "epoch": 0.7108513025169738, "grad_norm": 0.20658034086227417, "learning_rate": 0.0002, "loss": 0.6869, "mean_token_accuracy": 0.8105187516659498, "num_tokens": 318707246.0, "step": 9485 }, { "entropy": 0.573135489411652, "epoch": 0.7112260264508257, "grad_norm": 0.18360723555088043, "learning_rate": 0.0002, "loss": 0.6812, "mean_token_accuracy": 0.8099189288914204, "num_tokens": 320212963.0, "step": 9490 }, { "entropy": 0.5605486676096916, "epoch": 0.7116007503846775, "grad_norm": 0.1966058909893036, "learning_rate": 0.0002, "loss": 0.6737, "mean_token_accuracy": 0.8099576100707054, "num_tokens": 321701166.0, "step": 9495 }, { "entropy": 0.5678812980651855, "epoch": 0.7119754743185294, "grad_norm": 0.22762195765972137, "learning_rate": 0.0002, "loss": 0.6841, "mean_token_accuracy": 0.8098420679569245, "num_tokens": 323177441.0, "step": 9500 }, { "entropy": 0.5642101498320699, "epoch": 0.7123501982523812, "grad_norm": 0.20162831246852875, "learning_rate": 0.0002, "loss": 0.6818, "mean_token_accuracy": 0.8109998904168606, "num_tokens": 324639324.0, "step": 9505 }, { "entropy": 0.5833527786657214, "epoch": 0.7127249221862331, "grad_norm": 0.21069352328777313, "learning_rate": 0.0002, "loss": 0.6996, "mean_token_accuracy": 0.8080143861472606, "num_tokens": 326128856.0, "step": 9510 }, { "entropy": 0.5573318047448993, "epoch": 0.7130996461200849, "grad_norm": 0.2060549259185791, "learning_rate": 0.0002, "loss": 0.6591, "mean_token_accuracy": 0.8158526379615069, "num_tokens": 327594317.0, "step": 9515 }, { "entropy": 0.5603201799094677, "epoch": 0.7134743700539369, "grad_norm": 0.20472416281700134, "learning_rate": 0.0002, "loss": 0.6799, "mean_token_accuracy": 0.8118957843631506, "num_tokens": 329022740.0, "step": 9520 }, { "entropy": 0.5594128089025616, "epoch": 0.7138490939877887, "grad_norm": 0.22907957434654236, "learning_rate": 0.0002, "loss": 0.6746, "mean_token_accuracy": 0.8130889032036066, "num_tokens": 330446062.0, "step": 9525 }, { "entropy": 0.5923684272915125, "epoch": 0.7142238179216406, "grad_norm": 0.2052043378353119, "learning_rate": 0.0002, "loss": 0.7068, "mean_token_accuracy": 0.8065712064504623, "num_tokens": 331932169.0, "step": 9530 }, { "entropy": 0.5913100687786936, "epoch": 0.7145985418554924, "grad_norm": 0.18984313309192657, "learning_rate": 0.0002, "loss": 0.6983, "mean_token_accuracy": 0.8081477429717779, "num_tokens": 333445139.0, "step": 9535 }, { "entropy": 0.5846819406375289, "epoch": 0.7149732657893443, "grad_norm": 0.19013404846191406, "learning_rate": 0.0002, "loss": 0.6852, "mean_token_accuracy": 0.8086851816624403, "num_tokens": 334926059.0, "step": 9540 }, { "entropy": 0.5803574208170176, "epoch": 0.7153479897231961, "grad_norm": 0.18898101150989532, "learning_rate": 0.0002, "loss": 0.6775, "mean_token_accuracy": 0.8127848967909813, "num_tokens": 336437333.0, "step": 9545 }, { "entropy": 0.5637122118845582, "epoch": 0.715722713657048, "grad_norm": 0.1932593137025833, "learning_rate": 0.0002, "loss": 0.6727, "mean_token_accuracy": 0.8116721354424954, "num_tokens": 337860254.0, "step": 9550 }, { "entropy": 0.5881282959133387, "epoch": 0.7160974375908998, "grad_norm": 0.1912548542022705, "learning_rate": 0.0002, "loss": 0.6975, "mean_token_accuracy": 0.8076846711337566, "num_tokens": 339377237.0, "step": 9555 }, { "entropy": 0.5812396293506026, "epoch": 0.7164721615247517, "grad_norm": 0.2046729028224945, "learning_rate": 0.0002, "loss": 0.6879, "mean_token_accuracy": 0.8101447757333518, "num_tokens": 340832627.0, "step": 9560 }, { "entropy": 0.5666244607418776, "epoch": 0.7168468854586035, "grad_norm": 0.20713500678539276, "learning_rate": 0.0002, "loss": 0.6678, "mean_token_accuracy": 0.8128194112330676, "num_tokens": 342304261.0, "step": 9565 }, { "entropy": 0.6004724988713861, "epoch": 0.7172216093924554, "grad_norm": 0.20983876287937164, "learning_rate": 0.0002, "loss": 0.7053, "mean_token_accuracy": 0.8064231306314469, "num_tokens": 343791999.0, "step": 9570 }, { "entropy": 0.582763725053519, "epoch": 0.7175963333263072, "grad_norm": 0.2057403177022934, "learning_rate": 0.0002, "loss": 0.6856, "mean_token_accuracy": 0.8129936080425978, "num_tokens": 345258362.0, "step": 9575 }, { "entropy": 0.5884500622749329, "epoch": 0.7179710572601591, "grad_norm": 0.22250626981258392, "learning_rate": 0.0002, "loss": 0.6914, "mean_token_accuracy": 0.8102622516453266, "num_tokens": 346713247.0, "step": 9580 }, { "entropy": 0.6032623887062073, "epoch": 0.7183457811940109, "grad_norm": 0.20010168850421906, "learning_rate": 0.0002, "loss": 0.708, "mean_token_accuracy": 0.8055873289704323, "num_tokens": 348200398.0, "step": 9585 }, { "entropy": 0.6019841047003865, "epoch": 0.7187205051278628, "grad_norm": 0.20983999967575073, "learning_rate": 0.0002, "loss": 0.6977, "mean_token_accuracy": 0.8069313012063504, "num_tokens": 349697767.0, "step": 9590 }, { "entropy": 0.5786174656823277, "epoch": 0.7190952290617146, "grad_norm": 0.1898883432149887, "learning_rate": 0.0002, "loss": 0.6736, "mean_token_accuracy": 0.8145160052925349, "num_tokens": 351147517.0, "step": 9595 }, { "entropy": 0.5815604954957962, "epoch": 0.7194699529955666, "grad_norm": 0.20895925164222717, "learning_rate": 0.0002, "loss": 0.6707, "mean_token_accuracy": 0.81516892015934, "num_tokens": 352612311.0, "step": 9600 }, { "entropy": 0.5926272094249725, "epoch": 0.7198446769294184, "grad_norm": 0.2185683250427246, "learning_rate": 0.0002, "loss": 0.6855, "mean_token_accuracy": 0.8113894801586866, "num_tokens": 354061228.0, "step": 9605 }, { "entropy": 0.591436761803925, "epoch": 0.7202194008632703, "grad_norm": 0.205888032913208, "learning_rate": 0.0002, "loss": 0.6894, "mean_token_accuracy": 0.8105201806873084, "num_tokens": 355527544.0, "step": 9610 }, { "entropy": 0.564955064561218, "epoch": 0.7205941247971221, "grad_norm": 0.18847297132015228, "learning_rate": 0.0002, "loss": 0.6692, "mean_token_accuracy": 0.8140731532126665, "num_tokens": 356968148.0, "step": 9615 }, { "entropy": 0.580063903145492, "epoch": 0.720968848730974, "grad_norm": 0.19728969037532806, "learning_rate": 0.0002, "loss": 0.6947, "mean_token_accuracy": 0.8055280890315771, "num_tokens": 358444106.0, "step": 9620 }, { "entropy": 0.5771507998928428, "epoch": 0.7213435726648258, "grad_norm": 0.2181500345468521, "learning_rate": 0.0002, "loss": 0.6884, "mean_token_accuracy": 0.8097044203430415, "num_tokens": 359917105.0, "step": 9625 }, { "entropy": 0.5862113179638981, "epoch": 0.7217182965986777, "grad_norm": 0.1944039762020111, "learning_rate": 0.0002, "loss": 0.6992, "mean_token_accuracy": 0.8081128090620041, "num_tokens": 361403550.0, "step": 9630 }, { "entropy": 0.5865960566326975, "epoch": 0.7220930205325296, "grad_norm": 0.2563723921775818, "learning_rate": 0.0002, "loss": 0.6965, "mean_token_accuracy": 0.8097056817263365, "num_tokens": 362870983.0, "step": 9635 }, { "entropy": 0.5531126894056797, "epoch": 0.7224677444663814, "grad_norm": 0.20597311854362488, "learning_rate": 0.0002, "loss": 0.6779, "mean_token_accuracy": 0.8144817858934402, "num_tokens": 364290153.0, "step": 9640 }, { "entropy": 0.5689105667173863, "epoch": 0.7228424684002333, "grad_norm": 0.2082528918981552, "learning_rate": 0.0002, "loss": 0.6866, "mean_token_accuracy": 0.80965376496315, "num_tokens": 365752102.0, "step": 9645 }, { "entropy": 0.5620084438472986, "epoch": 0.7232171923340851, "grad_norm": 0.21058258414268494, "learning_rate": 0.0002, "loss": 0.678, "mean_token_accuracy": 0.8101997945457697, "num_tokens": 367226496.0, "step": 9650 }, { "entropy": 0.5759396640583873, "epoch": 0.723591916267937, "grad_norm": 0.19343042373657227, "learning_rate": 0.0002, "loss": 0.6972, "mean_token_accuracy": 0.8077963508665562, "num_tokens": 368669958.0, "step": 9655 }, { "entropy": 0.578104374743998, "epoch": 0.7239666402017888, "grad_norm": 0.1989438682794571, "learning_rate": 0.0002, "loss": 0.7018, "mean_token_accuracy": 0.8055240839719773, "num_tokens": 370135507.0, "step": 9660 }, { "entropy": 0.5688631923869252, "epoch": 0.7243413641356407, "grad_norm": 0.232672318816185, "learning_rate": 0.0002, "loss": 0.6978, "mean_token_accuracy": 0.8099156148731709, "num_tokens": 371578715.0, "step": 9665 }, { "entropy": 0.5595905182883143, "epoch": 0.7247160880694925, "grad_norm": 0.21287275850772858, "learning_rate": 0.0002, "loss": 0.6801, "mean_token_accuracy": 0.812260128557682, "num_tokens": 373023188.0, "step": 9670 }, { "entropy": 0.570568575244397, "epoch": 0.7250908120033445, "grad_norm": 0.2115774005651474, "learning_rate": 0.0002, "loss": 0.6857, "mean_token_accuracy": 0.8077544260770082, "num_tokens": 374467497.0, "step": 9675 }, { "entropy": 0.5602902522310614, "epoch": 0.7254655359371963, "grad_norm": 0.19747577607631683, "learning_rate": 0.0002, "loss": 0.6711, "mean_token_accuracy": 0.8097302611917258, "num_tokens": 375916117.0, "step": 9680 }, { "entropy": 0.5670629642903805, "epoch": 0.7258402598710482, "grad_norm": 0.21055440604686737, "learning_rate": 0.0002, "loss": 0.6786, "mean_token_accuracy": 0.8134722288697958, "num_tokens": 377395786.0, "step": 9685 }, { "entropy": 0.55989635810256, "epoch": 0.7262149838049, "grad_norm": 0.20885126292705536, "learning_rate": 0.0002, "loss": 0.6758, "mean_token_accuracy": 0.8149845380336046, "num_tokens": 378850009.0, "step": 9690 }, { "entropy": 0.5577881928533316, "epoch": 0.7265897077387519, "grad_norm": 0.20097985863685608, "learning_rate": 0.0002, "loss": 0.6762, "mean_token_accuracy": 0.8136957913637162, "num_tokens": 380328019.0, "step": 9695 }, { "entropy": 0.5784922679886222, "epoch": 0.7269644316726037, "grad_norm": 0.4594259560108185, "learning_rate": 0.0002, "loss": 0.6985, "mean_token_accuracy": 0.8100153423845768, "num_tokens": 381854214.0, "step": 9700 }, { "entropy": 0.5615364631637931, "epoch": 0.7273391556064556, "grad_norm": 0.20550836622714996, "learning_rate": 0.0002, "loss": 0.6791, "mean_token_accuracy": 0.8098490975797177, "num_tokens": 383315495.0, "step": 9705 }, { "entropy": 0.5594747066497803, "epoch": 0.7277138795403074, "grad_norm": 0.22412194311618805, "learning_rate": 0.0002, "loss": 0.687, "mean_token_accuracy": 0.8117369052022696, "num_tokens": 384736169.0, "step": 9710 }, { "entropy": 0.5600243709981442, "epoch": 0.7280886034741593, "grad_norm": 0.18947947025299072, "learning_rate": 0.0002, "loss": 0.6752, "mean_token_accuracy": 0.8084039311856032, "num_tokens": 386164108.0, "step": 9715 }, { "entropy": 0.5735881582833826, "epoch": 0.7284633274080111, "grad_norm": 0.32225501537323, "learning_rate": 0.0002, "loss": 0.7025, "mean_token_accuracy": 0.809577253088355, "num_tokens": 387640691.0, "step": 9720 }, { "entropy": 0.5744219653308391, "epoch": 0.728838051341863, "grad_norm": 0.2045966237783432, "learning_rate": 0.0002, "loss": 0.6908, "mean_token_accuracy": 0.8087959170341492, "num_tokens": 389145395.0, "step": 9725 }, { "entropy": 0.5663927881047129, "epoch": 0.7292127752757148, "grad_norm": 0.19935296475887299, "learning_rate": 0.0002, "loss": 0.6816, "mean_token_accuracy": 0.8116070576012134, "num_tokens": 390634878.0, "step": 9730 }, { "entropy": 0.5631224548444151, "epoch": 0.7295874992095667, "grad_norm": 0.23440048098564148, "learning_rate": 0.0002, "loss": 0.6748, "mean_token_accuracy": 0.8142853397876024, "num_tokens": 392085691.0, "step": 9735 }, { "entropy": 0.5464721901342273, "epoch": 0.7299622231434185, "grad_norm": 0.21272003650665283, "learning_rate": 0.0002, "loss": 0.6586, "mean_token_accuracy": 0.8164637066423893, "num_tokens": 393513077.0, "step": 9740 }, { "entropy": 0.5486749384552241, "epoch": 0.7303369470772704, "grad_norm": 0.20832574367523193, "learning_rate": 0.0002, "loss": 0.6641, "mean_token_accuracy": 0.8127151094377041, "num_tokens": 394949881.0, "step": 9745 }, { "entropy": 0.5683700585737824, "epoch": 0.7307116710111222, "grad_norm": 0.19971387088298798, "learning_rate": 0.0002, "loss": 0.6868, "mean_token_accuracy": 0.813614097610116, "num_tokens": 396427125.0, "step": 9750 }, { "entropy": 0.5831880729645491, "epoch": 0.7310863949449742, "grad_norm": 0.2020214945077896, "learning_rate": 0.0002, "loss": 0.7055, "mean_token_accuracy": 0.8078326854854823, "num_tokens": 397926786.0, "step": 9755 }, { "entropy": 0.5688781829550862, "epoch": 0.731461118878826, "grad_norm": 0.19753314554691315, "learning_rate": 0.0002, "loss": 0.6863, "mean_token_accuracy": 0.8091260008513927, "num_tokens": 399355814.0, "step": 9760 }, { "entropy": 0.5733280846849084, "epoch": 0.7318358428126779, "grad_norm": 0.2215949296951294, "learning_rate": 0.0002, "loss": 0.6925, "mean_token_accuracy": 0.8098347425460816, "num_tokens": 400861510.0, "step": 9765 }, { "entropy": 0.5585992600768804, "epoch": 0.7322105667465297, "grad_norm": 0.21057170629501343, "learning_rate": 0.0002, "loss": 0.6708, "mean_token_accuracy": 0.8151062563061714, "num_tokens": 402286772.0, "step": 9770 }, { "entropy": 0.5635543234646321, "epoch": 0.7325852906803816, "grad_norm": 0.2211710512638092, "learning_rate": 0.0002, "loss": 0.6849, "mean_token_accuracy": 0.813905593752861, "num_tokens": 403708174.0, "step": 9775 }, { "entropy": 0.5728855377063156, "epoch": 0.7329600146142334, "grad_norm": 0.21433904767036438, "learning_rate": 0.0002, "loss": 0.6962, "mean_token_accuracy": 0.8058967232704163, "num_tokens": 405167246.0, "step": 9780 }, { "entropy": 0.5601021261885762, "epoch": 0.7333347385480853, "grad_norm": 0.19191478192806244, "learning_rate": 0.0002, "loss": 0.6738, "mean_token_accuracy": 0.8120449833571911, "num_tokens": 406641011.0, "step": 9785 }, { "entropy": 0.5514616264030338, "epoch": 0.7337094624819371, "grad_norm": 0.20756745338439941, "learning_rate": 0.0002, "loss": 0.6676, "mean_token_accuracy": 0.8152942329645156, "num_tokens": 408082408.0, "step": 9790 }, { "entropy": 0.5651532022282482, "epoch": 0.734084186415789, "grad_norm": 0.19745852053165436, "learning_rate": 0.0002, "loss": 0.6883, "mean_token_accuracy": 0.8105798970907927, "num_tokens": 409565510.0, "step": 9795 }, { "entropy": 0.5618660701438785, "epoch": 0.7344589103496408, "grad_norm": 0.20197834074497223, "learning_rate": 0.0002, "loss": 0.6867, "mean_token_accuracy": 0.8095463324338198, "num_tokens": 411045159.0, "step": 9800 }, { "entropy": 0.5481777552515268, "epoch": 0.7348336342834927, "grad_norm": 0.18848438560962677, "learning_rate": 0.0002, "loss": 0.6727, "mean_token_accuracy": 0.8127046428620815, "num_tokens": 1458601.0, "step": 9805 }, { "entropy": 0.5701336683705449, "epoch": 0.7352083582173445, "grad_norm": 0.18821334838867188, "learning_rate": 0.0002, "loss": 0.6996, "mean_token_accuracy": 0.8087221644818783, "num_tokens": 2933064.0, "step": 9810 }, { "entropy": 0.5493646238930523, "epoch": 0.7355830821511964, "grad_norm": 0.1930852234363556, "learning_rate": 0.0002, "loss": 0.6666, "mean_token_accuracy": 0.813911448046565, "num_tokens": 4359916.0, "step": 9815 }, { "entropy": 0.5538445513695478, "epoch": 0.7359578060850482, "grad_norm": 0.19317267835140228, "learning_rate": 0.0002, "loss": 0.6786, "mean_token_accuracy": 0.8126622550189495, "num_tokens": 5785430.0, "step": 9820 }, { "entropy": 0.5595841763541103, "epoch": 0.7363325300189001, "grad_norm": 0.19704246520996094, "learning_rate": 0.0002, "loss": 0.6894, "mean_token_accuracy": 0.8068298660218716, "num_tokens": 7221882.0, "step": 9825 }, { "entropy": 0.5657623042352498, "epoch": 0.736707253952752, "grad_norm": 0.20216166973114014, "learning_rate": 0.0002, "loss": 0.6931, "mean_token_accuracy": 0.8076296031475068, "num_tokens": 8730160.0, "step": 9830 }, { "entropy": 0.5802229376509785, "epoch": 0.7370819778866039, "grad_norm": 0.22145217657089233, "learning_rate": 0.0002, "loss": 0.6999, "mean_token_accuracy": 0.8088170316070318, "num_tokens": 10237149.0, "step": 9835 }, { "entropy": 0.5586304560303688, "epoch": 0.7374567018204558, "grad_norm": 0.1927095502614975, "learning_rate": 0.0002, "loss": 0.6857, "mean_token_accuracy": 0.8110189251601696, "num_tokens": 11679712.0, "step": 9840 }, { "entropy": 0.5504014706239104, "epoch": 0.7378314257543076, "grad_norm": 0.25766026973724365, "learning_rate": 0.0002, "loss": 0.6789, "mean_token_accuracy": 0.8116058278828859, "num_tokens": 13145654.0, "step": 9845 }, { "entropy": 0.5667012460529804, "epoch": 0.7382061496881595, "grad_norm": 0.2035420536994934, "learning_rate": 0.0002, "loss": 0.6944, "mean_token_accuracy": 0.8104210413992405, "num_tokens": 14610619.0, "step": 9850 }, { "entropy": 0.5617953235283494, "epoch": 0.7385808736220113, "grad_norm": 0.2155904918909073, "learning_rate": 0.0002, "loss": 0.6967, "mean_token_accuracy": 0.8080554895102978, "num_tokens": 16040509.0, "step": 9855 }, { "entropy": 0.5571979360654951, "epoch": 0.7389555975558632, "grad_norm": 0.20293740928173065, "learning_rate": 0.0002, "loss": 0.6847, "mean_token_accuracy": 0.8110688496381044, "num_tokens": 17467928.0, "step": 9860 }, { "entropy": 0.5596860019490123, "epoch": 0.739330321489715, "grad_norm": 0.22453077137470245, "learning_rate": 0.0002, "loss": 0.687, "mean_token_accuracy": 0.8114126797765493, "num_tokens": 18916481.0, "step": 9865 }, { "entropy": 0.560153223015368, "epoch": 0.7397050454235669, "grad_norm": 0.18653246760368347, "learning_rate": 0.0002, "loss": 0.6848, "mean_token_accuracy": 0.8128115881234408, "num_tokens": 20351307.0, "step": 9870 }, { "entropy": 0.5533657293766737, "epoch": 0.7400797693574187, "grad_norm": 0.19614648818969727, "learning_rate": 0.0002, "loss": 0.6722, "mean_token_accuracy": 0.8138990584760905, "num_tokens": 21791447.0, "step": 9875 }, { "entropy": 0.5601743249222636, "epoch": 0.7404544932912706, "grad_norm": 0.18637581169605255, "learning_rate": 0.0002, "loss": 0.668, "mean_token_accuracy": 0.814869312569499, "num_tokens": 23246292.0, "step": 9880 }, { "entropy": 0.5481826221570373, "epoch": 0.7408292172251224, "grad_norm": 0.21337288618087769, "learning_rate": 0.0002, "loss": 0.6621, "mean_token_accuracy": 0.8141466166824103, "num_tokens": 24751931.0, "step": 9885 }, { "entropy": 0.568215686827898, "epoch": 0.7412039411589743, "grad_norm": 0.2041051983833313, "learning_rate": 0.0002, "loss": 0.6927, "mean_token_accuracy": 0.8085038796067238, "num_tokens": 26228754.0, "step": 9890 }, { "entropy": 0.5661028077825904, "epoch": 0.7415786650928261, "grad_norm": 0.2022516429424286, "learning_rate": 0.0002, "loss": 0.6804, "mean_token_accuracy": 0.8157083127647639, "num_tokens": 27691254.0, "step": 9895 }, { "entropy": 0.5636758301407099, "epoch": 0.741953389026678, "grad_norm": 0.26545780897140503, "learning_rate": 0.0002, "loss": 0.6729, "mean_token_accuracy": 0.8122284445911646, "num_tokens": 29185334.0, "step": 9900 }, { "entropy": 0.5759978868067265, "epoch": 0.7423281129605298, "grad_norm": 0.20686395466327667, "learning_rate": 0.0002, "loss": 0.6843, "mean_token_accuracy": 0.8107663221657276, "num_tokens": 30673734.0, "step": 9905 }, { "entropy": 0.5560418909415603, "epoch": 0.7427028368943818, "grad_norm": 0.20183801651000977, "learning_rate": 0.0002, "loss": 0.6609, "mean_token_accuracy": 0.8113027516752481, "num_tokens": 32169552.0, "step": 9910 }, { "entropy": 0.5701857099309564, "epoch": 0.7430775608282336, "grad_norm": 0.33502620458602905, "learning_rate": 0.0002, "loss": 0.6852, "mean_token_accuracy": 0.8066281504929066, "num_tokens": 33646414.0, "step": 9915 }, { "entropy": 0.5758354945108295, "epoch": 0.7434522847620855, "grad_norm": 0.2103537917137146, "learning_rate": 0.0002, "loss": 0.6969, "mean_token_accuracy": 0.8083195634186268, "num_tokens": 35120880.0, "step": 9920 }, { "entropy": 0.5920243022963405, "epoch": 0.7438270086959373, "grad_norm": 0.2106325924396515, "learning_rate": 0.0002, "loss": 0.6998, "mean_token_accuracy": 0.8079452410340309, "num_tokens": 36631591.0, "step": 9925 }, { "entropy": 0.5789008060470223, "epoch": 0.7442017326297892, "grad_norm": 0.2078704684972763, "learning_rate": 0.0002, "loss": 0.7012, "mean_token_accuracy": 0.8105935651808978, "num_tokens": 38079761.0, "step": 9930 }, { "entropy": 0.5595704204402864, "epoch": 0.744576456563641, "grad_norm": 0.20770500600337982, "learning_rate": 0.0002, "loss": 0.6756, "mean_token_accuracy": 0.8111835084855556, "num_tokens": 39566297.0, "step": 9935 }, { "entropy": 0.5749794645234942, "epoch": 0.7449511804974929, "grad_norm": 0.22441568970680237, "learning_rate": 0.0002, "loss": 0.7041, "mean_token_accuracy": 0.8060854587703943, "num_tokens": 41060897.0, "step": 9940 }, { "entropy": 0.5548018703237176, "epoch": 0.7453259044313447, "grad_norm": 0.22395774722099304, "learning_rate": 0.0002, "loss": 0.6832, "mean_token_accuracy": 0.808983089402318, "num_tokens": 42555382.0, "step": 9945 }, { "entropy": 0.5533584730699659, "epoch": 0.7457006283651966, "grad_norm": 0.19673940539360046, "learning_rate": 0.0002, "loss": 0.6693, "mean_token_accuracy": 0.8154916282743215, "num_tokens": 44020816.0, "step": 9950 }, { "entropy": 0.5627813769504428, "epoch": 0.7460753522990484, "grad_norm": 0.19159235060214996, "learning_rate": 0.0002, "loss": 0.6822, "mean_token_accuracy": 0.8118107974529266, "num_tokens": 45489156.0, "step": 9955 }, { "entropy": 0.5562672972679138, "epoch": 0.7464500762329003, "grad_norm": 0.19923162460327148, "learning_rate": 0.0002, "loss": 0.678, "mean_token_accuracy": 0.8123088207095861, "num_tokens": 46978551.0, "step": 9960 }, { "entropy": 0.5763363055884838, "epoch": 0.7468248001667521, "grad_norm": 0.20063750445842743, "learning_rate": 0.0002, "loss": 0.6914, "mean_token_accuracy": 0.8115300331264734, "num_tokens": 48453758.0, "step": 9965 }, { "entropy": 0.5582985308021307, "epoch": 0.747199524100604, "grad_norm": 0.1947137415409088, "learning_rate": 0.0002, "loss": 0.6653, "mean_token_accuracy": 0.8147341769188643, "num_tokens": 49879942.0, "step": 9970 }, { "entropy": 0.5585894268006086, "epoch": 0.7475742480344558, "grad_norm": 0.2013792246580124, "learning_rate": 0.0002, "loss": 0.6604, "mean_token_accuracy": 0.8133472140878439, "num_tokens": 51366813.0, "step": 9975 }, { "entropy": 0.5670221941545606, "epoch": 0.7479489719683077, "grad_norm": 0.20650100708007812, "learning_rate": 0.0002, "loss": 0.6705, "mean_token_accuracy": 0.8131394796073437, "num_tokens": 52877726.0, "step": 9980 }, { "entropy": 0.5726255524903536, "epoch": 0.7483236959021596, "grad_norm": 0.21157114207744598, "learning_rate": 0.0002, "loss": 0.6796, "mean_token_accuracy": 0.8123737558722496, "num_tokens": 54328504.0, "step": 9985 }, { "entropy": 0.573940365202725, "epoch": 0.7486984198360115, "grad_norm": 0.19641129672527313, "learning_rate": 0.0002, "loss": 0.6792, "mean_token_accuracy": 0.811272332072258, "num_tokens": 55793069.0, "step": 9990 }, { "entropy": 0.6033023085445166, "epoch": 0.7490731437698633, "grad_norm": 0.19565068185329437, "learning_rate": 0.0002, "loss": 0.7024, "mean_token_accuracy": 0.807830560579896, "num_tokens": 57279623.0, "step": 9995 }, { "entropy": 0.5874843375757337, "epoch": 0.7494478677037152, "grad_norm": 0.25535526871681213, "learning_rate": 0.0002, "loss": 0.6848, "mean_token_accuracy": 0.8118246763944625, "num_tokens": 58772307.0, "step": 10000 }, { "entropy": 0.6015228437259793, "epoch": 0.749822591637567, "grad_norm": 0.2572895884513855, "learning_rate": 0.0002, "loss": 0.6955, "mean_token_accuracy": 0.8066255904734134, "num_tokens": 60240959.0, "step": 10005 }, { "entropy": 0.589034236036241, "epoch": 0.7501973155714189, "grad_norm": 0.22219473123550415, "learning_rate": 0.0002, "loss": 0.6803, "mean_token_accuracy": 0.8127103745937347, "num_tokens": 61730263.0, "step": 10010 }, { "entropy": 0.5664888793602586, "epoch": 0.7505720395052707, "grad_norm": 0.20837391912937164, "learning_rate": 0.0002, "loss": 0.6618, "mean_token_accuracy": 0.8154230404645204, "num_tokens": 63220383.0, "step": 10015 }, { "entropy": 0.5549443738535047, "epoch": 0.7509467634391226, "grad_norm": 0.19703342020511627, "learning_rate": 0.0002, "loss": 0.6492, "mean_token_accuracy": 0.817657633125782, "num_tokens": 64714396.0, "step": 10020 }, { "entropy": 0.578785265609622, "epoch": 0.7513214873729744, "grad_norm": 0.2315840721130371, "learning_rate": 0.0002, "loss": 0.6961, "mean_token_accuracy": 0.8087290994822979, "num_tokens": 66133796.0, "step": 10025 }, { "entropy": 0.5804629879072308, "epoch": 0.7516962113068263, "grad_norm": 0.21196430921554565, "learning_rate": 0.0002, "loss": 0.6933, "mean_token_accuracy": 0.8107944391667843, "num_tokens": 67588430.0, "step": 10030 }, { "entropy": 0.571128992177546, "epoch": 0.7520709352406781, "grad_norm": 0.2059342861175537, "learning_rate": 0.0002, "loss": 0.6802, "mean_token_accuracy": 0.813560176640749, "num_tokens": 69046325.0, "step": 10035 }, { "entropy": 0.5695220040157437, "epoch": 0.75244565917453, "grad_norm": 0.19712436199188232, "learning_rate": 0.0002, "loss": 0.6971, "mean_token_accuracy": 0.807468642666936, "num_tokens": 70514833.0, "step": 10040 }, { "entropy": 0.5685533254407347, "epoch": 0.7528203831083818, "grad_norm": 0.21414019167423248, "learning_rate": 0.0002, "loss": 0.6778, "mean_token_accuracy": 0.8100226357579231, "num_tokens": 71977379.0, "step": 10045 }, { "entropy": 0.5711461920291185, "epoch": 0.7531951070422337, "grad_norm": 0.19914937019348145, "learning_rate": 0.0002, "loss": 0.6821, "mean_token_accuracy": 0.8104952517896891, "num_tokens": 73452696.0, "step": 10050 }, { "entropy": 0.5797641094774008, "epoch": 0.7535698309760855, "grad_norm": 0.18892040848731995, "learning_rate": 0.0002, "loss": 0.6892, "mean_token_accuracy": 0.8109553880989552, "num_tokens": 74944605.0, "step": 10055 }, { "entropy": 0.5660814123228193, "epoch": 0.7539445549099374, "grad_norm": 0.19573093950748444, "learning_rate": 0.0002, "loss": 0.6663, "mean_token_accuracy": 0.8152664072811604, "num_tokens": 76395325.0, "step": 10060 }, { "entropy": 0.5576535664498806, "epoch": 0.7543192788437894, "grad_norm": 0.22068773210048676, "learning_rate": 0.0002, "loss": 0.6631, "mean_token_accuracy": 0.8155834171921015, "num_tokens": 77847560.0, "step": 10065 }, { "entropy": 0.5846087159588933, "epoch": 0.7546940027776412, "grad_norm": 0.1897910088300705, "learning_rate": 0.0002, "loss": 0.6948, "mean_token_accuracy": 0.80654720030725, "num_tokens": 79333973.0, "step": 10070 }, { "entropy": 0.560074610915035, "epoch": 0.755068726711493, "grad_norm": 0.2525058686733246, "learning_rate": 0.0002, "loss": 0.6644, "mean_token_accuracy": 0.8143374759703874, "num_tokens": 80797097.0, "step": 10075 }, { "entropy": 0.5739984393119812, "epoch": 0.7554434506453449, "grad_norm": 0.2036161571741104, "learning_rate": 0.0002, "loss": 0.6769, "mean_token_accuracy": 0.8109189916402102, "num_tokens": 82262222.0, "step": 10080 }, { "entropy": 0.5821411004289985, "epoch": 0.7558181745791968, "grad_norm": 0.18599092960357666, "learning_rate": 0.0002, "loss": 0.6829, "mean_token_accuracy": 0.8111226920038461, "num_tokens": 83736656.0, "step": 10085 }, { "entropy": 0.5707690828479827, "epoch": 0.7561928985130486, "grad_norm": 0.21470774710178375, "learning_rate": 0.0002, "loss": 0.672, "mean_token_accuracy": 0.8117883183062077, "num_tokens": 85191930.0, "step": 10090 }, { "entropy": 0.5717164596542716, "epoch": 0.7565676224469005, "grad_norm": 0.22449415922164917, "learning_rate": 0.0002, "loss": 0.6762, "mean_token_accuracy": 0.8139200378209352, "num_tokens": 86639586.0, "step": 10095 }, { "entropy": 0.5839122029021382, "epoch": 0.7569423463807523, "grad_norm": 0.20560689270496368, "learning_rate": 0.0002, "loss": 0.6807, "mean_token_accuracy": 0.8105049941688776, "num_tokens": 88109884.0, "step": 10100 }, { "entropy": 0.5780090058222414, "epoch": 0.7573170703146042, "grad_norm": 0.1997208148241043, "learning_rate": 0.0002, "loss": 0.6827, "mean_token_accuracy": 0.809601179882884, "num_tokens": 89545075.0, "step": 10105 }, { "entropy": 0.5800757881253957, "epoch": 0.757691794248456, "grad_norm": 0.19518880546092987, "learning_rate": 0.0002, "loss": 0.6914, "mean_token_accuracy": 0.8074586801230907, "num_tokens": 91030721.0, "step": 10110 }, { "entropy": 0.5834470642730594, "epoch": 0.7580665181823079, "grad_norm": 0.20315420627593994, "learning_rate": 0.0002, "loss": 0.6894, "mean_token_accuracy": 0.8079790994524956, "num_tokens": 92541729.0, "step": 10115 }, { "entropy": 0.5804402543231845, "epoch": 0.7584412421161597, "grad_norm": 0.20035609602928162, "learning_rate": 0.0002, "loss": 0.6952, "mean_token_accuracy": 0.808877108246088, "num_tokens": 94017711.0, "step": 10120 }, { "entropy": 0.5614628247916699, "epoch": 0.7588159660500116, "grad_norm": 0.1932367980480194, "learning_rate": 0.0002, "loss": 0.6704, "mean_token_accuracy": 0.8122741989791393, "num_tokens": 95483572.0, "step": 10125 }, { "entropy": 0.5709651317447424, "epoch": 0.7591906899838634, "grad_norm": 0.19021262228488922, "learning_rate": 0.0002, "loss": 0.6845, "mean_token_accuracy": 0.8093257788568735, "num_tokens": 96910594.0, "step": 10130 }, { "entropy": 0.5740485334768891, "epoch": 0.7595654139177153, "grad_norm": 0.1990092396736145, "learning_rate": 0.0002, "loss": 0.6814, "mean_token_accuracy": 0.8112134832888842, "num_tokens": 98364527.0, "step": 10135 }, { "entropy": 0.5617828394286335, "epoch": 0.7599401378515672, "grad_norm": 0.1960204392671585, "learning_rate": 0.0002, "loss": 0.6671, "mean_token_accuracy": 0.8166765261441469, "num_tokens": 99840122.0, "step": 10140 }, { "entropy": 0.5669731272384524, "epoch": 0.7603148617854191, "grad_norm": 0.1991107165813446, "learning_rate": 0.0002, "loss": 0.6794, "mean_token_accuracy": 0.8137098036706447, "num_tokens": 101285921.0, "step": 10145 }, { "entropy": 0.5420610783621669, "epoch": 0.7606895857192709, "grad_norm": 0.18616408109664917, "learning_rate": 0.0002, "loss": 0.6597, "mean_token_accuracy": 0.814639662578702, "num_tokens": 102743969.0, "step": 10150 }, { "entropy": 0.552033331990242, "epoch": 0.7610643096531228, "grad_norm": 0.19644956290721893, "learning_rate": 0.0002, "loss": 0.6777, "mean_token_accuracy": 0.8154577158391476, "num_tokens": 104175370.0, "step": 10155 }, { "entropy": 0.5589954994618893, "epoch": 0.7614390335869746, "grad_norm": 0.21914856135845184, "learning_rate": 0.0002, "loss": 0.6723, "mean_token_accuracy": 0.8109598677605391, "num_tokens": 105645122.0, "step": 10160 }, { "entropy": 0.5646612707525491, "epoch": 0.7618137575208265, "grad_norm": 0.20999932289123535, "learning_rate": 0.0002, "loss": 0.6786, "mean_token_accuracy": 0.8124109517782927, "num_tokens": 107099176.0, "step": 10165 }, { "entropy": 0.5537554489448666, "epoch": 0.7621884814546783, "grad_norm": 0.23554082214832306, "learning_rate": 0.0002, "loss": 0.6716, "mean_token_accuracy": 0.8133471388369798, "num_tokens": 108557991.0, "step": 10170 }, { "entropy": 0.56332171484828, "epoch": 0.7625632053885302, "grad_norm": 0.2857425808906555, "learning_rate": 0.0002, "loss": 0.6924, "mean_token_accuracy": 0.8106699384748935, "num_tokens": 109990959.0, "step": 10175 }, { "entropy": 0.5656483296304942, "epoch": 0.762937929322382, "grad_norm": 0.21210506558418274, "learning_rate": 0.0002, "loss": 0.6813, "mean_token_accuracy": 0.8094532188028097, "num_tokens": 111452227.0, "step": 10180 }, { "entropy": 0.5527910799719393, "epoch": 0.7633126532562339, "grad_norm": 0.21874772012233734, "learning_rate": 0.0002, "loss": 0.6643, "mean_token_accuracy": 0.8119579907506704, "num_tokens": 112881394.0, "step": 10185 }, { "entropy": 0.5588457737118006, "epoch": 0.7636873771900857, "grad_norm": 0.21661286056041718, "learning_rate": 0.0002, "loss": 0.6866, "mean_token_accuracy": 0.8127934023737907, "num_tokens": 114344608.0, "step": 10190 }, { "entropy": 0.5662413898855447, "epoch": 0.7640621011239376, "grad_norm": 0.18508125841617584, "learning_rate": 0.0002, "loss": 0.7057, "mean_token_accuracy": 0.8086641281843185, "num_tokens": 115797422.0, "step": 10195 }, { "entropy": 0.5568818168714642, "epoch": 0.7644368250577894, "grad_norm": 0.1950504630804062, "learning_rate": 0.0002, "loss": 0.6845, "mean_token_accuracy": 0.8093828465789557, "num_tokens": 117269856.0, "step": 10200 }, { "entropy": 0.557499011233449, "epoch": 0.7648115489916413, "grad_norm": 0.20133720338344574, "learning_rate": 0.0002, "loss": 0.683, "mean_token_accuracy": 0.8102143242955208, "num_tokens": 118774638.0, "step": 10205 }, { "entropy": 0.5612733339890837, "epoch": 0.7651862729254931, "grad_norm": 0.2106878012418747, "learning_rate": 0.0002, "loss": 0.6814, "mean_token_accuracy": 0.8117003887891769, "num_tokens": 120256629.0, "step": 10210 }, { "entropy": 0.5564417690038681, "epoch": 0.765560996859345, "grad_norm": 0.1933099627494812, "learning_rate": 0.0002, "loss": 0.6715, "mean_token_accuracy": 0.8132068756967783, "num_tokens": 121750710.0, "step": 10215 }, { "entropy": 0.5612802822142839, "epoch": 0.7659357207931969, "grad_norm": 0.1909349113702774, "learning_rate": 0.0002, "loss": 0.6817, "mean_token_accuracy": 0.8132397249341011, "num_tokens": 123242955.0, "step": 10220 }, { "entropy": 0.5503055168315768, "epoch": 0.7663104447270488, "grad_norm": 0.20075218379497528, "learning_rate": 0.0002, "loss": 0.6685, "mean_token_accuracy": 0.81414256952703, "num_tokens": 124678130.0, "step": 10225 }, { "entropy": 0.5712565118446946, "epoch": 0.7666851686609006, "grad_norm": 0.21570897102355957, "learning_rate": 0.0002, "loss": 0.6863, "mean_token_accuracy": 0.8113312017172575, "num_tokens": 126158891.0, "step": 10230 }, { "entropy": 0.5505262857303024, "epoch": 0.7670598925947525, "grad_norm": 0.19373291730880737, "learning_rate": 0.0002, "loss": 0.6595, "mean_token_accuracy": 0.8165093038231135, "num_tokens": 127600456.0, "step": 10235 }, { "entropy": 0.5583393584936858, "epoch": 0.7674346165286043, "grad_norm": 0.1968058943748474, "learning_rate": 0.0002, "loss": 0.6835, "mean_token_accuracy": 0.8067659039050341, "num_tokens": 129058292.0, "step": 10240 }, { "entropy": 0.554837504401803, "epoch": 0.7678093404624562, "grad_norm": 0.20208516716957092, "learning_rate": 0.0002, "loss": 0.6751, "mean_token_accuracy": 0.8152739182114601, "num_tokens": 130539569.0, "step": 10245 }, { "entropy": 0.5642950364388526, "epoch": 0.768184064396308, "grad_norm": 0.19831442832946777, "learning_rate": 0.0002, "loss": 0.6842, "mean_token_accuracy": 0.8126643273979426, "num_tokens": 131990656.0, "step": 10250 }, { "entropy": 0.5767976112663746, "epoch": 0.7685587883301599, "grad_norm": 0.19299675524234772, "learning_rate": 0.0002, "loss": 0.6907, "mean_token_accuracy": 0.8072094466537237, "num_tokens": 133465360.0, "step": 10255 }, { "entropy": 0.563090436346829, "epoch": 0.7689335122640117, "grad_norm": 0.37898507714271545, "learning_rate": 0.0002, "loss": 0.6759, "mean_token_accuracy": 0.8124324060976506, "num_tokens": 134882606.0, "step": 10260 }, { "entropy": 0.5673618175089359, "epoch": 0.7693082361978636, "grad_norm": 0.18453951179981232, "learning_rate": 0.0002, "loss": 0.6751, "mean_token_accuracy": 0.8139164078980684, "num_tokens": 136328811.0, "step": 10265 }, { "entropy": 0.5859864134341478, "epoch": 0.7696829601317154, "grad_norm": 0.20595915615558624, "learning_rate": 0.0002, "loss": 0.7012, "mean_token_accuracy": 0.8070973455905914, "num_tokens": 137794408.0, "step": 10270 }, { "entropy": 0.573388853482902, "epoch": 0.7700576840655673, "grad_norm": 0.1994549185037613, "learning_rate": 0.0002, "loss": 0.689, "mean_token_accuracy": 0.8078138504177332, "num_tokens": 139290350.0, "step": 10275 }, { "entropy": 0.5355982344597578, "epoch": 0.7704324079994191, "grad_norm": 0.19284197688102722, "learning_rate": 0.0002, "loss": 0.6695, "mean_token_accuracy": 0.8132123284041881, "num_tokens": 140716096.0, "step": 10280 }, { "entropy": 0.5353037592954933, "epoch": 0.770807131933271, "grad_norm": 0.21526364982128143, "learning_rate": 0.0002, "loss": 0.6746, "mean_token_accuracy": 0.8135915502905846, "num_tokens": 142151164.0, "step": 10285 }, { "entropy": 0.5661560531705618, "epoch": 0.7711818558671228, "grad_norm": 0.20207498967647552, "learning_rate": 0.0002, "loss": 0.7009, "mean_token_accuracy": 0.807619708776474, "num_tokens": 143603024.0, "step": 10290 }, { "entropy": 0.5496485602110625, "epoch": 0.7715565798009747, "grad_norm": 0.2146015465259552, "learning_rate": 0.0002, "loss": 0.6842, "mean_token_accuracy": 0.8115343578159809, "num_tokens": 145112062.0, "step": 10295 }, { "entropy": 0.5548124911263586, "epoch": 0.7719313037348267, "grad_norm": 0.20201608538627625, "learning_rate": 0.0002, "loss": 0.6968, "mean_token_accuracy": 0.8095180168747902, "num_tokens": 146575391.0, "step": 10300 }, { "entropy": 0.559838799200952, "epoch": 0.7723060276686785, "grad_norm": 0.20280559360980988, "learning_rate": 0.0002, "loss": 0.6946, "mean_token_accuracy": 0.8086317572742701, "num_tokens": 148035055.0, "step": 10305 }, { "entropy": 0.5476373668760062, "epoch": 0.7726807516025304, "grad_norm": 0.2072807252407074, "learning_rate": 0.0002, "loss": 0.6805, "mean_token_accuracy": 0.8137129466980696, "num_tokens": 149512603.0, "step": 10310 }, { "entropy": 0.5558377744629979, "epoch": 0.7730554755363822, "grad_norm": 0.1880168318748474, "learning_rate": 0.0002, "loss": 0.6846, "mean_token_accuracy": 0.8081666756421327, "num_tokens": 150998379.0, "step": 10315 }, { "entropy": 0.5250714852474629, "epoch": 0.7734301994702341, "grad_norm": 0.18558911979198456, "learning_rate": 0.0002, "loss": 0.6588, "mean_token_accuracy": 0.8161787323653698, "num_tokens": 152401356.0, "step": 10320 }, { "entropy": 0.5461616517975927, "epoch": 0.7738049234040859, "grad_norm": 0.19123773276805878, "learning_rate": 0.0002, "loss": 0.6757, "mean_token_accuracy": 0.8128229364752769, "num_tokens": 153826767.0, "step": 10325 }, { "entropy": 0.5576374901458621, "epoch": 0.7741796473379378, "grad_norm": 0.20459645986557007, "learning_rate": 0.0002, "loss": 0.6837, "mean_token_accuracy": 0.8110631871968508, "num_tokens": 155304204.0, "step": 10330 }, { "entropy": 0.5457320732995867, "epoch": 0.7745543712717896, "grad_norm": 0.18507206439971924, "learning_rate": 0.0002, "loss": 0.681, "mean_token_accuracy": 0.8128340549767017, "num_tokens": 156770036.0, "step": 10335 }, { "entropy": 0.5623500162735582, "epoch": 0.7749290952056415, "grad_norm": 0.20801347494125366, "learning_rate": 0.0002, "loss": 0.6837, "mean_token_accuracy": 0.8108413752168417, "num_tokens": 158273274.0, "step": 10340 }, { "entropy": 0.5704694351181387, "epoch": 0.7753038191394933, "grad_norm": 0.19261173903942108, "learning_rate": 0.0002, "loss": 0.6968, "mean_token_accuracy": 0.8083550810813904, "num_tokens": 159787407.0, "step": 10345 }, { "entropy": 0.5677743105217814, "epoch": 0.7756785430733452, "grad_norm": 0.20530399680137634, "learning_rate": 0.0002, "loss": 0.6946, "mean_token_accuracy": 0.8100682709366083, "num_tokens": 161210547.0, "step": 10350 }, { "entropy": 0.5770996080711484, "epoch": 0.776053267007197, "grad_norm": 0.18714390695095062, "learning_rate": 0.0002, "loss": 0.6995, "mean_token_accuracy": 0.8079606711864471, "num_tokens": 162712620.0, "step": 10355 }, { "entropy": 0.5754404441453517, "epoch": 0.7764279909410489, "grad_norm": 0.2080933004617691, "learning_rate": 0.0002, "loss": 0.6941, "mean_token_accuracy": 0.8076082956045866, "num_tokens": 164195580.0, "step": 10360 }, { "entropy": 0.5567493817768991, "epoch": 0.7768027148749007, "grad_norm": 0.20329494774341583, "learning_rate": 0.0002, "loss": 0.6721, "mean_token_accuracy": 0.8152179434895516, "num_tokens": 165681874.0, "step": 10365 }, { "entropy": 0.5633164405822754, "epoch": 0.7771774388087526, "grad_norm": 0.24730972945690155, "learning_rate": 0.0002, "loss": 0.6835, "mean_token_accuracy": 0.8113545447587966, "num_tokens": 167130671.0, "step": 10370 }, { "entropy": 0.5565703788772225, "epoch": 0.7775521627426045, "grad_norm": 0.21340683102607727, "learning_rate": 0.0002, "loss": 0.6805, "mean_token_accuracy": 0.808761915192008, "num_tokens": 168590253.0, "step": 10375 }, { "entropy": 0.5537916803732514, "epoch": 0.7779268866764564, "grad_norm": 0.20860391855239868, "learning_rate": 0.0002, "loss": 0.6724, "mean_token_accuracy": 0.8133628331124783, "num_tokens": 170043182.0, "step": 10380 }, { "entropy": 0.5704786438494921, "epoch": 0.7783016106103082, "grad_norm": 0.19949108362197876, "learning_rate": 0.0002, "loss": 0.6829, "mean_token_accuracy": 0.8096921734511853, "num_tokens": 171515074.0, "step": 10385 }, { "entropy": 0.5687261041253805, "epoch": 0.7786763345441601, "grad_norm": 0.2067728191614151, "learning_rate": 0.0002, "loss": 0.6764, "mean_token_accuracy": 0.8133930642157793, "num_tokens": 172947256.0, "step": 10390 }, { "entropy": 0.5658283466473222, "epoch": 0.7790510584780119, "grad_norm": 0.21511127054691315, "learning_rate": 0.0002, "loss": 0.6748, "mean_token_accuracy": 0.8132876567542553, "num_tokens": 174373879.0, "step": 10395 }, { "entropy": 0.5681820081546902, "epoch": 0.7794257824118638, "grad_norm": 0.20037850737571716, "learning_rate": 0.0002, "loss": 0.6824, "mean_token_accuracy": 0.8114471599459648, "num_tokens": 175843915.0, "step": 10400 }, { "entropy": 0.5725321274250745, "epoch": 0.7798005063457156, "grad_norm": 0.19921070337295532, "learning_rate": 0.0002, "loss": 0.6813, "mean_token_accuracy": 0.812675291672349, "num_tokens": 177290794.0, "step": 10405 }, { "entropy": 0.590223997272551, "epoch": 0.7801752302795675, "grad_norm": 0.1991547793149948, "learning_rate": 0.0002, "loss": 0.6969, "mean_token_accuracy": 0.8079689223319292, "num_tokens": 178749238.0, "step": 10410 }, { "entropy": 0.581967076100409, "epoch": 0.7805499542134193, "grad_norm": 0.20213349163532257, "learning_rate": 0.0002, "loss": 0.6822, "mean_token_accuracy": 0.8117789097130299, "num_tokens": 180213103.0, "step": 10415 }, { "entropy": 0.5762604198418557, "epoch": 0.7809246781472712, "grad_norm": 0.212645024061203, "learning_rate": 0.0002, "loss": 0.6723, "mean_token_accuracy": 0.8155659556388855, "num_tokens": 181665355.0, "step": 10420 }, { "entropy": 0.5644226977601647, "epoch": 0.781299402081123, "grad_norm": 0.2415713667869568, "learning_rate": 0.0002, "loss": 0.6654, "mean_token_accuracy": 0.8135440796613693, "num_tokens": 183123470.0, "step": 10425 }, { "entropy": 0.5776618892326951, "epoch": 0.7816741260149749, "grad_norm": 0.2022438794374466, "learning_rate": 0.0002, "loss": 0.6841, "mean_token_accuracy": 0.8110425155609846, "num_tokens": 184580258.0, "step": 10430 }, { "entropy": 0.5770567554980517, "epoch": 0.7820488499488267, "grad_norm": 0.20869091153144836, "learning_rate": 0.0002, "loss": 0.6797, "mean_token_accuracy": 0.8111612521111965, "num_tokens": 186070919.0, "step": 10435 }, { "entropy": 0.5739916985854506, "epoch": 0.7824235738826786, "grad_norm": 0.21607041358947754, "learning_rate": 0.0002, "loss": 0.6768, "mean_token_accuracy": 0.8133439462631941, "num_tokens": 187544522.0, "step": 10440 }, { "entropy": 0.5864045264199376, "epoch": 0.7827982978165304, "grad_norm": 0.1982320100069046, "learning_rate": 0.0002, "loss": 0.6779, "mean_token_accuracy": 0.8103376012295485, "num_tokens": 189049038.0, "step": 10445 }, { "entropy": 0.5854955952614546, "epoch": 0.7831730217503823, "grad_norm": 0.19323737919330597, "learning_rate": 0.0002, "loss": 0.6768, "mean_token_accuracy": 0.8107393674552441, "num_tokens": 190516521.0, "step": 10450 }, { "entropy": 0.5729714257642626, "epoch": 0.7835477456842342, "grad_norm": 0.19997933506965637, "learning_rate": 0.0002, "loss": 0.66, "mean_token_accuracy": 0.8137711349874734, "num_tokens": 192015577.0, "step": 10455 }, { "entropy": 0.587951141782105, "epoch": 0.7839224696180861, "grad_norm": 0.18858131766319275, "learning_rate": 0.0002, "loss": 0.6854, "mean_token_accuracy": 0.8100097257643938, "num_tokens": 193510714.0, "step": 10460 }, { "entropy": 0.572966781258583, "epoch": 0.7842971935519379, "grad_norm": 0.1939094215631485, "learning_rate": 0.0002, "loss": 0.6667, "mean_token_accuracy": 0.815099137276411, "num_tokens": 194980774.0, "step": 10465 }, { "entropy": 0.5748432077467441, "epoch": 0.7846719174857898, "grad_norm": 0.19897396862506866, "learning_rate": 0.0002, "loss": 0.6654, "mean_token_accuracy": 0.8160939503461122, "num_tokens": 196430244.0, "step": 10470 }, { "entropy": 0.5740883737802506, "epoch": 0.7850466414196416, "grad_norm": 0.22753065824508667, "learning_rate": 0.0002, "loss": 0.6713, "mean_token_accuracy": 0.8125271569937468, "num_tokens": 197907388.0, "step": 10475 }, { "entropy": 0.584853183850646, "epoch": 0.7854213653534935, "grad_norm": 0.20607927441596985, "learning_rate": 0.0002, "loss": 0.6789, "mean_token_accuracy": 0.8125476878136396, "num_tokens": 199351975.0, "step": 10480 }, { "entropy": 0.5933663506060839, "epoch": 0.7857960892873453, "grad_norm": 0.2191070020198822, "learning_rate": 0.0002, "loss": 0.6798, "mean_token_accuracy": 0.8116008348762989, "num_tokens": 200821957.0, "step": 10485 }, { "entropy": 0.5789101475849747, "epoch": 0.7861708132211972, "grad_norm": 0.19167892634868622, "learning_rate": 0.0002, "loss": 0.6593, "mean_token_accuracy": 0.8176299199461937, "num_tokens": 202310163.0, "step": 10490 }, { "entropy": 0.58479940649122, "epoch": 0.786545537155049, "grad_norm": 0.20526821911334991, "learning_rate": 0.0002, "loss": 0.67, "mean_token_accuracy": 0.8161196414381265, "num_tokens": 203780114.0, "step": 10495 }, { "entropy": 0.5939969474449753, "epoch": 0.7869202610889009, "grad_norm": 0.20241592824459076, "learning_rate": 0.0002, "loss": 0.6752, "mean_token_accuracy": 0.8137029595673084, "num_tokens": 205252711.0, "step": 10500 }, { "entropy": 0.6036513705737889, "epoch": 0.7872949850227527, "grad_norm": 0.19903329014778137, "learning_rate": 0.0002, "loss": 0.6936, "mean_token_accuracy": 0.8093349676579237, "num_tokens": 206722723.0, "step": 10505 }, { "entropy": 0.605019035935402, "epoch": 0.7876697089566046, "grad_norm": 0.19250914454460144, "learning_rate": 0.0002, "loss": 0.6863, "mean_token_accuracy": 0.811095517128706, "num_tokens": 208210598.0, "step": 10510 }, { "entropy": 0.5988626515492796, "epoch": 0.7880444328904564, "grad_norm": 0.20477402210235596, "learning_rate": 0.0002, "loss": 0.6811, "mean_token_accuracy": 0.8124646134674549, "num_tokens": 209691300.0, "step": 10515 }, { "entropy": 0.5898526765406131, "epoch": 0.7884191568243083, "grad_norm": 0.2081272304058075, "learning_rate": 0.0002, "loss": 0.6697, "mean_token_accuracy": 0.812967024743557, "num_tokens": 211210651.0, "step": 10520 }, { "entropy": 0.6000477438792586, "epoch": 0.7887938807581601, "grad_norm": 0.1852315068244934, "learning_rate": 0.0002, "loss": 0.6926, "mean_token_accuracy": 0.8076917983591556, "num_tokens": 212672413.0, "step": 10525 }, { "entropy": 0.5742446579039097, "epoch": 0.7891686046920121, "grad_norm": 0.21522845327854156, "learning_rate": 0.0002, "loss": 0.6718, "mean_token_accuracy": 0.8131202716380358, "num_tokens": 214142441.0, "step": 10530 }, { "entropy": 0.577957870811224, "epoch": 0.789543328625864, "grad_norm": 0.1980750560760498, "learning_rate": 0.0002, "loss": 0.6819, "mean_token_accuracy": 0.8100273925811052, "num_tokens": 215605214.0, "step": 10535 }, { "entropy": 0.5639872891828418, "epoch": 0.7899180525597158, "grad_norm": 0.18968504667282104, "learning_rate": 0.0002, "loss": 0.6732, "mean_token_accuracy": 0.8135593611747026, "num_tokens": 217053276.0, "step": 10540 }, { "entropy": 0.5504879277199507, "epoch": 0.7902927764935677, "grad_norm": 0.18511950969696045, "learning_rate": 0.0002, "loss": 0.6601, "mean_token_accuracy": 0.8122268460690976, "num_tokens": 218483237.0, "step": 10545 }, { "entropy": 0.5565538335591554, "epoch": 0.7906675004274195, "grad_norm": 0.18906714022159576, "learning_rate": 0.0002, "loss": 0.6598, "mean_token_accuracy": 0.8141150880604983, "num_tokens": 219952604.0, "step": 10550 }, { "entropy": 0.5634323559701443, "epoch": 0.7910422243612714, "grad_norm": 0.21581631898880005, "learning_rate": 0.0002, "loss": 0.6735, "mean_token_accuracy": 0.8149314444512129, "num_tokens": 221383977.0, "step": 10555 }, { "entropy": 0.569957727752626, "epoch": 0.7914169482951232, "grad_norm": 0.20789475739002228, "learning_rate": 0.0002, "loss": 0.6866, "mean_token_accuracy": 0.8132584992796182, "num_tokens": 222845754.0, "step": 10560 }, { "entropy": 0.5675086458213627, "epoch": 0.7917916722289751, "grad_norm": 0.2060377150774002, "learning_rate": 0.0002, "loss": 0.6804, "mean_token_accuracy": 0.8129398945719004, "num_tokens": 224281977.0, "step": 10565 }, { "entropy": 0.5884444447234273, "epoch": 0.7921663961628269, "grad_norm": 0.20482103526592255, "learning_rate": 0.0002, "loss": 0.6855, "mean_token_accuracy": 0.8101746514439583, "num_tokens": 225768594.0, "step": 10570 }, { "entropy": 0.5805645495653152, "epoch": 0.7925411200966788, "grad_norm": 0.21447589993476868, "learning_rate": 0.0002, "loss": 0.6828, "mean_token_accuracy": 0.8072754673659801, "num_tokens": 227244790.0, "step": 10575 }, { "entropy": 0.5788427470251918, "epoch": 0.7929158440305306, "grad_norm": 0.19903920590877533, "learning_rate": 0.0002, "loss": 0.6731, "mean_token_accuracy": 0.8127084679901599, "num_tokens": 228772069.0, "step": 10580 }, { "entropy": 0.5766531839966774, "epoch": 0.7932905679643825, "grad_norm": 0.18330878019332886, "learning_rate": 0.0002, "loss": 0.6715, "mean_token_accuracy": 0.8108808234333992, "num_tokens": 230224276.0, "step": 10585 }, { "entropy": 0.5669945150613784, "epoch": 0.7936652918982343, "grad_norm": 0.213231161236763, "learning_rate": 0.0002, "loss": 0.6687, "mean_token_accuracy": 0.8137528304010629, "num_tokens": 231639353.0, "step": 10590 }, { "entropy": 0.5571032969281078, "epoch": 0.7940400158320862, "grad_norm": 0.21057641506195068, "learning_rate": 0.0002, "loss": 0.6601, "mean_token_accuracy": 0.8135935720056295, "num_tokens": 233083563.0, "step": 10595 }, { "entropy": 0.5590359140187502, "epoch": 0.794414739765938, "grad_norm": 0.21421706676483154, "learning_rate": 0.0002, "loss": 0.6645, "mean_token_accuracy": 0.8124529678374529, "num_tokens": 234570612.0, "step": 10600 }, { "entropy": 0.570829270966351, "epoch": 0.7947894636997899, "grad_norm": 0.24218279123306274, "learning_rate": 0.0002, "loss": 0.682, "mean_token_accuracy": 0.8115904428064823, "num_tokens": 236044844.0, "step": 10605 }, { "entropy": 0.580439961887896, "epoch": 0.7951641876336418, "grad_norm": 0.19620642066001892, "learning_rate": 0.0002, "loss": 0.6961, "mean_token_accuracy": 0.8093026421964169, "num_tokens": 237532871.0, "step": 10610 }, { "entropy": 0.5657217562198639, "epoch": 0.7955389115674937, "grad_norm": 0.19820542633533478, "learning_rate": 0.0002, "loss": 0.6669, "mean_token_accuracy": 0.8147482484579086, "num_tokens": 239014974.0, "step": 10615 }, { "entropy": 0.5491089862771332, "epoch": 0.7959136355013455, "grad_norm": 0.21313422918319702, "learning_rate": 0.0002, "loss": 0.6594, "mean_token_accuracy": 0.820142575353384, "num_tokens": 240461284.0, "step": 10620 }, { "entropy": 0.5527254218235612, "epoch": 0.7962883594351974, "grad_norm": 0.2167268991470337, "learning_rate": 0.0002, "loss": 0.6677, "mean_token_accuracy": 0.8126881249248982, "num_tokens": 241933453.0, "step": 10625 }, { "entropy": 0.5692499585449695, "epoch": 0.7966630833690492, "grad_norm": 0.1981002241373062, "learning_rate": 0.0002, "loss": 0.6874, "mean_token_accuracy": 0.8103679642081261, "num_tokens": 243427154.0, "step": 10630 }, { "entropy": 0.5655683003365993, "epoch": 0.7970378073029011, "grad_norm": 0.19722874462604523, "learning_rate": 0.0002, "loss": 0.6812, "mean_token_accuracy": 0.8078640695661307, "num_tokens": 244909268.0, "step": 10635 }, { "entropy": 0.5559213355183601, "epoch": 0.7974125312367529, "grad_norm": 0.18609365820884705, "learning_rate": 0.0002, "loss": 0.6789, "mean_token_accuracy": 0.810693109035492, "num_tokens": 246362068.0, "step": 10640 }, { "entropy": 0.5671896416693926, "epoch": 0.7977872551706048, "grad_norm": 0.21290811896324158, "learning_rate": 0.0002, "loss": 0.6886, "mean_token_accuracy": 0.8094809550791979, "num_tokens": 247841377.0, "step": 10645 }, { "entropy": 0.5710425782948733, "epoch": 0.7981619791044566, "grad_norm": 0.1958487629890442, "learning_rate": 0.0002, "loss": 0.6845, "mean_token_accuracy": 0.8088053196668625, "num_tokens": 249299777.0, "step": 10650 }, { "entropy": 0.5754283959046006, "epoch": 0.7985367030383085, "grad_norm": 0.19995848834514618, "learning_rate": 0.0002, "loss": 0.677, "mean_token_accuracy": 0.8121997062116861, "num_tokens": 250782586.0, "step": 10655 }, { "entropy": 0.5702465173788369, "epoch": 0.7989114269721603, "grad_norm": 0.20106063783168793, "learning_rate": 0.0002, "loss": 0.6765, "mean_token_accuracy": 0.8145467128604651, "num_tokens": 252260986.0, "step": 10660 }, { "entropy": 0.5693119384348393, "epoch": 0.7992861509060122, "grad_norm": 0.2021607905626297, "learning_rate": 0.0002, "loss": 0.6795, "mean_token_accuracy": 0.8093441832810641, "num_tokens": 253729529.0, "step": 10665 }, { "entropy": 0.5691186131909489, "epoch": 0.799660874839864, "grad_norm": 0.2028864175081253, "learning_rate": 0.0002, "loss": 0.6882, "mean_token_accuracy": 0.8102823790162802, "num_tokens": 255200193.0, "step": 10670 }, { "entropy": 0.5615657076239586, "epoch": 0.8000355987737159, "grad_norm": 0.2031538486480713, "learning_rate": 0.0002, "loss": 0.6755, "mean_token_accuracy": 0.8123906578868627, "num_tokens": 256653624.0, "step": 10675 }, { "entropy": 0.5646479735150933, "epoch": 0.8004103227075677, "grad_norm": 0.19757850468158722, "learning_rate": 0.0002, "loss": 0.6749, "mean_token_accuracy": 0.8120345011353493, "num_tokens": 258137671.0, "step": 10680 }, { "entropy": 0.5620745738968254, "epoch": 0.8007850466414197, "grad_norm": 0.19633372128009796, "learning_rate": 0.0002, "loss": 0.6709, "mean_token_accuracy": 0.8133185718208551, "num_tokens": 259603934.0, "step": 10685 }, { "entropy": 0.5511412126943469, "epoch": 0.8011597705752715, "grad_norm": 0.20314519107341766, "learning_rate": 0.0002, "loss": 0.6595, "mean_token_accuracy": 0.8149406369775534, "num_tokens": 261097667.0, "step": 10690 }, { "entropy": 0.572960646264255, "epoch": 0.8015344945091234, "grad_norm": 0.21700026094913483, "learning_rate": 0.0002, "loss": 0.6882, "mean_token_accuracy": 0.812687249109149, "num_tokens": 262549804.0, "step": 10695 }, { "entropy": 0.5866281708702445, "epoch": 0.8019092184429752, "grad_norm": 0.19744396209716797, "learning_rate": 0.0002, "loss": 0.6879, "mean_token_accuracy": 0.8125022388994694, "num_tokens": 264064691.0, "step": 10700 }, { "entropy": 0.5763689387589693, "epoch": 0.8022839423768271, "grad_norm": 0.22409123182296753, "learning_rate": 0.0002, "loss": 0.6694, "mean_token_accuracy": 0.8110343609005213, "num_tokens": 265528079.0, "step": 10705 }, { "entropy": 0.5808911656960845, "epoch": 0.802658666310679, "grad_norm": 0.21907560527324677, "learning_rate": 0.0002, "loss": 0.6766, "mean_token_accuracy": 0.8122890498489141, "num_tokens": 266979983.0, "step": 10710 }, { "entropy": 0.5892563126981258, "epoch": 0.8030333902445308, "grad_norm": 0.19445329904556274, "learning_rate": 0.0002, "loss": 0.6937, "mean_token_accuracy": 0.8088293053209782, "num_tokens": 268372553.0, "step": 10715 }, { "entropy": 0.5795412605628372, "epoch": 0.8034081141783826, "grad_norm": 0.20940865576267242, "learning_rate": 0.0002, "loss": 0.6685, "mean_token_accuracy": 0.8135415527969598, "num_tokens": 269863340.0, "step": 10720 }, { "entropy": 0.6024334149435162, "epoch": 0.8037828381122345, "grad_norm": 0.2102898508310318, "learning_rate": 0.0002, "loss": 0.6965, "mean_token_accuracy": 0.8078064125031232, "num_tokens": 271366738.0, "step": 10725 }, { "entropy": 0.5761579247191548, "epoch": 0.8041575620460863, "grad_norm": 0.21059110760688782, "learning_rate": 0.0002, "loss": 0.6675, "mean_token_accuracy": 0.8153466489166021, "num_tokens": 272795914.0, "step": 10730 }, { "entropy": 0.586841551400721, "epoch": 0.8045322859799382, "grad_norm": 0.18718382716178894, "learning_rate": 0.0002, "loss": 0.685, "mean_token_accuracy": 0.8122293327003718, "num_tokens": 274247685.0, "step": 10735 }, { "entropy": 0.5838749345391989, "epoch": 0.80490700991379, "grad_norm": 0.22434668242931366, "learning_rate": 0.0002, "loss": 0.6821, "mean_token_accuracy": 0.8119066409766674, "num_tokens": 275721303.0, "step": 10740 }, { "entropy": 0.5950217634439469, "epoch": 0.8052817338476419, "grad_norm": 0.2157672643661499, "learning_rate": 0.0002, "loss": 0.6956, "mean_token_accuracy": 0.8107666920870542, "num_tokens": 277250288.0, "step": 10745 }, { "entropy": 0.5978714922443032, "epoch": 0.8056564577814938, "grad_norm": 0.20980755984783173, "learning_rate": 0.0002, "loss": 0.69, "mean_token_accuracy": 0.8103512112051249, "num_tokens": 278760103.0, "step": 10750 }, { "entropy": 0.6053952239453793, "epoch": 0.8060311817153456, "grad_norm": 0.19996921718120575, "learning_rate": 0.0002, "loss": 0.6929, "mean_token_accuracy": 0.8092877980321646, "num_tokens": 280261568.0, "step": 10755 }, { "entropy": 0.6023820446804166, "epoch": 0.8064059056491975, "grad_norm": 0.20601245760917664, "learning_rate": 0.0002, "loss": 0.6946, "mean_token_accuracy": 0.8085363991558552, "num_tokens": 281704523.0, "step": 10760 }, { "entropy": 0.6069532416760921, "epoch": 0.8067806295830494, "grad_norm": 0.22477075457572937, "learning_rate": 0.0002, "loss": 0.6946, "mean_token_accuracy": 0.8098358388990163, "num_tokens": 283198193.0, "step": 10765 }, { "entropy": 0.585908680036664, "epoch": 0.8071553535169013, "grad_norm": 0.20062774419784546, "learning_rate": 0.0002, "loss": 0.6746, "mean_token_accuracy": 0.8138935852795839, "num_tokens": 284719291.0, "step": 10770 }, { "entropy": 0.58261877335608, "epoch": 0.8075300774507531, "grad_norm": 0.217084139585495, "learning_rate": 0.0002, "loss": 0.6837, "mean_token_accuracy": 0.8074890196323394, "num_tokens": 286230995.0, "step": 10775 }, { "entropy": 0.5876636298373341, "epoch": 0.807904801384605, "grad_norm": 0.22068148851394653, "learning_rate": 0.0002, "loss": 0.686, "mean_token_accuracy": 0.8088357198983431, "num_tokens": 287721452.0, "step": 10780 }, { "entropy": 0.5722858976572752, "epoch": 0.8082795253184568, "grad_norm": 0.20159663259983063, "learning_rate": 0.0002, "loss": 0.6602, "mean_token_accuracy": 0.8157478176057339, "num_tokens": 289197582.0, "step": 10785 }, { "entropy": 0.5754757553339005, "epoch": 0.8086542492523087, "grad_norm": 0.1973356455564499, "learning_rate": 0.0002, "loss": 0.6789, "mean_token_accuracy": 0.8128834027796984, "num_tokens": 290662016.0, "step": 10790 }, { "entropy": 0.5713139425963163, "epoch": 0.8090289731861605, "grad_norm": 0.22583387792110443, "learning_rate": 0.0002, "loss": 0.6704, "mean_token_accuracy": 0.8143170721828937, "num_tokens": 292118270.0, "step": 10795 }, { "entropy": 0.5768582366406918, "epoch": 0.8094036971200124, "grad_norm": 0.21850408613681793, "learning_rate": 0.0002, "loss": 0.687, "mean_token_accuracy": 0.811512702330947, "num_tokens": 293548218.0, "step": 10800 }, { "entropy": 0.5731686908751726, "epoch": 0.8097784210538642, "grad_norm": 0.21446116268634796, "learning_rate": 0.0002, "loss": 0.6635, "mean_token_accuracy": 0.816955491527915, "num_tokens": 295030395.0, "step": 10805 }, { "entropy": 0.5861798502504826, "epoch": 0.8101531449877161, "grad_norm": 0.19047370553016663, "learning_rate": 0.0002, "loss": 0.6837, "mean_token_accuracy": 0.8091792676597833, "num_tokens": 296494235.0, "step": 10810 }, { "entropy": 0.5785240357741713, "epoch": 0.8105278689215679, "grad_norm": 0.1927640587091446, "learning_rate": 0.0002, "loss": 0.6644, "mean_token_accuracy": 0.8158360973000527, "num_tokens": 297978378.0, "step": 10815 }, { "entropy": 0.5854287272319197, "epoch": 0.8109025928554198, "grad_norm": 0.2120140790939331, "learning_rate": 0.0002, "loss": 0.6863, "mean_token_accuracy": 0.8149723447859287, "num_tokens": 299445596.0, "step": 10820 }, { "entropy": 0.6073330730199814, "epoch": 0.8112773167892716, "grad_norm": 0.2128450870513916, "learning_rate": 0.0002, "loss": 0.6977, "mean_token_accuracy": 0.8064688358455896, "num_tokens": 300909674.0, "step": 10825 }, { "entropy": 0.5782412510365248, "epoch": 0.8116520407231235, "grad_norm": 0.19398708641529083, "learning_rate": 0.0002, "loss": 0.6694, "mean_token_accuracy": 0.8138195291161537, "num_tokens": 302412105.0, "step": 10830 }, { "entropy": 0.6013998210430145, "epoch": 0.8120267646569753, "grad_norm": 0.20828808844089508, "learning_rate": 0.0002, "loss": 0.7019, "mean_token_accuracy": 0.8079470928758383, "num_tokens": 303932604.0, "step": 10835 }, { "entropy": 0.5897096123546361, "epoch": 0.8124014885908273, "grad_norm": 0.19336363673210144, "learning_rate": 0.0002, "loss": 0.6873, "mean_token_accuracy": 0.8105929169803858, "num_tokens": 305392338.0, "step": 10840 }, { "entropy": 0.5941529026255011, "epoch": 0.8127762125246791, "grad_norm": 0.20042560994625092, "learning_rate": 0.0002, "loss": 0.6917, "mean_token_accuracy": 0.8103685099631548, "num_tokens": 306899472.0, "step": 10845 }, { "entropy": 0.5976505098864436, "epoch": 0.813150936458531, "grad_norm": 0.19786852598190308, "learning_rate": 0.0002, "loss": 0.6929, "mean_token_accuracy": 0.8083691857755184, "num_tokens": 308413375.0, "step": 10850 }, { "entropy": 0.5785227339714766, "epoch": 0.8135256603923828, "grad_norm": 0.28091922402381897, "learning_rate": 0.0002, "loss": 0.6692, "mean_token_accuracy": 0.8140469491481781, "num_tokens": 309872161.0, "step": 10855 }, { "entropy": 0.5998728776350617, "epoch": 0.8139003843262347, "grad_norm": 0.18675006926059723, "learning_rate": 0.0002, "loss": 0.6906, "mean_token_accuracy": 0.809183056652546, "num_tokens": 311371346.0, "step": 10860 }, { "entropy": 0.5656189100816846, "epoch": 0.8142751082600865, "grad_norm": 0.2659045457839966, "learning_rate": 0.0002, "loss": 0.6586, "mean_token_accuracy": 0.8146158259361982, "num_tokens": 312814533.0, "step": 10865 }, { "entropy": 0.5694644810631871, "epoch": 0.8146498321939384, "grad_norm": 0.20319466292858124, "learning_rate": 0.0002, "loss": 0.6591, "mean_token_accuracy": 0.8161069065332412, "num_tokens": 314300434.0, "step": 10870 }, { "entropy": 0.582558635994792, "epoch": 0.8150245561277902, "grad_norm": 0.2556271553039551, "learning_rate": 0.0002, "loss": 0.6649, "mean_token_accuracy": 0.8154875446110964, "num_tokens": 315766116.0, "step": 10875 }, { "entropy": 0.5747982358559967, "epoch": 0.8153992800616421, "grad_norm": 0.20839205384254456, "learning_rate": 0.0002, "loss": 0.6743, "mean_token_accuracy": 0.8131104372441769, "num_tokens": 317201813.0, "step": 10880 }, { "entropy": 0.5781723974272609, "epoch": 0.8157740039954939, "grad_norm": 0.2081194669008255, "learning_rate": 0.0002, "loss": 0.6909, "mean_token_accuracy": 0.8086908597499132, "num_tokens": 318624701.0, "step": 10885 }, { "entropy": 0.5602839330211282, "epoch": 0.8161487279293458, "grad_norm": 0.20789626240730286, "learning_rate": 0.0002, "loss": 0.6651, "mean_token_accuracy": 0.8127234485000372, "num_tokens": 320127175.0, "step": 10890 }, { "entropy": 0.5565892646089197, "epoch": 0.8165234518631976, "grad_norm": 0.2147476226091385, "learning_rate": 0.0002, "loss": 0.6666, "mean_token_accuracy": 0.8102500960230827, "num_tokens": 321653017.0, "step": 10895 }, { "entropy": 0.5701033685356378, "epoch": 0.8168981757970495, "grad_norm": 0.20855078101158142, "learning_rate": 0.0002, "loss": 0.683, "mean_token_accuracy": 0.8116153471171856, "num_tokens": 323116443.0, "step": 10900 }, { "entropy": 0.5704444568604231, "epoch": 0.8172728997309013, "grad_norm": 0.21404580771923065, "learning_rate": 0.0002, "loss": 0.6782, "mean_token_accuracy": 0.8112135641276836, "num_tokens": 324585949.0, "step": 10905 }, { "entropy": 0.5723830746486783, "epoch": 0.8176476236647532, "grad_norm": 0.20956005156040192, "learning_rate": 0.0002, "loss": 0.6797, "mean_token_accuracy": 0.8144220493733882, "num_tokens": 326052655.0, "step": 10910 }, { "entropy": 0.5449826525524258, "epoch": 0.818022347598605, "grad_norm": 0.21397621929645538, "learning_rate": 0.0002, "loss": 0.6431, "mean_token_accuracy": 0.8180336095392704, "num_tokens": 327530112.0, "step": 10915 }, { "entropy": 0.5601852418854832, "epoch": 0.818397071532457, "grad_norm": 0.20623303949832916, "learning_rate": 0.0002, "loss": 0.6721, "mean_token_accuracy": 0.8106285352259874, "num_tokens": 328984470.0, "step": 10920 }, { "entropy": 0.5699095239862799, "epoch": 0.8187717954663088, "grad_norm": 0.18468832969665527, "learning_rate": 0.0002, "loss": 0.6752, "mean_token_accuracy": 0.8131400384008884, "num_tokens": 330459231.0, "step": 10925 }, { "entropy": 0.5833077359944582, "epoch": 0.8191465194001607, "grad_norm": 0.21632373332977295, "learning_rate": 0.0002, "loss": 0.6911, "mean_token_accuracy": 0.8088234391063451, "num_tokens": 331923251.0, "step": 10930 }, { "entropy": 0.5563632495701313, "epoch": 0.8195212433340126, "grad_norm": 0.20025089383125305, "learning_rate": 0.0002, "loss": 0.6636, "mean_token_accuracy": 0.8148979593068362, "num_tokens": 333376876.0, "step": 10935 }, { "entropy": 0.5667523650452495, "epoch": 0.8198959672678644, "grad_norm": 0.25400999188423157, "learning_rate": 0.0002, "loss": 0.6707, "mean_token_accuracy": 0.8145557977259159, "num_tokens": 334813350.0, "step": 10940 }, { "entropy": 0.5796640972606838, "epoch": 0.8202706912017163, "grad_norm": 0.21374109387397766, "learning_rate": 0.0002, "loss": 0.6655, "mean_token_accuracy": 0.8125911217182875, "num_tokens": 336267202.0, "step": 10945 }, { "entropy": 0.5824327919632196, "epoch": 0.8206454151355681, "grad_norm": 0.19932521879673004, "learning_rate": 0.0002, "loss": 0.6765, "mean_token_accuracy": 0.8103801731020213, "num_tokens": 337746520.0, "step": 10950 }, { "entropy": 0.5802007233723998, "epoch": 0.82102013906942, "grad_norm": 0.2187487632036209, "learning_rate": 0.0002, "loss": 0.6707, "mean_token_accuracy": 0.8112513698637486, "num_tokens": 339238512.0, "step": 10955 }, { "entropy": 0.5784679656848312, "epoch": 0.8213948630032718, "grad_norm": 0.24090223014354706, "learning_rate": 0.0002, "loss": 0.6697, "mean_token_accuracy": 0.810352674499154, "num_tokens": 340723132.0, "step": 10960 }, { "entropy": 0.5958922175690532, "epoch": 0.8217695869371237, "grad_norm": 0.20812714099884033, "learning_rate": 0.0002, "loss": 0.6845, "mean_token_accuracy": 0.812490364164114, "num_tokens": 342184109.0, "step": 10965 }, { "entropy": 0.6011210184544324, "epoch": 0.8221443108709755, "grad_norm": 0.1973821222782135, "learning_rate": 0.0002, "loss": 0.6952, "mean_token_accuracy": 0.8091065466403962, "num_tokens": 343652618.0, "step": 10970 }, { "entropy": 0.5888845853507518, "epoch": 0.8225190348048274, "grad_norm": 0.20673608779907227, "learning_rate": 0.0002, "loss": 0.6893, "mean_token_accuracy": 0.813544413074851, "num_tokens": 345086997.0, "step": 10975 }, { "entropy": 0.5872133746743202, "epoch": 0.8228937587386792, "grad_norm": 0.21504470705986023, "learning_rate": 0.0002, "loss": 0.6809, "mean_token_accuracy": 0.8122704360634089, "num_tokens": 346576998.0, "step": 10980 }, { "entropy": 0.591719551756978, "epoch": 0.823268482672531, "grad_norm": 0.19384026527404785, "learning_rate": 0.0002, "loss": 0.6728, "mean_token_accuracy": 0.8104731664061546, "num_tokens": 348087275.0, "step": 10985 }, { "entropy": 0.5830369867384434, "epoch": 0.8236432066063829, "grad_norm": 0.20011299848556519, "learning_rate": 0.0002, "loss": 0.6758, "mean_token_accuracy": 0.8150323469191789, "num_tokens": 349583769.0, "step": 10990 }, { "entropy": 0.5858105707913637, "epoch": 0.8240179305402349, "grad_norm": 0.20544472336769104, "learning_rate": 0.0002, "loss": 0.6837, "mean_token_accuracy": 0.8117873072624207, "num_tokens": 351097311.0, "step": 10995 }, { "entropy": 0.5899222608655691, "epoch": 0.8243926544740867, "grad_norm": 0.20313981175422668, "learning_rate": 0.0002, "loss": 0.6712, "mean_token_accuracy": 0.8132738217711448, "num_tokens": 352592970.0, "step": 11000 }, { "entropy": 0.6118162116035819, "epoch": 0.8247673784079386, "grad_norm": 0.21483010053634644, "learning_rate": 0.0002, "loss": 0.6991, "mean_token_accuracy": 0.8085182208567858, "num_tokens": 354108106.0, "step": 11005 }, { "entropy": 0.5862393969669938, "epoch": 0.8251421023417904, "grad_norm": 0.19050616025924683, "learning_rate": 0.0002, "loss": 0.6879, "mean_token_accuracy": 0.8113716810941696, "num_tokens": 355544566.0, "step": 11010 }, { "entropy": 0.592042189836502, "epoch": 0.8255168262756423, "grad_norm": 0.21786773204803467, "learning_rate": 0.0002, "loss": 0.6759, "mean_token_accuracy": 0.8125091642141342, "num_tokens": 357018511.0, "step": 11015 }, { "entropy": 0.5513476690277457, "epoch": 0.8258915502094941, "grad_norm": 0.20093028247356415, "learning_rate": 0.0002, "loss": 0.6619, "mean_token_accuracy": 0.8157477594912053, "num_tokens": 358431539.0, "step": 11020 }, { "entropy": 0.5625835714861751, "epoch": 0.826266274143346, "grad_norm": 0.22473295032978058, "learning_rate": 0.0002, "loss": 0.6721, "mean_token_accuracy": 0.8140765327960253, "num_tokens": 359892530.0, "step": 11025 }, { "entropy": 0.5424991420470178, "epoch": 0.8266409980771978, "grad_norm": 0.2000250518321991, "learning_rate": 0.0002, "loss": 0.6583, "mean_token_accuracy": 0.814133570343256, "num_tokens": 361305575.0, "step": 11030 }, { "entropy": 0.5489845331758261, "epoch": 0.8270157220110497, "grad_norm": 0.22776217758655548, "learning_rate": 0.0002, "loss": 0.6615, "mean_token_accuracy": 0.8167624711990357, "num_tokens": 362786279.0, "step": 11035 }, { "entropy": 0.5669179487973451, "epoch": 0.8273904459449015, "grad_norm": 0.2643877863883972, "learning_rate": 0.0002, "loss": 0.6842, "mean_token_accuracy": 0.8129611641168595, "num_tokens": 364297365.0, "step": 11040 }, { "entropy": 0.5703990725800395, "epoch": 0.8277651698787534, "grad_norm": 0.2176257222890854, "learning_rate": 0.0002, "loss": 0.6942, "mean_token_accuracy": 0.8088178500533104, "num_tokens": 365730819.0, "step": 11045 }, { "entropy": 0.5842772021889686, "epoch": 0.8281398938126052, "grad_norm": 0.22500213980674744, "learning_rate": 0.0002, "loss": 0.6944, "mean_token_accuracy": 0.8100766956806182, "num_tokens": 367180677.0, "step": 11050 }, { "entropy": 0.5708058968186378, "epoch": 0.8285146177464571, "grad_norm": 0.19541382789611816, "learning_rate": 0.0002, "loss": 0.6807, "mean_token_accuracy": 0.813231598213315, "num_tokens": 368672088.0, "step": 11055 }, { "entropy": 0.576916878297925, "epoch": 0.8288893416803089, "grad_norm": 0.21429958939552307, "learning_rate": 0.0002, "loss": 0.6812, "mean_token_accuracy": 0.8128389827907085, "num_tokens": 370148773.0, "step": 11060 }, { "entropy": 0.560053204279393, "epoch": 0.8292640656141608, "grad_norm": 0.18297185003757477, "learning_rate": 0.0002, "loss": 0.6595, "mean_token_accuracy": 0.8159781869500875, "num_tokens": 371620365.0, "step": 11065 }, { "entropy": 0.5903103252872824, "epoch": 0.8296387895480126, "grad_norm": 0.22753961384296417, "learning_rate": 0.0002, "loss": 0.699, "mean_token_accuracy": 0.8091725554317236, "num_tokens": 373108366.0, "step": 11070 }, { "entropy": 0.5797198727726937, "epoch": 0.8300135134818646, "grad_norm": 0.18041613698005676, "learning_rate": 0.0002, "loss": 0.6733, "mean_token_accuracy": 0.8118617717176676, "num_tokens": 374605468.0, "step": 11075 }, { "entropy": 0.586527569964528, "epoch": 0.8303882374157164, "grad_norm": 0.19077786803245544, "learning_rate": 0.0002, "loss": 0.6723, "mean_token_accuracy": 0.8112717803567648, "num_tokens": 376098758.0, "step": 11080 }, { "entropy": 0.5750628924928606, "epoch": 0.8307629613495683, "grad_norm": 0.21506860852241516, "learning_rate": 0.0002, "loss": 0.6756, "mean_token_accuracy": 0.8140463564544916, "num_tokens": 377582880.0, "step": 11085 }, { "entropy": 0.571205317787826, "epoch": 0.8311376852834201, "grad_norm": 0.19073261320590973, "learning_rate": 0.0002, "loss": 0.654, "mean_token_accuracy": 0.815552081167698, "num_tokens": 379065375.0, "step": 11090 }, { "entropy": 0.5776555577293039, "epoch": 0.831512409217272, "grad_norm": 0.210993692278862, "learning_rate": 0.0002, "loss": 0.6741, "mean_token_accuracy": 0.8145915351808071, "num_tokens": 380558015.0, "step": 11095 }, { "entropy": 0.5776999810710549, "epoch": 0.8318871331511238, "grad_norm": 0.21881428360939026, "learning_rate": 0.0002, "loss": 0.6827, "mean_token_accuracy": 0.8130808737128973, "num_tokens": 382014701.0, "step": 11100 }, { "entropy": 0.5875276977196335, "epoch": 0.8322618570849757, "grad_norm": 0.23279787600040436, "learning_rate": 0.0002, "loss": 0.6894, "mean_token_accuracy": 0.8066119190305472, "num_tokens": 383513029.0, "step": 11105 }, { "entropy": 0.577891237474978, "epoch": 0.8326365810188275, "grad_norm": 0.22438840568065643, "learning_rate": 0.0002, "loss": 0.6703, "mean_token_accuracy": 0.8160835772752761, "num_tokens": 384973544.0, "step": 11110 }, { "entropy": 0.5688722347840667, "epoch": 0.8330113049526794, "grad_norm": 0.20810069143772125, "learning_rate": 0.0002, "loss": 0.676, "mean_token_accuracy": 0.8152614302933217, "num_tokens": 386428503.0, "step": 11115 }, { "entropy": 0.5798116311430931, "epoch": 0.8333860288865312, "grad_norm": 0.2176765352487564, "learning_rate": 0.0002, "loss": 0.6858, "mean_token_accuracy": 0.810474106669426, "num_tokens": 387896252.0, "step": 11120 }, { "entropy": 0.5743724441155791, "epoch": 0.8337607528203831, "grad_norm": 0.2036724090576172, "learning_rate": 0.0002, "loss": 0.6629, "mean_token_accuracy": 0.8127602137625217, "num_tokens": 389347989.0, "step": 11125 }, { "entropy": 0.5838633032515645, "epoch": 0.8341354767542349, "grad_norm": 0.19957470893859863, "learning_rate": 0.0002, "loss": 0.6754, "mean_token_accuracy": 0.8112158220261335, "num_tokens": 390822992.0, "step": 11130 }, { "entropy": 0.5802147101610899, "epoch": 0.8345102006880868, "grad_norm": 0.20999403297901154, "learning_rate": 0.0002, "loss": 0.6688, "mean_token_accuracy": 0.8128511637449265, "num_tokens": 392285315.0, "step": 11135 }, { "entropy": 0.5913849886506796, "epoch": 0.8348849246219386, "grad_norm": 0.1985415816307068, "learning_rate": 0.0002, "loss": 0.678, "mean_token_accuracy": 0.8141532432287931, "num_tokens": 393713335.0, "step": 11140 }, { "entropy": 0.5869304710999131, "epoch": 0.8352596485557905, "grad_norm": 0.19826456904411316, "learning_rate": 0.0002, "loss": 0.6797, "mean_token_accuracy": 0.8135620471090078, "num_tokens": 395165246.0, "step": 11145 }, { "entropy": 0.5705240868031979, "epoch": 0.8356343724896423, "grad_norm": 0.1944052278995514, "learning_rate": 0.0002, "loss": 0.6576, "mean_token_accuracy": 0.8149515684694052, "num_tokens": 396633617.0, "step": 11150 }, { "entropy": 0.573450835607946, "epoch": 0.8360090964234943, "grad_norm": 0.20794415473937988, "learning_rate": 0.0002, "loss": 0.6709, "mean_token_accuracy": 0.8162610299885273, "num_tokens": 398086038.0, "step": 11155 }, { "entropy": 0.5799348002299667, "epoch": 0.8363838203573462, "grad_norm": 0.20053355395793915, "learning_rate": 0.0002, "loss": 0.6748, "mean_token_accuracy": 0.8134364530444145, "num_tokens": 399549834.0, "step": 11160 }, { "entropy": 0.5783989427611231, "epoch": 0.836758544291198, "grad_norm": 0.21161489188671112, "learning_rate": 0.0002, "loss": 0.6804, "mean_token_accuracy": 0.8167735528200865, "num_tokens": 400973636.0, "step": 11165 }, { "entropy": 0.5756048591807484, "epoch": 0.8371332682250499, "grad_norm": 0.21141350269317627, "learning_rate": 0.0002, "loss": 0.6708, "mean_token_accuracy": 0.8132316909730435, "num_tokens": 402398098.0, "step": 11170 }, { "entropy": 0.5793660778552294, "epoch": 0.8375079921589017, "grad_norm": 0.19525250792503357, "learning_rate": 0.0002, "loss": 0.6739, "mean_token_accuracy": 0.8153315845876932, "num_tokens": 403891728.0, "step": 11175 }, { "entropy": 0.570309747569263, "epoch": 0.8378827160927536, "grad_norm": 0.22081004083156586, "learning_rate": 0.0002, "loss": 0.6732, "mean_token_accuracy": 0.8138131476938725, "num_tokens": 405358131.0, "step": 11180 }, { "entropy": 0.5779308272525668, "epoch": 0.8382574400266054, "grad_norm": 0.21388185024261475, "learning_rate": 0.0002, "loss": 0.6853, "mean_token_accuracy": 0.8118697568774224, "num_tokens": 406809965.0, "step": 11185 }, { "entropy": 0.5683308273553849, "epoch": 0.8386321639604573, "grad_norm": 0.20083807408809662, "learning_rate": 0.0002, "loss": 0.6679, "mean_token_accuracy": 0.8137130469083786, "num_tokens": 408273049.0, "step": 11190 }, { "entropy": 0.5694773808121681, "epoch": 0.8390068878943091, "grad_norm": 0.20063740015029907, "learning_rate": 0.0002, "loss": 0.6779, "mean_token_accuracy": 0.815231253579259, "num_tokens": 409750805.0, "step": 11195 }, { "entropy": 0.5924386471509934, "epoch": 0.839381611828161, "grad_norm": 0.20481042563915253, "learning_rate": 0.0002, "loss": 0.6856, "mean_token_accuracy": 0.8109227228909731, "num_tokens": 411221649.0, "step": 11200 }, { "entropy": 0.5804875448346138, "epoch": 0.8397563357620128, "grad_norm": 0.2192552387714386, "learning_rate": 0.0002, "loss": 0.6702, "mean_token_accuracy": 0.8135802280157804, "num_tokens": 412683503.0, "step": 11205 }, { "entropy": 0.5842447830364108, "epoch": 0.8401310596958647, "grad_norm": 0.20968115329742432, "learning_rate": 0.0002, "loss": 0.6843, "mean_token_accuracy": 0.8132329910993576, "num_tokens": 414154797.0, "step": 11210 }, { "entropy": 0.5781424017623067, "epoch": 0.8405057836297165, "grad_norm": 0.20666879415512085, "learning_rate": 0.0002, "loss": 0.677, "mean_token_accuracy": 0.8113611798733473, "num_tokens": 415625084.0, "step": 11215 }, { "entropy": 0.5868405807763338, "epoch": 0.8408805075635684, "grad_norm": 0.1952890157699585, "learning_rate": 0.0002, "loss": 0.6835, "mean_token_accuracy": 0.812285989522934, "num_tokens": 417116072.0, "step": 11220 }, { "entropy": 0.5660469196736813, "epoch": 0.8412552314974202, "grad_norm": 0.19061851501464844, "learning_rate": 0.0002, "loss": 0.6614, "mean_token_accuracy": 0.8160952799022198, "num_tokens": 418594597.0, "step": 11225 }, { "entropy": 0.5693430613726378, "epoch": 0.8416299554312722, "grad_norm": 0.20859169960021973, "learning_rate": 0.0002, "loss": 0.667, "mean_token_accuracy": 0.8134893637150526, "num_tokens": 420042463.0, "step": 11230 }, { "entropy": 0.5815690565854311, "epoch": 0.842004679365124, "grad_norm": 0.19652780890464783, "learning_rate": 0.0002, "loss": 0.6843, "mean_token_accuracy": 0.8119194280356169, "num_tokens": 421516533.0, "step": 11235 }, { "entropy": 0.5673126077279449, "epoch": 0.8423794032989759, "grad_norm": 0.17577269673347473, "learning_rate": 0.0002, "loss": 0.6635, "mean_token_accuracy": 0.8146971251815558, "num_tokens": 422994091.0, "step": 11240 }, { "entropy": 0.5923528231680393, "epoch": 0.8427541272328277, "grad_norm": 0.19314636290073395, "learning_rate": 0.0002, "loss": 0.691, "mean_token_accuracy": 0.8111356135457755, "num_tokens": 424470342.0, "step": 11245 }, { "entropy": 0.5778177846223116, "epoch": 0.8431288511666796, "grad_norm": 0.18175315856933594, "learning_rate": 0.0002, "loss": 0.6816, "mean_token_accuracy": 0.812772398814559, "num_tokens": 425945839.0, "step": 11250 }, { "entropy": 0.5781935581937432, "epoch": 0.8435035751005314, "grad_norm": 0.20594096183776855, "learning_rate": 0.0002, "loss": 0.6794, "mean_token_accuracy": 0.8099473495036363, "num_tokens": 427394590.0, "step": 11255 }, { "entropy": 0.5714739309623837, "epoch": 0.8438782990343833, "grad_norm": 0.21837298572063446, "learning_rate": 0.0002, "loss": 0.6674, "mean_token_accuracy": 0.814265514537692, "num_tokens": 428847845.0, "step": 11260 }, { "entropy": 0.5885962268337608, "epoch": 0.8442530229682351, "grad_norm": 0.20720508694648743, "learning_rate": 0.0002, "loss": 0.6806, "mean_token_accuracy": 0.8113648038357496, "num_tokens": 430350152.0, "step": 11265 }, { "entropy": 0.5836049353703856, "epoch": 0.844627746902087, "grad_norm": 0.26100677251815796, "learning_rate": 0.0002, "loss": 0.6739, "mean_token_accuracy": 0.8109136413782835, "num_tokens": 431830460.0, "step": 11270 }, { "entropy": 0.5767727731727064, "epoch": 0.8450024708359388, "grad_norm": 0.1935531198978424, "learning_rate": 0.0002, "loss": 0.6718, "mean_token_accuracy": 0.8165711998939514, "num_tokens": 433297844.0, "step": 11275 }, { "entropy": 0.5928064010106027, "epoch": 0.8453771947697907, "grad_norm": 0.22910362482070923, "learning_rate": 0.0002, "loss": 0.6833, "mean_token_accuracy": 0.8109666656702756, "num_tokens": 434834898.0, "step": 11280 }, { "entropy": 0.5759054617956281, "epoch": 0.8457519187036425, "grad_norm": 0.21390599012374878, "learning_rate": 0.0002, "loss": 0.6785, "mean_token_accuracy": 0.8106406953185796, "num_tokens": 436313469.0, "step": 11285 }, { "entropy": 0.5500784607604146, "epoch": 0.8461266426374944, "grad_norm": 0.19783465564250946, "learning_rate": 0.0002, "loss": 0.6506, "mean_token_accuracy": 0.8143778756260872, "num_tokens": 437741051.0, "step": 11290 }, { "entropy": 0.5721522215753794, "epoch": 0.8465013665713462, "grad_norm": 0.20855790376663208, "learning_rate": 0.0002, "loss": 0.674, "mean_token_accuracy": 0.8115159511566162, "num_tokens": 439225828.0, "step": 11295 }, { "entropy": 0.5828640766441822, "epoch": 0.8468760905051981, "grad_norm": 0.19734062254428864, "learning_rate": 0.0002, "loss": 0.6845, "mean_token_accuracy": 0.8094083618372678, "num_tokens": 440687652.0, "step": 11300 }, { "entropy": 0.5624962709844112, "epoch": 0.8472508144390499, "grad_norm": 0.2029494196176529, "learning_rate": 0.0002, "loss": 0.6769, "mean_token_accuracy": 0.8124742917716503, "num_tokens": 442157075.0, "step": 11305 }, { "entropy": 0.5573581214994192, "epoch": 0.8476255383729019, "grad_norm": 0.20451706647872925, "learning_rate": 0.0002, "loss": 0.6587, "mean_token_accuracy": 0.8165427204221487, "num_tokens": 443609064.0, "step": 11310 }, { "entropy": 0.5718150559812785, "epoch": 0.8480002623067537, "grad_norm": 0.21709977090358734, "learning_rate": 0.0002, "loss": 0.6836, "mean_token_accuracy": 0.813011534884572, "num_tokens": 445095043.0, "step": 11315 }, { "entropy": 0.558296555839479, "epoch": 0.8483749862406056, "grad_norm": 0.21650958061218262, "learning_rate": 0.0002, "loss": 0.6649, "mean_token_accuracy": 0.81223305426538, "num_tokens": 446516827.0, "step": 11320 }, { "entropy": 0.5563839446753264, "epoch": 0.8487497101744574, "grad_norm": 0.2268914431333542, "learning_rate": 0.0002, "loss": 0.6761, "mean_token_accuracy": 0.8123511116951704, "num_tokens": 447969595.0, "step": 11325 }, { "entropy": 0.5618237506598234, "epoch": 0.8491244341083093, "grad_norm": 0.21193383634090424, "learning_rate": 0.0002, "loss": 0.6744, "mean_token_accuracy": 0.8124262310564518, "num_tokens": 449483438.0, "step": 11330 }, { "entropy": 0.5442532876506447, "epoch": 0.8494991580421611, "grad_norm": 0.22972401976585388, "learning_rate": 0.0002, "loss": 0.659, "mean_token_accuracy": 0.814897371456027, "num_tokens": 450939797.0, "step": 11335 }, { "entropy": 0.5586064418777823, "epoch": 0.849873881976013, "grad_norm": 0.21571041643619537, "learning_rate": 0.0002, "loss": 0.6724, "mean_token_accuracy": 0.8143838968127965, "num_tokens": 452396184.0, "step": 11340 }, { "entropy": 0.5600046435371041, "epoch": 0.8502486059098648, "grad_norm": 0.21972665190696716, "learning_rate": 0.0002, "loss": 0.6666, "mean_token_accuracy": 0.8126166250556708, "num_tokens": 453802000.0, "step": 11345 }, { "entropy": 0.575032964348793, "epoch": 0.8506233298437167, "grad_norm": 0.20313861966133118, "learning_rate": 0.0002, "loss": 0.6819, "mean_token_accuracy": 0.8121071621775627, "num_tokens": 455254949.0, "step": 11350 }, { "entropy": 0.5660049999132752, "epoch": 0.8509980537775685, "grad_norm": 0.21617433428764343, "learning_rate": 0.0002, "loss": 0.6626, "mean_token_accuracy": 0.8132874500006437, "num_tokens": 456686344.0, "step": 11355 }, { "entropy": 0.5711528358981013, "epoch": 0.8513727777114204, "grad_norm": 0.2149169147014618, "learning_rate": 0.0002, "loss": 0.6697, "mean_token_accuracy": 0.8134912692010403, "num_tokens": 458107399.0, "step": 11360 }, { "entropy": 0.56369295027107, "epoch": 0.8517475016452722, "grad_norm": 0.2271771878004074, "learning_rate": 0.0002, "loss": 0.6683, "mean_token_accuracy": 0.8134106636047364, "num_tokens": 459551087.0, "step": 11365 }, { "entropy": 0.5911231834441424, "epoch": 0.8521222255791241, "grad_norm": 0.21282607316970825, "learning_rate": 0.0002, "loss": 0.7017, "mean_token_accuracy": 0.8068844299763441, "num_tokens": 461024211.0, "step": 11370 }, { "entropy": 0.5756668141111732, "epoch": 0.8524969495129759, "grad_norm": 0.19575391709804535, "learning_rate": 0.0002, "loss": 0.6713, "mean_token_accuracy": 0.8130615293979645, "num_tokens": 462488082.0, "step": 11375 }, { "entropy": 0.5838382763788104, "epoch": 0.8528716734468278, "grad_norm": 0.2177809476852417, "learning_rate": 0.0002, "loss": 0.699, "mean_token_accuracy": 0.809083704650402, "num_tokens": 463936256.0, "step": 11380 }, { "entropy": 0.5680893525481224, "epoch": 0.8532463973806798, "grad_norm": 0.24357514083385468, "learning_rate": 0.0002, "loss": 0.6647, "mean_token_accuracy": 0.8134802877902985, "num_tokens": 465409272.0, "step": 11385 }, { "entropy": 0.5922229306772351, "epoch": 0.8536211213145316, "grad_norm": 0.2016853392124176, "learning_rate": 0.0002, "loss": 0.6934, "mean_token_accuracy": 0.8110261660069227, "num_tokens": 466867547.0, "step": 11390 }, { "entropy": 0.5800349144265056, "epoch": 0.8539958452483835, "grad_norm": 0.2267782837152481, "learning_rate": 0.0002, "loss": 0.68, "mean_token_accuracy": 0.8124820224940776, "num_tokens": 468326473.0, "step": 11395 }, { "entropy": 0.571750407293439, "epoch": 0.8543705691822353, "grad_norm": 0.22465583682060242, "learning_rate": 0.0002, "loss": 0.6651, "mean_token_accuracy": 0.8140407346189023, "num_tokens": 469807978.0, "step": 11400 }, { "entropy": 0.5896193472668528, "epoch": 0.8547452931160872, "grad_norm": 0.20569437742233276, "learning_rate": 0.0002, "loss": 0.6889, "mean_token_accuracy": 0.8114319920539856, "num_tokens": 471303061.0, "step": 11405 }, { "entropy": 0.5687002055346966, "epoch": 0.855120017049939, "grad_norm": 0.20143666863441467, "learning_rate": 0.0002, "loss": 0.6654, "mean_token_accuracy": 0.81458585485816, "num_tokens": 472735782.0, "step": 11410 }, { "entropy": 0.5773912420496344, "epoch": 0.8554947409837909, "grad_norm": 0.19356487691402435, "learning_rate": 0.0002, "loss": 0.656, "mean_token_accuracy": 0.8147111434489489, "num_tokens": 474207148.0, "step": 11415 }, { "entropy": 0.5779213577508926, "epoch": 0.8558694649176427, "grad_norm": 0.21417486667633057, "learning_rate": 0.0002, "loss": 0.6626, "mean_token_accuracy": 0.8144957613199949, "num_tokens": 475682287.0, "step": 11420 }, { "entropy": 0.5682059571146965, "epoch": 0.8562441888514946, "grad_norm": 0.20819315314292908, "learning_rate": 0.0002, "loss": 0.6632, "mean_token_accuracy": 0.8155433028936386, "num_tokens": 477115676.0, "step": 11425 }, { "entropy": 0.5750830091536046, "epoch": 0.8566189127853464, "grad_norm": 0.2680797874927521, "learning_rate": 0.0002, "loss": 0.666, "mean_token_accuracy": 0.8165854018181562, "num_tokens": 478593885.0, "step": 11430 }, { "entropy": 0.5863592021167279, "epoch": 0.8569936367191983, "grad_norm": 0.19909845292568207, "learning_rate": 0.0002, "loss": 0.6798, "mean_token_accuracy": 0.8134543094784021, "num_tokens": 480050910.0, "step": 11435 }, { "entropy": 0.5763909392058849, "epoch": 0.8573683606530501, "grad_norm": 0.19089356064796448, "learning_rate": 0.0002, "loss": 0.6594, "mean_token_accuracy": 0.8130563326179981, "num_tokens": 481560770.0, "step": 11440 }, { "entropy": 0.5797013351693749, "epoch": 0.857743084586902, "grad_norm": 0.21298567950725555, "learning_rate": 0.0002, "loss": 0.6714, "mean_token_accuracy": 0.8149261504411698, "num_tokens": 483056938.0, "step": 11445 }, { "entropy": 0.5927397765219211, "epoch": 0.8581178085207538, "grad_norm": 0.20418603718280792, "learning_rate": 0.0002, "loss": 0.6904, "mean_token_accuracy": 0.8083284564316273, "num_tokens": 484539753.0, "step": 11450 }, { "entropy": 0.5657330516725778, "epoch": 0.8584925324546057, "grad_norm": 0.1936536282300949, "learning_rate": 0.0002, "loss": 0.6604, "mean_token_accuracy": 0.8146614573895932, "num_tokens": 486010552.0, "step": 11455 }, { "entropy": 0.562697365321219, "epoch": 0.8588672563884575, "grad_norm": 0.1954784095287323, "learning_rate": 0.0002, "loss": 0.6598, "mean_token_accuracy": 0.8166926436126232, "num_tokens": 487464237.0, "step": 11460 }, { "entropy": 0.5754117779433727, "epoch": 0.8592419803223095, "grad_norm": 0.2086716741323471, "learning_rate": 0.0002, "loss": 0.6825, "mean_token_accuracy": 0.8133781429380178, "num_tokens": 488932868.0, "step": 11465 }, { "entropy": 0.5848508322611451, "epoch": 0.8596167042561613, "grad_norm": 0.19432029128074646, "learning_rate": 0.0002, "loss": 0.686, "mean_token_accuracy": 0.8110423061996699, "num_tokens": 490435296.0, "step": 11470 }, { "entropy": 0.5824873320758343, "epoch": 0.8599914281900132, "grad_norm": 0.1951759308576584, "learning_rate": 0.0002, "loss": 0.6795, "mean_token_accuracy": 0.8156022742390633, "num_tokens": 491936338.0, "step": 11475 }, { "entropy": 0.5827017512172461, "epoch": 0.860366152123865, "grad_norm": 0.2161404937505722, "learning_rate": 0.0002, "loss": 0.6802, "mean_token_accuracy": 0.8095098339021206, "num_tokens": 493417291.0, "step": 11480 }, { "entropy": 0.5665580749511718, "epoch": 0.8607408760577169, "grad_norm": 0.20224438607692719, "learning_rate": 0.0002, "loss": 0.6622, "mean_token_accuracy": 0.8156586814671755, "num_tokens": 494873872.0, "step": 11485 }, { "entropy": 0.5688052129000425, "epoch": 0.8611155999915687, "grad_norm": 0.22666677832603455, "learning_rate": 0.0002, "loss": 0.6629, "mean_token_accuracy": 0.8160819303244352, "num_tokens": 496294081.0, "step": 11490 }, { "entropy": 0.5586622392758727, "epoch": 0.8614903239254206, "grad_norm": 0.25392237305641174, "learning_rate": 0.0002, "loss": 0.6605, "mean_token_accuracy": 0.8188035123050212, "num_tokens": 497750035.0, "step": 11495 }, { "entropy": 0.5630251534283162, "epoch": 0.8618650478592724, "grad_norm": 0.18435579538345337, "learning_rate": 0.0002, "loss": 0.6621, "mean_token_accuracy": 0.8157978933304548, "num_tokens": 499228276.0, "step": 11500 }, { "entropy": 0.5738127476535737, "epoch": 0.8622397717931243, "grad_norm": 0.21077881753444672, "learning_rate": 0.0002, "loss": 0.6829, "mean_token_accuracy": 0.8124654237180948, "num_tokens": 500714198.0, "step": 11505 }, { "entropy": 0.5619939145632088, "epoch": 0.8626144957269761, "grad_norm": 0.2284451723098755, "learning_rate": 0.0002, "loss": 0.6719, "mean_token_accuracy": 0.8133852846920491, "num_tokens": 502175865.0, "step": 11510 }, { "entropy": 0.5667272750288248, "epoch": 0.862989219660828, "grad_norm": 0.2151719480752945, "learning_rate": 0.0002, "loss": 0.6809, "mean_token_accuracy": 0.8108080759644508, "num_tokens": 503647882.0, "step": 11515 }, { "entropy": 0.5492886897176504, "epoch": 0.8633639435946798, "grad_norm": 0.198433518409729, "learning_rate": 0.0002, "loss": 0.6629, "mean_token_accuracy": 0.8159799683839083, "num_tokens": 505133562.0, "step": 11520 }, { "entropy": 0.5647161236032844, "epoch": 0.8637386675285317, "grad_norm": 0.21669384837150574, "learning_rate": 0.0002, "loss": 0.6745, "mean_token_accuracy": 0.8141294579952956, "num_tokens": 506594169.0, "step": 11525 }, { "entropy": 0.574981801956892, "epoch": 0.8641133914623835, "grad_norm": 0.18374145030975342, "learning_rate": 0.0002, "loss": 0.6827, "mean_token_accuracy": 0.810106723383069, "num_tokens": 508067749.0, "step": 11530 }, { "entropy": 0.5803303172811866, "epoch": 0.8644881153962354, "grad_norm": 0.20076194405555725, "learning_rate": 0.0002, "loss": 0.6799, "mean_token_accuracy": 0.8124682564288378, "num_tokens": 509554472.0, "step": 11535 }, { "entropy": 0.5760074241086841, "epoch": 0.8648628393300873, "grad_norm": 0.23053723573684692, "learning_rate": 0.0002, "loss": 0.6778, "mean_token_accuracy": 0.8135972000658512, "num_tokens": 511018745.0, "step": 11540 }, { "entropy": 0.5536572767421604, "epoch": 0.8652375632639392, "grad_norm": 0.2224312722682953, "learning_rate": 0.0002, "loss": 0.648, "mean_token_accuracy": 0.8160308450460434, "num_tokens": 512452684.0, "step": 11545 }, { "entropy": 0.566802927851677, "epoch": 0.865612287197791, "grad_norm": 0.21219204366207123, "learning_rate": 0.0002, "loss": 0.6733, "mean_token_accuracy": 0.8151910364627838, "num_tokens": 513896378.0, "step": 11550 }, { "entropy": 0.5658261215314269, "epoch": 0.8659870111316429, "grad_norm": 0.20944175124168396, "learning_rate": 0.0002, "loss": 0.6692, "mean_token_accuracy": 0.8146393366158009, "num_tokens": 515336166.0, "step": 11555 }, { "entropy": 0.5704552520066499, "epoch": 0.8663617350654947, "grad_norm": 0.21240916848182678, "learning_rate": 0.0002, "loss": 0.6674, "mean_token_accuracy": 0.8129684466868639, "num_tokens": 516839957.0, "step": 11560 }, { "entropy": 0.5800895122811198, "epoch": 0.8667364589993466, "grad_norm": 0.20788398385047913, "learning_rate": 0.0002, "loss": 0.6738, "mean_token_accuracy": 0.8126779019832611, "num_tokens": 518320641.0, "step": 11565 }, { "entropy": 0.5687077663838863, "epoch": 0.8671111829331984, "grad_norm": 0.21133415400981903, "learning_rate": 0.0002, "loss": 0.6666, "mean_token_accuracy": 0.8139011818915606, "num_tokens": 519767364.0, "step": 11570 }, { "entropy": 0.5797597851604224, "epoch": 0.8674859068670503, "grad_norm": 0.21814574301242828, "learning_rate": 0.0002, "loss": 0.6799, "mean_token_accuracy": 0.812481839209795, "num_tokens": 521223078.0, "step": 11575 }, { "entropy": 0.5710658838972449, "epoch": 0.8678606308009021, "grad_norm": 0.18429067730903625, "learning_rate": 0.0002, "loss": 0.6732, "mean_token_accuracy": 0.8130495004355908, "num_tokens": 522682206.0, "step": 11580 }, { "entropy": 0.5863473922014236, "epoch": 0.868235354734754, "grad_norm": 0.2245045155286789, "learning_rate": 0.0002, "loss": 0.7006, "mean_token_accuracy": 0.8076728589832782, "num_tokens": 524146679.0, "step": 11585 }, { "entropy": 0.5469909904524684, "epoch": 0.8686100786686058, "grad_norm": 0.21804338693618774, "learning_rate": 0.0002, "loss": 0.6549, "mean_token_accuracy": 0.8179379802197217, "num_tokens": 525577188.0, "step": 11590 }, { "entropy": 0.5546038508415222, "epoch": 0.8689848026024577, "grad_norm": 0.18015894293785095, "learning_rate": 0.0002, "loss": 0.6605, "mean_token_accuracy": 0.8159789498895407, "num_tokens": 527068543.0, "step": 11595 }, { "entropy": 0.5631214348599315, "epoch": 0.8693595265363095, "grad_norm": 0.1940874308347702, "learning_rate": 0.0002, "loss": 0.6633, "mean_token_accuracy": 0.8134912334382534, "num_tokens": 528591055.0, "step": 11600 }, { "entropy": 0.5566829076968134, "epoch": 0.8697342504701614, "grad_norm": 0.22037987411022186, "learning_rate": 0.0002, "loss": 0.6687, "mean_token_accuracy": 0.8114486206322908, "num_tokens": 530073015.0, "step": 11605 }, { "entropy": 0.5357502550818026, "epoch": 0.8701089744040132, "grad_norm": 0.18867120146751404, "learning_rate": 0.0002, "loss": 0.655, "mean_token_accuracy": 0.8176407381892205, "num_tokens": 531506786.0, "step": 11610 }, { "entropy": 0.5443079796619713, "epoch": 0.8704836983378651, "grad_norm": 0.2169996052980423, "learning_rate": 0.0002, "loss": 0.6744, "mean_token_accuracy": 0.8133369598537683, "num_tokens": 532963976.0, "step": 11615 }, { "entropy": 0.5562760414555669, "epoch": 0.8708584222717171, "grad_norm": 0.1869686096906662, "learning_rate": 0.0002, "loss": 0.6722, "mean_token_accuracy": 0.8132312696427106, "num_tokens": 534458408.0, "step": 11620 }, { "entropy": 0.5327452784404159, "epoch": 0.8712331462055689, "grad_norm": 0.20062348246574402, "learning_rate": 0.0002, "loss": 0.6576, "mean_token_accuracy": 0.8153956573456526, "num_tokens": 535999295.0, "step": 11625 }, { "entropy": 0.5365106543526054, "epoch": 0.8716078701394208, "grad_norm": 0.21056871116161346, "learning_rate": 0.0002, "loss": 0.658, "mean_token_accuracy": 0.8147639591246844, "num_tokens": 537447087.0, "step": 11630 }, { "entropy": 0.5295411391183734, "epoch": 0.8719825940732726, "grad_norm": 0.18628399074077606, "learning_rate": 0.0002, "loss": 0.648, "mean_token_accuracy": 0.8169335223734379, "num_tokens": 538918783.0, "step": 11635 }, { "entropy": 0.5467723496258259, "epoch": 0.8723573180071245, "grad_norm": 0.20632033050060272, "learning_rate": 0.0002, "loss": 0.6697, "mean_token_accuracy": 0.8164742153137923, "num_tokens": 540399371.0, "step": 11640 }, { "entropy": 0.5482719326391816, "epoch": 0.8727320419409763, "grad_norm": 0.17837858200073242, "learning_rate": 0.0002, "loss": 0.6735, "mean_token_accuracy": 0.814836609363556, "num_tokens": 541879963.0, "step": 11645 }, { "entropy": 0.5425653424113989, "epoch": 0.8731067658748282, "grad_norm": 0.204787477850914, "learning_rate": 0.0002, "loss": 0.6609, "mean_token_accuracy": 0.8168742053210736, "num_tokens": 543356149.0, "step": 11650 }, { "entropy": 0.537669824063778, "epoch": 0.87348148980868, "grad_norm": 0.4304494559764862, "learning_rate": 0.0002, "loss": 0.6576, "mean_token_accuracy": 0.8162822265177965, "num_tokens": 544832921.0, "step": 11655 }, { "entropy": 0.5597095200791955, "epoch": 0.8738562137425319, "grad_norm": 0.21836644411087036, "learning_rate": 0.0002, "loss": 0.6741, "mean_token_accuracy": 0.8116815652698278, "num_tokens": 546325231.0, "step": 11660 }, { "entropy": 0.5692757483571768, "epoch": 0.8742309376763837, "grad_norm": 0.20292900502681732, "learning_rate": 0.0002, "loss": 0.6858, "mean_token_accuracy": 0.8129656385630369, "num_tokens": 547842618.0, "step": 11665 }, { "entropy": 0.5430613584816456, "epoch": 0.8746056616102356, "grad_norm": 0.2046264261007309, "learning_rate": 0.0002, "loss": 0.6641, "mean_token_accuracy": 0.8166099578142166, "num_tokens": 549266990.0, "step": 11670 }, { "entropy": 0.5450382970273495, "epoch": 0.8749803855440874, "grad_norm": 0.23082101345062256, "learning_rate": 0.0002, "loss": 0.6605, "mean_token_accuracy": 0.8149367365986109, "num_tokens": 550735065.0, "step": 11675 }, { "entropy": 0.5473066540434957, "epoch": 0.8753551094779393, "grad_norm": 0.2447034865617752, "learning_rate": 0.0002, "loss": 0.6552, "mean_token_accuracy": 0.816857073828578, "num_tokens": 552212142.0, "step": 11680 }, { "entropy": 0.5600558141246438, "epoch": 0.8757298334117911, "grad_norm": 0.20140030980110168, "learning_rate": 0.0002, "loss": 0.6682, "mean_token_accuracy": 0.8145247351378202, "num_tokens": 553672946.0, "step": 11685 }, { "entropy": 0.5541643872857094, "epoch": 0.876104557345643, "grad_norm": 0.2079235315322876, "learning_rate": 0.0002, "loss": 0.6709, "mean_token_accuracy": 0.8136086478829384, "num_tokens": 555136367.0, "step": 11690 }, { "entropy": 0.5608260462991893, "epoch": 0.8764792812794949, "grad_norm": 0.2029624581336975, "learning_rate": 0.0002, "loss": 0.6752, "mean_token_accuracy": 0.8128593944013118, "num_tokens": 556621967.0, "step": 11695 }, { "entropy": 0.5776057312265038, "epoch": 0.8768540052133468, "grad_norm": 0.21316222846508026, "learning_rate": 0.0002, "loss": 0.6976, "mean_token_accuracy": 0.809134216606617, "num_tokens": 558082954.0, "step": 11700 }, { "entropy": 0.5575248902663589, "epoch": 0.8772287291471986, "grad_norm": 0.21120190620422363, "learning_rate": 0.0002, "loss": 0.6718, "mean_token_accuracy": 0.8101135995239019, "num_tokens": 559594327.0, "step": 11705 }, { "entropy": 0.5425163343548774, "epoch": 0.8776034530810505, "grad_norm": 0.21445758640766144, "learning_rate": 0.0002, "loss": 0.6644, "mean_token_accuracy": 0.8155474044382572, "num_tokens": 561091757.0, "step": 11710 }, { "entropy": 0.5516666082665325, "epoch": 0.8779781770149023, "grad_norm": 0.21733061969280243, "learning_rate": 0.0002, "loss": 0.6831, "mean_token_accuracy": 0.8115248497575521, "num_tokens": 562532961.0, "step": 11715 }, { "entropy": 0.5530487289652228, "epoch": 0.8783529009487542, "grad_norm": 0.21660789847373962, "learning_rate": 0.0002, "loss": 0.6693, "mean_token_accuracy": 0.8137937925755978, "num_tokens": 564030714.0, "step": 11720 }, { "entropy": 0.5304805915802717, "epoch": 0.878727624882606, "grad_norm": 0.24330228567123413, "learning_rate": 0.0002, "loss": 0.6533, "mean_token_accuracy": 0.8198285892605781, "num_tokens": 565485727.0, "step": 11725 }, { "entropy": 0.5489035459235311, "epoch": 0.8791023488164579, "grad_norm": 0.22839444875717163, "learning_rate": 0.0002, "loss": 0.6655, "mean_token_accuracy": 0.8131830167025328, "num_tokens": 566969719.0, "step": 11730 }, { "entropy": 0.5585889043286443, "epoch": 0.8794770727503097, "grad_norm": 0.21978993713855743, "learning_rate": 0.0002, "loss": 0.675, "mean_token_accuracy": 0.8117383431643248, "num_tokens": 568478285.0, "step": 11735 }, { "entropy": 0.5656600009649992, "epoch": 0.8798517966841616, "grad_norm": 0.19229333102703094, "learning_rate": 0.0002, "loss": 0.6774, "mean_token_accuracy": 0.8124740712344647, "num_tokens": 569984164.0, "step": 11740 }, { "entropy": 0.5472802328877151, "epoch": 0.8802265206180134, "grad_norm": 0.21530599892139435, "learning_rate": 0.0002, "loss": 0.6606, "mean_token_accuracy": 0.8167372804135085, "num_tokens": 571464345.0, "step": 11745 }, { "entropy": 0.5735402019694448, "epoch": 0.8806012445518653, "grad_norm": 0.2123885601758957, "learning_rate": 0.0002, "loss": 0.681, "mean_token_accuracy": 0.8120206393301487, "num_tokens": 572959615.0, "step": 11750 }, { "entropy": 0.5544528050348163, "epoch": 0.8809759684857171, "grad_norm": 0.19990234076976776, "learning_rate": 0.0002, "loss": 0.6572, "mean_token_accuracy": 0.8161290347576141, "num_tokens": 574454150.0, "step": 11755 }, { "entropy": 0.5372038524597883, "epoch": 0.881350692419569, "grad_norm": 0.1964544802904129, "learning_rate": 0.0002, "loss": 0.6522, "mean_token_accuracy": 0.8203427139669657, "num_tokens": 575922050.0, "step": 11760 }, { "entropy": 0.5695933192968369, "epoch": 0.8817254163534208, "grad_norm": 0.20542378723621368, "learning_rate": 0.0002, "loss": 0.6828, "mean_token_accuracy": 0.809733422473073, "num_tokens": 577405393.0, "step": 11765 }, { "entropy": 0.5737552231177687, "epoch": 0.8821001402872727, "grad_norm": 0.19717921316623688, "learning_rate": 0.0002, "loss": 0.6925, "mean_token_accuracy": 0.8057466398924589, "num_tokens": 578887765.0, "step": 11770 }, { "entropy": 0.560514111071825, "epoch": 0.8824748642211246, "grad_norm": 0.2041478008031845, "learning_rate": 0.0002, "loss": 0.6732, "mean_token_accuracy": 0.8117651481181383, "num_tokens": 580367019.0, "step": 11775 }, { "entropy": 0.5411278638988734, "epoch": 0.8828495881549765, "grad_norm": 0.21801194548606873, "learning_rate": 0.0002, "loss": 0.6544, "mean_token_accuracy": 0.8155685193836689, "num_tokens": 581829107.0, "step": 11780 }, { "entropy": 0.5558366926386953, "epoch": 0.8832243120888283, "grad_norm": 0.21106064319610596, "learning_rate": 0.0002, "loss": 0.6846, "mean_token_accuracy": 0.8146879367530346, "num_tokens": 583255132.0, "step": 11785 }, { "entropy": 0.5532529544085264, "epoch": 0.8835990360226802, "grad_norm": 0.19972969591617584, "learning_rate": 0.0002, "loss": 0.6736, "mean_token_accuracy": 0.812060135230422, "num_tokens": 584728756.0, "step": 11790 }, { "entropy": 0.5642529157921672, "epoch": 0.883973759956532, "grad_norm": 0.19758905470371246, "learning_rate": 0.0002, "loss": 0.6836, "mean_token_accuracy": 0.8104962781071663, "num_tokens": 586235546.0, "step": 11795 }, { "entropy": 0.5213814381510019, "epoch": 0.8843484838903839, "grad_norm": 0.2359555959701538, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8216058209538459, "num_tokens": 587667676.0, "step": 11800 }, { "entropy": 0.5311359919607639, "epoch": 0.8847232078242357, "grad_norm": 0.21836096048355103, "learning_rate": 0.0002, "loss": 0.6645, "mean_token_accuracy": 0.8155560426414012, "num_tokens": 589140299.0, "step": 11805 }, { "entropy": 0.526430403906852, "epoch": 0.8850979317580876, "grad_norm": 0.20020131766796112, "learning_rate": 0.0002, "loss": 0.6661, "mean_token_accuracy": 0.8098848581314086, "num_tokens": 590602979.0, "step": 11810 }, { "entropy": 0.5186726471409202, "epoch": 0.8854726556919394, "grad_norm": 0.2466166913509369, "learning_rate": 0.0002, "loss": 0.665, "mean_token_accuracy": 0.8143559392541647, "num_tokens": 592073320.0, "step": 11815 }, { "entropy": 0.5216389622539281, "epoch": 0.8858473796257913, "grad_norm": 0.24703702330589294, "learning_rate": 0.0002, "loss": 0.6652, "mean_token_accuracy": 0.8163891971111298, "num_tokens": 593553333.0, "step": 11820 }, { "entropy": 0.5513039421290159, "epoch": 0.8862221035596431, "grad_norm": 0.20751893520355225, "learning_rate": 0.0002, "loss": 0.6934, "mean_token_accuracy": 0.8078687340021133, "num_tokens": 595011826.0, "step": 11825 }, { "entropy": 0.5265166697092354, "epoch": 0.886596827493495, "grad_norm": 0.20054005086421967, "learning_rate": 0.0002, "loss": 0.6608, "mean_token_accuracy": 0.8159965846687556, "num_tokens": 596465322.0, "step": 11830 }, { "entropy": 0.5501749575138092, "epoch": 0.8869715514273468, "grad_norm": 0.20462432503700256, "learning_rate": 0.0002, "loss": 0.6824, "mean_token_accuracy": 0.8109054174274206, "num_tokens": 597981085.0, "step": 11835 }, { "entropy": 0.5425692442804575, "epoch": 0.8873462753611987, "grad_norm": 0.18651235103607178, "learning_rate": 0.0002, "loss": 0.6591, "mean_token_accuracy": 0.8168051850050688, "num_tokens": 599480217.0, "step": 11840 }, { "entropy": 0.5401644740253687, "epoch": 0.8877209992950506, "grad_norm": 0.20072375237941742, "learning_rate": 0.0002, "loss": 0.6694, "mean_token_accuracy": 0.814818474650383, "num_tokens": 600917216.0, "step": 11845 }, { "entropy": 0.5556065239943564, "epoch": 0.8880957232289025, "grad_norm": 0.21568232774734497, "learning_rate": 0.0002, "loss": 0.6744, "mean_token_accuracy": 0.8143508363515138, "num_tokens": 602395156.0, "step": 11850 }, { "entropy": 0.534106113947928, "epoch": 0.8884704471627544, "grad_norm": 0.19493769109249115, "learning_rate": 0.0002, "loss": 0.6624, "mean_token_accuracy": 0.8144510760903358, "num_tokens": 603832877.0, "step": 11855 }, { "entropy": 0.5444348853081464, "epoch": 0.8888451710966062, "grad_norm": 0.20036610960960388, "learning_rate": 0.0002, "loss": 0.6763, "mean_token_accuracy": 0.8127038229256869, "num_tokens": 605256257.0, "step": 11860 }, { "entropy": 0.5473286891356111, "epoch": 0.8892198950304581, "grad_norm": 0.21536926925182343, "learning_rate": 0.0002, "loss": 0.6647, "mean_token_accuracy": 0.8122570905834436, "num_tokens": 606720097.0, "step": 11865 }, { "entropy": 0.5457576356828213, "epoch": 0.8895946189643099, "grad_norm": 0.20887809991836548, "learning_rate": 0.0002, "loss": 0.6632, "mean_token_accuracy": 0.8133640479296446, "num_tokens": 608242790.0, "step": 11870 }, { "entropy": 0.5421657003462315, "epoch": 0.8899693428981618, "grad_norm": 0.21163026988506317, "learning_rate": 0.0002, "loss": 0.6648, "mean_token_accuracy": 0.8162233162671327, "num_tokens": 609717184.0, "step": 11875 }, { "entropy": 0.5306983700953424, "epoch": 0.8903440668320136, "grad_norm": 0.22197772562503815, "learning_rate": 0.0002, "loss": 0.6545, "mean_token_accuracy": 0.8183684714138508, "num_tokens": 611154883.0, "step": 11880 }, { "entropy": 0.5551006719470024, "epoch": 0.8907187907658655, "grad_norm": 0.19404065608978271, "learning_rate": 0.0002, "loss": 0.6724, "mean_token_accuracy": 0.8126463640481234, "num_tokens": 612651790.0, "step": 11885 }, { "entropy": 0.5445368697866797, "epoch": 0.8910935146997173, "grad_norm": 0.2012370079755783, "learning_rate": 0.0002, "loss": 0.6774, "mean_token_accuracy": 0.8107840467244387, "num_tokens": 614130815.0, "step": 11890 }, { "entropy": 0.5323183108121157, "epoch": 0.8914682386335692, "grad_norm": 0.2098330855369568, "learning_rate": 0.0002, "loss": 0.6673, "mean_token_accuracy": 0.8146334972232581, "num_tokens": 615561159.0, "step": 11895 }, { "entropy": 0.5537765750661492, "epoch": 0.891842962567421, "grad_norm": 0.20828211307525635, "learning_rate": 0.0002, "loss": 0.6836, "mean_token_accuracy": 0.8122017260640859, "num_tokens": 617039157.0, "step": 11900 }, { "entropy": 0.5364983968436718, "epoch": 0.8922176865012729, "grad_norm": 0.2187623828649521, "learning_rate": 0.0002, "loss": 0.6587, "mean_token_accuracy": 0.8161520838737488, "num_tokens": 618517100.0, "step": 11905 }, { "entropy": 0.5451744658872485, "epoch": 0.8925924104351247, "grad_norm": 0.1984141618013382, "learning_rate": 0.0002, "loss": 0.656, "mean_token_accuracy": 0.8166241236031055, "num_tokens": 619954856.0, "step": 11910 }, { "entropy": 0.5692591890692711, "epoch": 0.8929671343689766, "grad_norm": 0.19859756529331207, "learning_rate": 0.0002, "loss": 0.6808, "mean_token_accuracy": 0.8116259142756462, "num_tokens": 621462741.0, "step": 11915 }, { "entropy": 0.5544564222916961, "epoch": 0.8933418583028284, "grad_norm": 0.2223050892353058, "learning_rate": 0.0002, "loss": 0.668, "mean_token_accuracy": 0.8142585456371307, "num_tokens": 622925363.0, "step": 11920 }, { "entropy": 0.5646908191032708, "epoch": 0.8937165822366803, "grad_norm": 0.22492803633213043, "learning_rate": 0.0002, "loss": 0.6738, "mean_token_accuracy": 0.8152030516415835, "num_tokens": 624415328.0, "step": 11925 }, { "entropy": 0.5584872793406248, "epoch": 0.8940913061705322, "grad_norm": 0.21963462233543396, "learning_rate": 0.0002, "loss": 0.6515, "mean_token_accuracy": 0.8173475287854671, "num_tokens": 625838657.0, "step": 11930 }, { "entropy": 0.5693356832489371, "epoch": 0.8944660301043841, "grad_norm": 0.21532122790813446, "learning_rate": 0.0002, "loss": 0.688, "mean_token_accuracy": 0.8113042175769806, "num_tokens": 627278259.0, "step": 11935 }, { "entropy": 0.5630715402774513, "epoch": 0.8948407540382359, "grad_norm": 0.21958068013191223, "learning_rate": 0.0002, "loss": 0.6658, "mean_token_accuracy": 0.813555408269167, "num_tokens": 628765908.0, "step": 11940 }, { "entropy": 0.564830182865262, "epoch": 0.8952154779720878, "grad_norm": 0.23740729689598083, "learning_rate": 0.0002, "loss": 0.6713, "mean_token_accuracy": 0.8117327336221933, "num_tokens": 630213141.0, "step": 11945 }, { "entropy": 0.5706725088879466, "epoch": 0.8955902019059396, "grad_norm": 0.22779616713523865, "learning_rate": 0.0002, "loss": 0.6852, "mean_token_accuracy": 0.8085206635296345, "num_tokens": 631739264.0, "step": 11950 }, { "entropy": 0.5706523861736059, "epoch": 0.8959649258397915, "grad_norm": 0.22937935590744019, "learning_rate": 0.0002, "loss": 0.6802, "mean_token_accuracy": 0.81177873685956, "num_tokens": 633204304.0, "step": 11955 }, { "entropy": 0.5783515693619847, "epoch": 0.8963396497736433, "grad_norm": 0.19802695512771606, "learning_rate": 0.0002, "loss": 0.6841, "mean_token_accuracy": 0.8112630687654019, "num_tokens": 634663945.0, "step": 11960 }, { "entropy": 0.5740055194124579, "epoch": 0.8967143737074952, "grad_norm": 0.2043110877275467, "learning_rate": 0.0002, "loss": 0.6701, "mean_token_accuracy": 0.8145978581160307, "num_tokens": 636132919.0, "step": 11965 }, { "entropy": 0.5676716210320591, "epoch": 0.897089097641347, "grad_norm": 0.2053823173046112, "learning_rate": 0.0002, "loss": 0.6629, "mean_token_accuracy": 0.8139162600040436, "num_tokens": 637595455.0, "step": 11970 }, { "entropy": 0.5761282740160822, "epoch": 0.8974638215751989, "grad_norm": 0.2259608656167984, "learning_rate": 0.0002, "loss": 0.6746, "mean_token_accuracy": 0.8131712097674608, "num_tokens": 639029583.0, "step": 11975 }, { "entropy": 0.5640015985816718, "epoch": 0.8978385455090507, "grad_norm": 0.20051033794879913, "learning_rate": 0.0002, "loss": 0.6684, "mean_token_accuracy": 0.8140872571617365, "num_tokens": 640483686.0, "step": 11980 }, { "entropy": 0.5776079948991537, "epoch": 0.8982132694429026, "grad_norm": 0.20838944613933563, "learning_rate": 0.0002, "loss": 0.6797, "mean_token_accuracy": 0.8124356780201196, "num_tokens": 641961111.0, "step": 11985 }, { "entropy": 0.5401847148314118, "epoch": 0.8985879933767544, "grad_norm": 0.20654384791851044, "learning_rate": 0.0002, "loss": 0.6387, "mean_token_accuracy": 0.8197998438030482, "num_tokens": 643459212.0, "step": 11990 }, { "entropy": 0.557703138049692, "epoch": 0.8989627173106063, "grad_norm": 0.21053147315979004, "learning_rate": 0.0002, "loss": 0.6619, "mean_token_accuracy": 0.8158624526113272, "num_tokens": 644978378.0, "step": 11995 }, { "entropy": 0.5410884134471416, "epoch": 0.8993374412444581, "grad_norm": 0.20555664598941803, "learning_rate": 0.0002, "loss": 0.6487, "mean_token_accuracy": 0.8168309848755598, "num_tokens": 646419372.0, "step": 12000 }, { "entropy": 0.5599353789351881, "epoch": 0.8997121651783101, "grad_norm": 0.21220478415489197, "learning_rate": 0.0002, "loss": 0.6694, "mean_token_accuracy": 0.8123102236539126, "num_tokens": 647864300.0, "step": 12005 }, { "entropy": 0.5739062836393714, "epoch": 0.900086889112162, "grad_norm": 0.21056900918483734, "learning_rate": 0.0002, "loss": 0.6803, "mean_token_accuracy": 0.8108804561197758, "num_tokens": 649360050.0, "step": 12010 }, { "entropy": 0.5682800780981779, "epoch": 0.9004616130460138, "grad_norm": 0.19312910735607147, "learning_rate": 0.0002, "loss": 0.6721, "mean_token_accuracy": 0.8163932364434003, "num_tokens": 650836278.0, "step": 12015 }, { "entropy": 0.5652580766007305, "epoch": 0.9008363369798656, "grad_norm": 0.19065620005130768, "learning_rate": 0.0002, "loss": 0.666, "mean_token_accuracy": 0.8131228681653738, "num_tokens": 652260220.0, "step": 12020 }, { "entropy": 0.5640876235440373, "epoch": 0.9012110609137175, "grad_norm": 0.19570372998714447, "learning_rate": 0.0002, "loss": 0.6627, "mean_token_accuracy": 0.8161051619797945, "num_tokens": 653704889.0, "step": 12025 }, { "entropy": 0.5586216337047517, "epoch": 0.9015857848475693, "grad_norm": 0.21524590253829956, "learning_rate": 0.0002, "loss": 0.6706, "mean_token_accuracy": 0.8131134130060673, "num_tokens": 655155349.0, "step": 12030 }, { "entropy": 0.5577216418460011, "epoch": 0.9019605087814212, "grad_norm": 0.23552724719047546, "learning_rate": 0.0002, "loss": 0.6557, "mean_token_accuracy": 0.817191506549716, "num_tokens": 656639307.0, "step": 12035 }, { "entropy": 0.5840440323576331, "epoch": 0.902335232715273, "grad_norm": 0.2384471744298935, "learning_rate": 0.0002, "loss": 0.6858, "mean_token_accuracy": 0.8124539602547884, "num_tokens": 658106147.0, "step": 12040 }, { "entropy": 0.569543469324708, "epoch": 0.9027099566491249, "grad_norm": 0.19044862687587738, "learning_rate": 0.0002, "loss": 0.6705, "mean_token_accuracy": 0.8134495005011558, "num_tokens": 659563237.0, "step": 12045 }, { "entropy": 0.5701109370216727, "epoch": 0.9030846805829768, "grad_norm": 0.23214201629161835, "learning_rate": 0.0002, "loss": 0.6744, "mean_token_accuracy": 0.814281340688467, "num_tokens": 661021870.0, "step": 12050 }, { "entropy": 0.5618982798419893, "epoch": 0.9034594045168286, "grad_norm": 0.23163387179374695, "learning_rate": 0.0002, "loss": 0.6634, "mean_token_accuracy": 0.8156173467636109, "num_tokens": 662505674.0, "step": 12055 }, { "entropy": 0.5620999669656157, "epoch": 0.9038341284506805, "grad_norm": 0.20640237629413605, "learning_rate": 0.0002, "loss": 0.6657, "mean_token_accuracy": 0.8134221728891134, "num_tokens": 664024959.0, "step": 12060 }, { "entropy": 0.5502842362970114, "epoch": 0.9042088523845323, "grad_norm": 0.2520785629749298, "learning_rate": 0.0002, "loss": 0.6539, "mean_token_accuracy": 0.8167013514786958, "num_tokens": 665477903.0, "step": 12065 }, { "entropy": 0.5660300647839904, "epoch": 0.9045835763183842, "grad_norm": 0.23209865391254425, "learning_rate": 0.0002, "loss": 0.6638, "mean_token_accuracy": 0.8137812487781048, "num_tokens": 666944843.0, "step": 12070 }, { "entropy": 0.5820109201595187, "epoch": 0.904958300252236, "grad_norm": 0.2064007967710495, "learning_rate": 0.0002, "loss": 0.6816, "mean_token_accuracy": 0.8109530698508024, "num_tokens": 668382965.0, "step": 12075 }, { "entropy": 0.5851652543991804, "epoch": 0.9053330241860879, "grad_norm": 0.2131270468235016, "learning_rate": 0.0002, "loss": 0.6833, "mean_token_accuracy": 0.8098719876259566, "num_tokens": 669847885.0, "step": 12080 }, { "entropy": 0.5768699888139963, "epoch": 0.9057077481199398, "grad_norm": 0.2196718007326126, "learning_rate": 0.0002, "loss": 0.6777, "mean_token_accuracy": 0.811494430527091, "num_tokens": 671343764.0, "step": 12085 }, { "entropy": 0.5774549514055252, "epoch": 0.9060824720537917, "grad_norm": 0.2307625561952591, "learning_rate": 0.0002, "loss": 0.6671, "mean_token_accuracy": 0.8130867935717105, "num_tokens": 672813587.0, "step": 12090 }, { "entropy": 0.5608637265861034, "epoch": 0.9064571959876435, "grad_norm": 0.21244508028030396, "learning_rate": 0.0002, "loss": 0.6578, "mean_token_accuracy": 0.814246666058898, "num_tokens": 674291012.0, "step": 12095 }, { "entropy": 0.5625492472201585, "epoch": 0.9068319199214954, "grad_norm": 0.21630427241325378, "learning_rate": 0.0002, "loss": 0.6601, "mean_token_accuracy": 0.8148893974721432, "num_tokens": 675775366.0, "step": 12100 }, { "entropy": 0.5688029944896698, "epoch": 0.9072066438553472, "grad_norm": 0.213215172290802, "learning_rate": 0.0002, "loss": 0.6664, "mean_token_accuracy": 0.8162872213870287, "num_tokens": 677252702.0, "step": 12105 }, { "entropy": 0.5783463513478637, "epoch": 0.9075813677891991, "grad_norm": 0.3056935667991638, "learning_rate": 0.0002, "loss": 0.6808, "mean_token_accuracy": 0.8095012806355953, "num_tokens": 678778578.0, "step": 12110 }, { "entropy": 0.5684222560375929, "epoch": 0.9079560917230509, "grad_norm": 0.21878990530967712, "learning_rate": 0.0002, "loss": 0.6823, "mean_token_accuracy": 0.8091754823923111, "num_tokens": 680238008.0, "step": 12115 }, { "entropy": 0.5538608090952039, "epoch": 0.9083308156569028, "grad_norm": 0.19718037545681, "learning_rate": 0.0002, "loss": 0.6614, "mean_token_accuracy": 0.8133160773664713, "num_tokens": 681725149.0, "step": 12120 }, { "entropy": 0.5485669668763876, "epoch": 0.9087055395907546, "grad_norm": 0.1945296674966812, "learning_rate": 0.0002, "loss": 0.6711, "mean_token_accuracy": 0.8110955104231834, "num_tokens": 683205649.0, "step": 12125 }, { "entropy": 0.5712034171447158, "epoch": 0.9090802635246065, "grad_norm": 0.19776421785354614, "learning_rate": 0.0002, "loss": 0.694, "mean_token_accuracy": 0.8099219396710395, "num_tokens": 684703568.0, "step": 12130 }, { "entropy": 0.5760577647015452, "epoch": 0.9094549874584583, "grad_norm": 0.20910952985286713, "learning_rate": 0.0002, "loss": 0.6862, "mean_token_accuracy": 0.8087835006415844, "num_tokens": 686199194.0, "step": 12135 }, { "entropy": 0.5674379339441658, "epoch": 0.9098297113923102, "grad_norm": 0.20754805207252502, "learning_rate": 0.0002, "loss": 0.6696, "mean_token_accuracy": 0.8154521461576223, "num_tokens": 687679391.0, "step": 12140 }, { "entropy": 0.5592344148084522, "epoch": 0.910204435326162, "grad_norm": 0.21329298615455627, "learning_rate": 0.0002, "loss": 0.6622, "mean_token_accuracy": 0.8202100440859794, "num_tokens": 689123289.0, "step": 12145 }, { "entropy": 0.5732990259304642, "epoch": 0.9105791592600139, "grad_norm": 0.2236274927854538, "learning_rate": 0.0002, "loss": 0.6819, "mean_token_accuracy": 0.8156679190695286, "num_tokens": 690520873.0, "step": 12150 }, { "entropy": 0.5645358197391033, "epoch": 0.9109538831938657, "grad_norm": 0.19817772507667542, "learning_rate": 0.0002, "loss": 0.6628, "mean_token_accuracy": 0.8164033133536577, "num_tokens": 691976252.0, "step": 12155 }, { "entropy": 0.5551721941679716, "epoch": 0.9113286071277176, "grad_norm": 0.20268107950687408, "learning_rate": 0.0002, "loss": 0.6542, "mean_token_accuracy": 0.8199469517916441, "num_tokens": 693439855.0, "step": 12160 }, { "entropy": 0.5617481274530292, "epoch": 0.9117033310615695, "grad_norm": 0.2020190805196762, "learning_rate": 0.0002, "loss": 0.6656, "mean_token_accuracy": 0.8127567563205957, "num_tokens": 694935434.0, "step": 12165 }, { "entropy": 0.5743256289511919, "epoch": 0.9120780549954214, "grad_norm": 0.19820092618465424, "learning_rate": 0.0002, "loss": 0.6887, "mean_token_accuracy": 0.8092280052602291, "num_tokens": 696378048.0, "step": 12170 }, { "entropy": 0.5761588973924517, "epoch": 0.9124527789292732, "grad_norm": 0.20258378982543945, "learning_rate": 0.0002, "loss": 0.6834, "mean_token_accuracy": 0.8131879515945911, "num_tokens": 697852208.0, "step": 12175 }, { "entropy": 0.5658290615305305, "epoch": 0.9128275028631251, "grad_norm": 0.32215937972068787, "learning_rate": 0.0002, "loss": 0.6696, "mean_token_accuracy": 0.8136026106774807, "num_tokens": 699339189.0, "step": 12180 }, { "entropy": 0.5674375986680389, "epoch": 0.9132022267969769, "grad_norm": 0.20513631403446198, "learning_rate": 0.0002, "loss": 0.6708, "mean_token_accuracy": 0.8100145012140274, "num_tokens": 700802477.0, "step": 12185 }, { "entropy": 0.5666063608601689, "epoch": 0.9135769507308288, "grad_norm": 0.19756998121738434, "learning_rate": 0.0002, "loss": 0.6679, "mean_token_accuracy": 0.8170455671846867, "num_tokens": 702310838.0, "step": 12190 }, { "entropy": 0.578651905618608, "epoch": 0.9139516746646806, "grad_norm": 0.21555855870246887, "learning_rate": 0.0002, "loss": 0.6968, "mean_token_accuracy": 0.8085058625787497, "num_tokens": 703802067.0, "step": 12195 }, { "entropy": 0.5655263911932706, "epoch": 0.9143263985985325, "grad_norm": 0.21932834386825562, "learning_rate": 0.0002, "loss": 0.6694, "mean_token_accuracy": 0.8116340421140193, "num_tokens": 705309292.0, "step": 12200 }, { "entropy": 0.5711003691889346, "epoch": 0.9147011225323843, "grad_norm": 0.21871615946292877, "learning_rate": 0.0002, "loss": 0.6839, "mean_token_accuracy": 0.8096773453056813, "num_tokens": 706752123.0, "step": 12205 }, { "entropy": 0.5673079324886203, "epoch": 0.9150758464662362, "grad_norm": 0.20576317608356476, "learning_rate": 0.0002, "loss": 0.6775, "mean_token_accuracy": 0.8142023034393787, "num_tokens": 708247503.0, "step": 12210 }, { "entropy": 0.5535855470225215, "epoch": 0.915450570400088, "grad_norm": 0.22272135317325592, "learning_rate": 0.0002, "loss": 0.655, "mean_token_accuracy": 0.8178841196000576, "num_tokens": 709683412.0, "step": 12215 }, { "entropy": 0.5593578115105629, "epoch": 0.9158252943339399, "grad_norm": 0.2167985886335373, "learning_rate": 0.0002, "loss": 0.6615, "mean_token_accuracy": 0.8138112470507621, "num_tokens": 711144314.0, "step": 12220 }, { "entropy": 0.5627798357978463, "epoch": 0.9162000182677917, "grad_norm": 0.22920159995555878, "learning_rate": 0.0002, "loss": 0.6715, "mean_token_accuracy": 0.8151688046753407, "num_tokens": 712638108.0, "step": 12225 }, { "entropy": 0.5462125847116113, "epoch": 0.9165747422016436, "grad_norm": 0.20637913048267365, "learning_rate": 0.0002, "loss": 0.6414, "mean_token_accuracy": 0.821863292530179, "num_tokens": 714077930.0, "step": 12230 }, { "entropy": 0.5732831591740251, "epoch": 0.9169494661354954, "grad_norm": 0.18777243793010712, "learning_rate": 0.0002, "loss": 0.6746, "mean_token_accuracy": 0.8118248291313648, "num_tokens": 715560323.0, "step": 12235 }, { "entropy": 0.5679226772859692, "epoch": 0.9173241900693474, "grad_norm": 0.20450469851493835, "learning_rate": 0.0002, "loss": 0.6655, "mean_token_accuracy": 0.8141183894127607, "num_tokens": 716997905.0, "step": 12240 }, { "entropy": 0.569209401961416, "epoch": 0.9176989140031993, "grad_norm": 0.3131488859653473, "learning_rate": 0.0002, "loss": 0.668, "mean_token_accuracy": 0.8148143716156483, "num_tokens": 718486908.0, "step": 12245 }, { "entropy": 0.5780976762995124, "epoch": 0.9180736379370511, "grad_norm": 0.19122380018234253, "learning_rate": 0.0002, "loss": 0.6793, "mean_token_accuracy": 0.8127533368766308, "num_tokens": 719973567.0, "step": 12250 }, { "entropy": 0.5605756286531687, "epoch": 0.918448361870903, "grad_norm": 0.1853559911251068, "learning_rate": 0.0002, "loss": 0.6574, "mean_token_accuracy": 0.8154073312878609, "num_tokens": 721431423.0, "step": 12255 }, { "entropy": 0.5757056329399347, "epoch": 0.9188230858047548, "grad_norm": 0.18604841828346252, "learning_rate": 0.0002, "loss": 0.6792, "mean_token_accuracy": 0.8121386148035526, "num_tokens": 722908437.0, "step": 12260 }, { "entropy": 0.5611631942912936, "epoch": 0.9191978097386067, "grad_norm": 0.20036020874977112, "learning_rate": 0.0002, "loss": 0.6588, "mean_token_accuracy": 0.8136757157742978, "num_tokens": 724397027.0, "step": 12265 }, { "entropy": 0.5588284013792872, "epoch": 0.9195725336724585, "grad_norm": 0.20439276099205017, "learning_rate": 0.0002, "loss": 0.6708, "mean_token_accuracy": 0.8157665476202964, "num_tokens": 725854288.0, "step": 12270 }, { "entropy": 0.5564582625404, "epoch": 0.9199472576063104, "grad_norm": 0.19996315240859985, "learning_rate": 0.0002, "loss": 0.6549, "mean_token_accuracy": 0.816352317109704, "num_tokens": 727295808.0, "step": 12275 }, { "entropy": 0.5639039011672139, "epoch": 0.9203219815401622, "grad_norm": 0.19216454029083252, "learning_rate": 0.0002, "loss": 0.6681, "mean_token_accuracy": 0.813323438167572, "num_tokens": 728785285.0, "step": 12280 }, { "entropy": 0.5519219262525439, "epoch": 0.920696705474014, "grad_norm": 0.20258741080760956, "learning_rate": 0.0002, "loss": 0.6644, "mean_token_accuracy": 0.8155202638357878, "num_tokens": 730243401.0, "step": 12285 }, { "entropy": 0.5567178133875131, "epoch": 0.9210714294078659, "grad_norm": 0.18655098974704742, "learning_rate": 0.0002, "loss": 0.6785, "mean_token_accuracy": 0.8134421553462744, "num_tokens": 731729261.0, "step": 12290 }, { "entropy": 0.5550903286784887, "epoch": 0.9214461533417178, "grad_norm": 0.2015630453824997, "learning_rate": 0.0002, "loss": 0.6759, "mean_token_accuracy": 0.8111619133502245, "num_tokens": 733228586.0, "step": 12295 }, { "entropy": 0.5461276788264513, "epoch": 0.9218208772755696, "grad_norm": 0.21184906363487244, "learning_rate": 0.0002, "loss": 0.676, "mean_token_accuracy": 0.8127993289381266, "num_tokens": 734671417.0, "step": 12300 }, { "entropy": 0.550888734869659, "epoch": 0.9221956012094215, "grad_norm": 0.19727087020874023, "learning_rate": 0.0002, "loss": 0.6807, "mean_token_accuracy": 0.815062865242362, "num_tokens": 736156864.0, "step": 12305 }, { "entropy": 0.5325752613134682, "epoch": 0.9225703251432733, "grad_norm": 0.18858005106449127, "learning_rate": 0.0002, "loss": 0.6556, "mean_token_accuracy": 0.8156998682767153, "num_tokens": 737613076.0, "step": 12310 }, { "entropy": 0.551955153606832, "epoch": 0.9229450490771252, "grad_norm": 0.21590662002563477, "learning_rate": 0.0002, "loss": 0.6779, "mean_token_accuracy": 0.812455664947629, "num_tokens": 739091314.0, "step": 12315 }, { "entropy": 0.5477531131356954, "epoch": 0.9233197730109771, "grad_norm": 0.20747271180152893, "learning_rate": 0.0002, "loss": 0.6685, "mean_token_accuracy": 0.8169231034815312, "num_tokens": 740520784.0, "step": 12320 }, { "entropy": 0.5549230605363846, "epoch": 0.923694496944829, "grad_norm": 0.21333572268486023, "learning_rate": 0.0002, "loss": 0.6703, "mean_token_accuracy": 0.8109515909105539, "num_tokens": 742028435.0, "step": 12325 }, { "entropy": 0.5513356290757656, "epoch": 0.9240692208786808, "grad_norm": 0.320273220539093, "learning_rate": 0.0002, "loss": 0.671, "mean_token_accuracy": 0.8126253370195627, "num_tokens": 743518859.0, "step": 12330 }, { "entropy": 0.5373350143432617, "epoch": 0.9244439448125327, "grad_norm": 0.21278025209903717, "learning_rate": 0.0002, "loss": 0.6506, "mean_token_accuracy": 0.8173446096479893, "num_tokens": 745005752.0, "step": 12335 }, { "entropy": 0.5539275064133108, "epoch": 0.9248186687463845, "grad_norm": 0.2187536507844925, "learning_rate": 0.0002, "loss": 0.6709, "mean_token_accuracy": 0.8124414272606373, "num_tokens": 746499010.0, "step": 12340 }, { "entropy": 0.5311268099583686, "epoch": 0.9251933926802364, "grad_norm": 0.2176695019006729, "learning_rate": 0.0002, "loss": 0.6518, "mean_token_accuracy": 0.8177968058735132, "num_tokens": 747918879.0, "step": 12345 }, { "entropy": 0.5361072313040495, "epoch": 0.9255681166140882, "grad_norm": 0.23093253374099731, "learning_rate": 0.0002, "loss": 0.6596, "mean_token_accuracy": 0.8132055316120386, "num_tokens": 749352481.0, "step": 12350 }, { "entropy": 0.5303133811801672, "epoch": 0.9259428405479401, "grad_norm": 0.22227707505226135, "learning_rate": 0.0002, "loss": 0.6542, "mean_token_accuracy": 0.8172001507133245, "num_tokens": 750815406.0, "step": 12355 }, { "entropy": 0.5422395190224052, "epoch": 0.9263175644817919, "grad_norm": 0.19365723431110382, "learning_rate": 0.0002, "loss": 0.6631, "mean_token_accuracy": 0.8147412203252316, "num_tokens": 752297745.0, "step": 12360 }, { "entropy": 0.5504207225516439, "epoch": 0.9266922884156438, "grad_norm": 0.2707286775112152, "learning_rate": 0.0002, "loss": 0.6827, "mean_token_accuracy": 0.8129703335464, "num_tokens": 753780011.0, "step": 12365 }, { "entropy": 0.5505195426754653, "epoch": 0.9270670123494956, "grad_norm": 0.20037464797496796, "learning_rate": 0.0002, "loss": 0.6646, "mean_token_accuracy": 0.8148729588836432, "num_tokens": 755298079.0, "step": 12370 }, { "entropy": 0.5480726342648268, "epoch": 0.9274417362833475, "grad_norm": 0.2007346749305725, "learning_rate": 0.0002, "loss": 0.679, "mean_token_accuracy": 0.8133962914347649, "num_tokens": 756746193.0, "step": 12375 }, { "entropy": 0.5431828700006008, "epoch": 0.9278164602171993, "grad_norm": 0.3170406222343445, "learning_rate": 0.0002, "loss": 0.6661, "mean_token_accuracy": 0.8150538794696331, "num_tokens": 758205707.0, "step": 12380 }, { "entropy": 0.5585792239755392, "epoch": 0.9281911841510512, "grad_norm": 0.22421041131019592, "learning_rate": 0.0002, "loss": 0.6783, "mean_token_accuracy": 0.8132659930735826, "num_tokens": 759689136.0, "step": 12385 }, { "entropy": 0.5658832946792245, "epoch": 0.928565908084903, "grad_norm": 0.20813287794589996, "learning_rate": 0.0002, "loss": 0.6826, "mean_token_accuracy": 0.8101058218628168, "num_tokens": 761175171.0, "step": 12390 }, { "entropy": 0.5643184887245297, "epoch": 0.928940632018755, "grad_norm": 0.20824414491653442, "learning_rate": 0.0002, "loss": 0.6776, "mean_token_accuracy": 0.8137592103332281, "num_tokens": 762616989.0, "step": 12395 }, { "entropy": 0.56043014023453, "epoch": 0.9293153559526068, "grad_norm": 0.2103734016418457, "learning_rate": 0.0002, "loss": 0.6703, "mean_token_accuracy": 0.8115991674363613, "num_tokens": 764072580.0, "step": 12400 }, { "entropy": 0.552270788513124, "epoch": 0.9296900798864587, "grad_norm": 0.20201247930526733, "learning_rate": 0.0002, "loss": 0.6689, "mean_token_accuracy": 0.8173358239233494, "num_tokens": 765537709.0, "step": 12405 }, { "entropy": 0.5486791051924229, "epoch": 0.9300648038203105, "grad_norm": 0.201764315366745, "learning_rate": 0.0002, "loss": 0.6573, "mean_token_accuracy": 0.8159384813159705, "num_tokens": 767019916.0, "step": 12410 }, { "entropy": 0.5585592292249203, "epoch": 0.9304395277541624, "grad_norm": 0.20411252975463867, "learning_rate": 0.0002, "loss": 0.6753, "mean_token_accuracy": 0.8121982507407666, "num_tokens": 768530262.0, "step": 12415 }, { "entropy": 0.5599917961284518, "epoch": 0.9308142516880142, "grad_norm": 0.21511486172676086, "learning_rate": 0.0002, "loss": 0.6722, "mean_token_accuracy": 0.8115589339286089, "num_tokens": 770014172.0, "step": 12420 }, { "entropy": 0.5426170915365219, "epoch": 0.9311889756218661, "grad_norm": 0.20876361429691315, "learning_rate": 0.0002, "loss": 0.6476, "mean_token_accuracy": 0.8170652825385332, "num_tokens": 771469462.0, "step": 12425 }, { "entropy": 0.5718210689723492, "epoch": 0.9315636995557179, "grad_norm": 0.20802250504493713, "learning_rate": 0.0002, "loss": 0.6844, "mean_token_accuracy": 0.8115608975291252, "num_tokens": 772958631.0, "step": 12430 }, { "entropy": 0.5594635389745235, "epoch": 0.9319384234895698, "grad_norm": 0.20630578696727753, "learning_rate": 0.0002, "loss": 0.6653, "mean_token_accuracy": 0.8145258728414774, "num_tokens": 774433928.0, "step": 12435 }, { "entropy": 0.5782641598954796, "epoch": 0.9323131474234216, "grad_norm": 0.2269352525472641, "learning_rate": 0.0002, "loss": 0.6921, "mean_token_accuracy": 0.8101483289152384, "num_tokens": 775887724.0, "step": 12440 }, { "entropy": 0.5614480348303914, "epoch": 0.9326878713572735, "grad_norm": 0.20753180980682373, "learning_rate": 0.0002, "loss": 0.6638, "mean_token_accuracy": 0.813705462962389, "num_tokens": 777386097.0, "step": 12445 }, { "entropy": 0.5600265098735691, "epoch": 0.9330625952911253, "grad_norm": 0.2228650450706482, "learning_rate": 0.0002, "loss": 0.6714, "mean_token_accuracy": 0.8153579916805029, "num_tokens": 778836443.0, "step": 12450 }, { "entropy": 0.5561413869261742, "epoch": 0.9334373192249772, "grad_norm": 0.21573081612586975, "learning_rate": 0.0002, "loss": 0.6593, "mean_token_accuracy": 0.8162534050643444, "num_tokens": 780292140.0, "step": 12455 }, { "entropy": 0.5455635841935873, "epoch": 0.933812043158829, "grad_norm": 0.2081994116306305, "learning_rate": 0.0002, "loss": 0.65, "mean_token_accuracy": 0.8126038692891597, "num_tokens": 781767412.0, "step": 12460 }, { "entropy": 0.5614634165540338, "epoch": 0.9341867670926809, "grad_norm": 0.20929698646068573, "learning_rate": 0.0002, "loss": 0.6731, "mean_token_accuracy": 0.8138424377888441, "num_tokens": 783205252.0, "step": 12465 }, { "entropy": 0.5569948549382389, "epoch": 0.9345614910265327, "grad_norm": 0.2138264775276184, "learning_rate": 0.0002, "loss": 0.67, "mean_token_accuracy": 0.8128887474536896, "num_tokens": 784703234.0, "step": 12470 }, { "entropy": 0.5652584039606154, "epoch": 0.9349362149603847, "grad_norm": 0.2183125615119934, "learning_rate": 0.0002, "loss": 0.6754, "mean_token_accuracy": 0.8131437141448259, "num_tokens": 786142178.0, "step": 12475 }, { "entropy": 0.5443750312551856, "epoch": 0.9353109388942366, "grad_norm": 0.28651607036590576, "learning_rate": 0.0002, "loss": 0.6656, "mean_token_accuracy": 0.8136166080832481, "num_tokens": 787569233.0, "step": 12480 }, { "entropy": 0.5520842498168349, "epoch": 0.9356856628280884, "grad_norm": 0.22240249812602997, "learning_rate": 0.0002, "loss": 0.667, "mean_token_accuracy": 0.8138417907059192, "num_tokens": 789050094.0, "step": 12485 }, { "entropy": 0.5482198817655444, "epoch": 0.9360603867619403, "grad_norm": 0.19237354397773743, "learning_rate": 0.0002, "loss": 0.6648, "mean_token_accuracy": 0.8134395025670529, "num_tokens": 790499881.0, "step": 12490 }, { "entropy": 0.5477802477777004, "epoch": 0.9364351106957921, "grad_norm": 0.1929321587085724, "learning_rate": 0.0002, "loss": 0.6634, "mean_token_accuracy": 0.8135205004364252, "num_tokens": 791966092.0, "step": 12495 }, { "entropy": 0.5451323909685015, "epoch": 0.936809834629644, "grad_norm": 0.21149785816669464, "learning_rate": 0.0002, "loss": 0.6653, "mean_token_accuracy": 0.8159629620611668, "num_tokens": 793436737.0, "step": 12500 }, { "entropy": 0.5653162380680442, "epoch": 0.9371845585634958, "grad_norm": 0.20638427138328552, "learning_rate": 0.0002, "loss": 0.6707, "mean_token_accuracy": 0.8108672693371772, "num_tokens": 794950264.0, "step": 12505 }, { "entropy": 0.5501329215243459, "epoch": 0.9375592824973477, "grad_norm": 0.20342224836349487, "learning_rate": 0.0002, "loss": 0.6649, "mean_token_accuracy": 0.8158083617687225, "num_tokens": 796394630.0, "step": 12510 }, { "entropy": 0.5485154358670116, "epoch": 0.9379340064311995, "grad_norm": 0.23386266827583313, "learning_rate": 0.0002, "loss": 0.663, "mean_token_accuracy": 0.8149451334029436, "num_tokens": 797816266.0, "step": 12515 }, { "entropy": 0.5702336495742202, "epoch": 0.9383087303650514, "grad_norm": 0.21608901023864746, "learning_rate": 0.0002, "loss": 0.6715, "mean_token_accuracy": 0.8131005324423313, "num_tokens": 799294888.0, "step": 12520 }, { "entropy": 0.5542077027261257, "epoch": 0.9386834542989032, "grad_norm": 0.21092617511749268, "learning_rate": 0.0002, "loss": 0.6517, "mean_token_accuracy": 0.8187702532857657, "num_tokens": 800779458.0, "step": 12525 }, { "entropy": 0.5878037866204977, "epoch": 0.9390581782327551, "grad_norm": 0.22019603848457336, "learning_rate": 0.0002, "loss": 0.6885, "mean_token_accuracy": 0.8123368103057146, "num_tokens": 802278666.0, "step": 12530 }, { "entropy": 0.554361772723496, "epoch": 0.9394329021666069, "grad_norm": 0.20995476841926575, "learning_rate": 0.0002, "loss": 0.651, "mean_token_accuracy": 0.8183128882199526, "num_tokens": 803750549.0, "step": 12535 }, { "entropy": 0.5625690829008818, "epoch": 0.9398076261004588, "grad_norm": 0.20126652717590332, "learning_rate": 0.0002, "loss": 0.6705, "mean_token_accuracy": 0.8154655799269677, "num_tokens": 805216255.0, "step": 12540 }, { "entropy": 0.564863795787096, "epoch": 0.9401823500343106, "grad_norm": 0.19835667312145233, "learning_rate": 0.0002, "loss": 0.6762, "mean_token_accuracy": 0.8113276790827513, "num_tokens": 806729285.0, "step": 12545 }, { "entropy": 0.5540885731577874, "epoch": 0.9405570739681626, "grad_norm": 0.219755157828331, "learning_rate": 0.0002, "loss": 0.6617, "mean_token_accuracy": 0.8154292959719897, "num_tokens": 808207314.0, "step": 12550 }, { "entropy": 0.5626806078478694, "epoch": 0.9409317979020144, "grad_norm": 0.21230286359786987, "learning_rate": 0.0002, "loss": 0.6693, "mean_token_accuracy": 0.8159732587635518, "num_tokens": 809715870.0, "step": 12555 }, { "entropy": 0.5612182654440403, "epoch": 0.9413065218358663, "grad_norm": 0.2051982581615448, "learning_rate": 0.0002, "loss": 0.6688, "mean_token_accuracy": 0.8137799471616745, "num_tokens": 811206992.0, "step": 12560 }, { "entropy": 0.5471382137387991, "epoch": 0.9416812457697181, "grad_norm": 0.18740731477737427, "learning_rate": 0.0002, "loss": 0.6516, "mean_token_accuracy": 0.8148497615009547, "num_tokens": 812656366.0, "step": 12565 }, { "entropy": 0.5551805222406984, "epoch": 0.94205596970357, "grad_norm": 0.21295149624347687, "learning_rate": 0.0002, "loss": 0.6779, "mean_token_accuracy": 0.8118157353252172, "num_tokens": 814123847.0, "step": 12570 }, { "entropy": 0.5516741391271353, "epoch": 0.9424306936374218, "grad_norm": 0.20782411098480225, "learning_rate": 0.0002, "loss": 0.6661, "mean_token_accuracy": 0.814837334677577, "num_tokens": 815581641.0, "step": 12575 }, { "entropy": 0.5515983166173101, "epoch": 0.9428054175712737, "grad_norm": 0.19471372663974762, "learning_rate": 0.0002, "loss": 0.6705, "mean_token_accuracy": 0.8146083120256662, "num_tokens": 817047304.0, "step": 12580 }, { "entropy": 0.5308997271582484, "epoch": 0.9431801415051255, "grad_norm": 0.20412659645080566, "learning_rate": 0.0002, "loss": 0.6479, "mean_token_accuracy": 0.8196380980312824, "num_tokens": 818498258.0, "step": 12585 }, { "entropy": 0.5493884751573205, "epoch": 0.9435548654389774, "grad_norm": 0.21175460517406464, "learning_rate": 0.0002, "loss": 0.6725, "mean_token_accuracy": 0.815691027417779, "num_tokens": 819946614.0, "step": 12590 }, { "entropy": 0.5426402750425041, "epoch": 0.9439295893728292, "grad_norm": 0.20292870700359344, "learning_rate": 0.0002, "loss": 0.6634, "mean_token_accuracy": 0.8175998654216527, "num_tokens": 821388132.0, "step": 12595 }, { "entropy": 0.5515389958396554, "epoch": 0.9443043133066811, "grad_norm": 0.20209290087223053, "learning_rate": 0.0002, "loss": 0.665, "mean_token_accuracy": 0.8139783166348934, "num_tokens": 822846688.0, "step": 12600 }, { "entropy": 0.5466427143663168, "epoch": 0.9446790372405329, "grad_norm": 0.19882844388484955, "learning_rate": 0.0002, "loss": 0.6581, "mean_token_accuracy": 0.8151650954037905, "num_tokens": 824312984.0, "step": 12605 }, { "entropy": 0.5512930963188409, "epoch": 0.9450537611743848, "grad_norm": 0.22141388058662415, "learning_rate": 0.0002, "loss": 0.6491, "mean_token_accuracy": 0.815999548882246, "num_tokens": 825758053.0, "step": 12610 }, { "entropy": 0.556481066532433, "epoch": 0.9454284851082366, "grad_norm": 0.21113231778144836, "learning_rate": 0.0002, "loss": 0.6477, "mean_token_accuracy": 0.8174844078719616, "num_tokens": 827206450.0, "step": 12615 }, { "entropy": 0.5834262434393167, "epoch": 0.9458032090420885, "grad_norm": 0.21327804028987885, "learning_rate": 0.0002, "loss": 0.6851, "mean_token_accuracy": 0.8099524788558483, "num_tokens": 828668545.0, "step": 12620 }, { "entropy": 0.5680903296917676, "epoch": 0.9461779329759403, "grad_norm": 0.21272988617420197, "learning_rate": 0.0002, "loss": 0.6681, "mean_token_accuracy": 0.8141823090612889, "num_tokens": 830146626.0, "step": 12625 }, { "entropy": 0.5647014291957021, "epoch": 0.9465526569097923, "grad_norm": 0.20406588912010193, "learning_rate": 0.0002, "loss": 0.673, "mean_token_accuracy": 0.8139936074614524, "num_tokens": 831636620.0, "step": 12630 }, { "entropy": 0.5827916042879224, "epoch": 0.9469273808436441, "grad_norm": 0.2328009456396103, "learning_rate": 0.0002, "loss": 0.6829, "mean_token_accuracy": 0.8138948820531369, "num_tokens": 833138207.0, "step": 12635 }, { "entropy": 0.5662512792274356, "epoch": 0.947302104777496, "grad_norm": 0.2236436903476715, "learning_rate": 0.0002, "loss": 0.67, "mean_token_accuracy": 0.812908037006855, "num_tokens": 834611542.0, "step": 12640 }, { "entropy": 0.557795231230557, "epoch": 0.9476768287113478, "grad_norm": 0.25502532720565796, "learning_rate": 0.0002, "loss": 0.6602, "mean_token_accuracy": 0.8163386035710574, "num_tokens": 836078453.0, "step": 12645 }, { "entropy": 0.5537591334432364, "epoch": 0.9480515526451997, "grad_norm": 0.2042321115732193, "learning_rate": 0.0002, "loss": 0.6492, "mean_token_accuracy": 0.8173806976526976, "num_tokens": 837558148.0, "step": 12650 }, { "entropy": 0.5691071886569261, "epoch": 0.9484262765790515, "grad_norm": 0.2101443111896515, "learning_rate": 0.0002, "loss": 0.6739, "mean_token_accuracy": 0.8139301612973213, "num_tokens": 839045051.0, "step": 12655 }, { "entropy": 0.5611317234113813, "epoch": 0.9488010005129034, "grad_norm": 0.19826821982860565, "learning_rate": 0.0002, "loss": 0.6619, "mean_token_accuracy": 0.8169221263378859, "num_tokens": 840501845.0, "step": 12660 }, { "entropy": 0.5579316806979477, "epoch": 0.9491757244467552, "grad_norm": 0.2403637319803238, "learning_rate": 0.0002, "loss": 0.6507, "mean_token_accuracy": 0.814688178896904, "num_tokens": 841961443.0, "step": 12665 }, { "entropy": 0.5612250886857509, "epoch": 0.9495504483806071, "grad_norm": 0.19554288685321808, "learning_rate": 0.0002, "loss": 0.6593, "mean_token_accuracy": 0.815507584810257, "num_tokens": 843437124.0, "step": 12670 }, { "entropy": 0.5703104794025421, "epoch": 0.9499251723144589, "grad_norm": 0.22316426038742065, "learning_rate": 0.0002, "loss": 0.6783, "mean_token_accuracy": 0.8140679500997067, "num_tokens": 844897278.0, "step": 12675 }, { "entropy": 0.5586856590583921, "epoch": 0.9502998962483108, "grad_norm": 0.20828357338905334, "learning_rate": 0.0002, "loss": 0.6626, "mean_token_accuracy": 0.8160583816468716, "num_tokens": 846382759.0, "step": 12680 }, { "entropy": 0.5698930146172643, "epoch": 0.9506746201821626, "grad_norm": 0.19522154331207275, "learning_rate": 0.0002, "loss": 0.6695, "mean_token_accuracy": 0.8135406613349915, "num_tokens": 847872976.0, "step": 12685 }, { "entropy": 0.5527913952246308, "epoch": 0.9510493441160145, "grad_norm": 0.2049010843038559, "learning_rate": 0.0002, "loss": 0.6622, "mean_token_accuracy": 0.8162548635154963, "num_tokens": 849341099.0, "step": 12690 }, { "entropy": 0.5772928256541491, "epoch": 0.9514240680498663, "grad_norm": 0.21015316247940063, "learning_rate": 0.0002, "loss": 0.676, "mean_token_accuracy": 0.8150422718375921, "num_tokens": 850824731.0, "step": 12695 }, { "entropy": 0.5934185341000557, "epoch": 0.9517987919837182, "grad_norm": 0.5227614641189575, "learning_rate": 0.0002, "loss": 0.6837, "mean_token_accuracy": 0.812971805781126, "num_tokens": 852299042.0, "step": 12700 }, { "entropy": 0.5857105921953917, "epoch": 0.9521735159175702, "grad_norm": 0.21350540220737457, "learning_rate": 0.0002, "loss": 0.6718, "mean_token_accuracy": 0.8119775436818599, "num_tokens": 853797620.0, "step": 12705 }, { "entropy": 0.5735279070213437, "epoch": 0.952548239851422, "grad_norm": 0.22480328381061554, "learning_rate": 0.0002, "loss": 0.6695, "mean_token_accuracy": 0.8120702024549246, "num_tokens": 855323967.0, "step": 12710 }, { "entropy": 0.5844613704830408, "epoch": 0.9529229637852739, "grad_norm": 0.20789723098278046, "learning_rate": 0.0002, "loss": 0.6895, "mean_token_accuracy": 0.8084907602518797, "num_tokens": 856815823.0, "step": 12715 }, { "entropy": 0.5734851716086269, "epoch": 0.9532976877191257, "grad_norm": 0.19834363460540771, "learning_rate": 0.0002, "loss": 0.676, "mean_token_accuracy": 0.8095919787883759, "num_tokens": 858314502.0, "step": 12720 }, { "entropy": 0.5636423919349909, "epoch": 0.9536724116529776, "grad_norm": 0.20657879114151, "learning_rate": 0.0002, "loss": 0.6677, "mean_token_accuracy": 0.8132237013429403, "num_tokens": 859769021.0, "step": 12725 }, { "entropy": 0.5544786535203456, "epoch": 0.9540471355868294, "grad_norm": 0.21045804023742676, "learning_rate": 0.0002, "loss": 0.6571, "mean_token_accuracy": 0.8149882454425097, "num_tokens": 861235776.0, "step": 12730 }, { "entropy": 0.570038141682744, "epoch": 0.9544218595206813, "grad_norm": 0.19934339821338654, "learning_rate": 0.0002, "loss": 0.6791, "mean_token_accuracy": 0.8135756377130747, "num_tokens": 862717194.0, "step": 12735 }, { "entropy": 0.5509682754054666, "epoch": 0.9547965834545331, "grad_norm": 0.2081587016582489, "learning_rate": 0.0002, "loss": 0.6604, "mean_token_accuracy": 0.8154924206435681, "num_tokens": 864179548.0, "step": 12740 }, { "entropy": 0.544762059301138, "epoch": 0.955171307388385, "grad_norm": 0.20466198027133942, "learning_rate": 0.0002, "loss": 0.655, "mean_token_accuracy": 0.8173180691897869, "num_tokens": 865641224.0, "step": 12745 }, { "entropy": 0.5666981559246779, "epoch": 0.9555460313222368, "grad_norm": 0.21506790816783905, "learning_rate": 0.0002, "loss": 0.6806, "mean_token_accuracy": 0.8114119518548251, "num_tokens": 867111879.0, "step": 12750 }, { "entropy": 0.5730675650760532, "epoch": 0.9559207552560887, "grad_norm": 0.20369748771190643, "learning_rate": 0.0002, "loss": 0.6834, "mean_token_accuracy": 0.8125497426837682, "num_tokens": 868548087.0, "step": 12755 }, { "entropy": 0.5692602680996061, "epoch": 0.9562954791899405, "grad_norm": 0.22701658308506012, "learning_rate": 0.0002, "loss": 0.6795, "mean_token_accuracy": 0.8123677246272564, "num_tokens": 870018739.0, "step": 12760 }, { "entropy": 0.551702019572258, "epoch": 0.9566702031237924, "grad_norm": 0.21959680318832397, "learning_rate": 0.0002, "loss": 0.6627, "mean_token_accuracy": 0.8155603129416704, "num_tokens": 871407281.0, "step": 12765 }, { "entropy": 0.5559759307652712, "epoch": 0.9570449270576442, "grad_norm": 0.23116391897201538, "learning_rate": 0.0002, "loss": 0.6604, "mean_token_accuracy": 0.815514237806201, "num_tokens": 872848661.0, "step": 12770 }, { "entropy": 0.5738682067021728, "epoch": 0.9574196509914961, "grad_norm": 0.2060251384973526, "learning_rate": 0.0002, "loss": 0.6724, "mean_token_accuracy": 0.8137315142899751, "num_tokens": 874313611.0, "step": 12775 }, { "entropy": 0.5702512865886092, "epoch": 0.9577943749253479, "grad_norm": 0.2136348932981491, "learning_rate": 0.0002, "loss": 0.6638, "mean_token_accuracy": 0.8124465685337782, "num_tokens": 875780417.0, "step": 12780 }, { "entropy": 0.5758954150602221, "epoch": 0.9581690988591999, "grad_norm": 0.2281310111284256, "learning_rate": 0.0002, "loss": 0.6729, "mean_token_accuracy": 0.8160949110984802, "num_tokens": 877262376.0, "step": 12785 }, { "entropy": 0.5754949107766152, "epoch": 0.9585438227930517, "grad_norm": 0.30507248640060425, "learning_rate": 0.0002, "loss": 0.6756, "mean_token_accuracy": 0.812525536864996, "num_tokens": 878766056.0, "step": 12790 }, { "entropy": 0.574837750941515, "epoch": 0.9589185467269036, "grad_norm": 0.23272405564785004, "learning_rate": 0.0002, "loss": 0.6795, "mean_token_accuracy": 0.8141110725700855, "num_tokens": 880237781.0, "step": 12795 }, { "entropy": 0.558625815436244, "epoch": 0.9592932706607554, "grad_norm": 0.20420391857624054, "learning_rate": 0.0002, "loss": 0.6688, "mean_token_accuracy": 0.8122241836041212, "num_tokens": 881693252.0, "step": 12800 }, { "entropy": 0.5634847074747086, "epoch": 0.9596679945946073, "grad_norm": 0.2384355068206787, "learning_rate": 0.0002, "loss": 0.6658, "mean_token_accuracy": 0.8159219294786453, "num_tokens": 883160648.0, "step": 12805 }, { "entropy": 0.5763133378699422, "epoch": 0.9600427185284591, "grad_norm": 0.2189222276210785, "learning_rate": 0.0002, "loss": 0.6684, "mean_token_accuracy": 0.8150244109332562, "num_tokens": 884649340.0, "step": 12810 }, { "entropy": 0.5594903767108917, "epoch": 0.960417442462311, "grad_norm": 0.19910532236099243, "learning_rate": 0.0002, "loss": 0.6578, "mean_token_accuracy": 0.8149992637336254, "num_tokens": 886092710.0, "step": 12815 }, { "entropy": 0.5649473613128067, "epoch": 0.9607921663961628, "grad_norm": 0.20374615490436554, "learning_rate": 0.0002, "loss": 0.6603, "mean_token_accuracy": 0.8167868394404649, "num_tokens": 887620965.0, "step": 12820 }, { "entropy": 0.5646779878996313, "epoch": 0.9611668903300147, "grad_norm": 0.20565621554851532, "learning_rate": 0.0002, "loss": 0.6566, "mean_token_accuracy": 0.8182167284190655, "num_tokens": 889117957.0, "step": 12825 }, { "entropy": 0.5605320233851672, "epoch": 0.9615416142638665, "grad_norm": 0.19491076469421387, "learning_rate": 0.0002, "loss": 0.6621, "mean_token_accuracy": 0.8146092433482408, "num_tokens": 890596733.0, "step": 12830 }, { "entropy": 0.570744504686445, "epoch": 0.9619163381977184, "grad_norm": 0.19864122569561005, "learning_rate": 0.0002, "loss": 0.6765, "mean_token_accuracy": 0.8127065435051918, "num_tokens": 892087683.0, "step": 12835 }, { "entropy": 0.5465060948394239, "epoch": 0.9622910621315702, "grad_norm": 0.21188689768314362, "learning_rate": 0.0002, "loss": 0.6527, "mean_token_accuracy": 0.8202849250286818, "num_tokens": 893536975.0, "step": 12840 }, { "entropy": 0.5586321666836739, "epoch": 0.9626657860654221, "grad_norm": 0.2113392949104309, "learning_rate": 0.0002, "loss": 0.6602, "mean_token_accuracy": 0.8139969538897276, "num_tokens": 894992118.0, "step": 12845 }, { "entropy": 0.5653804456815124, "epoch": 0.9630405099992739, "grad_norm": 0.20759445428848267, "learning_rate": 0.0002, "loss": 0.6748, "mean_token_accuracy": 0.8144591689109802, "num_tokens": 896440994.0, "step": 12850 }, { "entropy": 0.5703974746167659, "epoch": 0.9634152339331258, "grad_norm": 0.21035423874855042, "learning_rate": 0.0002, "loss": 0.6702, "mean_token_accuracy": 0.8138319618999958, "num_tokens": 897886926.0, "step": 12855 }, { "entropy": 0.5747872576117515, "epoch": 0.9637899578669777, "grad_norm": 0.2026612013578415, "learning_rate": 0.0002, "loss": 0.6689, "mean_token_accuracy": 0.8140293344855308, "num_tokens": 899334292.0, "step": 12860 }, { "entropy": 0.5523668190464377, "epoch": 0.9641646818008296, "grad_norm": 0.2025534063577652, "learning_rate": 0.0002, "loss": 0.6522, "mean_token_accuracy": 0.8200998838990927, "num_tokens": 900822505.0, "step": 12865 }, { "entropy": 0.5726272471249103, "epoch": 0.9645394057346814, "grad_norm": 0.22572532296180725, "learning_rate": 0.0002, "loss": 0.6636, "mean_token_accuracy": 0.813432602956891, "num_tokens": 902277717.0, "step": 12870 }, { "entropy": 0.5906345915049315, "epoch": 0.9649141296685333, "grad_norm": 0.2159872204065323, "learning_rate": 0.0002, "loss": 0.6855, "mean_token_accuracy": 0.811395612731576, "num_tokens": 903718154.0, "step": 12875 }, { "entropy": 0.5645722446963191, "epoch": 0.9652888536023851, "grad_norm": 0.21969087421894073, "learning_rate": 0.0002, "loss": 0.6561, "mean_token_accuracy": 0.8138607628643513, "num_tokens": 905160389.0, "step": 12880 }, { "entropy": 0.5837958442047239, "epoch": 0.965663577536237, "grad_norm": 0.20357733964920044, "learning_rate": 0.0002, "loss": 0.6642, "mean_token_accuracy": 0.8141440790146589, "num_tokens": 906651473.0, "step": 12885 }, { "entropy": 0.5783320873975754, "epoch": 0.9660383014700888, "grad_norm": 0.20419342815876007, "learning_rate": 0.0002, "loss": 0.6649, "mean_token_accuracy": 0.8157842416316271, "num_tokens": 908121824.0, "step": 12890 }, { "entropy": 0.593322379514575, "epoch": 0.9664130254039407, "grad_norm": 0.19633270800113678, "learning_rate": 0.0002, "loss": 0.6762, "mean_token_accuracy": 0.8117150742560625, "num_tokens": 909604830.0, "step": 12895 }, { "entropy": 0.5855795068666338, "epoch": 0.9667877493377925, "grad_norm": 0.2301403433084488, "learning_rate": 0.0002, "loss": 0.6492, "mean_token_accuracy": 0.8195457570254803, "num_tokens": 911064255.0, "step": 12900 }, { "entropy": 0.5816622911021113, "epoch": 0.9671624732716444, "grad_norm": 0.20744577050209045, "learning_rate": 0.0002, "loss": 0.6583, "mean_token_accuracy": 0.8145992752164603, "num_tokens": 912485688.0, "step": 12905 }, { "entropy": 0.589862029440701, "epoch": 0.9675371972054962, "grad_norm": 0.2061813324689865, "learning_rate": 0.0002, "loss": 0.6662, "mean_token_accuracy": 0.8132513519376516, "num_tokens": 913977784.0, "step": 12910 }, { "entropy": 0.5859868755564094, "epoch": 0.9679119211393481, "grad_norm": 0.20686021447181702, "learning_rate": 0.0002, "loss": 0.6663, "mean_token_accuracy": 0.8164731822907925, "num_tokens": 915468155.0, "step": 12915 }, { "entropy": 0.5802725333720445, "epoch": 0.9682866450732, "grad_norm": 0.20143793523311615, "learning_rate": 0.0002, "loss": 0.6523, "mean_token_accuracy": 0.8157601963728667, "num_tokens": 916947521.0, "step": 12920 }, { "entropy": 0.5749590986408293, "epoch": 0.9686613690070518, "grad_norm": 0.21950264275074005, "learning_rate": 0.0002, "loss": 0.6625, "mean_token_accuracy": 0.8163549967110157, "num_tokens": 918439317.0, "step": 12925 }, { "entropy": 0.5690308786928654, "epoch": 0.9690360929409036, "grad_norm": 0.22358205914497375, "learning_rate": 0.0002, "loss": 0.6609, "mean_token_accuracy": 0.8154520720243454, "num_tokens": 919891756.0, "step": 12930 }, { "entropy": 0.5838442765176296, "epoch": 0.9694108168747555, "grad_norm": 0.21096336841583252, "learning_rate": 0.0002, "loss": 0.6755, "mean_token_accuracy": 0.8134933583438396, "num_tokens": 921386965.0, "step": 12935 }, { "entropy": 0.5728674484416842, "epoch": 0.9697855408086075, "grad_norm": 0.22067062556743622, "learning_rate": 0.0002, "loss": 0.6697, "mean_token_accuracy": 0.8151553105562925, "num_tokens": 922850147.0, "step": 12940 }, { "entropy": 0.5727013714611531, "epoch": 0.9701602647424593, "grad_norm": 0.3230602741241455, "learning_rate": 0.0002, "loss": 0.6537, "mean_token_accuracy": 0.8155403953045607, "num_tokens": 924297582.0, "step": 12945 }, { "entropy": 0.5803218428045511, "epoch": 0.9705349886763112, "grad_norm": 0.19109274446964264, "learning_rate": 0.0002, "loss": 0.6606, "mean_token_accuracy": 0.8143868692219257, "num_tokens": 925739482.0, "step": 12950 }, { "entropy": 0.5844746053218841, "epoch": 0.970909712610163, "grad_norm": 0.1983853131532669, "learning_rate": 0.0002, "loss": 0.6783, "mean_token_accuracy": 0.8119211226701737, "num_tokens": 927210837.0, "step": 12955 }, { "entropy": 0.5700020954012871, "epoch": 0.9712844365440149, "grad_norm": 0.2030099630355835, "learning_rate": 0.0002, "loss": 0.668, "mean_token_accuracy": 0.8148712415248156, "num_tokens": 928725738.0, "step": 12960 }, { "entropy": 0.5844048479571938, "epoch": 0.9716591604778667, "grad_norm": 0.2006724327802658, "learning_rate": 0.0002, "loss": 0.6854, "mean_token_accuracy": 0.8095999006181955, "num_tokens": 930220194.0, "step": 12965 }, { "entropy": 0.5695305690169334, "epoch": 0.9720338844117186, "grad_norm": 0.2237401008605957, "learning_rate": 0.0002, "loss": 0.6576, "mean_token_accuracy": 0.8175805009901523, "num_tokens": 931682801.0, "step": 12970 }, { "entropy": 0.570660313591361, "epoch": 0.9724086083455704, "grad_norm": 0.21512740850448608, "learning_rate": 0.0002, "loss": 0.6635, "mean_token_accuracy": 0.8155352771282196, "num_tokens": 933191107.0, "step": 12975 }, { "entropy": 0.5812296349555254, "epoch": 0.9727833322794223, "grad_norm": 0.20150317251682281, "learning_rate": 0.0002, "loss": 0.6708, "mean_token_accuracy": 0.8147213775664568, "num_tokens": 934661830.0, "step": 12980 }, { "entropy": 0.5596358938142657, "epoch": 0.9731580562132741, "grad_norm": 0.18615546822547913, "learning_rate": 0.0002, "loss": 0.6396, "mean_token_accuracy": 0.8178843222558498, "num_tokens": 936109122.0, "step": 12985 }, { "entropy": 0.5748804191127419, "epoch": 0.973532780147126, "grad_norm": 0.21624380350112915, "learning_rate": 0.0002, "loss": 0.668, "mean_token_accuracy": 0.8152858026325702, "num_tokens": 937614002.0, "step": 12990 }, { "entropy": 0.5659370372071862, "epoch": 0.9739075040809778, "grad_norm": 0.2741355299949646, "learning_rate": 0.0002, "loss": 0.658, "mean_token_accuracy": 0.8150327015668154, "num_tokens": 939090114.0, "step": 12995 }, { "entropy": 0.5669656068086624, "epoch": 0.9742822280148297, "grad_norm": 0.19624865055084229, "learning_rate": 0.0002, "loss": 0.6578, "mean_token_accuracy": 0.8162862177938223, "num_tokens": 940618203.0, "step": 13000 }, { "entropy": 0.5698457499966025, "epoch": 0.9746569519486815, "grad_norm": 0.17721126973628998, "learning_rate": 0.0002, "loss": 0.6487, "mean_token_accuracy": 0.8166402459144593, "num_tokens": 942077457.0, "step": 13005 }, { "entropy": 0.5800151145085692, "epoch": 0.9750316758825334, "grad_norm": 0.20702897012233734, "learning_rate": 0.0002, "loss": 0.6635, "mean_token_accuracy": 0.8135156478732825, "num_tokens": 943573628.0, "step": 13010 }, { "entropy": 0.5678585244342684, "epoch": 0.9754063998163853, "grad_norm": 0.2033298760652542, "learning_rate": 0.0002, "loss": 0.6437, "mean_token_accuracy": 0.8209440983831883, "num_tokens": 945057261.0, "step": 13015 }, { "entropy": 0.5780060125514865, "epoch": 0.9757811237502372, "grad_norm": 0.2260199934244156, "learning_rate": 0.0002, "loss": 0.6571, "mean_token_accuracy": 0.8187740840017795, "num_tokens": 946536224.0, "step": 13020 }, { "entropy": 0.5669079508632422, "epoch": 0.976155847684089, "grad_norm": 0.2121814787387848, "learning_rate": 0.0002, "loss": 0.6491, "mean_token_accuracy": 0.8188730258494615, "num_tokens": 947971699.0, "step": 13025 }, { "entropy": 0.5833846798166633, "epoch": 0.9765305716179409, "grad_norm": 0.1986205279827118, "learning_rate": 0.0002, "loss": 0.681, "mean_token_accuracy": 0.8135996893048286, "num_tokens": 949467810.0, "step": 13030 }, { "entropy": 0.5671905959025025, "epoch": 0.9769052955517927, "grad_norm": 0.2152293622493744, "learning_rate": 0.0002, "loss": 0.6527, "mean_token_accuracy": 0.8162636697292328, "num_tokens": 950934956.0, "step": 13035 }, { "entropy": 0.5689193030819297, "epoch": 0.9772800194856446, "grad_norm": 0.23653985559940338, "learning_rate": 0.0002, "loss": 0.6685, "mean_token_accuracy": 0.8150889247655868, "num_tokens": 952416055.0, "step": 13040 }, { "entropy": 0.580013482645154, "epoch": 0.9776547434194964, "grad_norm": 0.19600163400173187, "learning_rate": 0.0002, "loss": 0.6807, "mean_token_accuracy": 0.8127873972058296, "num_tokens": 953931743.0, "step": 13045 }, { "entropy": 0.5630410097539424, "epoch": 0.9780294673533483, "grad_norm": 0.22857338190078735, "learning_rate": 0.0002, "loss": 0.6557, "mean_token_accuracy": 0.8153727058321237, "num_tokens": 955409307.0, "step": 13050 }, { "entropy": 0.5726382462307811, "epoch": 0.9784041912872001, "grad_norm": 0.21807147562503815, "learning_rate": 0.0002, "loss": 0.6667, "mean_token_accuracy": 0.8136952426284552, "num_tokens": 956911960.0, "step": 13055 }, { "entropy": 0.5653065349906683, "epoch": 0.978778915221052, "grad_norm": 0.2242017686367035, "learning_rate": 0.0002, "loss": 0.6576, "mean_token_accuracy": 0.8183182816952467, "num_tokens": 958381867.0, "step": 13060 }, { "entropy": 0.579177295602858, "epoch": 0.9791536391549038, "grad_norm": 0.20291084051132202, "learning_rate": 0.0002, "loss": 0.6751, "mean_token_accuracy": 0.8133789986371994, "num_tokens": 959849171.0, "step": 13065 }, { "entropy": 0.5765824070200324, "epoch": 0.9795283630887557, "grad_norm": 0.2015385925769806, "learning_rate": 0.0002, "loss": 0.6555, "mean_token_accuracy": 0.8170374095439911, "num_tokens": 961330798.0, "step": 13070 }, { "entropy": 0.5790429763495922, "epoch": 0.9799030870226075, "grad_norm": 0.20047686994075775, "learning_rate": 0.0002, "loss": 0.6715, "mean_token_accuracy": 0.811693924292922, "num_tokens": 962816532.0, "step": 13075 }, { "entropy": 0.5773708786815405, "epoch": 0.9802778109564594, "grad_norm": 0.22117409110069275, "learning_rate": 0.0002, "loss": 0.6569, "mean_token_accuracy": 0.8156194299459457, "num_tokens": 964303563.0, "step": 13080 }, { "entropy": 0.5839019972831011, "epoch": 0.9806525348903112, "grad_norm": 0.2111828625202179, "learning_rate": 0.0002, "loss": 0.6638, "mean_token_accuracy": 0.8128108404576778, "num_tokens": 965788615.0, "step": 13085 }, { "entropy": 0.5815574979409576, "epoch": 0.9810272588241631, "grad_norm": 0.2005719691514969, "learning_rate": 0.0002, "loss": 0.6675, "mean_token_accuracy": 0.8142055686563253, "num_tokens": 967297136.0, "step": 13090 }, { "entropy": 0.5694840140640736, "epoch": 0.981401982758015, "grad_norm": 0.2249336689710617, "learning_rate": 0.0002, "loss": 0.6519, "mean_token_accuracy": 0.8163001745939255, "num_tokens": 968780909.0, "step": 13095 }, { "entropy": 0.563676985539496, "epoch": 0.9817767066918669, "grad_norm": 0.19320835173130035, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.8150955334305763, "num_tokens": 970231481.0, "step": 13100 }, { "entropy": 0.5608404707163572, "epoch": 0.9821514306257187, "grad_norm": 0.22896131873130798, "learning_rate": 0.0002, "loss": 0.6483, "mean_token_accuracy": 0.8167359355837107, "num_tokens": 971710894.0, "step": 13105 }, { "entropy": 0.5767851466313004, "epoch": 0.9825261545595706, "grad_norm": 0.19943970441818237, "learning_rate": 0.0002, "loss": 0.6749, "mean_token_accuracy": 0.81459686383605, "num_tokens": 973207470.0, "step": 13110 }, { "entropy": 0.5656792829744518, "epoch": 0.9829008784934224, "grad_norm": 0.1979246884584427, "learning_rate": 0.0002, "loss": 0.6623, "mean_token_accuracy": 0.8161165460944175, "num_tokens": 974701451.0, "step": 13115 }, { "entropy": 0.5655198512598872, "epoch": 0.9832756024272743, "grad_norm": 0.23041890561580658, "learning_rate": 0.0002, "loss": 0.6626, "mean_token_accuracy": 0.8142582293599844, "num_tokens": 976242183.0, "step": 13120 }, { "entropy": 0.5569721277803182, "epoch": 0.9836503263611261, "grad_norm": 0.2241164892911911, "learning_rate": 0.0002, "loss": 0.6603, "mean_token_accuracy": 0.8119801744818688, "num_tokens": 977728687.0, "step": 13125 }, { "entropy": 0.5526668893173337, "epoch": 0.984025050294978, "grad_norm": 0.21148476004600525, "learning_rate": 0.0002, "loss": 0.6591, "mean_token_accuracy": 0.8146421402692795, "num_tokens": 979189366.0, "step": 13130 }, { "entropy": 0.5714912870898843, "epoch": 0.9843997742288298, "grad_norm": 0.20355193316936493, "learning_rate": 0.0002, "loss": 0.6708, "mean_token_accuracy": 0.8137362092733383, "num_tokens": 980640520.0, "step": 13135 }, { "entropy": 0.5545145621523261, "epoch": 0.9847744981626817, "grad_norm": 0.2065795361995697, "learning_rate": 0.0002, "loss": 0.668, "mean_token_accuracy": 0.8151166249066591, "num_tokens": 982120913.0, "step": 13140 }, { "entropy": 0.5625961693935096, "epoch": 0.9851492220965336, "grad_norm": 0.267681360244751, "learning_rate": 0.0002, "loss": 0.6652, "mean_token_accuracy": 0.8136616218835115, "num_tokens": 983556126.0, "step": 13145 }, { "entropy": 0.5703600771725178, "epoch": 0.9855239460303854, "grad_norm": 0.23342107236385345, "learning_rate": 0.0002, "loss": 0.6734, "mean_token_accuracy": 0.8137156341224909, "num_tokens": 985058201.0, "step": 13150 }, { "entropy": 0.5586272010579705, "epoch": 0.9858986699642373, "grad_norm": 0.21615466475486755, "learning_rate": 0.0002, "loss": 0.6628, "mean_token_accuracy": 0.8163065824657678, "num_tokens": 986475080.0, "step": 13155 }, { "entropy": 0.5671999013051391, "epoch": 0.9862733938980891, "grad_norm": 0.26553958654403687, "learning_rate": 0.0002, "loss": 0.6706, "mean_token_accuracy": 0.8153316579759121, "num_tokens": 987940526.0, "step": 13160 }, { "entropy": 0.5385115783661604, "epoch": 0.986648117831941, "grad_norm": 0.22182565927505493, "learning_rate": 0.0002, "loss": 0.6451, "mean_token_accuracy": 0.8194454420357943, "num_tokens": 989376205.0, "step": 13165 }, { "entropy": 0.5569973514415324, "epoch": 0.9870228417657928, "grad_norm": 0.19352135062217712, "learning_rate": 0.0002, "loss": 0.6649, "mean_token_accuracy": 0.8165531657636166, "num_tokens": 990855619.0, "step": 13170 }, { "entropy": 0.5395429201424122, "epoch": 0.9873975656996448, "grad_norm": 0.19390468299388885, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8187099892646075, "num_tokens": 992323084.0, "step": 13175 }, { "entropy": 0.57224848177284, "epoch": 0.9877722896334966, "grad_norm": 0.20994660258293152, "learning_rate": 0.0002, "loss": 0.6725, "mean_token_accuracy": 0.8092429738491773, "num_tokens": 993848973.0, "step": 13180 }, { "entropy": 0.5817715203389525, "epoch": 0.9881470135673485, "grad_norm": 0.22380901873111725, "learning_rate": 0.0002, "loss": 0.6813, "mean_token_accuracy": 0.8117261193692684, "num_tokens": 995347401.0, "step": 13185 }, { "entropy": 0.5528323138132691, "epoch": 0.9885217375012003, "grad_norm": 0.20016345381736755, "learning_rate": 0.0002, "loss": 0.6507, "mean_token_accuracy": 0.8182956714183092, "num_tokens": 996807717.0, "step": 13190 }, { "entropy": 0.5717337794601918, "epoch": 0.9888964614350522, "grad_norm": 0.19618608057498932, "learning_rate": 0.0002, "loss": 0.6731, "mean_token_accuracy": 0.8133057009428739, "num_tokens": 998235958.0, "step": 13195 }, { "entropy": 0.5663468629121781, "epoch": 0.989271185368904, "grad_norm": 0.2025354504585266, "learning_rate": 0.0002, "loss": 0.6559, "mean_token_accuracy": 0.8159435164183378, "num_tokens": 999670170.0, "step": 13200 }, { "entropy": 0.5622261522337795, "epoch": 0.9896459093027559, "grad_norm": 0.20090973377227783, "learning_rate": 0.0002, "loss": 0.6545, "mean_token_accuracy": 0.8163201984018087, "num_tokens": 1001089504.0, "step": 13205 }, { "entropy": 0.5609770493581892, "epoch": 0.9900206332366077, "grad_norm": 0.21360591053962708, "learning_rate": 0.0002, "loss": 0.6615, "mean_token_accuracy": 0.8185532353818417, "num_tokens": 1002582839.0, "step": 13210 }, { "entropy": 0.5749189592897892, "epoch": 0.9903953571704596, "grad_norm": 0.2045973837375641, "learning_rate": 0.0002, "loss": 0.6645, "mean_token_accuracy": 0.813238549232483, "num_tokens": 1004064933.0, "step": 13215 }, { "entropy": 0.5593414356932044, "epoch": 0.9907700811043114, "grad_norm": 0.1969960629940033, "learning_rate": 0.0002, "loss": 0.6433, "mean_token_accuracy": 0.8182937275618315, "num_tokens": 1005572798.0, "step": 13220 }, { "entropy": 0.575691481307149, "epoch": 0.9911448050381633, "grad_norm": 0.21554593741893768, "learning_rate": 0.0002, "loss": 0.669, "mean_token_accuracy": 0.8176323339343071, "num_tokens": 1007059966.0, "step": 13225 }, { "entropy": 0.5681387577205896, "epoch": 0.9915195289720151, "grad_norm": 0.19393298029899597, "learning_rate": 0.0002, "loss": 0.6597, "mean_token_accuracy": 0.815880537033081, "num_tokens": 1008537477.0, "step": 13230 }, { "entropy": 0.5774783544242382, "epoch": 0.991894252905867, "grad_norm": 0.21246804296970367, "learning_rate": 0.0002, "loss": 0.6731, "mean_token_accuracy": 0.8145314510911703, "num_tokens": 1009999545.0, "step": 13235 }, { "entropy": 0.5769321337342262, "epoch": 0.9922689768397188, "grad_norm": 0.21293999254703522, "learning_rate": 0.0002, "loss": 0.672, "mean_token_accuracy": 0.8139773298054933, "num_tokens": 1011450045.0, "step": 13240 }, { "entropy": 0.5793784961104393, "epoch": 0.9926437007735707, "grad_norm": 0.22281725704669952, "learning_rate": 0.0002, "loss": 0.666, "mean_token_accuracy": 0.8142654843628406, "num_tokens": 1012908705.0, "step": 13245 }, { "entropy": 0.5725551825016737, "epoch": 0.9930184247074226, "grad_norm": 0.20366115868091583, "learning_rate": 0.0002, "loss": 0.651, "mean_token_accuracy": 0.8193461898714304, "num_tokens": 1014332477.0, "step": 13250 }, { "entropy": 0.5822045454755426, "epoch": 0.9933931486412745, "grad_norm": 0.2220202386379242, "learning_rate": 0.0002, "loss": 0.6584, "mean_token_accuracy": 0.8154862731695175, "num_tokens": 1015802315.0, "step": 13255 }, { "entropy": 0.5682889658957719, "epoch": 0.9937678725751263, "grad_norm": 0.2263236939907074, "learning_rate": 0.0002, "loss": 0.651, "mean_token_accuracy": 0.8181657381355762, "num_tokens": 1017229671.0, "step": 13260 }, { "entropy": 0.568637091293931, "epoch": 0.9941425965089782, "grad_norm": 0.19027848541736603, "learning_rate": 0.0002, "loss": 0.6493, "mean_token_accuracy": 0.8142800901085139, "num_tokens": 1018729273.0, "step": 13265 }, { "entropy": 0.5651841720566153, "epoch": 0.99451732044283, "grad_norm": 0.19631797075271606, "learning_rate": 0.0002, "loss": 0.6568, "mean_token_accuracy": 0.8168729145079852, "num_tokens": 1020225073.0, "step": 13270 }, { "entropy": 0.5630476774647832, "epoch": 0.9948920443766819, "grad_norm": 0.20887739956378937, "learning_rate": 0.0002, "loss": 0.6592, "mean_token_accuracy": 0.8149261984974145, "num_tokens": 1021663770.0, "step": 13275 }, { "entropy": 0.5595613026991486, "epoch": 0.9952667683105337, "grad_norm": 0.21156615018844604, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8199994459748268, "num_tokens": 1023135318.0, "step": 13280 }, { "entropy": 0.5784286187961698, "epoch": 0.9956414922443856, "grad_norm": 0.20079748332500458, "learning_rate": 0.0002, "loss": 0.6616, "mean_token_accuracy": 0.813157568871975, "num_tokens": 1024626454.0, "step": 13285 }, { "entropy": 0.5822808327153325, "epoch": 0.9960162161782374, "grad_norm": 0.21406885981559753, "learning_rate": 0.0002, "loss": 0.6733, "mean_token_accuracy": 0.8142529666423798, "num_tokens": 1026098499.0, "step": 13290 }, { "entropy": 0.590917869284749, "epoch": 0.9963909401120893, "grad_norm": 0.22326456010341644, "learning_rate": 0.0002, "loss": 0.6788, "mean_token_accuracy": 0.8154041115194559, "num_tokens": 1027576945.0, "step": 13295 }, { "entropy": 0.5755999501794576, "epoch": 0.9967656640459411, "grad_norm": 0.21767482161521912, "learning_rate": 0.0002, "loss": 0.6658, "mean_token_accuracy": 0.8168368000537157, "num_tokens": 1029072210.0, "step": 13300 }, { "entropy": 0.5632402503862977, "epoch": 0.997140387979793, "grad_norm": 0.21650893986225128, "learning_rate": 0.0002, "loss": 0.6725, "mean_token_accuracy": 0.8162445668131113, "num_tokens": 1030502402.0, "step": 13305 }, { "entropy": 0.5351851515471935, "epoch": 0.9975151119136448, "grad_norm": 0.20206283032894135, "learning_rate": 0.0002, "loss": 0.6361, "mean_token_accuracy": 0.8175457578152419, "num_tokens": 1031908274.0, "step": 13310 }, { "entropy": 0.5375327175483108, "epoch": 0.9978898358474967, "grad_norm": 0.20425638556480408, "learning_rate": 0.0002, "loss": 0.6479, "mean_token_accuracy": 0.8195033606141806, "num_tokens": 1033316924.0, "step": 13315 }, { "entropy": 0.5722145035862922, "epoch": 0.9982645597813485, "grad_norm": 0.20198458433151245, "learning_rate": 0.0002, "loss": 0.6764, "mean_token_accuracy": 0.814577754586935, "num_tokens": 1034803615.0, "step": 13320 }, { "entropy": 0.5635342447087168, "epoch": 0.9986392837152004, "grad_norm": 0.19901719689369202, "learning_rate": 0.0002, "loss": 0.6654, "mean_token_accuracy": 0.8147738471627235, "num_tokens": 1036274789.0, "step": 13325 }, { "entropy": 0.5634700935333967, "epoch": 0.9990140076490523, "grad_norm": 0.21045193076133728, "learning_rate": 0.0002, "loss": 0.6706, "mean_token_accuracy": 0.8139341175556183, "num_tokens": 1037733695.0, "step": 13330 }, { "entropy": 0.5561646612361073, "epoch": 0.9993887315829042, "grad_norm": 0.2025044560432434, "learning_rate": 0.0002, "loss": 0.6531, "mean_token_accuracy": 0.8180426202714444, "num_tokens": 1039204645.0, "step": 13335 }, { "entropy": 0.5680373730137944, "epoch": 0.999763455516756, "grad_norm": 0.20707763731479645, "learning_rate": 0.0002, "loss": 0.6739, "mean_token_accuracy": 0.8136212192475796, "num_tokens": 1040741329.0, "step": 13340 }, { "entropy": 0.5704774858808159, "epoch": 1.0000749447867703, "grad_norm": 0.23888227343559265, "learning_rate": 0.0002, "loss": 0.6756, "mean_token_accuracy": 0.8116118782445004, "num_tokens": 1041997407.0, "step": 13345 }, { "entropy": 0.5544480582699179, "epoch": 1.0004496687206221, "grad_norm": 0.22763951122760773, "learning_rate": 0.0002, "loss": 0.6586, "mean_token_accuracy": 0.8145304068922996, "num_tokens": 1043498261.0, "step": 13350 }, { "entropy": 0.5372144363820552, "epoch": 1.000824392654474, "grad_norm": 0.2265525758266449, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.8194984208792448, "num_tokens": 1044983262.0, "step": 13355 }, { "entropy": 0.5389224618673325, "epoch": 1.001199116588326, "grad_norm": 0.19962912797927856, "learning_rate": 0.0002, "loss": 0.6346, "mean_token_accuracy": 0.818070724979043, "num_tokens": 1046488361.0, "step": 13360 }, { "entropy": 0.5652135690674186, "epoch": 1.001573840522178, "grad_norm": 0.1960865557193756, "learning_rate": 0.0002, "loss": 0.6757, "mean_token_accuracy": 0.8133301183581352, "num_tokens": 1048003238.0, "step": 13365 }, { "entropy": 0.5503858853131532, "epoch": 1.0019485644560298, "grad_norm": 0.2276962846517563, "learning_rate": 0.0002, "loss": 0.6396, "mean_token_accuracy": 0.8197258736938238, "num_tokens": 1049521739.0, "step": 13370 }, { "entropy": 0.568610892444849, "epoch": 1.0023232883898816, "grad_norm": 0.2069069892168045, "learning_rate": 0.0002, "loss": 0.6558, "mean_token_accuracy": 0.8126921683549881, "num_tokens": 1050997116.0, "step": 13375 }, { "entropy": 0.5593573981896043, "epoch": 1.0026980123237335, "grad_norm": 0.20987297594547272, "learning_rate": 0.0002, "loss": 0.6536, "mean_token_accuracy": 0.8148490987718106, "num_tokens": 1052458972.0, "step": 13380 }, { "entropy": 0.5530819501727819, "epoch": 1.0030727362575853, "grad_norm": 0.22456198930740356, "learning_rate": 0.0002, "loss": 0.6441, "mean_token_accuracy": 0.8206599984318018, "num_tokens": 1053952964.0, "step": 13385 }, { "entropy": 0.5425474854186177, "epoch": 1.0034474601914372, "grad_norm": 0.27258366346359253, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.8205607268959284, "num_tokens": 1055410504.0, "step": 13390 }, { "entropy": 0.5575100403279066, "epoch": 1.003822184125289, "grad_norm": 0.21709521114826202, "learning_rate": 0.0002, "loss": 0.6505, "mean_token_accuracy": 0.8177127711474895, "num_tokens": 1056877499.0, "step": 13395 }, { "entropy": 0.5613157331943512, "epoch": 1.0041969080591409, "grad_norm": 0.21034689247608185, "learning_rate": 0.0002, "loss": 0.6475, "mean_token_accuracy": 0.8202341321855784, "num_tokens": 1058345214.0, "step": 13400 }, { "entropy": 0.5387067030183971, "epoch": 1.0045716319929927, "grad_norm": 0.20405533909797668, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.8195809874683618, "num_tokens": 1059812215.0, "step": 13405 }, { "entropy": 0.5508530275896192, "epoch": 1.0049463559268446, "grad_norm": 0.22116421163082123, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.820882324129343, "num_tokens": 1061259061.0, "step": 13410 }, { "entropy": 0.5468695843592286, "epoch": 1.0053210798606964, "grad_norm": 0.2249743938446045, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8208653554320335, "num_tokens": 1062686058.0, "step": 13415 }, { "entropy": 0.5581013986840844, "epoch": 1.0056958037945483, "grad_norm": 0.19291743636131287, "learning_rate": 0.0002, "loss": 0.6502, "mean_token_accuracy": 0.8172530326992273, "num_tokens": 1064183822.0, "step": 13420 }, { "entropy": 0.5365384167991578, "epoch": 1.0060705277284, "grad_norm": 0.2164762169122696, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.820368318259716, "num_tokens": 1065637183.0, "step": 13425 }, { "entropy": 0.5359046983532607, "epoch": 1.006445251662252, "grad_norm": 0.23381651937961578, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.8203916072845459, "num_tokens": 1067101113.0, "step": 13430 }, { "entropy": 0.5479911878705025, "epoch": 1.0068199755961038, "grad_norm": 0.20853877067565918, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.8178746398538351, "num_tokens": 1068562754.0, "step": 13435 }, { "entropy": 0.5295617327094078, "epoch": 1.0071946995299557, "grad_norm": 0.20131933689117432, "learning_rate": 0.0002, "loss": 0.6371, "mean_token_accuracy": 0.81977069824934, "num_tokens": 1070012240.0, "step": 13440 }, { "entropy": 0.5448067680001258, "epoch": 1.0075694234638075, "grad_norm": 0.21150639653205872, "learning_rate": 0.0002, "loss": 0.6586, "mean_token_accuracy": 0.8181868594139814, "num_tokens": 1071442121.0, "step": 13445 }, { "entropy": 0.552656932733953, "epoch": 1.0079441473976594, "grad_norm": 0.2586424648761749, "learning_rate": 0.0002, "loss": 0.6638, "mean_token_accuracy": 0.813650781288743, "num_tokens": 1072935156.0, "step": 13450 }, { "entropy": 0.5501045618206263, "epoch": 1.0083188713315112, "grad_norm": 0.21051499247550964, "learning_rate": 0.0002, "loss": 0.6649, "mean_token_accuracy": 0.8137607529759407, "num_tokens": 1074419540.0, "step": 13455 }, { "entropy": 0.5226931858807802, "epoch": 1.008693595265363, "grad_norm": 0.21687842905521393, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8232077028602361, "num_tokens": 1075880186.0, "step": 13460 }, { "entropy": 0.5429589124396443, "epoch": 1.009068319199215, "grad_norm": 0.2100352793931961, "learning_rate": 0.0002, "loss": 0.6457, "mean_token_accuracy": 0.816456712037325, "num_tokens": 1077364700.0, "step": 13465 }, { "entropy": 0.5305429249070585, "epoch": 1.0094430431330668, "grad_norm": 0.2244836986064911, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8179162722080946, "num_tokens": 1078853042.0, "step": 13470 }, { "entropy": 0.5270028730854392, "epoch": 1.0098177670669186, "grad_norm": 0.21047739684581757, "learning_rate": 0.0002, "loss": 0.6407, "mean_token_accuracy": 0.8164708465337753, "num_tokens": 1080243099.0, "step": 13475 }, { "entropy": 0.5367447946220636, "epoch": 1.0101924910007705, "grad_norm": 0.20641915500164032, "learning_rate": 0.0002, "loss": 0.6493, "mean_token_accuracy": 0.81763203330338, "num_tokens": 1081742834.0, "step": 13480 }, { "entropy": 0.5289525466971099, "epoch": 1.0105672149346223, "grad_norm": 0.22761289775371552, "learning_rate": 0.0002, "loss": 0.6463, "mean_token_accuracy": 0.8186418693512678, "num_tokens": 1083206931.0, "step": 13485 }, { "entropy": 0.5387507075443864, "epoch": 1.0109419388684742, "grad_norm": 0.22678446769714355, "learning_rate": 0.0002, "loss": 0.6493, "mean_token_accuracy": 0.815402440726757, "num_tokens": 1084699457.0, "step": 13490 }, { "entropy": 0.550604453869164, "epoch": 1.011316662802326, "grad_norm": 0.21238316595554352, "learning_rate": 0.0002, "loss": 0.6626, "mean_token_accuracy": 0.8136143799871206, "num_tokens": 1086154735.0, "step": 13495 }, { "entropy": 0.5440594777464867, "epoch": 1.0116913867361779, "grad_norm": 0.26308631896972656, "learning_rate": 0.0002, "loss": 0.6654, "mean_token_accuracy": 0.8158131998032332, "num_tokens": 1087627959.0, "step": 13500 }, { "entropy": 0.5255320688709617, "epoch": 1.0120661106700297, "grad_norm": 0.21043869853019714, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.8180657085031271, "num_tokens": 1089089981.0, "step": 13505 }, { "entropy": 0.5357323385775089, "epoch": 1.0124408346038816, "grad_norm": 0.2127910852432251, "learning_rate": 0.0002, "loss": 0.6407, "mean_token_accuracy": 0.8195008158683776, "num_tokens": 1090526677.0, "step": 13510 }, { "entropy": 0.5312947027385235, "epoch": 1.0128155585377336, "grad_norm": 0.2129354178905487, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.8208299368619919, "num_tokens": 1091959979.0, "step": 13515 }, { "entropy": 0.5544145632535219, "epoch": 1.0131902824715855, "grad_norm": 0.2048136293888092, "learning_rate": 0.0002, "loss": 0.6465, "mean_token_accuracy": 0.8174325630068779, "num_tokens": 1093486988.0, "step": 13520 }, { "entropy": 0.5442559283226729, "epoch": 1.0135650064054373, "grad_norm": 0.20069143176078796, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8200355343520641, "num_tokens": 1094956446.0, "step": 13525 }, { "entropy": 0.5465480878949165, "epoch": 1.0139397303392892, "grad_norm": 0.21077750623226166, "learning_rate": 0.0002, "loss": 0.6454, "mean_token_accuracy": 0.819757928699255, "num_tokens": 1096379177.0, "step": 13530 }, { "entropy": 0.5375741792842745, "epoch": 1.014314454273141, "grad_norm": 0.20960313081741333, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.8202731005847455, "num_tokens": 1097855409.0, "step": 13535 }, { "entropy": 0.5718679727986455, "epoch": 1.014689178206993, "grad_norm": 0.22005704045295715, "learning_rate": 0.0002, "loss": 0.6677, "mean_token_accuracy": 0.8130039222538471, "num_tokens": 1099297541.0, "step": 13540 }, { "entropy": 0.5540409391745925, "epoch": 1.0150639021408447, "grad_norm": 0.21335415542125702, "learning_rate": 0.0002, "loss": 0.644, "mean_token_accuracy": 0.8195872116833925, "num_tokens": 1100761505.0, "step": 13545 }, { "entropy": 0.555188874155283, "epoch": 1.0154386260746966, "grad_norm": 0.2089000642299652, "learning_rate": 0.0002, "loss": 0.6411, "mean_token_accuracy": 0.8184108957648277, "num_tokens": 1102245123.0, "step": 13550 }, { "entropy": 0.5646426325663925, "epoch": 1.0158133500085484, "grad_norm": 0.22337964177131653, "learning_rate": 0.0002, "loss": 0.6527, "mean_token_accuracy": 0.8153816226869821, "num_tokens": 1103708602.0, "step": 13555 }, { "entropy": 0.5657408850267529, "epoch": 1.0161880739424003, "grad_norm": 0.20785339176654816, "learning_rate": 0.0002, "loss": 0.6609, "mean_token_accuracy": 0.8166480753570795, "num_tokens": 1105199264.0, "step": 13560 }, { "entropy": 0.5507932208478451, "epoch": 1.0165627978762521, "grad_norm": 0.20917333662509918, "learning_rate": 0.0002, "loss": 0.6421, "mean_token_accuracy": 0.8196190588176251, "num_tokens": 1106657504.0, "step": 13565 }, { "entropy": 0.5604894163087011, "epoch": 1.016937521810104, "grad_norm": 0.207265704870224, "learning_rate": 0.0002, "loss": 0.6488, "mean_token_accuracy": 0.8176998764276504, "num_tokens": 1108132509.0, "step": 13570 }, { "entropy": 0.5509708171710372, "epoch": 1.0173122457439558, "grad_norm": 0.20224548876285553, "learning_rate": 0.0002, "loss": 0.6394, "mean_token_accuracy": 0.8202374599874019, "num_tokens": 1109540794.0, "step": 13575 }, { "entropy": 0.5540878664702177, "epoch": 1.0176869696778077, "grad_norm": 0.2203056812286377, "learning_rate": 0.0002, "loss": 0.6372, "mean_token_accuracy": 0.8188724484294653, "num_tokens": 1111036350.0, "step": 13580 }, { "entropy": 0.5664244530722499, "epoch": 1.0180616936116595, "grad_norm": 0.20701731741428375, "learning_rate": 0.0002, "loss": 0.6566, "mean_token_accuracy": 0.8106844272464514, "num_tokens": 1112502821.0, "step": 13585 }, { "entropy": 0.5536012457683682, "epoch": 1.0184364175455114, "grad_norm": 0.1930079609155655, "learning_rate": 0.0002, "loss": 0.6424, "mean_token_accuracy": 0.816598292067647, "num_tokens": 1113991339.0, "step": 13590 }, { "entropy": 0.5631848525255918, "epoch": 1.0188111414793632, "grad_norm": 0.2058492749929428, "learning_rate": 0.0002, "loss": 0.6502, "mean_token_accuracy": 0.8158687185496092, "num_tokens": 1115487593.0, "step": 13595 }, { "entropy": 0.5500465082004666, "epoch": 1.019185865413215, "grad_norm": 0.33993053436279297, "learning_rate": 0.0002, "loss": 0.6451, "mean_token_accuracy": 0.8191346928477288, "num_tokens": 1116923813.0, "step": 13600 }, { "entropy": 0.55192916020751, "epoch": 1.019560589347067, "grad_norm": 0.21494804322719574, "learning_rate": 0.0002, "loss": 0.6472, "mean_token_accuracy": 0.816361527144909, "num_tokens": 1118385795.0, "step": 13605 }, { "entropy": 0.5608910420909524, "epoch": 1.0199353132809188, "grad_norm": 0.20857791602611542, "learning_rate": 0.0002, "loss": 0.6572, "mean_token_accuracy": 0.8140493728220463, "num_tokens": 1119853058.0, "step": 13610 }, { "entropy": 0.5326402531936765, "epoch": 1.0203100372147706, "grad_norm": 0.24870117008686066, "learning_rate": 0.0002, "loss": 0.6387, "mean_token_accuracy": 0.8199037168174982, "num_tokens": 1121321655.0, "step": 13615 }, { "entropy": 0.5387020887807011, "epoch": 1.0206847611486225, "grad_norm": 0.21357281506061554, "learning_rate": 0.0002, "loss": 0.6325, "mean_token_accuracy": 0.8180718187242746, "num_tokens": 1122857307.0, "step": 13620 }, { "entropy": 0.557958241738379, "epoch": 1.0210594850824743, "grad_norm": 0.20926907658576965, "learning_rate": 0.0002, "loss": 0.6548, "mean_token_accuracy": 0.8132552798837424, "num_tokens": 1124364272.0, "step": 13625 }, { "entropy": 0.5559332819655538, "epoch": 1.0214342090163262, "grad_norm": 0.20632781088352203, "learning_rate": 0.0002, "loss": 0.6598, "mean_token_accuracy": 0.8136483181267977, "num_tokens": 1125857557.0, "step": 13630 }, { "entropy": 0.5419613784179091, "epoch": 1.021808932950178, "grad_norm": 0.2059336155653, "learning_rate": 0.0002, "loss": 0.6374, "mean_token_accuracy": 0.8207571640610695, "num_tokens": 1127310312.0, "step": 13635 }, { "entropy": 0.5336447928100825, "epoch": 1.02218365688403, "grad_norm": 0.21281923353672028, "learning_rate": 0.0002, "loss": 0.63, "mean_token_accuracy": 0.8211156204342842, "num_tokens": 1128757883.0, "step": 13640 }, { "entropy": 0.547108486481011, "epoch": 1.0225583808178818, "grad_norm": 0.2164224237203598, "learning_rate": 0.0002, "loss": 0.6455, "mean_token_accuracy": 0.8219338107854128, "num_tokens": 1130248458.0, "step": 13645 }, { "entropy": 0.5485926058143378, "epoch": 1.0229331047517336, "grad_norm": 0.2187340408563614, "learning_rate": 0.0002, "loss": 0.6482, "mean_token_accuracy": 0.817500127106905, "num_tokens": 1131677042.0, "step": 13650 }, { "entropy": 0.5600590197369456, "epoch": 1.0233078286855855, "grad_norm": 0.21414542198181152, "learning_rate": 0.0002, "loss": 0.6589, "mean_token_accuracy": 0.8164660926908255, "num_tokens": 1133148346.0, "step": 13655 }, { "entropy": 0.5579775681719183, "epoch": 1.0236825526194373, "grad_norm": 0.22363044321537018, "learning_rate": 0.0002, "loss": 0.6632, "mean_token_accuracy": 0.815651997923851, "num_tokens": 1134617738.0, "step": 13660 }, { "entropy": 0.5490597145631909, "epoch": 1.0240572765532892, "grad_norm": 0.21783940494060516, "learning_rate": 0.0002, "loss": 0.6418, "mean_token_accuracy": 0.8162145778536797, "num_tokens": 1136098124.0, "step": 13665 }, { "entropy": 0.5625691983848811, "epoch": 1.024432000487141, "grad_norm": 0.21045398712158203, "learning_rate": 0.0002, "loss": 0.655, "mean_token_accuracy": 0.8149533897638321, "num_tokens": 1137597462.0, "step": 13670 }, { "entropy": 0.548613048531115, "epoch": 1.024806724420993, "grad_norm": 0.2103690803050995, "learning_rate": 0.0002, "loss": 0.6426, "mean_token_accuracy": 0.8182279288768768, "num_tokens": 1139074439.0, "step": 13675 }, { "entropy": 0.5559936903417111, "epoch": 1.025181448354845, "grad_norm": 0.22345782816410065, "learning_rate": 0.0002, "loss": 0.6378, "mean_token_accuracy": 0.8194551575928927, "num_tokens": 1140540919.0, "step": 13680 }, { "entropy": 0.5586344020441174, "epoch": 1.0255561722886968, "grad_norm": 0.2046053260564804, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8194023180752993, "num_tokens": 1142048557.0, "step": 13685 }, { "entropy": 0.5644229838624597, "epoch": 1.0259308962225486, "grad_norm": 0.219741553068161, "learning_rate": 0.0002, "loss": 0.6546, "mean_token_accuracy": 0.817897629365325, "num_tokens": 1143486795.0, "step": 13690 }, { "entropy": 0.5648827714845538, "epoch": 1.0263056201564005, "grad_norm": 0.22470282018184662, "learning_rate": 0.0002, "loss": 0.6502, "mean_token_accuracy": 0.8166111197322607, "num_tokens": 1144976180.0, "step": 13695 }, { "entropy": 0.5640271883457899, "epoch": 1.0266803440902523, "grad_norm": 0.22776705026626587, "learning_rate": 0.0002, "loss": 0.6533, "mean_token_accuracy": 0.8164833296090365, "num_tokens": 1146453061.0, "step": 13700 }, { "entropy": 0.5597106443718076, "epoch": 1.0270550680241042, "grad_norm": 0.21001674234867096, "learning_rate": 0.0002, "loss": 0.6483, "mean_token_accuracy": 0.8165923710912466, "num_tokens": 1147900250.0, "step": 13705 }, { "entropy": 0.5596444111317396, "epoch": 1.027429791957956, "grad_norm": 0.20678788423538208, "learning_rate": 0.0002, "loss": 0.6546, "mean_token_accuracy": 0.815880037471652, "num_tokens": 1149326727.0, "step": 13710 }, { "entropy": 0.5553151808679104, "epoch": 1.0278045158918079, "grad_norm": 0.20818282663822174, "learning_rate": 0.0002, "loss": 0.649, "mean_token_accuracy": 0.8139134012162685, "num_tokens": 1150831133.0, "step": 13715 }, { "entropy": 0.5738635996356607, "epoch": 1.0281792398256597, "grad_norm": 0.22757691144943237, "learning_rate": 0.0002, "loss": 0.6773, "mean_token_accuracy": 0.8131483819335699, "num_tokens": 1152286470.0, "step": 13720 }, { "entropy": 0.5627703378908336, "epoch": 1.0285539637595116, "grad_norm": 0.19538114964962006, "learning_rate": 0.0002, "loss": 0.6528, "mean_token_accuracy": 0.8158556133508682, "num_tokens": 1153771148.0, "step": 13725 }, { "entropy": 0.5620070425793529, "epoch": 1.0289286876933634, "grad_norm": 0.2086997777223587, "learning_rate": 0.0002, "loss": 0.6513, "mean_token_accuracy": 0.8139421485364438, "num_tokens": 1155279898.0, "step": 13730 }, { "entropy": 0.5552151279523969, "epoch": 1.0293034116272153, "grad_norm": 0.23440290987491608, "learning_rate": 0.0002, "loss": 0.6488, "mean_token_accuracy": 0.8172969322651624, "num_tokens": 1156707941.0, "step": 13735 }, { "entropy": 0.5433262992650271, "epoch": 1.0296781355610671, "grad_norm": 0.20533017814159393, "learning_rate": 0.0002, "loss": 0.6425, "mean_token_accuracy": 0.8172902755439282, "num_tokens": 1158173665.0, "step": 13740 }, { "entropy": 0.5477021114900709, "epoch": 1.030052859494919, "grad_norm": 0.21434618532657623, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.8197133004665375, "num_tokens": 1159659817.0, "step": 13745 }, { "entropy": 0.5534700077027083, "epoch": 1.0304275834287708, "grad_norm": 0.21364128589630127, "learning_rate": 0.0002, "loss": 0.6551, "mean_token_accuracy": 0.814370883256197, "num_tokens": 1161128879.0, "step": 13750 }, { "entropy": 0.5428920645266772, "epoch": 1.0308023073626227, "grad_norm": 0.19682316482067108, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8222768869251013, "num_tokens": 1162614148.0, "step": 13755 }, { "entropy": 0.5357734605669975, "epoch": 1.0311770312964745, "grad_norm": 0.21383368968963623, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.8197727914899587, "num_tokens": 1164060670.0, "step": 13760 }, { "entropy": 0.5506071435287595, "epoch": 1.0315517552303264, "grad_norm": 0.20062293112277985, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.8181455235928297, "num_tokens": 1165519587.0, "step": 13765 }, { "entropy": 0.5383080210536718, "epoch": 1.0319264791641782, "grad_norm": 0.2005176693201065, "learning_rate": 0.0002, "loss": 0.6387, "mean_token_accuracy": 0.8175833124667407, "num_tokens": 1166958655.0, "step": 13770 }, { "entropy": 0.5354182329028845, "epoch": 1.03230120309803, "grad_norm": 0.2000667005777359, "learning_rate": 0.0002, "loss": 0.6333, "mean_token_accuracy": 0.8187642276287079, "num_tokens": 1168407610.0, "step": 13775 }, { "entropy": 0.5606068760156632, "epoch": 1.032675927031882, "grad_norm": 0.2065376341342926, "learning_rate": 0.0002, "loss": 0.6595, "mean_token_accuracy": 0.8161816123872996, "num_tokens": 1169867412.0, "step": 13780 }, { "entropy": 0.5561245963908732, "epoch": 1.0330506509657338, "grad_norm": 0.19273658096790314, "learning_rate": 0.0002, "loss": 0.6517, "mean_token_accuracy": 0.8173061527311802, "num_tokens": 1171327474.0, "step": 13785 }, { "entropy": 0.5484752852469683, "epoch": 1.0334253748995856, "grad_norm": 0.20412792265415192, "learning_rate": 0.0002, "loss": 0.6427, "mean_token_accuracy": 0.820511419698596, "num_tokens": 1172777150.0, "step": 13790 }, { "entropy": 0.5471178790554404, "epoch": 1.0338000988334375, "grad_norm": 0.18547458946704865, "learning_rate": 0.0002, "loss": 0.6365, "mean_token_accuracy": 0.8212733063846827, "num_tokens": 1174255958.0, "step": 13795 }, { "entropy": 0.554989006742835, "epoch": 1.0341748227672893, "grad_norm": 0.22284233570098877, "learning_rate": 0.0002, "loss": 0.6521, "mean_token_accuracy": 0.8156580083072186, "num_tokens": 1175709392.0, "step": 13800 }, { "entropy": 0.557241278141737, "epoch": 1.0345495467011412, "grad_norm": 0.2148411124944687, "learning_rate": 0.0002, "loss": 0.6344, "mean_token_accuracy": 0.8182643469423055, "num_tokens": 1177207338.0, "step": 13805 }, { "entropy": 0.5481248755007982, "epoch": 1.034924270634993, "grad_norm": 0.18916599452495575, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.8170689661055803, "num_tokens": 1178682088.0, "step": 13810 }, { "entropy": 0.5536113854497671, "epoch": 1.0352989945688449, "grad_norm": 0.20699015259742737, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.817990354821086, "num_tokens": 1180157058.0, "step": 13815 }, { "entropy": 0.5671579901129007, "epoch": 1.0356737185026967, "grad_norm": 0.2209988385438919, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.818561515212059, "num_tokens": 1181628411.0, "step": 13820 }, { "entropy": 0.5639125846326352, "epoch": 1.0360484424365488, "grad_norm": 0.21287108957767487, "learning_rate": 0.0002, "loss": 0.642, "mean_token_accuracy": 0.8182295713573694, "num_tokens": 1183101120.0, "step": 13825 }, { "entropy": 0.5551279731094837, "epoch": 1.0364231663704007, "grad_norm": 0.2023453414440155, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8217967320233583, "num_tokens": 1184600137.0, "step": 13830 }, { "entropy": 0.5591248098760844, "epoch": 1.0367978903042525, "grad_norm": 0.2174273133277893, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.8177340246737004, "num_tokens": 1186015170.0, "step": 13835 }, { "entropy": 0.5704630756750703, "epoch": 1.0371726142381044, "grad_norm": 0.207612082362175, "learning_rate": 0.0002, "loss": 0.6516, "mean_token_accuracy": 0.8178269181400537, "num_tokens": 1187475823.0, "step": 13840 }, { "entropy": 0.5731261389330029, "epoch": 1.0375473381719562, "grad_norm": 0.2144336998462677, "learning_rate": 0.0002, "loss": 0.6566, "mean_token_accuracy": 0.819228184223175, "num_tokens": 1188937636.0, "step": 13845 }, { "entropy": 0.5680343970656395, "epoch": 1.037922062105808, "grad_norm": 0.2656528651714325, "learning_rate": 0.0002, "loss": 0.6611, "mean_token_accuracy": 0.8126110319048166, "num_tokens": 1190381627.0, "step": 13850 }, { "entropy": 0.5589640544727444, "epoch": 1.03829678603966, "grad_norm": 0.2282530963420868, "learning_rate": 0.0002, "loss": 0.6523, "mean_token_accuracy": 0.8179284483194351, "num_tokens": 1191835639.0, "step": 13855 }, { "entropy": 0.5551448684185744, "epoch": 1.0386715099735118, "grad_norm": 0.22049622237682343, "learning_rate": 0.0002, "loss": 0.6439, "mean_token_accuracy": 0.8158007830381393, "num_tokens": 1193283151.0, "step": 13860 }, { "entropy": 0.5420742591843009, "epoch": 1.0390462339073636, "grad_norm": 0.19827347993850708, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.819159310683608, "num_tokens": 1194715663.0, "step": 13865 }, { "entropy": 0.5477863867767155, "epoch": 1.0394209578412155, "grad_norm": 0.2076900601387024, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8198650278151035, "num_tokens": 1196183172.0, "step": 13870 }, { "entropy": 0.5557606751099229, "epoch": 1.0397956817750673, "grad_norm": 0.1856156289577484, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.818281278386712, "num_tokens": 1197661787.0, "step": 13875 }, { "entropy": 0.5662225807085633, "epoch": 1.0401704057089192, "grad_norm": 0.20865291357040405, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.8153215013444424, "num_tokens": 1199161252.0, "step": 13880 }, { "entropy": 0.5582923419773579, "epoch": 1.040545129642771, "grad_norm": 0.21316960453987122, "learning_rate": 0.0002, "loss": 0.6487, "mean_token_accuracy": 0.8156984407454729, "num_tokens": 1200627806.0, "step": 13885 }, { "entropy": 0.5455300338566303, "epoch": 1.0409198535766229, "grad_norm": 0.2052599936723709, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.8209010343998671, "num_tokens": 1202104872.0, "step": 13890 }, { "entropy": 0.5621199642308057, "epoch": 1.0412945775104747, "grad_norm": 0.20259185135364532, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.8164517723023892, "num_tokens": 1203546430.0, "step": 13895 }, { "entropy": 0.5694496136158704, "epoch": 1.0416693014443266, "grad_norm": 0.2185339480638504, "learning_rate": 0.0002, "loss": 0.6529, "mean_token_accuracy": 0.8168646156787872, "num_tokens": 1205038980.0, "step": 13900 }, { "entropy": 0.5716031141579151, "epoch": 1.0420440253781784, "grad_norm": 0.2112453132867813, "learning_rate": 0.0002, "loss": 0.6435, "mean_token_accuracy": 0.815698953717947, "num_tokens": 1206508868.0, "step": 13905 }, { "entropy": 0.5645261937752366, "epoch": 1.0424187493120303, "grad_norm": 0.22424089908599854, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.8197371929883956, "num_tokens": 1207970040.0, "step": 13910 }, { "entropy": 0.5560080404393375, "epoch": 1.0427934732458821, "grad_norm": 0.21003586053848267, "learning_rate": 0.0002, "loss": 0.6333, "mean_token_accuracy": 0.817926823347807, "num_tokens": 1209438443.0, "step": 13915 }, { "entropy": 0.5729775883257389, "epoch": 1.043168197179734, "grad_norm": 0.23040422797203064, "learning_rate": 0.0002, "loss": 0.6524, "mean_token_accuracy": 0.8137695003300905, "num_tokens": 1210939749.0, "step": 13920 }, { "entropy": 0.5522454937919974, "epoch": 1.0435429211135858, "grad_norm": 0.2212659865617752, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8199192520231009, "num_tokens": 1212424382.0, "step": 13925 }, { "entropy": 0.5596079641953111, "epoch": 1.0439176450474377, "grad_norm": 0.2121303677558899, "learning_rate": 0.0002, "loss": 0.6463, "mean_token_accuracy": 0.8188553918153048, "num_tokens": 1213888928.0, "step": 13930 }, { "entropy": 0.5709986487403512, "epoch": 1.0442923689812895, "grad_norm": 0.20581014454364777, "learning_rate": 0.0002, "loss": 0.672, "mean_token_accuracy": 0.81243283636868, "num_tokens": 1215363452.0, "step": 13935 }, { "entropy": 0.5570866394788027, "epoch": 1.0446670929151414, "grad_norm": 0.20889747142791748, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.8212460316717625, "num_tokens": 1216797949.0, "step": 13940 }, { "entropy": 0.5511658631265164, "epoch": 1.0450418168489932, "grad_norm": 0.22981171309947968, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.8193134550005198, "num_tokens": 1218220474.0, "step": 13945 }, { "entropy": 0.5518514582887292, "epoch": 1.045416540782845, "grad_norm": 0.182744562625885, "learning_rate": 0.0002, "loss": 0.6337, "mean_token_accuracy": 0.8178678449243307, "num_tokens": 1219697607.0, "step": 13950 }, { "entropy": 0.5472762187942862, "epoch": 1.045791264716697, "grad_norm": 0.2044219672679901, "learning_rate": 0.0002, "loss": 0.6414, "mean_token_accuracy": 0.8181291073560715, "num_tokens": 1221122423.0, "step": 13955 }, { "entropy": 0.533134818635881, "epoch": 1.0461659886505488, "grad_norm": 0.2325223982334137, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.8235277384519577, "num_tokens": 1222571795.0, "step": 13960 }, { "entropy": 0.5593807393684983, "epoch": 1.0465407125844006, "grad_norm": 0.21184106171131134, "learning_rate": 0.0002, "loss": 0.6379, "mean_token_accuracy": 0.8187566425651311, "num_tokens": 1224017506.0, "step": 13965 }, { "entropy": 0.5557482151314617, "epoch": 1.0469154365182525, "grad_norm": 0.19741491973400116, "learning_rate": 0.0002, "loss": 0.6378, "mean_token_accuracy": 0.8161979001015425, "num_tokens": 1225471048.0, "step": 13970 }, { "entropy": 0.5668150183744729, "epoch": 1.0472901604521043, "grad_norm": 0.21168802678585052, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.8169676434248686, "num_tokens": 1226952535.0, "step": 13975 }, { "entropy": 0.5612282514572143, "epoch": 1.0476648843859562, "grad_norm": 0.2154131531715393, "learning_rate": 0.0002, "loss": 0.639, "mean_token_accuracy": 0.8166254557669163, "num_tokens": 1228437629.0, "step": 13980 }, { "entropy": 0.5813772145658731, "epoch": 1.0480396083198082, "grad_norm": 0.2146705836057663, "learning_rate": 0.0002, "loss": 0.6732, "mean_token_accuracy": 0.8130089402198791, "num_tokens": 1229965168.0, "step": 13985 }, { "entropy": 0.5640939937904477, "epoch": 1.04841433225366, "grad_norm": 0.2074461728334427, "learning_rate": 0.0002, "loss": 0.6498, "mean_token_accuracy": 0.8153648421168327, "num_tokens": 1231433163.0, "step": 13990 }, { "entropy": 0.5531702831387519, "epoch": 1.048789056187512, "grad_norm": 0.2192113697528839, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8216249205172061, "num_tokens": 1232906761.0, "step": 13995 }, { "entropy": 0.5544008046388627, "epoch": 1.0491637801213638, "grad_norm": 0.2197006344795227, "learning_rate": 0.0002, "loss": 0.6372, "mean_token_accuracy": 0.8208246409893036, "num_tokens": 1234409045.0, "step": 14000 }, { "entropy": 0.5613484159111977, "epoch": 1.0495385040552156, "grad_norm": 0.2141319364309311, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.8165672466158866, "num_tokens": 1235888512.0, "step": 14005 }, { "entropy": 0.5721476120874286, "epoch": 1.0499132279890675, "grad_norm": 0.21851566433906555, "learning_rate": 0.0002, "loss": 0.6555, "mean_token_accuracy": 0.8153749354183674, "num_tokens": 1237380049.0, "step": 14010 }, { "entropy": 0.5684736374765634, "epoch": 1.0502879519229193, "grad_norm": 0.22105741500854492, "learning_rate": 0.0002, "loss": 0.6402, "mean_token_accuracy": 0.8152392864227295, "num_tokens": 1238840091.0, "step": 14015 }, { "entropy": 0.5821529755368828, "epoch": 1.0506626758567712, "grad_norm": 0.207941934466362, "learning_rate": 0.0002, "loss": 0.6555, "mean_token_accuracy": 0.814956695958972, "num_tokens": 1240356877.0, "step": 14020 }, { "entropy": 0.5695428784936667, "epoch": 1.051037399790623, "grad_norm": 0.21848659217357635, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.8212268579751253, "num_tokens": 1241818176.0, "step": 14025 }, { "entropy": 0.5685325184836983, "epoch": 1.051412123724475, "grad_norm": 0.20367638766765594, "learning_rate": 0.0002, "loss": 0.6472, "mean_token_accuracy": 0.816910257935524, "num_tokens": 1243312338.0, "step": 14030 }, { "entropy": 0.5608844324946404, "epoch": 1.0517868476583268, "grad_norm": 0.21263721585273743, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.8183060999959707, "num_tokens": 1244777812.0, "step": 14035 }, { "entropy": 0.5720531409606338, "epoch": 1.0521615715921786, "grad_norm": 0.20272274315357208, "learning_rate": 0.0002, "loss": 0.6539, "mean_token_accuracy": 0.816294415667653, "num_tokens": 1246242301.0, "step": 14040 }, { "entropy": 0.5600530132651329, "epoch": 1.0525362955260305, "grad_norm": 0.22493864595890045, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.8147773906588555, "num_tokens": 1247708635.0, "step": 14045 }, { "entropy": 0.5654571662656963, "epoch": 1.0529110194598823, "grad_norm": 0.2105301469564438, "learning_rate": 0.0002, "loss": 0.6395, "mean_token_accuracy": 0.8164419446140527, "num_tokens": 1249192933.0, "step": 14050 }, { "entropy": 0.5620817450806499, "epoch": 1.0532857433937342, "grad_norm": 0.2319408655166626, "learning_rate": 0.0002, "loss": 0.649, "mean_token_accuracy": 0.8181632086634636, "num_tokens": 1250628650.0, "step": 14055 }, { "entropy": 0.5730578620918095, "epoch": 1.053660467327586, "grad_norm": 0.20568105578422546, "learning_rate": 0.0002, "loss": 0.658, "mean_token_accuracy": 0.8140109531581402, "num_tokens": 1252084454.0, "step": 14060 }, { "entropy": 0.5513176122680307, "epoch": 1.0540351912614379, "grad_norm": 0.19155000150203705, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.8227332171052695, "num_tokens": 1253538797.0, "step": 14065 }, { "entropy": 0.5654538435861468, "epoch": 1.0544099151952897, "grad_norm": 0.19677037000656128, "learning_rate": 0.0002, "loss": 0.6533, "mean_token_accuracy": 0.8172522373497486, "num_tokens": 1254975062.0, "step": 14070 }, { "entropy": 0.5529757417738438, "epoch": 1.0547846391291416, "grad_norm": 0.20158308744430542, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.8189672388136386, "num_tokens": 1256468210.0, "step": 14075 }, { "entropy": 0.5489785043522716, "epoch": 1.0551593630629934, "grad_norm": 0.20226599276065826, "learning_rate": 0.0002, "loss": 0.6326, "mean_token_accuracy": 0.8206412028521299, "num_tokens": 1257893185.0, "step": 14080 }, { "entropy": 0.555027278047055, "epoch": 1.0555340869968453, "grad_norm": 0.21500717103481293, "learning_rate": 0.0002, "loss": 0.634, "mean_token_accuracy": 0.818440904468298, "num_tokens": 1259377583.0, "step": 14085 }, { "entropy": 0.5627778122201562, "epoch": 1.055908810930697, "grad_norm": 0.21609298884868622, "learning_rate": 0.0002, "loss": 0.6583, "mean_token_accuracy": 0.8177864912897348, "num_tokens": 1260895539.0, "step": 14090 }, { "entropy": 0.5541865829378366, "epoch": 1.056283534864549, "grad_norm": 0.20647010207176208, "learning_rate": 0.0002, "loss": 0.6356, "mean_token_accuracy": 0.8196590796113015, "num_tokens": 1262366999.0, "step": 14095 }, { "entropy": 0.5570920526981353, "epoch": 1.0566582587984008, "grad_norm": 0.2401854246854782, "learning_rate": 0.0002, "loss": 0.655, "mean_token_accuracy": 0.8203567400574684, "num_tokens": 1263800405.0, "step": 14100 }, { "entropy": 0.5589959762990475, "epoch": 1.0570329827322527, "grad_norm": 0.31048354506492615, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.8179750978946686, "num_tokens": 1265279646.0, "step": 14105 }, { "entropy": 0.575203112512827, "epoch": 1.0574077066661045, "grad_norm": 0.2112141251564026, "learning_rate": 0.0002, "loss": 0.668, "mean_token_accuracy": 0.816099289804697, "num_tokens": 1266706313.0, "step": 14110 }, { "entropy": 0.5613065453246235, "epoch": 1.0577824305999564, "grad_norm": 0.2305564284324646, "learning_rate": 0.0002, "loss": 0.6413, "mean_token_accuracy": 0.8165291499346494, "num_tokens": 1268215814.0, "step": 14115 }, { "entropy": 0.5703224185854197, "epoch": 1.0581571545338082, "grad_norm": 0.38328343629837036, "learning_rate": 0.0002, "loss": 0.6442, "mean_token_accuracy": 0.8170697875320911, "num_tokens": 1269723760.0, "step": 14120 }, { "entropy": 0.5600090183317661, "epoch": 1.05853187846766, "grad_norm": 0.21183371543884277, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.8209174051880836, "num_tokens": 1271247139.0, "step": 14125 }, { "entropy": 0.5664785023778677, "epoch": 1.058906602401512, "grad_norm": 0.21546359360218048, "learning_rate": 0.0002, "loss": 0.6553, "mean_token_accuracy": 0.8165877234190703, "num_tokens": 1272719646.0, "step": 14130 }, { "entropy": 0.5575527219101787, "epoch": 1.059281326335364, "grad_norm": 0.21488754451274872, "learning_rate": 0.0002, "loss": 0.6397, "mean_token_accuracy": 0.8171676002442837, "num_tokens": 1274152473.0, "step": 14135 }, { "entropy": 0.5602867631241679, "epoch": 1.0596560502692158, "grad_norm": 0.20190273225307465, "learning_rate": 0.0002, "loss": 0.651, "mean_token_accuracy": 0.8176744986325503, "num_tokens": 1275627587.0, "step": 14140 }, { "entropy": 0.570088030025363, "epoch": 1.0600307742030677, "grad_norm": 0.2836175858974457, "learning_rate": 0.0002, "loss": 0.6653, "mean_token_accuracy": 0.8158501543104648, "num_tokens": 1277087825.0, "step": 14145 }, { "entropy": 0.5814148407429457, "epoch": 1.0604054981369195, "grad_norm": 0.24438579380512238, "learning_rate": 0.0002, "loss": 0.6699, "mean_token_accuracy": 0.8130599491298198, "num_tokens": 1278564221.0, "step": 14150 }, { "entropy": 0.563639284297824, "epoch": 1.0607802220707714, "grad_norm": 0.1845695823431015, "learning_rate": 0.0002, "loss": 0.6516, "mean_token_accuracy": 0.8120465833693743, "num_tokens": 1280069152.0, "step": 14155 }, { "entropy": 0.5757838167250157, "epoch": 1.0611549460046232, "grad_norm": 0.1924816220998764, "learning_rate": 0.0002, "loss": 0.6667, "mean_token_accuracy": 0.8154888466000557, "num_tokens": 1281583904.0, "step": 14160 }, { "entropy": 0.5566907558590174, "epoch": 1.061529669938475, "grad_norm": 0.2219758778810501, "learning_rate": 0.0002, "loss": 0.6464, "mean_token_accuracy": 0.8191106084734201, "num_tokens": 1283085232.0, "step": 14165 }, { "entropy": 0.5451113302260637, "epoch": 1.061904393872327, "grad_norm": 0.20403257012367249, "learning_rate": 0.0002, "loss": 0.6346, "mean_token_accuracy": 0.8205039639025926, "num_tokens": 1284546317.0, "step": 14170 }, { "entropy": 0.5534085089340806, "epoch": 1.0622791178061788, "grad_norm": 0.2027537226676941, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8184224329888821, "num_tokens": 1286032577.0, "step": 14175 }, { "entropy": 0.555477587506175, "epoch": 1.0626538417400306, "grad_norm": 0.29775020480155945, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.8162977129220963, "num_tokens": 1287504257.0, "step": 14180 }, { "entropy": 0.5712028082460165, "epoch": 1.0630285656738825, "grad_norm": 0.19931486248970032, "learning_rate": 0.0002, "loss": 0.6653, "mean_token_accuracy": 0.8125255983322859, "num_tokens": 1289017682.0, "step": 14185 }, { "entropy": 0.539735471457243, "epoch": 1.0634032896077343, "grad_norm": 0.2135552316904068, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8217212654650211, "num_tokens": 1290451803.0, "step": 14190 }, { "entropy": 0.5515747703611851, "epoch": 1.0637780135415862, "grad_norm": 0.20768721401691437, "learning_rate": 0.0002, "loss": 0.6447, "mean_token_accuracy": 0.8184287700802088, "num_tokens": 1291879223.0, "step": 14195 }, { "entropy": 0.5533385263755918, "epoch": 1.064152737475438, "grad_norm": 0.20638494193553925, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.8194895390421152, "num_tokens": 1293334027.0, "step": 14200 }, { "entropy": 0.560462125390768, "epoch": 1.0645274614092899, "grad_norm": 0.20202073454856873, "learning_rate": 0.0002, "loss": 0.6475, "mean_token_accuracy": 0.8177651729434728, "num_tokens": 1294797604.0, "step": 14205 }, { "entropy": 0.5642263501882553, "epoch": 1.0649021853431417, "grad_norm": 0.1962054967880249, "learning_rate": 0.0002, "loss": 0.6559, "mean_token_accuracy": 0.8185315068811179, "num_tokens": 1296283766.0, "step": 14210 }, { "entropy": 0.5573183873668313, "epoch": 1.0652769092769936, "grad_norm": 0.20536957681179047, "learning_rate": 0.0002, "loss": 0.6491, "mean_token_accuracy": 0.8157585963606835, "num_tokens": 1297792173.0, "step": 14215 }, { "entropy": 0.5465939735993743, "epoch": 1.0656516332108454, "grad_norm": 0.22633327543735504, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.8188821285963058, "num_tokens": 1299223849.0, "step": 14220 }, { "entropy": 0.5499215621501208, "epoch": 1.0660263571446973, "grad_norm": 0.2178889662027359, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.8187105424702168, "num_tokens": 1300729471.0, "step": 14225 }, { "entropy": 0.5458719506859779, "epoch": 1.0664010810785491, "grad_norm": 0.21128103137016296, "learning_rate": 0.0002, "loss": 0.6422, "mean_token_accuracy": 0.819160358607769, "num_tokens": 1302219844.0, "step": 14230 }, { "entropy": 0.542955395579338, "epoch": 1.066775805012401, "grad_norm": 0.22953549027442932, "learning_rate": 0.0002, "loss": 0.6406, "mean_token_accuracy": 0.8174291029572487, "num_tokens": 1303677442.0, "step": 14235 }, { "entropy": 0.5441418857313692, "epoch": 1.0671505289462528, "grad_norm": 0.21089895069599152, "learning_rate": 0.0002, "loss": 0.639, "mean_token_accuracy": 0.8189295388758182, "num_tokens": 1305167268.0, "step": 14240 }, { "entropy": 0.5360679161734879, "epoch": 1.0675252528801047, "grad_norm": 0.19916948676109314, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8193850941956043, "num_tokens": 1306672512.0, "step": 14245 }, { "entropy": 0.5380268787965179, "epoch": 1.0678999768139565, "grad_norm": 0.19755899906158447, "learning_rate": 0.0002, "loss": 0.6451, "mean_token_accuracy": 0.8182165902107954, "num_tokens": 1308139059.0, "step": 14250 }, { "entropy": 0.5351174412295222, "epoch": 1.0682747007478084, "grad_norm": 0.20959709584712982, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.8200914625078439, "num_tokens": 1309637017.0, "step": 14255 }, { "entropy": 0.5431426813825965, "epoch": 1.0686494246816602, "grad_norm": 0.20737148821353912, "learning_rate": 0.0002, "loss": 0.6497, "mean_token_accuracy": 0.8193588588386774, "num_tokens": 1311074616.0, "step": 14260 }, { "entropy": 0.5408327220007777, "epoch": 1.069024148615512, "grad_norm": 0.20942322909832, "learning_rate": 0.0002, "loss": 0.6444, "mean_token_accuracy": 0.8164362445473671, "num_tokens": 1312567609.0, "step": 14265 }, { "entropy": 0.5570562783628702, "epoch": 1.069398872549364, "grad_norm": 0.21881625056266785, "learning_rate": 0.0002, "loss": 0.6572, "mean_token_accuracy": 0.811422635987401, "num_tokens": 1314030940.0, "step": 14270 }, { "entropy": 0.5476596999913454, "epoch": 1.0697735964832158, "grad_norm": 0.23503342270851135, "learning_rate": 0.0002, "loss": 0.6464, "mean_token_accuracy": 0.8179935641586781, "num_tokens": 1315470920.0, "step": 14275 }, { "entropy": 0.5490819009952247, "epoch": 1.0701483204170676, "grad_norm": 0.22142134606838226, "learning_rate": 0.0002, "loss": 0.6498, "mean_token_accuracy": 0.8152019791305065, "num_tokens": 1316920161.0, "step": 14280 }, { "entropy": 0.5502522695809603, "epoch": 1.0705230443509195, "grad_norm": 0.21432560682296753, "learning_rate": 0.0002, "loss": 0.6431, "mean_token_accuracy": 0.819145691767335, "num_tokens": 1318375865.0, "step": 14285 }, { "entropy": 0.549450122192502, "epoch": 1.0708977682847713, "grad_norm": 0.23622123897075653, "learning_rate": 0.0002, "loss": 0.6419, "mean_token_accuracy": 0.817825835943222, "num_tokens": 1319850538.0, "step": 14290 }, { "entropy": 0.5486967684701085, "epoch": 1.0712724922186232, "grad_norm": 0.18896709382534027, "learning_rate": 0.0002, "loss": 0.6451, "mean_token_accuracy": 0.8188548725098371, "num_tokens": 1321318684.0, "step": 14295 }, { "entropy": 0.5498185228556395, "epoch": 1.0716472161524753, "grad_norm": 0.1898205578327179, "learning_rate": 0.0002, "loss": 0.6461, "mean_token_accuracy": 0.8203062552958726, "num_tokens": 1322803927.0, "step": 14300 }, { "entropy": 0.5507346617989242, "epoch": 1.0720219400863271, "grad_norm": 0.20855365693569183, "learning_rate": 0.0002, "loss": 0.6483, "mean_token_accuracy": 0.8165951158851386, "num_tokens": 1324257137.0, "step": 14305 }, { "entropy": 0.5421549029648304, "epoch": 1.072396664020179, "grad_norm": 0.23384708166122437, "learning_rate": 0.0002, "loss": 0.6332, "mean_token_accuracy": 0.8239199228584766, "num_tokens": 1325698961.0, "step": 14310 }, { "entropy": 0.5636294217780232, "epoch": 1.0727713879540308, "grad_norm": 0.19174060225486755, "learning_rate": 0.0002, "loss": 0.656, "mean_token_accuracy": 0.8177919596433639, "num_tokens": 1327164894.0, "step": 14315 }, { "entropy": 0.5616766514256597, "epoch": 1.0731461118878827, "grad_norm": 0.20339781045913696, "learning_rate": 0.0002, "loss": 0.6493, "mean_token_accuracy": 0.8186924520879983, "num_tokens": 1328703305.0, "step": 14320 }, { "entropy": 0.5455830671824515, "epoch": 1.0735208358217345, "grad_norm": 0.19829705357551575, "learning_rate": 0.0002, "loss": 0.6464, "mean_token_accuracy": 0.8196020990610122, "num_tokens": 1330180217.0, "step": 14325 }, { "entropy": 0.5542356688529253, "epoch": 1.0738955597555864, "grad_norm": 0.20508478581905365, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8176984772086143, "num_tokens": 1331676575.0, "step": 14330 }, { "entropy": 0.5479736657813191, "epoch": 1.0742702836894382, "grad_norm": 0.207711860537529, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.818901851773262, "num_tokens": 1333131593.0, "step": 14335 }, { "entropy": 0.5529484322294593, "epoch": 1.07464500762329, "grad_norm": 0.20764140784740448, "learning_rate": 0.0002, "loss": 0.6426, "mean_token_accuracy": 0.8185974098742008, "num_tokens": 1334583375.0, "step": 14340 }, { "entropy": 0.5686480497941375, "epoch": 1.075019731557142, "grad_norm": 0.20830576121807098, "learning_rate": 0.0002, "loss": 0.66, "mean_token_accuracy": 0.818693982064724, "num_tokens": 1336124869.0, "step": 14345 }, { "entropy": 0.5478971088305116, "epoch": 1.0753944554909938, "grad_norm": 0.22434015572071075, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8212144307792186, "num_tokens": 1337605924.0, "step": 14350 }, { "entropy": 0.5672066308557987, "epoch": 1.0757691794248456, "grad_norm": 0.19558052718639374, "learning_rate": 0.0002, "loss": 0.6578, "mean_token_accuracy": 0.8156053058803081, "num_tokens": 1339110485.0, "step": 14355 }, { "entropy": 0.572470030374825, "epoch": 1.0761439033586975, "grad_norm": 0.2034010887145996, "learning_rate": 0.0002, "loss": 0.6629, "mean_token_accuracy": 0.8161498684436083, "num_tokens": 1340592093.0, "step": 14360 }, { "entropy": 0.5715172357857228, "epoch": 1.0765186272925493, "grad_norm": 0.20820733904838562, "learning_rate": 0.0002, "loss": 0.6592, "mean_token_accuracy": 0.8136607971042394, "num_tokens": 1342078259.0, "step": 14365 }, { "entropy": 0.5433515522629022, "epoch": 1.0768933512264012, "grad_norm": 0.21964353322982788, "learning_rate": 0.0002, "loss": 0.6412, "mean_token_accuracy": 0.8197503108531237, "num_tokens": 1343550796.0, "step": 14370 }, { "entropy": 0.562806349620223, "epoch": 1.077268075160253, "grad_norm": 0.2166871428489685, "learning_rate": 0.0002, "loss": 0.6586, "mean_token_accuracy": 0.8159527897834777, "num_tokens": 1345010944.0, "step": 14375 }, { "entropy": 0.547766887024045, "epoch": 1.0776427990941049, "grad_norm": 0.2227579802274704, "learning_rate": 0.0002, "loss": 0.6374, "mean_token_accuracy": 0.8208282109349966, "num_tokens": 1346464356.0, "step": 14380 }, { "entropy": 0.5574960955418646, "epoch": 1.0780175230279567, "grad_norm": 0.21027816832065582, "learning_rate": 0.0002, "loss": 0.6446, "mean_token_accuracy": 0.8153850357979536, "num_tokens": 1347970539.0, "step": 14385 }, { "entropy": 0.5616593698039651, "epoch": 1.0783922469618086, "grad_norm": 0.22172768414020538, "learning_rate": 0.0002, "loss": 0.6515, "mean_token_accuracy": 0.8168713696300983, "num_tokens": 1349433300.0, "step": 14390 }, { "entropy": 0.5580646267160774, "epoch": 1.0787669708956604, "grad_norm": 0.20444919168949127, "learning_rate": 0.0002, "loss": 0.643, "mean_token_accuracy": 0.8195420574396849, "num_tokens": 1350930209.0, "step": 14395 }, { "entropy": 0.5541564859449863, "epoch": 1.0791416948295123, "grad_norm": 0.5927976965904236, "learning_rate": 0.0002, "loss": 0.6384, "mean_token_accuracy": 0.8181661035865545, "num_tokens": 1352429402.0, "step": 14400 }, { "entropy": 0.558811766281724, "epoch": 1.0795164187633641, "grad_norm": 0.22098568081855774, "learning_rate": 0.0002, "loss": 0.6466, "mean_token_accuracy": 0.8200198858976364, "num_tokens": 1353896161.0, "step": 14405 }, { "entropy": 0.5531332101672888, "epoch": 1.079891142697216, "grad_norm": 0.19680900871753693, "learning_rate": 0.0002, "loss": 0.6359, "mean_token_accuracy": 0.8227821465581655, "num_tokens": 1355367642.0, "step": 14410 }, { "entropy": 0.5448419226333499, "epoch": 1.0802658666310678, "grad_norm": 0.2095474749803543, "learning_rate": 0.0002, "loss": 0.6407, "mean_token_accuracy": 0.8175718661397695, "num_tokens": 1356786362.0, "step": 14415 }, { "entropy": 0.5587983937934041, "epoch": 1.0806405905649197, "grad_norm": 0.2048654854297638, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.820356871187687, "num_tokens": 1358255556.0, "step": 14420 }, { "entropy": 0.5435066288337111, "epoch": 1.0810153144987715, "grad_norm": 0.19405938684940338, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8207910478115081, "num_tokens": 1359711274.0, "step": 14425 }, { "entropy": 0.5601586785167456, "epoch": 1.0813900384326234, "grad_norm": 0.21361596882343292, "learning_rate": 0.0002, "loss": 0.6425, "mean_token_accuracy": 0.816763536259532, "num_tokens": 1361223689.0, "step": 14430 }, { "entropy": 0.5660548163577914, "epoch": 1.0817647623664752, "grad_norm": 0.21371150016784668, "learning_rate": 0.0002, "loss": 0.6622, "mean_token_accuracy": 0.8155920412391424, "num_tokens": 1362668620.0, "step": 14435 }, { "entropy": 0.5820588778704405, "epoch": 1.082139486300327, "grad_norm": 0.22493095695972443, "learning_rate": 0.0002, "loss": 0.6674, "mean_token_accuracy": 0.8186745997518301, "num_tokens": 1364163032.0, "step": 14440 }, { "entropy": 0.5664988227188588, "epoch": 1.0825142102341792, "grad_norm": 0.2276047319173813, "learning_rate": 0.0002, "loss": 0.6545, "mean_token_accuracy": 0.8167638391256332, "num_tokens": 1365590601.0, "step": 14445 }, { "entropy": 0.5940882552415132, "epoch": 1.082888934168031, "grad_norm": 0.22673571109771729, "learning_rate": 0.0002, "loss": 0.6605, "mean_token_accuracy": 0.8132938623428345, "num_tokens": 1367054524.0, "step": 14450 }, { "entropy": 0.5845117578282952, "epoch": 1.0832636581018829, "grad_norm": 0.19931462407112122, "learning_rate": 0.0002, "loss": 0.6607, "mean_token_accuracy": 0.8165157873183488, "num_tokens": 1368539264.0, "step": 14455 }, { "entropy": 0.5692393199540675, "epoch": 1.0836383820357347, "grad_norm": 0.23077161610126495, "learning_rate": 0.0002, "loss": 0.6449, "mean_token_accuracy": 0.8187655001878739, "num_tokens": 1370014684.0, "step": 14460 }, { "entropy": 0.5868778048083186, "epoch": 1.0840131059695866, "grad_norm": 0.22228261828422546, "learning_rate": 0.0002, "loss": 0.6669, "mean_token_accuracy": 0.8135294191539287, "num_tokens": 1371489760.0, "step": 14465 }, { "entropy": 0.5730668265372515, "epoch": 1.0843878299034384, "grad_norm": 0.22933676838874817, "learning_rate": 0.0002, "loss": 0.6417, "mean_token_accuracy": 0.8163329575210809, "num_tokens": 1372955077.0, "step": 14470 }, { "entropy": 0.5686996793374419, "epoch": 1.0847625538372903, "grad_norm": 0.20923930406570435, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.8191742621362209, "num_tokens": 1374454355.0, "step": 14475 }, { "entropy": 0.5756376454606652, "epoch": 1.085137277771142, "grad_norm": 0.1968061327934265, "learning_rate": 0.0002, "loss": 0.6546, "mean_token_accuracy": 0.8163150414824486, "num_tokens": 1375920337.0, "step": 14480 }, { "entropy": 0.5699464915320277, "epoch": 1.085512001704994, "grad_norm": 0.20298771560192108, "learning_rate": 0.0002, "loss": 0.6477, "mean_token_accuracy": 0.8161188084632158, "num_tokens": 1377374868.0, "step": 14485 }, { "entropy": 0.5633311269804835, "epoch": 1.0858867256388458, "grad_norm": 0.21228422224521637, "learning_rate": 0.0002, "loss": 0.645, "mean_token_accuracy": 0.8177403412759304, "num_tokens": 1378875312.0, "step": 14490 }, { "entropy": 0.5413269909098745, "epoch": 1.0862614495726977, "grad_norm": 0.21592408418655396, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.8227482292801142, "num_tokens": 1380335148.0, "step": 14495 }, { "entropy": 0.5569645971059799, "epoch": 1.0866361735065495, "grad_norm": 0.2086167335510254, "learning_rate": 0.0002, "loss": 0.6476, "mean_token_accuracy": 0.8172466512769461, "num_tokens": 1381821379.0, "step": 14500 }, { "entropy": 0.5621790179982782, "epoch": 1.0870108974404014, "grad_norm": 0.1996023803949356, "learning_rate": 0.0002, "loss": 0.6533, "mean_token_accuracy": 0.8193729147315025, "num_tokens": 1383292766.0, "step": 14505 }, { "entropy": 0.56741834115237, "epoch": 1.0873856213742532, "grad_norm": 0.20725977420806885, "learning_rate": 0.0002, "loss": 0.6461, "mean_token_accuracy": 0.8171950347721577, "num_tokens": 1384768892.0, "step": 14510 }, { "entropy": 0.578390720486641, "epoch": 1.087760345308105, "grad_norm": 0.20192888379096985, "learning_rate": 0.0002, "loss": 0.6465, "mean_token_accuracy": 0.8159347232431173, "num_tokens": 1386257661.0, "step": 14515 }, { "entropy": 0.574539989605546, "epoch": 1.088135069241957, "grad_norm": 0.20999546349048615, "learning_rate": 0.0002, "loss": 0.6422, "mean_token_accuracy": 0.8168788406997919, "num_tokens": 1387729649.0, "step": 14520 }, { "entropy": 0.5722511410713196, "epoch": 1.0885097931758088, "grad_norm": 0.2016642689704895, "learning_rate": 0.0002, "loss": 0.6443, "mean_token_accuracy": 0.8170844167470932, "num_tokens": 1389206752.0, "step": 14525 }, { "entropy": 0.5802591988816858, "epoch": 1.0888845171096606, "grad_norm": 0.22019062936306, "learning_rate": 0.0002, "loss": 0.6565, "mean_token_accuracy": 0.8176533866673708, "num_tokens": 1390684506.0, "step": 14530 }, { "entropy": 0.5677760127931833, "epoch": 1.0892592410435125, "grad_norm": 0.2141028344631195, "learning_rate": 0.0002, "loss": 0.6412, "mean_token_accuracy": 0.8185866706073284, "num_tokens": 1392103454.0, "step": 14535 }, { "entropy": 0.5583299769088625, "epoch": 1.0896339649773643, "grad_norm": 0.20533251762390137, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8223818901926279, "num_tokens": 1393566968.0, "step": 14540 }, { "entropy": 0.576409487426281, "epoch": 1.0900086889112162, "grad_norm": 0.19597476720809937, "learning_rate": 0.0002, "loss": 0.6468, "mean_token_accuracy": 0.817090941593051, "num_tokens": 1395044731.0, "step": 14545 }, { "entropy": 0.5723375486209988, "epoch": 1.090383412845068, "grad_norm": 0.20907948911190033, "learning_rate": 0.0002, "loss": 0.6434, "mean_token_accuracy": 0.8158726688474417, "num_tokens": 1396577657.0, "step": 14550 }, { "entropy": 0.5615502120926976, "epoch": 1.0907581367789199, "grad_norm": 0.23160810768604279, "learning_rate": 0.0002, "loss": 0.6424, "mean_token_accuracy": 0.8168245054781437, "num_tokens": 1398066831.0, "step": 14555 }, { "entropy": 0.5499845574609935, "epoch": 1.0911328607127717, "grad_norm": 0.21422863006591797, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8209096200764179, "num_tokens": 1399534512.0, "step": 14560 }, { "entropy": 0.5595056446269154, "epoch": 1.0915075846466236, "grad_norm": 0.21484994888305664, "learning_rate": 0.0002, "loss": 0.6424, "mean_token_accuracy": 0.816362938657403, "num_tokens": 1401032703.0, "step": 14565 }, { "entropy": 0.5330275157466531, "epoch": 1.0918823085804754, "grad_norm": 0.19083140790462494, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8215924661606551, "num_tokens": 1402476222.0, "step": 14570 }, { "entropy": 0.5540244832634926, "epoch": 1.0922570325143273, "grad_norm": 0.21051527559757233, "learning_rate": 0.0002, "loss": 0.6518, "mean_token_accuracy": 0.8170733373612166, "num_tokens": 1403973405.0, "step": 14575 }, { "entropy": 0.5582658721134066, "epoch": 1.0926317564481791, "grad_norm": 0.22157996892929077, "learning_rate": 0.0002, "loss": 0.6483, "mean_token_accuracy": 0.8165531378239393, "num_tokens": 1405395003.0, "step": 14580 }, { "entropy": 0.5628325901925564, "epoch": 1.093006480382031, "grad_norm": 0.20201030373573303, "learning_rate": 0.0002, "loss": 0.6629, "mean_token_accuracy": 0.8154622178524733, "num_tokens": 1406895436.0, "step": 14585 }, { "entropy": 0.5517022963613272, "epoch": 1.0933812043158828, "grad_norm": 0.19965042173862457, "learning_rate": 0.0002, "loss": 0.6504, "mean_token_accuracy": 0.8182616714388132, "num_tokens": 1408349548.0, "step": 14590 }, { "entropy": 0.5434647757560015, "epoch": 1.0937559282497347, "grad_norm": 0.20694397389888763, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.8174669589847327, "num_tokens": 1409814115.0, "step": 14595 }, { "entropy": 0.5606957523152232, "epoch": 1.0941306521835865, "grad_norm": 0.23209165036678314, "learning_rate": 0.0002, "loss": 0.6581, "mean_token_accuracy": 0.8136917654424906, "num_tokens": 1411250439.0, "step": 14600 }, { "entropy": 0.5359558730386198, "epoch": 1.0945053761174384, "grad_norm": 0.20811906456947327, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8206360705196858, "num_tokens": 1412692885.0, "step": 14605 }, { "entropy": 0.5392879160121083, "epoch": 1.0948801000512904, "grad_norm": 0.21380050480365753, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8245377156883478, "num_tokens": 1414173023.0, "step": 14610 }, { "entropy": 0.5679772522300481, "epoch": 1.0952548239851423, "grad_norm": 0.20947645604610443, "learning_rate": 0.0002, "loss": 0.6621, "mean_token_accuracy": 0.8152570880949497, "num_tokens": 1415644389.0, "step": 14615 }, { "entropy": 0.5525785779580474, "epoch": 1.0956295479189941, "grad_norm": 0.2101106196641922, "learning_rate": 0.0002, "loss": 0.6346, "mean_token_accuracy": 0.8168964393436908, "num_tokens": 1417107094.0, "step": 14620 }, { "entropy": 0.5591574445366859, "epoch": 1.096004271852846, "grad_norm": 0.19889377057552338, "learning_rate": 0.0002, "loss": 0.6501, "mean_token_accuracy": 0.8168968711048364, "num_tokens": 1418608338.0, "step": 14625 }, { "entropy": 0.5549933098256588, "epoch": 1.0963789957866978, "grad_norm": 0.20702975988388062, "learning_rate": 0.0002, "loss": 0.6396, "mean_token_accuracy": 0.8196501903235912, "num_tokens": 1420061752.0, "step": 14630 }, { "entropy": 0.5653560036793351, "epoch": 1.0967537197205497, "grad_norm": 0.2175339013338089, "learning_rate": 0.0002, "loss": 0.6694, "mean_token_accuracy": 0.8164181958884, "num_tokens": 1421458647.0, "step": 14635 }, { "entropy": 0.5528178961947561, "epoch": 1.0971284436544015, "grad_norm": 0.2010267972946167, "learning_rate": 0.0002, "loss": 0.6455, "mean_token_accuracy": 0.8185416907072067, "num_tokens": 1422904214.0, "step": 14640 }, { "entropy": 0.5443570051342249, "epoch": 1.0975031675882534, "grad_norm": 0.21130333840847015, "learning_rate": 0.0002, "loss": 0.6311, "mean_token_accuracy": 0.8209510628134012, "num_tokens": 1424347392.0, "step": 14645 }, { "entropy": 0.5462319273501635, "epoch": 1.0978778915221052, "grad_norm": 0.1921384036540985, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.8178047496825457, "num_tokens": 1425832590.0, "step": 14650 }, { "entropy": 0.5637107415124774, "epoch": 1.098252615455957, "grad_norm": 0.21760612726211548, "learning_rate": 0.0002, "loss": 0.6579, "mean_token_accuracy": 0.8166199825704098, "num_tokens": 1427352976.0, "step": 14655 }, { "entropy": 0.5727406905964016, "epoch": 1.098627339389809, "grad_norm": 0.19752588868141174, "learning_rate": 0.0002, "loss": 0.6654, "mean_token_accuracy": 0.8125888898968696, "num_tokens": 1428823038.0, "step": 14660 }, { "entropy": 0.565270409733057, "epoch": 1.0990020633236608, "grad_norm": 0.19834263622760773, "learning_rate": 0.0002, "loss": 0.6554, "mean_token_accuracy": 0.8178054757416249, "num_tokens": 1430305101.0, "step": 14665 }, { "entropy": 0.5637985253706574, "epoch": 1.0993767872575126, "grad_norm": 0.1981198489665985, "learning_rate": 0.0002, "loss": 0.6546, "mean_token_accuracy": 0.8174145169556141, "num_tokens": 1431779198.0, "step": 14670 }, { "entropy": 0.5597010465338826, "epoch": 1.0997515111913645, "grad_norm": 0.2185635268688202, "learning_rate": 0.0002, "loss": 0.6525, "mean_token_accuracy": 0.817577588185668, "num_tokens": 1433281164.0, "step": 14675 }, { "entropy": 0.5414143836125731, "epoch": 1.1001262351252163, "grad_norm": 0.1894814521074295, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8184645250439644, "num_tokens": 1434763287.0, "step": 14680 }, { "entropy": 0.5650319555774331, "epoch": 1.1005009590590682, "grad_norm": 0.20218965411186218, "learning_rate": 0.0002, "loss": 0.6545, "mean_token_accuracy": 0.815010153874755, "num_tokens": 1436305081.0, "step": 14685 }, { "entropy": 0.549727319739759, "epoch": 1.10087568299292, "grad_norm": 0.2178458720445633, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.8199173636734486, "num_tokens": 1437791883.0, "step": 14690 }, { "entropy": 0.5535659283399582, "epoch": 1.101250406926772, "grad_norm": 0.23916444182395935, "learning_rate": 0.0002, "loss": 0.6504, "mean_token_accuracy": 0.8186033859848976, "num_tokens": 1439290998.0, "step": 14695 }, { "entropy": 0.5557127747684717, "epoch": 1.1016251308606237, "grad_norm": 0.2209153026342392, "learning_rate": 0.0002, "loss": 0.6569, "mean_token_accuracy": 0.8162944246083498, "num_tokens": 1440744208.0, "step": 14700 }, { "entropy": 0.5537536526098847, "epoch": 1.1019998547944756, "grad_norm": 0.22863277792930603, "learning_rate": 0.0002, "loss": 0.6406, "mean_token_accuracy": 0.8169410135596991, "num_tokens": 1442231807.0, "step": 14705 }, { "entropy": 0.5586177807301282, "epoch": 1.1023745787283274, "grad_norm": 0.20290504395961761, "learning_rate": 0.0002, "loss": 0.6501, "mean_token_accuracy": 0.8185081388801336, "num_tokens": 1443725061.0, "step": 14710 }, { "entropy": 0.5455389238893986, "epoch": 1.1027493026621793, "grad_norm": 0.22175060212612152, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.8199361026287079, "num_tokens": 1445204035.0, "step": 14715 }, { "entropy": 0.5732291333377362, "epoch": 1.1031240265960311, "grad_norm": 0.23629455268383026, "learning_rate": 0.0002, "loss": 0.6555, "mean_token_accuracy": 0.815789956599474, "num_tokens": 1446680097.0, "step": 14720 }, { "entropy": 0.5497912948019803, "epoch": 1.103498750529883, "grad_norm": 0.19025161862373352, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.8194325771182775, "num_tokens": 1448140781.0, "step": 14725 }, { "entropy": 0.5385194549337029, "epoch": 1.1038734744637348, "grad_norm": 0.20320604741573334, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8207718297839165, "num_tokens": 1449578401.0, "step": 14730 }, { "entropy": 0.5653954019770027, "epoch": 1.1042481983975867, "grad_norm": 0.20300932228565216, "learning_rate": 0.0002, "loss": 0.6541, "mean_token_accuracy": 0.8145166650414467, "num_tokens": 1451031997.0, "step": 14735 }, { "entropy": 0.5491691565141081, "epoch": 1.1046229223314386, "grad_norm": 0.21850232779979706, "learning_rate": 0.0002, "loss": 0.6438, "mean_token_accuracy": 0.8189163222908974, "num_tokens": 1452519812.0, "step": 14740 }, { "entropy": 0.5356855606660247, "epoch": 1.1049976462652904, "grad_norm": 0.2276773452758789, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8235014181584119, "num_tokens": 1453939721.0, "step": 14745 }, { "entropy": 0.5425517324358224, "epoch": 1.1053723701991423, "grad_norm": 0.21804597973823547, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.8203769609332084, "num_tokens": 1455420174.0, "step": 14750 }, { "entropy": 0.5398848893120884, "epoch": 1.1057470941329943, "grad_norm": 0.19716916978359222, "learning_rate": 0.0002, "loss": 0.6337, "mean_token_accuracy": 0.8205219298601151, "num_tokens": 1456888832.0, "step": 14755 }, { "entropy": 0.5399526622146368, "epoch": 1.1061218180668462, "grad_norm": 0.19735552370548248, "learning_rate": 0.0002, "loss": 0.6422, "mean_token_accuracy": 0.8190649956464767, "num_tokens": 1458339795.0, "step": 14760 }, { "entropy": 0.5224553542211652, "epoch": 1.106496542000698, "grad_norm": 0.21237017214298248, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.8250315766781569, "num_tokens": 1459772327.0, "step": 14765 }, { "entropy": 0.5334797080606222, "epoch": 1.1068712659345499, "grad_norm": 0.19843757152557373, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8231952745467425, "num_tokens": 1461221830.0, "step": 14770 }, { "entropy": 0.5454361714422703, "epoch": 1.1072459898684017, "grad_norm": 0.21404814720153809, "learning_rate": 0.0002, "loss": 0.6481, "mean_token_accuracy": 0.8155914843082428, "num_tokens": 1462703751.0, "step": 14775 }, { "entropy": 0.570506426692009, "epoch": 1.1076207138022536, "grad_norm": 0.21922531723976135, "learning_rate": 0.0002, "loss": 0.663, "mean_token_accuracy": 0.8122997187077999, "num_tokens": 1464221803.0, "step": 14780 }, { "entropy": 0.5489460557699204, "epoch": 1.1079954377361054, "grad_norm": 0.20810508728027344, "learning_rate": 0.0002, "loss": 0.6443, "mean_token_accuracy": 0.8188811756670475, "num_tokens": 1465680132.0, "step": 14785 }, { "entropy": 0.5357613189145922, "epoch": 1.1083701616699573, "grad_norm": 0.2018997073173523, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8219130292534829, "num_tokens": 1467100029.0, "step": 14790 }, { "entropy": 0.5519137032330036, "epoch": 1.1087448856038091, "grad_norm": 0.227107971906662, "learning_rate": 0.0002, "loss": 0.6465, "mean_token_accuracy": 0.8170615509152412, "num_tokens": 1468557811.0, "step": 14795 }, { "entropy": 0.5562187919393182, "epoch": 1.109119609537661, "grad_norm": 0.22004340589046478, "learning_rate": 0.0002, "loss": 0.6526, "mean_token_accuracy": 0.8166040249168873, "num_tokens": 1470009089.0, "step": 14800 }, { "entropy": 0.5474573960527778, "epoch": 1.1094943334715128, "grad_norm": 0.20387764275074005, "learning_rate": 0.0002, "loss": 0.63, "mean_token_accuracy": 0.8181474443525076, "num_tokens": 1484077.0, "step": 14805 }, { "entropy": 0.556044804584235, "epoch": 1.1098690574053647, "grad_norm": 0.23942138254642487, "learning_rate": 0.0002, "loss": 0.6445, "mean_token_accuracy": 0.814752047508955, "num_tokens": 2959956.0, "step": 14810 }, { "entropy": 0.5425976568832993, "epoch": 1.1102437813392165, "grad_norm": 0.18736052513122559, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.8237411011010408, "num_tokens": 4423671.0, "step": 14815 }, { "entropy": 0.5484885392710567, "epoch": 1.1106185052730684, "grad_norm": 0.2026047259569168, "learning_rate": 0.0002, "loss": 0.644, "mean_token_accuracy": 0.8162102837115526, "num_tokens": 5893790.0, "step": 14820 }, { "entropy": 0.5322427884675562, "epoch": 1.1109932292069202, "grad_norm": 0.21216827630996704, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8200423959642649, "num_tokens": 7391210.0, "step": 14825 }, { "entropy": 0.5465132679790259, "epoch": 1.111367953140772, "grad_norm": 0.21283237636089325, "learning_rate": 0.0002, "loss": 0.6407, "mean_token_accuracy": 0.8173400431871414, "num_tokens": 8867261.0, "step": 14830 }, { "entropy": 0.550072530284524, "epoch": 1.111742677074624, "grad_norm": 0.19471752643585205, "learning_rate": 0.0002, "loss": 0.6552, "mean_token_accuracy": 0.8147846799343824, "num_tokens": 10358901.0, "step": 14835 }, { "entropy": 0.5372474988922477, "epoch": 1.1121174010084758, "grad_norm": 0.20327116549015045, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.817377121001482, "num_tokens": 11851727.0, "step": 14840 }, { "entropy": 0.5289364499971271, "epoch": 1.1124921249423276, "grad_norm": 0.20660020411014557, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8196778658777475, "num_tokens": 13319491.0, "step": 14845 }, { "entropy": 0.5373098151758313, "epoch": 1.1128668488761795, "grad_norm": 0.20050138235092163, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8204697832465172, "num_tokens": 14818292.0, "step": 14850 }, { "entropy": 0.540898559615016, "epoch": 1.1132415728100313, "grad_norm": 0.19836848974227905, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8182242877781392, "num_tokens": 16308359.0, "step": 14855 }, { "entropy": 0.5569497594609857, "epoch": 1.1136162967438832, "grad_norm": 0.21619990468025208, "learning_rate": 0.0002, "loss": 0.6473, "mean_token_accuracy": 0.8166425008326769, "num_tokens": 17739108.0, "step": 14860 }, { "entropy": 0.5330268120393157, "epoch": 1.113991020677735, "grad_norm": 0.18731294572353363, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.8200934492051601, "num_tokens": 19176926.0, "step": 14865 }, { "entropy": 0.5441520493477583, "epoch": 1.1143657446115869, "grad_norm": 0.21327519416809082, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.8200631435960531, "num_tokens": 20654215.0, "step": 14870 }, { "entropy": 0.54577805288136, "epoch": 1.1147404685454387, "grad_norm": 0.2203986793756485, "learning_rate": 0.0002, "loss": 0.6399, "mean_token_accuracy": 0.8190083306282758, "num_tokens": 22131519.0, "step": 14875 }, { "entropy": 0.5494268979877234, "epoch": 1.1151151924792906, "grad_norm": 0.186459481716156, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.8186204295605422, "num_tokens": 23634013.0, "step": 14880 }, { "entropy": 0.543712162785232, "epoch": 1.1154899164131424, "grad_norm": 0.19724096357822418, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.8226990599185229, "num_tokens": 25124684.0, "step": 14885 }, { "entropy": 0.5460860475897789, "epoch": 1.1158646403469943, "grad_norm": 0.21103477478027344, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8219281803816557, "num_tokens": 26578550.0, "step": 14890 }, { "entropy": 0.5614618854597211, "epoch": 1.1162393642808461, "grad_norm": 0.20315633714199066, "learning_rate": 0.0002, "loss": 0.639, "mean_token_accuracy": 0.8162618570029736, "num_tokens": 28021776.0, "step": 14895 }, { "entropy": 0.5552337417379022, "epoch": 1.116614088214698, "grad_norm": 0.21092790365219116, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.820710777118802, "num_tokens": 29498788.0, "step": 14900 }, { "entropy": 0.5626045827753842, "epoch": 1.1169888121485498, "grad_norm": 0.21158558130264282, "learning_rate": 0.0002, "loss": 0.6433, "mean_token_accuracy": 0.8163279384374619, "num_tokens": 31027520.0, "step": 14905 }, { "entropy": 0.561036404967308, "epoch": 1.1173635360824017, "grad_norm": 0.2226509153842926, "learning_rate": 0.0002, "loss": 0.6437, "mean_token_accuracy": 0.8180532094091177, "num_tokens": 32500928.0, "step": 14910 }, { "entropy": 0.5528013514354825, "epoch": 1.1177382600162535, "grad_norm": 0.21430087089538574, "learning_rate": 0.0002, "loss": 0.6365, "mean_token_accuracy": 0.8213927179574967, "num_tokens": 33992108.0, "step": 14915 }, { "entropy": 0.5507874872535468, "epoch": 1.1181129839501056, "grad_norm": 0.20876987278461456, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.819342116639018, "num_tokens": 35432390.0, "step": 14920 }, { "entropy": 0.5488847121596336, "epoch": 1.1184877078839575, "grad_norm": 0.255197674036026, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.8198106847703457, "num_tokens": 36886587.0, "step": 14925 }, { "entropy": 0.551796723343432, "epoch": 1.1188624318178093, "grad_norm": 0.20266348123550415, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.8183411210775375, "num_tokens": 38358203.0, "step": 14930 }, { "entropy": 0.5414902349933982, "epoch": 1.1192371557516612, "grad_norm": 0.22656649351119995, "learning_rate": 0.0002, "loss": 0.6288, "mean_token_accuracy": 0.8218413092195987, "num_tokens": 39813016.0, "step": 14935 }, { "entropy": 0.5494578950107097, "epoch": 1.119611879685513, "grad_norm": 0.21518121659755707, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.8179727055132389, "num_tokens": 41261267.0, "step": 14940 }, { "entropy": 0.5453797774389386, "epoch": 1.1199866036193649, "grad_norm": 0.2005666196346283, "learning_rate": 0.0002, "loss": 0.6365, "mean_token_accuracy": 0.8179960772395134, "num_tokens": 42739551.0, "step": 14945 }, { "entropy": 0.54025037586689, "epoch": 1.1203613275532167, "grad_norm": 0.2431768774986267, "learning_rate": 0.0002, "loss": 0.6359, "mean_token_accuracy": 0.8198318403214216, "num_tokens": 44214246.0, "step": 14950 }, { "entropy": 0.5510345470160246, "epoch": 1.1207360514870686, "grad_norm": 0.20082350075244904, "learning_rate": 0.0002, "loss": 0.6426, "mean_token_accuracy": 0.8154282800853252, "num_tokens": 45712280.0, "step": 14955 }, { "entropy": 0.571021698601544, "epoch": 1.1211107754209204, "grad_norm": 0.20842529833316803, "learning_rate": 0.0002, "loss": 0.6552, "mean_token_accuracy": 0.8150297958403826, "num_tokens": 47212024.0, "step": 14960 }, { "entropy": 0.5547617430798709, "epoch": 1.1214854993547723, "grad_norm": 0.21200403571128845, "learning_rate": 0.0002, "loss": 0.6454, "mean_token_accuracy": 0.815925657376647, "num_tokens": 48684625.0, "step": 14965 }, { "entropy": 0.5315323358401656, "epoch": 1.1218602232886241, "grad_norm": 0.2021137773990631, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.826298189163208, "num_tokens": 50102412.0, "step": 14970 }, { "entropy": 0.552380215190351, "epoch": 1.122234947222476, "grad_norm": 0.22960692644119263, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.8201796863228082, "num_tokens": 51591023.0, "step": 14975 }, { "entropy": 0.5430444543249905, "epoch": 1.1226096711563278, "grad_norm": 0.21587666869163513, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.8202737364917994, "num_tokens": 53037111.0, "step": 14980 }, { "entropy": 0.55807802695781, "epoch": 1.1229843950901797, "grad_norm": 0.19542017579078674, "learning_rate": 0.0002, "loss": 0.6403, "mean_token_accuracy": 0.8201699826866389, "num_tokens": 54520694.0, "step": 14985 }, { "entropy": 0.551482470612973, "epoch": 1.1233591190240315, "grad_norm": 0.21429021656513214, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.8176700871437788, "num_tokens": 55992801.0, "step": 14990 }, { "entropy": 0.5379483132623136, "epoch": 1.1237338429578834, "grad_norm": 0.21866148710250854, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8223099123686552, "num_tokens": 57447665.0, "step": 14995 }, { "entropy": 0.5620814247056842, "epoch": 1.1241085668917352, "grad_norm": 0.1970924586057663, "learning_rate": 0.0002, "loss": 0.6549, "mean_token_accuracy": 0.8142241407185793, "num_tokens": 58916762.0, "step": 15000 }, { "entropy": 0.5351644292473793, "epoch": 1.124483290825587, "grad_norm": 0.26855477690696716, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.8226510360836983, "num_tokens": 60381019.0, "step": 15005 }, { "entropy": 0.5406121973879635, "epoch": 1.124858014759439, "grad_norm": 0.23005647957324982, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.8204865045845509, "num_tokens": 61886094.0, "step": 15010 }, { "entropy": 0.5415942702442408, "epoch": 1.1252327386932908, "grad_norm": 0.2085975855588913, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.8203611563891172, "num_tokens": 63379895.0, "step": 15015 }, { "entropy": 0.5361521638929844, "epoch": 1.1256074626271426, "grad_norm": 0.21844066679477692, "learning_rate": 0.0002, "loss": 0.6395, "mean_token_accuracy": 0.8191454242914915, "num_tokens": 64819733.0, "step": 15020 }, { "entropy": 0.5429855862632393, "epoch": 1.1259821865609945, "grad_norm": 0.2261941134929657, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.816282944008708, "num_tokens": 66281884.0, "step": 15025 }, { "entropy": 0.5563916793093086, "epoch": 1.1263569104948463, "grad_norm": 0.2153894156217575, "learning_rate": 0.0002, "loss": 0.6408, "mean_token_accuracy": 0.8176049686968326, "num_tokens": 67772637.0, "step": 15030 }, { "entropy": 0.5470106821507216, "epoch": 1.1267316344286982, "grad_norm": 0.21306081116199493, "learning_rate": 0.0002, "loss": 0.6333, "mean_token_accuracy": 0.8200824432075023, "num_tokens": 69240734.0, "step": 15035 }, { "entropy": 0.5540011115372181, "epoch": 1.12710635836255, "grad_norm": 0.20908890664577484, "learning_rate": 0.0002, "loss": 0.6478, "mean_token_accuracy": 0.8175500728189945, "num_tokens": 70736914.0, "step": 15040 }, { "entropy": 0.5624153625220061, "epoch": 1.1274810822964019, "grad_norm": 0.21872106194496155, "learning_rate": 0.0002, "loss": 0.6658, "mean_token_accuracy": 0.8123480468988419, "num_tokens": 72201130.0, "step": 15045 }, { "entropy": 0.5372987704351544, "epoch": 1.1278558062302537, "grad_norm": 0.20366184413433075, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.8196090873330831, "num_tokens": 73664916.0, "step": 15050 }, { "entropy": 0.532637276314199, "epoch": 1.1282305301641056, "grad_norm": 0.20841847360134125, "learning_rate": 0.0002, "loss": 0.6349, "mean_token_accuracy": 0.8179108157753945, "num_tokens": 75139073.0, "step": 15055 }, { "entropy": 0.5452793214470149, "epoch": 1.1286052540979576, "grad_norm": 0.20390678942203522, "learning_rate": 0.0002, "loss": 0.6509, "mean_token_accuracy": 0.8136857025325298, "num_tokens": 76624549.0, "step": 15060 }, { "entropy": 0.5456259002909064, "epoch": 1.1289799780318095, "grad_norm": 0.2105683535337448, "learning_rate": 0.0002, "loss": 0.6422, "mean_token_accuracy": 0.8148301642388105, "num_tokens": 78070562.0, "step": 15065 }, { "entropy": 0.5450002003461123, "epoch": 1.1293547019656613, "grad_norm": 0.2179940938949585, "learning_rate": 0.0002, "loss": 0.6419, "mean_token_accuracy": 0.817929546162486, "num_tokens": 79564790.0, "step": 15070 }, { "entropy": 0.5441508572548628, "epoch": 1.1297294258995132, "grad_norm": 0.19746467471122742, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.8197846952825785, "num_tokens": 81000858.0, "step": 15075 }, { "entropy": 0.532742383889854, "epoch": 1.130104149833365, "grad_norm": 0.2034115344285965, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8207056909799576, "num_tokens": 82449380.0, "step": 15080 }, { "entropy": 0.5359939884394407, "epoch": 1.130478873767217, "grad_norm": 0.2301296442747116, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.8212064549326896, "num_tokens": 83911056.0, "step": 15085 }, { "entropy": 0.5422849178314209, "epoch": 1.1308535977010687, "grad_norm": 0.2035750448703766, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8177955690771341, "num_tokens": 85326132.0, "step": 15090 }, { "entropy": 0.5528631404042244, "epoch": 1.1312283216349206, "grad_norm": 0.19331449270248413, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.8175492972135544, "num_tokens": 86800591.0, "step": 15095 }, { "entropy": 0.5325910503044724, "epoch": 1.1316030455687724, "grad_norm": 0.2019367665052414, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.8196745984256267, "num_tokens": 88261879.0, "step": 15100 }, { "entropy": 0.5351886788383127, "epoch": 1.1319777695026243, "grad_norm": 0.2116633653640747, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8193454083055258, "num_tokens": 89724287.0, "step": 15105 }, { "entropy": 0.5296231569722295, "epoch": 1.1323524934364761, "grad_norm": 0.21775460243225098, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.8215009238570928, "num_tokens": 91156627.0, "step": 15110 }, { "entropy": 0.5401787761598825, "epoch": 1.132727217370328, "grad_norm": 0.2273242175579071, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.8201185207813978, "num_tokens": 92597095.0, "step": 15115 }, { "entropy": 0.5304280422627926, "epoch": 1.1331019413041798, "grad_norm": 0.22146892547607422, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8218138754367829, "num_tokens": 94036814.0, "step": 15120 }, { "entropy": 0.5337586468085647, "epoch": 1.1334766652380317, "grad_norm": 0.20946064591407776, "learning_rate": 0.0002, "loss": 0.6352, "mean_token_accuracy": 0.8199427973479032, "num_tokens": 95481782.0, "step": 15125 }, { "entropy": 0.5321562437340617, "epoch": 1.1338513891718836, "grad_norm": 0.24882495403289795, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.820560235530138, "num_tokens": 96962272.0, "step": 15130 }, { "entropy": 0.5447107352316379, "epoch": 1.1342261131057354, "grad_norm": 0.21527352929115295, "learning_rate": 0.0002, "loss": 0.6434, "mean_token_accuracy": 0.8159884605556726, "num_tokens": 98430759.0, "step": 15135 }, { "entropy": 0.5321358142420649, "epoch": 1.1346008370395873, "grad_norm": 0.20559605956077576, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.8191003751009702, "num_tokens": 99903150.0, "step": 15140 }, { "entropy": 0.5316575452685356, "epoch": 1.134975560973439, "grad_norm": 0.21442915499210358, "learning_rate": 0.0002, "loss": 0.639, "mean_token_accuracy": 0.819050346314907, "num_tokens": 101386339.0, "step": 15145 }, { "entropy": 0.5193843960762023, "epoch": 1.135350284907291, "grad_norm": 0.21290303766727448, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8190418004989624, "num_tokens": 102823668.0, "step": 15150 }, { "entropy": 0.5275376584380865, "epoch": 1.1357250088411428, "grad_norm": 0.21151678264141083, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.821742407605052, "num_tokens": 104303553.0, "step": 15155 }, { "entropy": 0.5330424633808434, "epoch": 1.1360997327749947, "grad_norm": 0.2144862562417984, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8181955683976412, "num_tokens": 105773408.0, "step": 15160 }, { "entropy": 0.5217539303004741, "epoch": 1.1364744567088465, "grad_norm": 0.20025098323822021, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8213744677603245, "num_tokens": 107276051.0, "step": 15165 }, { "entropy": 0.5293426116928458, "epoch": 1.1368491806426984, "grad_norm": 0.21238376200199127, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.8199860133230686, "num_tokens": 108753274.0, "step": 15170 }, { "entropy": 0.5332580766640603, "epoch": 1.1372239045765502, "grad_norm": 0.20353296399116516, "learning_rate": 0.0002, "loss": 0.6421, "mean_token_accuracy": 0.8174023393541574, "num_tokens": 110197941.0, "step": 15175 }, { "entropy": 0.5311019914224744, "epoch": 1.137598628510402, "grad_norm": 0.20914459228515625, "learning_rate": 0.0002, "loss": 0.6367, "mean_token_accuracy": 0.8210623104125261, "num_tokens": 111694051.0, "step": 15180 }, { "entropy": 0.5099160736426711, "epoch": 1.137973352444254, "grad_norm": 0.19842371344566345, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.8229954529553651, "num_tokens": 113156825.0, "step": 15185 }, { "entropy": 0.5399016436189413, "epoch": 1.1383480763781058, "grad_norm": 0.21859969198703766, "learning_rate": 0.0002, "loss": 0.6514, "mean_token_accuracy": 0.8169912490993738, "num_tokens": 114624328.0, "step": 15190 }, { "entropy": 0.5260954646393656, "epoch": 1.1387228003119576, "grad_norm": 0.3734649419784546, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8192448571324349, "num_tokens": 116098511.0, "step": 15195 }, { "entropy": 0.5141025102697313, "epoch": 1.1390975242458095, "grad_norm": 0.2287501096725464, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.8238578084856272, "num_tokens": 117563315.0, "step": 15200 }, { "entropy": 0.522826767154038, "epoch": 1.1394722481796613, "grad_norm": 0.3100949823856354, "learning_rate": 0.0002, "loss": 0.6401, "mean_token_accuracy": 0.8175498560070992, "num_tokens": 119016469.0, "step": 15205 }, { "entropy": 0.5305403196252882, "epoch": 1.1398469721135132, "grad_norm": 0.20985595881938934, "learning_rate": 0.0002, "loss": 0.6415, "mean_token_accuracy": 0.8178046740591526, "num_tokens": 120500950.0, "step": 15210 }, { "entropy": 0.5367335379123688, "epoch": 1.140221696047365, "grad_norm": 0.2131592035293579, "learning_rate": 0.0002, "loss": 0.65, "mean_token_accuracy": 0.8144775282591581, "num_tokens": 121995552.0, "step": 15215 }, { "entropy": 0.525459230877459, "epoch": 1.1405964199812169, "grad_norm": 0.2190428376197815, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.8194598857313394, "num_tokens": 123467787.0, "step": 15220 }, { "entropy": 0.5255183830857277, "epoch": 1.1409711439150687, "grad_norm": 0.19596190750598907, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.8225025944411755, "num_tokens": 124906096.0, "step": 15225 }, { "entropy": 0.5114963557571173, "epoch": 1.1413458678489206, "grad_norm": 0.22511574625968933, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.8265996146947145, "num_tokens": 126359143.0, "step": 15230 }, { "entropy": 0.5343926470726729, "epoch": 1.1417205917827726, "grad_norm": 0.21172299981117249, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8182851213961839, "num_tokens": 127845175.0, "step": 15235 }, { "entropy": 0.5329928023740649, "epoch": 1.1420953157166245, "grad_norm": 0.20532119274139404, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.8186819586902857, "num_tokens": 129294849.0, "step": 15240 }, { "entropy": 0.5394835380837322, "epoch": 1.1424700396504763, "grad_norm": 0.21265849471092224, "learning_rate": 0.0002, "loss": 0.6448, "mean_token_accuracy": 0.8157801706343889, "num_tokens": 130756660.0, "step": 15245 }, { "entropy": 0.5474838815629482, "epoch": 1.1428447635843282, "grad_norm": 0.20046614110469818, "learning_rate": 0.0002, "loss": 0.65, "mean_token_accuracy": 0.8181062605232, "num_tokens": 132234447.0, "step": 15250 }, { "entropy": 0.5453504051081837, "epoch": 1.14321948751818, "grad_norm": 0.21078066527843475, "learning_rate": 0.0002, "loss": 0.6526, "mean_token_accuracy": 0.8161810170859098, "num_tokens": 133735272.0, "step": 15255 }, { "entropy": 0.5330930143594742, "epoch": 1.1435942114520319, "grad_norm": 0.2182839959859848, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.8247507981956005, "num_tokens": 135201402.0, "step": 15260 }, { "entropy": 0.5321950618177652, "epoch": 1.1439689353858837, "grad_norm": 0.2534467279911041, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8230598010122776, "num_tokens": 136637320.0, "step": 15265 }, { "entropy": 0.5281602436676621, "epoch": 1.1443436593197356, "grad_norm": 0.19628117978572845, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.8211598090827466, "num_tokens": 138101246.0, "step": 15270 }, { "entropy": 0.5302605951204896, "epoch": 1.1447183832535874, "grad_norm": 0.22090424597263336, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8186950672417879, "num_tokens": 139565429.0, "step": 15275 }, { "entropy": 0.5398764222860336, "epoch": 1.1450931071874393, "grad_norm": 0.24044984579086304, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.8193018190562725, "num_tokens": 140986263.0, "step": 15280 }, { "entropy": 0.5512881882488727, "epoch": 1.1454678311212911, "grad_norm": 0.22060343623161316, "learning_rate": 0.0002, "loss": 0.6473, "mean_token_accuracy": 0.8170196540653706, "num_tokens": 142443570.0, "step": 15285 }, { "entropy": 0.5598591832444072, "epoch": 1.145842555055143, "grad_norm": 0.2157810777425766, "learning_rate": 0.0002, "loss": 0.6449, "mean_token_accuracy": 0.8189783461391926, "num_tokens": 143894734.0, "step": 15290 }, { "entropy": 0.5462834438309073, "epoch": 1.1462172789889948, "grad_norm": 0.20989978313446045, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8191843900829554, "num_tokens": 145333549.0, "step": 15295 }, { "entropy": 0.5316857943311334, "epoch": 1.1465920029228467, "grad_norm": 0.20955568552017212, "learning_rate": 0.0002, "loss": 0.6155, "mean_token_accuracy": 0.8236037280410529, "num_tokens": 146812994.0, "step": 15300 }, { "entropy": 0.5417523615062236, "epoch": 1.1469667268566985, "grad_norm": 0.2763894200325012, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8223990291357041, "num_tokens": 148263370.0, "step": 15305 }, { "entropy": 0.5430322857573628, "epoch": 1.1473414507905504, "grad_norm": 0.1940237581729889, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.8206913940608501, "num_tokens": 149714093.0, "step": 15310 }, { "entropy": 0.5507472092285752, "epoch": 1.1477161747244022, "grad_norm": 0.20448075234889984, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.8159052774310112, "num_tokens": 151169075.0, "step": 15315 }, { "entropy": 0.5485115339979529, "epoch": 1.148090898658254, "grad_norm": 0.36480745673179626, "learning_rate": 0.0002, "loss": 0.6485, "mean_token_accuracy": 0.8158110536634922, "num_tokens": 152617577.0, "step": 15320 }, { "entropy": 0.5190546916797757, "epoch": 1.148465622592106, "grad_norm": 0.19891002774238586, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.8205499835312366, "num_tokens": 154125863.0, "step": 15325 }, { "entropy": 0.5017066974192858, "epoch": 1.1488403465259578, "grad_norm": 0.2155945599079132, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.822806203365326, "num_tokens": 155576369.0, "step": 15330 }, { "entropy": 0.5241659317165613, "epoch": 1.1492150704598096, "grad_norm": 0.19727657735347748, "learning_rate": 0.0002, "loss": 0.6392, "mean_token_accuracy": 0.8186828888952732, "num_tokens": 157031244.0, "step": 15335 }, { "entropy": 0.528219484910369, "epoch": 1.1495897943936615, "grad_norm": 0.21721024811267853, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8179951127618551, "num_tokens": 158528905.0, "step": 15340 }, { "entropy": 0.5300562996417284, "epoch": 1.1499645183275133, "grad_norm": 0.21147538721561432, "learning_rate": 0.0002, "loss": 0.6407, "mean_token_accuracy": 0.8204207874834537, "num_tokens": 159969943.0, "step": 15345 }, { "entropy": 0.5340174475684762, "epoch": 1.1503392422613652, "grad_norm": 0.22343291342258453, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.820684676989913, "num_tokens": 161428602.0, "step": 15350 }, { "entropy": 0.5252409229055047, "epoch": 1.150713966195217, "grad_norm": 0.22462525963783264, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.8223578620702028, "num_tokens": 162852841.0, "step": 15355 }, { "entropy": 0.5400621496140957, "epoch": 1.151088690129069, "grad_norm": 0.21711023151874542, "learning_rate": 0.0002, "loss": 0.6413, "mean_token_accuracy": 0.8178456269204617, "num_tokens": 164313272.0, "step": 15360 }, { "entropy": 0.526811120286584, "epoch": 1.1514634140629207, "grad_norm": 0.21423554420471191, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8202658675611019, "num_tokens": 165754564.0, "step": 15365 }, { "entropy": 0.5372701779007911, "epoch": 1.1518381379967726, "grad_norm": 0.21755488216876984, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8194625250995159, "num_tokens": 167241875.0, "step": 15370 }, { "entropy": 0.5361608905717731, "epoch": 1.1522128619306247, "grad_norm": 0.1967310905456543, "learning_rate": 0.0002, "loss": 0.6379, "mean_token_accuracy": 0.8186418302357197, "num_tokens": 168712553.0, "step": 15375 }, { "entropy": 0.544328686222434, "epoch": 1.1525875858644765, "grad_norm": 0.22029869258403778, "learning_rate": 0.0002, "loss": 0.6482, "mean_token_accuracy": 0.8167981572449208, "num_tokens": 170144238.0, "step": 15380 }, { "entropy": 0.5518952203914523, "epoch": 1.1529623097983284, "grad_norm": 0.20084990561008453, "learning_rate": 0.0002, "loss": 0.6467, "mean_token_accuracy": 0.8165990721434355, "num_tokens": 171605748.0, "step": 15385 }, { "entropy": 0.5382013913244009, "epoch": 1.1533370337321802, "grad_norm": 0.1941797286272049, "learning_rate": 0.0002, "loss": 0.6311, "mean_token_accuracy": 0.8209801077842712, "num_tokens": 173080454.0, "step": 15390 }, { "entropy": 0.5402020651847124, "epoch": 1.153711757666032, "grad_norm": 0.21004441380500793, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8197187792509795, "num_tokens": 174556740.0, "step": 15395 }, { "entropy": 0.5398525016382336, "epoch": 1.154086481599884, "grad_norm": 0.22164317965507507, "learning_rate": 0.0002, "loss": 0.6355, "mean_token_accuracy": 0.819812186434865, "num_tokens": 176051016.0, "step": 15400 }, { "entropy": 0.5255119660869241, "epoch": 1.1544612055337358, "grad_norm": 0.2328360229730606, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.8240885999053716, "num_tokens": 177497104.0, "step": 15405 }, { "entropy": 0.5415921236388386, "epoch": 1.1548359294675876, "grad_norm": 0.22052066028118134, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.8169951695948839, "num_tokens": 178997288.0, "step": 15410 }, { "entropy": 0.5517549853771925, "epoch": 1.1552106534014395, "grad_norm": 0.19578568637371063, "learning_rate": 0.0002, "loss": 0.6546, "mean_token_accuracy": 0.8143110040575265, "num_tokens": 180504664.0, "step": 15415 }, { "entropy": 0.5365440970286727, "epoch": 1.1555853773352913, "grad_norm": 0.2045038491487503, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.8221670623868704, "num_tokens": 181951072.0, "step": 15420 }, { "entropy": 0.5387387973256409, "epoch": 1.1559601012691432, "grad_norm": 0.2148345410823822, "learning_rate": 0.0002, "loss": 0.6332, "mean_token_accuracy": 0.816723246127367, "num_tokens": 183394879.0, "step": 15425 }, { "entropy": 0.5348810847848654, "epoch": 1.156334825202995, "grad_norm": 0.2187887728214264, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.8214800741523505, "num_tokens": 184853595.0, "step": 15430 }, { "entropy": 0.5395804699510336, "epoch": 1.1567095491368469, "grad_norm": 0.23868702352046967, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8204024005681276, "num_tokens": 186328033.0, "step": 15435 }, { "entropy": 0.5270075459033251, "epoch": 1.1570842730706987, "grad_norm": 0.19961439073085785, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8209889769554138, "num_tokens": 187828503.0, "step": 15440 }, { "entropy": 0.538202129676938, "epoch": 1.1574589970045506, "grad_norm": 0.2016802281141281, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.8224097162485122, "num_tokens": 189303709.0, "step": 15445 }, { "entropy": 0.5310984263196588, "epoch": 1.1578337209384024, "grad_norm": 0.21407707035541534, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8204619154334069, "num_tokens": 190783002.0, "step": 15450 }, { "entropy": 0.5413730069994926, "epoch": 1.1582084448722543, "grad_norm": 0.20956501364707947, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.8177123624831438, "num_tokens": 192284149.0, "step": 15455 }, { "entropy": 0.5404916737228632, "epoch": 1.1585831688061061, "grad_norm": 0.22518739104270935, "learning_rate": 0.0002, "loss": 0.6377, "mean_token_accuracy": 0.8183807257562876, "num_tokens": 193750910.0, "step": 15460 }, { "entropy": 0.5159136233851314, "epoch": 1.158957892739958, "grad_norm": 0.2303938865661621, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.8268997676670551, "num_tokens": 195189887.0, "step": 15465 }, { "entropy": 0.5326991154812276, "epoch": 1.1593326166738098, "grad_norm": 0.2188839614391327, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8201083291321993, "num_tokens": 196652402.0, "step": 15470 }, { "entropy": 0.53614745689556, "epoch": 1.1597073406076617, "grad_norm": 0.21910086274147034, "learning_rate": 0.0002, "loss": 0.6317, "mean_token_accuracy": 0.8197275221347808, "num_tokens": 198138425.0, "step": 15475 }, { "entropy": 0.5437615728005767, "epoch": 1.1600820645415135, "grad_norm": 0.19537511467933655, "learning_rate": 0.0002, "loss": 0.637, "mean_token_accuracy": 0.8188305478543043, "num_tokens": 199582334.0, "step": 15480 }, { "entropy": 0.5539979325607419, "epoch": 1.1604567884753654, "grad_norm": 0.2070092111825943, "learning_rate": 0.0002, "loss": 0.6496, "mean_token_accuracy": 0.8166499048471451, "num_tokens": 201058289.0, "step": 15485 }, { "entropy": 0.5458498351275921, "epoch": 1.1608315124092172, "grad_norm": 0.20136721432209015, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.81850645840168, "num_tokens": 202493185.0, "step": 15490 }, { "entropy": 0.5534692088142037, "epoch": 1.161206236343069, "grad_norm": 0.20873668789863586, "learning_rate": 0.0002, "loss": 0.6446, "mean_token_accuracy": 0.8143421687185765, "num_tokens": 203994664.0, "step": 15495 }, { "entropy": 0.5605338108725846, "epoch": 1.161580960276921, "grad_norm": 0.2203768938779831, "learning_rate": 0.0002, "loss": 0.6479, "mean_token_accuracy": 0.8146719306707382, "num_tokens": 205458311.0, "step": 15500 }, { "entropy": 0.5433472720906138, "epoch": 1.1619556842107728, "grad_norm": 0.2078513354063034, "learning_rate": 0.0002, "loss": 0.6311, "mean_token_accuracy": 0.8185877475887537, "num_tokens": 206903413.0, "step": 15505 }, { "entropy": 0.5573907129466533, "epoch": 1.1623304081446246, "grad_norm": 0.21332654356956482, "learning_rate": 0.0002, "loss": 0.6437, "mean_token_accuracy": 0.8177033387124538, "num_tokens": 208391111.0, "step": 15510 }, { "entropy": 0.5428014675155282, "epoch": 1.1627051320784765, "grad_norm": 0.21726427972316742, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.8190674386918545, "num_tokens": 209846016.0, "step": 15515 }, { "entropy": 0.5559645196422934, "epoch": 1.1630798560123283, "grad_norm": 0.20993110537528992, "learning_rate": 0.0002, "loss": 0.6424, "mean_token_accuracy": 0.8155396573245526, "num_tokens": 211365805.0, "step": 15520 }, { "entropy": 0.5354180200025439, "epoch": 1.1634545799461802, "grad_norm": 0.24249009788036346, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.8216710325330496, "num_tokens": 212834433.0, "step": 15525 }, { "entropy": 0.5309402983635664, "epoch": 1.163829303880032, "grad_norm": 0.20715078711509705, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.8241958968341351, "num_tokens": 214323469.0, "step": 15530 }, { "entropy": 0.5483285965397954, "epoch": 1.1642040278138839, "grad_norm": 0.21144481003284454, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.8180387418717145, "num_tokens": 215792687.0, "step": 15535 }, { "entropy": 0.5481603149324655, "epoch": 1.1645787517477357, "grad_norm": 0.21006131172180176, "learning_rate": 0.0002, "loss": 0.65, "mean_token_accuracy": 0.819357018917799, "num_tokens": 217251229.0, "step": 15540 }, { "entropy": 0.5470278142020106, "epoch": 1.1649534756815878, "grad_norm": 0.21808145940303802, "learning_rate": 0.0002, "loss": 0.648, "mean_token_accuracy": 0.8167155593633652, "num_tokens": 218700911.0, "step": 15545 }, { "entropy": 0.5573697770014405, "epoch": 1.1653281996154397, "grad_norm": 0.2358187437057495, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.8156570788472891, "num_tokens": 220170243.0, "step": 15550 }, { "entropy": 0.5220856046304106, "epoch": 1.1657029235492915, "grad_norm": 0.20753911137580872, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.8215243589133024, "num_tokens": 221612235.0, "step": 15555 }, { "entropy": 0.5435627460479736, "epoch": 1.1660776474831434, "grad_norm": 0.22171235084533691, "learning_rate": 0.0002, "loss": 0.6371, "mean_token_accuracy": 0.8176844537258148, "num_tokens": 223090251.0, "step": 15560 }, { "entropy": 0.5398770313709974, "epoch": 1.1664523714169952, "grad_norm": 0.2790617346763611, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8247738528996706, "num_tokens": 224568754.0, "step": 15565 }, { "entropy": 0.5564304480329156, "epoch": 1.166827095350847, "grad_norm": 0.21659108996391296, "learning_rate": 0.0002, "loss": 0.6414, "mean_token_accuracy": 0.816242978721857, "num_tokens": 226078258.0, "step": 15570 }, { "entropy": 0.5562395635992289, "epoch": 1.167201819284699, "grad_norm": 0.21683292090892792, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8184576023370027, "num_tokens": 227557944.0, "step": 15575 }, { "entropy": 0.5559782991185784, "epoch": 1.1675765432185508, "grad_norm": 0.21921786665916443, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8174417153000831, "num_tokens": 228974409.0, "step": 15580 }, { "entropy": 0.5625852836295963, "epoch": 1.1679512671524026, "grad_norm": 0.2026546448469162, "learning_rate": 0.0002, "loss": 0.6412, "mean_token_accuracy": 0.8211329702287913, "num_tokens": 230426679.0, "step": 15585 }, { "entropy": 0.5384157553315163, "epoch": 1.1683259910862545, "grad_norm": 0.19031989574432373, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.8210568785667419, "num_tokens": 231893353.0, "step": 15590 }, { "entropy": 0.5475016308948397, "epoch": 1.1687007150201063, "grad_norm": 0.214310884475708, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8168384753167629, "num_tokens": 233392148.0, "step": 15595 }, { "entropy": 0.5480816052295268, "epoch": 1.1690754389539582, "grad_norm": 0.22403517365455627, "learning_rate": 0.0002, "loss": 0.6388, "mean_token_accuracy": 0.8208647321909666, "num_tokens": 234886612.0, "step": 15600 }, { "entropy": 0.542025501281023, "epoch": 1.16945016288781, "grad_norm": 0.21326294541358948, "learning_rate": 0.0002, "loss": 0.6411, "mean_token_accuracy": 0.8181887954473496, "num_tokens": 236337161.0, "step": 15605 }, { "entropy": 0.5527969089336693, "epoch": 1.1698248868216619, "grad_norm": 0.22686174511909485, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.8181416552513838, "num_tokens": 237834971.0, "step": 15610 }, { "entropy": 0.5647339763119816, "epoch": 1.1701996107555137, "grad_norm": 0.20381702482700348, "learning_rate": 0.0002, "loss": 0.6499, "mean_token_accuracy": 0.8164297133684159, "num_tokens": 239333177.0, "step": 15615 }, { "entropy": 0.5363212353549898, "epoch": 1.1705743346893656, "grad_norm": 0.19540278613567352, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.8209344778209925, "num_tokens": 240849723.0, "step": 15620 }, { "entropy": 0.5406083885580302, "epoch": 1.1709490586232174, "grad_norm": 0.22689451277256012, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8192193541675806, "num_tokens": 242307013.0, "step": 15625 }, { "entropy": 0.5326429000124335, "epoch": 1.1713237825570693, "grad_norm": 0.19441914558410645, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8200698867440224, "num_tokens": 243773985.0, "step": 15630 }, { "entropy": 0.5334110245108604, "epoch": 1.171698506490921, "grad_norm": 0.2105536162853241, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.8241670280694962, "num_tokens": 245229264.0, "step": 15635 }, { "entropy": 0.5392092062160373, "epoch": 1.172073230424773, "grad_norm": 0.2516617774963379, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.8203759532421827, "num_tokens": 246703778.0, "step": 15640 }, { "entropy": 0.5451305215246975, "epoch": 1.1724479543586248, "grad_norm": 0.2040332704782486, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.8192308589816093, "num_tokens": 248141898.0, "step": 15645 }, { "entropy": 0.55029900893569, "epoch": 1.1728226782924767, "grad_norm": 0.21324849128723145, "learning_rate": 0.0002, "loss": 0.6332, "mean_token_accuracy": 0.8174972482025623, "num_tokens": 249611285.0, "step": 15650 }, { "entropy": 0.5642278654500842, "epoch": 1.1731974022263285, "grad_norm": 0.1984548717737198, "learning_rate": 0.0002, "loss": 0.6459, "mean_token_accuracy": 0.8182731907814741, "num_tokens": 251110829.0, "step": 15655 }, { "entropy": 0.5515925656072795, "epoch": 1.1735721261601804, "grad_norm": 0.23628295958042145, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8205199602991342, "num_tokens": 252579607.0, "step": 15660 }, { "entropy": 0.5589025454595685, "epoch": 1.1739468500940322, "grad_norm": 0.2087000012397766, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8214136876165867, "num_tokens": 254046016.0, "step": 15665 }, { "entropy": 0.568957757204771, "epoch": 1.174321574027884, "grad_norm": 0.20907878875732422, "learning_rate": 0.0002, "loss": 0.6403, "mean_token_accuracy": 0.8195033933967352, "num_tokens": 255530377.0, "step": 15670 }, { "entropy": 0.5815182905644178, "epoch": 1.174696297961736, "grad_norm": 0.20662961900234222, "learning_rate": 0.0002, "loss": 0.6494, "mean_token_accuracy": 0.8189668171107769, "num_tokens": 257031733.0, "step": 15675 }, { "entropy": 0.5663510909304023, "epoch": 1.1750710218955878, "grad_norm": 0.19948314130306244, "learning_rate": 0.0002, "loss": 0.6421, "mean_token_accuracy": 0.8186105601489544, "num_tokens": 258515035.0, "step": 15680 }, { "entropy": 0.5339896624907852, "epoch": 1.1754457458294398, "grad_norm": 0.20650537312030792, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.826084217429161, "num_tokens": 259986807.0, "step": 15685 }, { "entropy": 0.555138418264687, "epoch": 1.1758204697632917, "grad_norm": 0.21611273288726807, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8202888533473015, "num_tokens": 261495752.0, "step": 15690 }, { "entropy": 0.5688262324780226, "epoch": 1.1761951936971435, "grad_norm": 0.19569611549377441, "learning_rate": 0.0002, "loss": 0.641, "mean_token_accuracy": 0.8205148797482252, "num_tokens": 262979566.0, "step": 15695 }, { "entropy": 0.5596666960045695, "epoch": 1.1765699176309954, "grad_norm": 0.2141554355621338, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.8189047433435916, "num_tokens": 264414828.0, "step": 15700 }, { "entropy": 0.5627059841528534, "epoch": 1.1769446415648472, "grad_norm": 0.2144629806280136, "learning_rate": 0.0002, "loss": 0.6374, "mean_token_accuracy": 0.8181395318359137, "num_tokens": 265902644.0, "step": 15705 }, { "entropy": 0.564560879766941, "epoch": 1.177319365498699, "grad_norm": 0.234091654419899, "learning_rate": 0.0002, "loss": 0.6486, "mean_token_accuracy": 0.816453255712986, "num_tokens": 267388248.0, "step": 15710 }, { "entropy": 0.5580318549647927, "epoch": 1.177694089432551, "grad_norm": 0.2001536339521408, "learning_rate": 0.0002, "loss": 0.6413, "mean_token_accuracy": 0.8176109362393618, "num_tokens": 268866627.0, "step": 15715 }, { "entropy": 0.5448968028649688, "epoch": 1.1780688133664028, "grad_norm": 0.22129973769187927, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.8207923617213965, "num_tokens": 270328455.0, "step": 15720 }, { "entropy": 0.5386389346793294, "epoch": 1.1784435373002546, "grad_norm": 0.193189337849617, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.8211586512625217, "num_tokens": 271784371.0, "step": 15725 }, { "entropy": 0.5424343606457114, "epoch": 1.1788182612341065, "grad_norm": 0.21259663999080658, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.8204525269567966, "num_tokens": 273265680.0, "step": 15730 }, { "entropy": 0.5488002516329289, "epoch": 1.1791929851679583, "grad_norm": 0.2207118570804596, "learning_rate": 0.0002, "loss": 0.6372, "mean_token_accuracy": 0.8182060658931732, "num_tokens": 274732644.0, "step": 15735 }, { "entropy": 0.5518508333712816, "epoch": 1.1795677091018102, "grad_norm": 0.29898571968078613, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8192952636629343, "num_tokens": 276215442.0, "step": 15740 }, { "entropy": 0.5408744651824235, "epoch": 1.179942433035662, "grad_norm": 0.21727903187274933, "learning_rate": 0.0002, "loss": 0.637, "mean_token_accuracy": 0.8181692454963923, "num_tokens": 277691861.0, "step": 15745 }, { "entropy": 0.5400547128170728, "epoch": 1.180317156969514, "grad_norm": 0.22648996114730835, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.8196690931916237, "num_tokens": 279188699.0, "step": 15750 }, { "entropy": 0.5520823320373893, "epoch": 1.1806918809033657, "grad_norm": 0.21730203926563263, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.8206083320081234, "num_tokens": 280666556.0, "step": 15755 }, { "entropy": 0.5348713636398316, "epoch": 1.1810666048372176, "grad_norm": 0.25026923418045044, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.8196978617459536, "num_tokens": 282130170.0, "step": 15760 }, { "entropy": 0.5507078228518367, "epoch": 1.1814413287710694, "grad_norm": 0.21302105486392975, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8198738295584918, "num_tokens": 283571931.0, "step": 15765 }, { "entropy": 0.5532035553827882, "epoch": 1.1818160527049213, "grad_norm": 0.22440044581890106, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.822601592540741, "num_tokens": 285052437.0, "step": 15770 }, { "entropy": 0.5613814620301127, "epoch": 1.1821907766387731, "grad_norm": 0.19988325238227844, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.8176384013146162, "num_tokens": 286570532.0, "step": 15775 }, { "entropy": 0.5490282801911235, "epoch": 1.182565500572625, "grad_norm": 0.2406320869922638, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.8220926683396101, "num_tokens": 287951116.0, "step": 15780 }, { "entropy": 0.5368426701053977, "epoch": 1.1829402245064768, "grad_norm": 0.2515767216682434, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.825744142010808, "num_tokens": 289380825.0, "step": 15785 }, { "entropy": 0.5762920379638672, "epoch": 1.1833149484403287, "grad_norm": 0.21567979454994202, "learning_rate": 0.0002, "loss": 0.6532, "mean_token_accuracy": 0.8170733753591776, "num_tokens": 290874123.0, "step": 15790 }, { "entropy": 0.5618729570880532, "epoch": 1.1836896723741805, "grad_norm": 0.21933582425117493, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.8203139711171389, "num_tokens": 292360676.0, "step": 15795 }, { "entropy": 0.5531045898795128, "epoch": 1.1840643963080324, "grad_norm": 0.20935362577438354, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.8230548977851868, "num_tokens": 293801772.0, "step": 15800 }, { "entropy": 0.5451251054182649, "epoch": 1.1844391202418842, "grad_norm": 0.21271122992038727, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8224934607744216, "num_tokens": 295262048.0, "step": 15805 }, { "entropy": 0.5455452216789126, "epoch": 1.184813844175736, "grad_norm": 0.21479955315589905, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.8204073447734117, "num_tokens": 296691773.0, "step": 15810 }, { "entropy": 0.5490612523630262, "epoch": 1.185188568109588, "grad_norm": 0.20872144401073456, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8184580247849226, "num_tokens": 298154531.0, "step": 15815 }, { "entropy": 0.568153737206012, "epoch": 1.1855632920434398, "grad_norm": 0.21482737362384796, "learning_rate": 0.0002, "loss": 0.6518, "mean_token_accuracy": 0.8157465234398842, "num_tokens": 299634137.0, "step": 15820 }, { "entropy": 0.5482039192691446, "epoch": 1.1859380159772916, "grad_norm": 0.2217259258031845, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.8183993712067604, "num_tokens": 301085444.0, "step": 15825 }, { "entropy": 0.5694194626063108, "epoch": 1.1863127399111435, "grad_norm": 0.2051529735326767, "learning_rate": 0.0002, "loss": 0.655, "mean_token_accuracy": 0.8155386734753847, "num_tokens": 302557996.0, "step": 15830 }, { "entropy": 0.568057070299983, "epoch": 1.1866874638449953, "grad_norm": 0.21838122606277466, "learning_rate": 0.0002, "loss": 0.6482, "mean_token_accuracy": 0.8161941315978766, "num_tokens": 304047765.0, "step": 15835 }, { "entropy": 0.5604499012231827, "epoch": 1.1870621877788472, "grad_norm": 0.2108950912952423, "learning_rate": 0.0002, "loss": 0.6372, "mean_token_accuracy": 0.8172587636858225, "num_tokens": 305541863.0, "step": 15840 }, { "entropy": 0.5709219595417381, "epoch": 1.187436911712699, "grad_norm": 0.22775912284851074, "learning_rate": 0.0002, "loss": 0.6492, "mean_token_accuracy": 0.8151153843849898, "num_tokens": 307010942.0, "step": 15845 }, { "entropy": 0.5559785844758153, "epoch": 1.187811635646551, "grad_norm": 0.21514509618282318, "learning_rate": 0.0002, "loss": 0.6346, "mean_token_accuracy": 0.8226537246257066, "num_tokens": 308465269.0, "step": 15850 }, { "entropy": 0.556635432317853, "epoch": 1.1881863595804028, "grad_norm": 0.24835966527462006, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.8194680139422417, "num_tokens": 309936118.0, "step": 15855 }, { "entropy": 0.554649998806417, "epoch": 1.1885610835142548, "grad_norm": 0.2204464077949524, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.819863598421216, "num_tokens": 311402765.0, "step": 15860 }, { "entropy": 0.5519942246377468, "epoch": 1.1889358074481067, "grad_norm": 0.20727533102035522, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8208970680832863, "num_tokens": 312852199.0, "step": 15865 }, { "entropy": 0.5639481995254755, "epoch": 1.1893105313819585, "grad_norm": 0.2029898762702942, "learning_rate": 0.0002, "loss": 0.6406, "mean_token_accuracy": 0.8154022473841905, "num_tokens": 314319855.0, "step": 15870 }, { "entropy": 0.5500311387702823, "epoch": 1.1896852553158104, "grad_norm": 0.23906545341014862, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.8234933137893676, "num_tokens": 315718551.0, "step": 15875 }, { "entropy": 0.5607378934510052, "epoch": 1.1900599792496622, "grad_norm": 0.21106642484664917, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8183411449193955, "num_tokens": 317179891.0, "step": 15880 }, { "entropy": 0.5674335801973939, "epoch": 1.190434703183514, "grad_norm": 0.20076721906661987, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.8224584870040417, "num_tokens": 318635917.0, "step": 15885 }, { "entropy": 0.5615517290309071, "epoch": 1.190809427117366, "grad_norm": 0.19698385894298553, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.8173488304018974, "num_tokens": 320116858.0, "step": 15890 }, { "entropy": 0.5904022643342615, "epoch": 1.1911841510512178, "grad_norm": 0.2246994525194168, "learning_rate": 0.0002, "loss": 0.6697, "mean_token_accuracy": 0.8179963883012533, "num_tokens": 321579424.0, "step": 15895 }, { "entropy": 0.5713071178644895, "epoch": 1.1915588749850696, "grad_norm": 0.1962047964334488, "learning_rate": 0.0002, "loss": 0.6383, "mean_token_accuracy": 0.8197877235710621, "num_tokens": 323110685.0, "step": 15900 }, { "entropy": 0.5557500526309014, "epoch": 1.1919335989189215, "grad_norm": 0.21567818522453308, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.8207397181540728, "num_tokens": 324558159.0, "step": 15905 }, { "entropy": 0.5501130310818553, "epoch": 1.1923083228527733, "grad_norm": 0.2157336175441742, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.8211126040667296, "num_tokens": 326029047.0, "step": 15910 }, { "entropy": 0.5549915492534637, "epoch": 1.1926830467866252, "grad_norm": 0.20630492269992828, "learning_rate": 0.0002, "loss": 0.6356, "mean_token_accuracy": 0.8172483894973993, "num_tokens": 327498618.0, "step": 15915 }, { "entropy": 0.5575497567653656, "epoch": 1.193057770720477, "grad_norm": 0.22262641787528992, "learning_rate": 0.0002, "loss": 0.6402, "mean_token_accuracy": 0.8194199059158563, "num_tokens": 328965900.0, "step": 15920 }, { "entropy": 0.5476757378317416, "epoch": 1.1934324946543289, "grad_norm": 0.2063734084367752, "learning_rate": 0.0002, "loss": 0.6396, "mean_token_accuracy": 0.8178101647645235, "num_tokens": 330392968.0, "step": 15925 }, { "entropy": 0.5439882572740317, "epoch": 1.1938072185881807, "grad_norm": 0.21485531330108643, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8238768022507429, "num_tokens": 331841718.0, "step": 15930 }, { "entropy": 0.5503868538886308, "epoch": 1.1941819425220326, "grad_norm": 0.22280661761760712, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.8202629804611206, "num_tokens": 333300265.0, "step": 15935 }, { "entropy": 0.5523954572156071, "epoch": 1.1945566664558844, "grad_norm": 0.2154044508934021, "learning_rate": 0.0002, "loss": 0.6379, "mean_token_accuracy": 0.8182112440466881, "num_tokens": 334797942.0, "step": 15940 }, { "entropy": 0.5487131331115961, "epoch": 1.1949313903897363, "grad_norm": 0.2210586965084076, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8194796901196242, "num_tokens": 336275905.0, "step": 15945 }, { "entropy": 0.5457633748650551, "epoch": 1.1953061143235881, "grad_norm": 0.21038688719272614, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.8239422667771578, "num_tokens": 337725070.0, "step": 15950 }, { "entropy": 0.5595749573782086, "epoch": 1.19568083825744, "grad_norm": 0.21593697369098663, "learning_rate": 0.0002, "loss": 0.6437, "mean_token_accuracy": 0.8184035025537014, "num_tokens": 339181362.0, "step": 15955 }, { "entropy": 0.5523030603304505, "epoch": 1.1960555621912918, "grad_norm": 0.25279563665390015, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8203826025128365, "num_tokens": 340670296.0, "step": 15960 }, { "entropy": 0.5477837707847357, "epoch": 1.1964302861251437, "grad_norm": 0.21138380467891693, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8201215293258428, "num_tokens": 342130048.0, "step": 15965 }, { "entropy": 0.561730389483273, "epoch": 1.1968050100589955, "grad_norm": 0.20245297253131866, "learning_rate": 0.0002, "loss": 0.6386, "mean_token_accuracy": 0.8193085175007582, "num_tokens": 343612934.0, "step": 15970 }, { "entropy": 0.5405843811109662, "epoch": 1.1971797339928474, "grad_norm": 0.21498899161815643, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.8201549652963877, "num_tokens": 345087868.0, "step": 15975 }, { "entropy": 0.5462759140878916, "epoch": 1.1975544579266992, "grad_norm": 0.22310534119606018, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8210481543093919, "num_tokens": 346551799.0, "step": 15980 }, { "entropy": 0.555598528496921, "epoch": 1.197929181860551, "grad_norm": 0.214724600315094, "learning_rate": 0.0002, "loss": 0.6367, "mean_token_accuracy": 0.8191270772367716, "num_tokens": 347995142.0, "step": 15985 }, { "entropy": 0.5514026794582605, "epoch": 1.198303905794403, "grad_norm": 0.22287656366825104, "learning_rate": 0.0002, "loss": 0.6419, "mean_token_accuracy": 0.8184296168386936, "num_tokens": 349453012.0, "step": 15990 }, { "entropy": 0.5306168496608734, "epoch": 1.198678629728255, "grad_norm": 0.2255207598209381, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.8230748377740383, "num_tokens": 350896683.0, "step": 15995 }, { "entropy": 0.5392391895875335, "epoch": 1.1990533536621069, "grad_norm": 0.21876011788845062, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8195833459496498, "num_tokens": 352366466.0, "step": 16000 }, { "entropy": 0.5675320439040661, "epoch": 1.1994280775959587, "grad_norm": 0.23144634068012238, "learning_rate": 0.0002, "loss": 0.6676, "mean_token_accuracy": 0.8119930487126112, "num_tokens": 353855120.0, "step": 16005 }, { "entropy": 0.5512169744819403, "epoch": 1.1998028015298106, "grad_norm": 0.22463606297969818, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8187019370496273, "num_tokens": 355330196.0, "step": 16010 }, { "entropy": 0.5471936814486981, "epoch": 1.2001775254636624, "grad_norm": 0.20801855623722076, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.8189894165843725, "num_tokens": 356794323.0, "step": 16015 }, { "entropy": 0.5455999588593841, "epoch": 1.2005522493975143, "grad_norm": 0.2312791794538498, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8212532259523868, "num_tokens": 358298717.0, "step": 16020 }, { "entropy": 0.5315550839528441, "epoch": 1.200926973331366, "grad_norm": 0.21841691434383392, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.8241531159728765, "num_tokens": 359805444.0, "step": 16025 }, { "entropy": 0.5286845628172159, "epoch": 1.201301697265218, "grad_norm": 0.21820439398288727, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.8240635566413402, "num_tokens": 361254938.0, "step": 16030 }, { "entropy": 0.5656525328755379, "epoch": 1.2016764211990698, "grad_norm": 0.2178415060043335, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.8175923187285662, "num_tokens": 362732199.0, "step": 16035 }, { "entropy": 0.5646429920569063, "epoch": 1.2020511451329217, "grad_norm": 0.20455855131149292, "learning_rate": 0.0002, "loss": 0.6547, "mean_token_accuracy": 0.8146360430866479, "num_tokens": 364212444.0, "step": 16040 }, { "entropy": 0.5417776295915246, "epoch": 1.2024258690667735, "grad_norm": 0.20008884370326996, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.8193420134484768, "num_tokens": 365674533.0, "step": 16045 }, { "entropy": 0.54986906722188, "epoch": 1.2028005930006254, "grad_norm": 0.21747969090938568, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8207195859402419, "num_tokens": 367180684.0, "step": 16050 }, { "entropy": 0.5503818430006504, "epoch": 1.2031753169344772, "grad_norm": 0.21724915504455566, "learning_rate": 0.0002, "loss": 0.6397, "mean_token_accuracy": 0.8218812573701143, "num_tokens": 368668040.0, "step": 16055 }, { "entropy": 0.5495991084724665, "epoch": 1.203550040868329, "grad_norm": 0.19731052219867706, "learning_rate": 0.0002, "loss": 0.6367, "mean_token_accuracy": 0.8193798039108515, "num_tokens": 370132912.0, "step": 16060 }, { "entropy": 0.5434444773942232, "epoch": 1.203924764802181, "grad_norm": 0.20202571153640747, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8164689764380455, "num_tokens": 371649110.0, "step": 16065 }, { "entropy": 0.5433717202395201, "epoch": 1.2042994887360328, "grad_norm": 0.21862001717090607, "learning_rate": 0.0002, "loss": 0.6445, "mean_token_accuracy": 0.8163364116102457, "num_tokens": 373136604.0, "step": 16070 }, { "entropy": 0.5275349114090204, "epoch": 1.2046742126698846, "grad_norm": 0.2202070951461792, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8217236962169409, "num_tokens": 374637571.0, "step": 16075 }, { "entropy": 0.5280767848715187, "epoch": 1.2050489366037365, "grad_norm": 0.22334010899066925, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.8188487239181995, "num_tokens": 376165364.0, "step": 16080 }, { "entropy": 0.5292877845466137, "epoch": 1.2054236605375883, "grad_norm": 0.288803368806839, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.8204879853874445, "num_tokens": 377658324.0, "step": 16085 }, { "entropy": 0.5140156263485551, "epoch": 1.2057983844714402, "grad_norm": 0.2149018794298172, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8196202699095011, "num_tokens": 379109784.0, "step": 16090 }, { "entropy": 0.5096120232716203, "epoch": 1.206173108405292, "grad_norm": 0.21781012415885925, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.8233054716140031, "num_tokens": 380593450.0, "step": 16095 }, { "entropy": 0.5219666756689548, "epoch": 1.2065478323391439, "grad_norm": 0.21577198803424835, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.8194414962083101, "num_tokens": 382066359.0, "step": 16100 }, { "entropy": 0.5277005167677998, "epoch": 1.2069225562729957, "grad_norm": 0.22512517869472504, "learning_rate": 0.0002, "loss": 0.6417, "mean_token_accuracy": 0.8164571680128574, "num_tokens": 383528515.0, "step": 16105 }, { "entropy": 0.5153439512476325, "epoch": 1.2072972802068476, "grad_norm": 0.20002730190753937, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8220693912357092, "num_tokens": 384964940.0, "step": 16110 }, { "entropy": 0.5231467574834824, "epoch": 1.2076720041406994, "grad_norm": 0.22380492091178894, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8223538666963577, "num_tokens": 386423180.0, "step": 16115 }, { "entropy": 0.5226429726928472, "epoch": 1.2080467280745513, "grad_norm": 0.22022588551044464, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.8198374375700951, "num_tokens": 387895377.0, "step": 16120 }, { "entropy": 0.5274552814662457, "epoch": 1.2084214520084031, "grad_norm": 0.2035389244556427, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8177408751100301, "num_tokens": 389351091.0, "step": 16125 }, { "entropy": 0.5463793149217964, "epoch": 1.208796175942255, "grad_norm": 0.21360920369625092, "learning_rate": 0.0002, "loss": 0.6451, "mean_token_accuracy": 0.8169288951903582, "num_tokens": 390840533.0, "step": 16130 }, { "entropy": 0.5412166332826018, "epoch": 1.2091708998761068, "grad_norm": 0.2037932276725769, "learning_rate": 0.0002, "loss": 0.6365, "mean_token_accuracy": 0.8179454434663057, "num_tokens": 392327636.0, "step": 16135 }, { "entropy": 0.5267756023444236, "epoch": 1.2095456238099587, "grad_norm": 0.2049456089735031, "learning_rate": 0.0002, "loss": 0.6288, "mean_token_accuracy": 0.8226088408380747, "num_tokens": 393796509.0, "step": 16140 }, { "entropy": 0.5475874835625291, "epoch": 1.2099203477438105, "grad_norm": 0.2076299786567688, "learning_rate": 0.0002, "loss": 0.645, "mean_token_accuracy": 0.8182990442961454, "num_tokens": 395278213.0, "step": 16145 }, { "entropy": 0.5296475045382977, "epoch": 1.2102950716776624, "grad_norm": 0.19146060943603516, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.8229767017066478, "num_tokens": 396744409.0, "step": 16150 }, { "entropy": 0.5409910948947072, "epoch": 1.2106697956115142, "grad_norm": 0.2076360434293747, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.8197078894823789, "num_tokens": 398231041.0, "step": 16155 }, { "entropy": 0.5506492113694549, "epoch": 1.211044519545366, "grad_norm": 0.20549114048480988, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8187351670116186, "num_tokens": 399741737.0, "step": 16160 }, { "entropy": 0.5356888761743903, "epoch": 1.211419243479218, "grad_norm": 0.19212877750396729, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.8199533555656672, "num_tokens": 401204941.0, "step": 16165 }, { "entropy": 0.5418312272056938, "epoch": 1.21179396741307, "grad_norm": 0.2138081192970276, "learning_rate": 0.0002, "loss": 0.6324, "mean_token_accuracy": 0.8199380382895469, "num_tokens": 402670714.0, "step": 16170 }, { "entropy": 0.5395627392455935, "epoch": 1.2121686913469218, "grad_norm": 0.2153218686580658, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.8195312682539224, "num_tokens": 404095800.0, "step": 16175 }, { "entropy": 0.5424290619790554, "epoch": 1.2125434152807737, "grad_norm": 0.21201859414577484, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.8195981923490763, "num_tokens": 405600577.0, "step": 16180 }, { "entropy": 0.5452271308749914, "epoch": 1.2129181392146255, "grad_norm": 0.22899776697158813, "learning_rate": 0.0002, "loss": 0.6283, "mean_token_accuracy": 0.8210695389658212, "num_tokens": 407064651.0, "step": 16185 }, { "entropy": 0.5341171238571405, "epoch": 1.2132928631484774, "grad_norm": 0.1996448040008545, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.8242120616137981, "num_tokens": 408508910.0, "step": 16190 }, { "entropy": 0.5443948972970247, "epoch": 1.2136675870823292, "grad_norm": 0.23843424022197723, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.8206396590918302, "num_tokens": 409972596.0, "step": 16195 }, { "entropy": 0.5305875607766211, "epoch": 1.214042311016181, "grad_norm": 0.21319329738616943, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.8279986478388309, "num_tokens": 411469330.0, "step": 16200 }, { "entropy": 0.5383191723376513, "epoch": 1.214417034950033, "grad_norm": 0.20644855499267578, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8210528120398521, "num_tokens": 412950909.0, "step": 16205 }, { "entropy": 0.5286838210187852, "epoch": 1.2147917588838848, "grad_norm": 0.21568213403224945, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.8231611341238022, "num_tokens": 414419324.0, "step": 16210 }, { "entropy": 0.5324889168143272, "epoch": 1.2151664828177366, "grad_norm": 0.24467332661151886, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.8216703422367573, "num_tokens": 415903791.0, "step": 16215 }, { "entropy": 0.5528705962002277, "epoch": 1.2155412067515885, "grad_norm": 0.2077530175447464, "learning_rate": 0.0002, "loss": 0.6398, "mean_token_accuracy": 0.8173885583877564, "num_tokens": 417416033.0, "step": 16220 }, { "entropy": 0.5183630205690861, "epoch": 1.2159159306854403, "grad_norm": 0.22132150828838348, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.8224145997315645, "num_tokens": 418883539.0, "step": 16225 }, { "entropy": 0.5430596983060241, "epoch": 1.2162906546192922, "grad_norm": 0.22899331152439117, "learning_rate": 0.0002, "loss": 0.6384, "mean_token_accuracy": 0.8185259126126766, "num_tokens": 420378168.0, "step": 16230 }, { "entropy": 0.5445303177461028, "epoch": 1.216665378553144, "grad_norm": 0.2111872434616089, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.8187392547726631, "num_tokens": 421846726.0, "step": 16235 }, { "entropy": 0.5453444682061672, "epoch": 1.217040102486996, "grad_norm": 0.21204857528209686, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.8167982466518879, "num_tokens": 423323098.0, "step": 16240 }, { "entropy": 0.5471349191851914, "epoch": 1.2174148264208478, "grad_norm": 0.2205110341310501, "learning_rate": 0.0002, "loss": 0.6492, "mean_token_accuracy": 0.8177007168531418, "num_tokens": 424793790.0, "step": 16245 }, { "entropy": 0.5262664820998907, "epoch": 1.2177895503546996, "grad_norm": 0.3569920063018799, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8189272835850716, "num_tokens": 426223337.0, "step": 16250 }, { "entropy": 0.5467920441180467, "epoch": 1.2181642742885515, "grad_norm": 0.23700927197933197, "learning_rate": 0.0002, "loss": 0.6419, "mean_token_accuracy": 0.8176676675677299, "num_tokens": 427699192.0, "step": 16255 }, { "entropy": 0.5413308058865368, "epoch": 1.2185389982224033, "grad_norm": 0.24733874201774597, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.818879334256053, "num_tokens": 429150716.0, "step": 16260 }, { "entropy": 0.5471871251240372, "epoch": 1.2189137221562552, "grad_norm": 0.22648848593235016, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.8178978227078915, "num_tokens": 430641028.0, "step": 16265 }, { "entropy": 0.5480229833163321, "epoch": 1.219288446090107, "grad_norm": 0.20126816630363464, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.8182559363543987, "num_tokens": 432135664.0, "step": 16270 }, { "entropy": 0.5525619942694903, "epoch": 1.2196631700239589, "grad_norm": 0.20779937505722046, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.8189459171146154, "num_tokens": 433582467.0, "step": 16275 }, { "entropy": 0.5231747729703784, "epoch": 1.2200378939578107, "grad_norm": 0.202646404504776, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.8261198360472918, "num_tokens": 435006407.0, "step": 16280 }, { "entropy": 0.5467716474086046, "epoch": 1.2204126178916626, "grad_norm": 0.21149758994579315, "learning_rate": 0.0002, "loss": 0.6328, "mean_token_accuracy": 0.8199656844139099, "num_tokens": 436495758.0, "step": 16285 }, { "entropy": 0.5445891520008445, "epoch": 1.2207873418255144, "grad_norm": 0.19867508113384247, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8222383603453636, "num_tokens": 437969653.0, "step": 16290 }, { "entropy": 0.5475120754912496, "epoch": 1.2211620657593663, "grad_norm": 0.21429981291294098, "learning_rate": 0.0002, "loss": 0.6326, "mean_token_accuracy": 0.8220420330762863, "num_tokens": 439442767.0, "step": 16295 }, { "entropy": 0.5263394819572568, "epoch": 1.221536789693218, "grad_norm": 0.30293434858322144, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.8260350544005632, "num_tokens": 440883750.0, "step": 16300 }, { "entropy": 0.5265619801357388, "epoch": 1.2219115136270702, "grad_norm": 0.3465881943702698, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.8253807693719863, "num_tokens": 442288282.0, "step": 16305 }, { "entropy": 0.5497499389573932, "epoch": 1.222286237560922, "grad_norm": 0.22028760612010956, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.8191885691136122, "num_tokens": 443758359.0, "step": 16310 }, { "entropy": 0.5487025134265423, "epoch": 1.2226609614947739, "grad_norm": 0.2139016091823578, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.8199478380382061, "num_tokens": 445241108.0, "step": 16315 }, { "entropy": 0.5543011704459786, "epoch": 1.2230356854286257, "grad_norm": 0.20873506367206573, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8207632292062044, "num_tokens": 446683295.0, "step": 16320 }, { "entropy": 0.5465412773191929, "epoch": 1.2234104093624776, "grad_norm": 0.20824699103832245, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.8214564640074968, "num_tokens": 448142977.0, "step": 16325 }, { "entropy": 0.554194874688983, "epoch": 1.2237851332963294, "grad_norm": 0.2261955440044403, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8194256596267223, "num_tokens": 449577376.0, "step": 16330 }, { "entropy": 0.5616237426176667, "epoch": 1.2241598572301813, "grad_norm": 0.19997355341911316, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.8207548346370459, "num_tokens": 451055934.0, "step": 16335 }, { "entropy": 0.5693289568647742, "epoch": 1.2245345811640331, "grad_norm": 0.20291699469089508, "learning_rate": 0.0002, "loss": 0.6383, "mean_token_accuracy": 0.8190573811531067, "num_tokens": 452542061.0, "step": 16340 }, { "entropy": 0.5632130894809961, "epoch": 1.224909305097885, "grad_norm": 0.21321269869804382, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8223172847181559, "num_tokens": 454034789.0, "step": 16345 }, { "entropy": 0.5673366302624345, "epoch": 1.2252840290317368, "grad_norm": 0.20254789292812347, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.817766486108303, "num_tokens": 455523750.0, "step": 16350 }, { "entropy": 0.5322730045765638, "epoch": 1.2256587529655887, "grad_norm": 0.21072612702846527, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.8269950412213802, "num_tokens": 456994941.0, "step": 16355 }, { "entropy": 0.5508893949910998, "epoch": 1.2260334768994405, "grad_norm": 0.22641071677207947, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.8198867846280337, "num_tokens": 458457198.0, "step": 16360 }, { "entropy": 0.5448153208941221, "epoch": 1.2264082008332924, "grad_norm": 0.22859734296798706, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.8206405807286501, "num_tokens": 459921664.0, "step": 16365 }, { "entropy": 0.5540199983865023, "epoch": 1.2267829247671442, "grad_norm": 0.21558703482151031, "learning_rate": 0.0002, "loss": 0.6368, "mean_token_accuracy": 0.8191712304949761, "num_tokens": 461356067.0, "step": 16370 }, { "entropy": 0.5421827109530568, "epoch": 1.227157648700996, "grad_norm": 0.2174033373594284, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.8231872983276844, "num_tokens": 462833551.0, "step": 16375 }, { "entropy": 0.5303646799176931, "epoch": 1.227532372634848, "grad_norm": 0.21454934775829315, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.821143987774849, "num_tokens": 464271166.0, "step": 16380 }, { "entropy": 0.5084880117326975, "epoch": 1.2279070965686998, "grad_norm": 0.1993628442287445, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8208059936761856, "num_tokens": 465764430.0, "step": 16385 }, { "entropy": 0.513422014657408, "epoch": 1.2282818205025516, "grad_norm": 0.20404501259326935, "learning_rate": 0.0002, "loss": 0.6431, "mean_token_accuracy": 0.820822112262249, "num_tokens": 467231192.0, "step": 16390 }, { "entropy": 0.5016472352668643, "epoch": 1.2286565444364035, "grad_norm": 0.22348876297473907, "learning_rate": 0.0002, "loss": 0.6325, "mean_token_accuracy": 0.8165706429630518, "num_tokens": 468667964.0, "step": 16395 }, { "entropy": 0.5142116837203503, "epoch": 1.2290312683702553, "grad_norm": 0.20711906254291534, "learning_rate": 0.0002, "loss": 0.6413, "mean_token_accuracy": 0.8185458306223155, "num_tokens": 470170243.0, "step": 16400 }, { "entropy": 0.5033261952921748, "epoch": 1.2294059923041072, "grad_norm": 0.2018640637397766, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8184519365429879, "num_tokens": 471637890.0, "step": 16405 }, { "entropy": 0.4931364109739661, "epoch": 1.229780716237959, "grad_norm": 0.23615524172782898, "learning_rate": 0.0002, "loss": 0.6168, "mean_token_accuracy": 0.8233429159969091, "num_tokens": 473112151.0, "step": 16410 }, { "entropy": 0.5316375168040395, "epoch": 1.2301554401718109, "grad_norm": 0.22151191532611847, "learning_rate": 0.0002, "loss": 0.6472, "mean_token_accuracy": 0.8199020780622959, "num_tokens": 474603152.0, "step": 16415 }, { "entropy": 0.5117734322324395, "epoch": 1.2305301641056627, "grad_norm": 0.1931844800710678, "learning_rate": 0.0002, "loss": 0.6378, "mean_token_accuracy": 0.82004848793149, "num_tokens": 476067442.0, "step": 16420 }, { "entropy": 0.5195559300482273, "epoch": 1.2309048880395146, "grad_norm": 0.22654393315315247, "learning_rate": 0.0002, "loss": 0.6326, "mean_token_accuracy": 0.8182774014770985, "num_tokens": 477564042.0, "step": 16425 }, { "entropy": 0.5166875401511788, "epoch": 1.2312796119733664, "grad_norm": 0.22653630375862122, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.8211624503135682, "num_tokens": 479007470.0, "step": 16430 }, { "entropy": 0.5200740764848888, "epoch": 1.2316543359072183, "grad_norm": 0.21077914535999298, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8187624599784613, "num_tokens": 480473791.0, "step": 16435 }, { "entropy": 0.5260989579372108, "epoch": 1.2320290598410701, "grad_norm": 0.1983373761177063, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.8200287710875273, "num_tokens": 481951149.0, "step": 16440 }, { "entropy": 0.5444825425744056, "epoch": 1.232403783774922, "grad_norm": 0.205426424741745, "learning_rate": 0.0002, "loss": 0.6475, "mean_token_accuracy": 0.8202877257019281, "num_tokens": 483431541.0, "step": 16445 }, { "entropy": 0.5296711201779545, "epoch": 1.2327785077087738, "grad_norm": 0.20679841935634613, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.8186856359243393, "num_tokens": 484924417.0, "step": 16450 }, { "entropy": 0.522282784152776, "epoch": 1.2331532316426257, "grad_norm": 0.2558625042438507, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.8192424606531858, "num_tokens": 486408865.0, "step": 16455 }, { "entropy": 0.5188041539862752, "epoch": 1.2335279555764775, "grad_norm": 0.2450280338525772, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8215386729687453, "num_tokens": 487899215.0, "step": 16460 }, { "entropy": 0.5224698189646005, "epoch": 1.2339026795103294, "grad_norm": 0.23388825356960297, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8213137347251177, "num_tokens": 489341625.0, "step": 16465 }, { "entropy": 0.5293797533027828, "epoch": 1.2342774034441812, "grad_norm": 0.23135529458522797, "learning_rate": 0.0002, "loss": 0.6359, "mean_token_accuracy": 0.818126768246293, "num_tokens": 490841894.0, "step": 16470 }, { "entropy": 0.5432607930153608, "epoch": 1.234652127378033, "grad_norm": 0.2083522081375122, "learning_rate": 0.0002, "loss": 0.6437, "mean_token_accuracy": 0.8138182990252971, "num_tokens": 492341935.0, "step": 16475 }, { "entropy": 0.5378286808729171, "epoch": 1.2350268513118852, "grad_norm": 0.20683827996253967, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.8183946885168553, "num_tokens": 493865936.0, "step": 16480 }, { "entropy": 0.5103029450401664, "epoch": 1.235401575245737, "grad_norm": 0.20600149035453796, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.824104243144393, "num_tokens": 495305297.0, "step": 16485 }, { "entropy": 0.5262394644320011, "epoch": 1.2357762991795889, "grad_norm": 0.22533144056797028, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.8185561642050743, "num_tokens": 496773711.0, "step": 16490 }, { "entropy": 0.5323695092462003, "epoch": 1.2361510231134407, "grad_norm": 0.19825634360313416, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.8188108690083027, "num_tokens": 498266935.0, "step": 16495 }, { "entropy": 0.5198686759918928, "epoch": 1.2365257470472926, "grad_norm": 0.2161991447210312, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.8263437490910291, "num_tokens": 499772618.0, "step": 16500 }, { "entropy": 0.5320072427392006, "epoch": 1.2369004709811444, "grad_norm": 0.2193775326013565, "learning_rate": 0.0002, "loss": 0.6325, "mean_token_accuracy": 0.8167643558233977, "num_tokens": 501260788.0, "step": 16505 }, { "entropy": 0.532121155038476, "epoch": 1.2372751949149963, "grad_norm": 0.21462078392505646, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.818549707531929, "num_tokens": 502715845.0, "step": 16510 }, { "entropy": 0.5198448483832181, "epoch": 1.2376499188488481, "grad_norm": 0.22433564066886902, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.8243482660502195, "num_tokens": 504174398.0, "step": 16515 }, { "entropy": 0.5243266897276044, "epoch": 1.2380246427827, "grad_norm": 0.2019008845090866, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.8222447033971548, "num_tokens": 505653187.0, "step": 16520 }, { "entropy": 0.5310024835169316, "epoch": 1.2383993667165518, "grad_norm": 0.220550999045372, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8196980282664299, "num_tokens": 507112680.0, "step": 16525 }, { "entropy": 0.5291682727634907, "epoch": 1.2387740906504037, "grad_norm": 0.21218353509902954, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8201417803764344, "num_tokens": 508591859.0, "step": 16530 }, { "entropy": 0.5169583970680833, "epoch": 1.2391488145842555, "grad_norm": 0.20378537476062775, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.8237829484045506, "num_tokens": 510065529.0, "step": 16535 }, { "entropy": 0.5275073204189539, "epoch": 1.2395235385181074, "grad_norm": 0.2252926081418991, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.8209170766174794, "num_tokens": 511510387.0, "step": 16540 }, { "entropy": 0.5460548736155033, "epoch": 1.2398982624519592, "grad_norm": 0.21371012926101685, "learning_rate": 0.0002, "loss": 0.6549, "mean_token_accuracy": 0.8150759857147932, "num_tokens": 512953758.0, "step": 16545 }, { "entropy": 0.5348939960822463, "epoch": 1.240272986385811, "grad_norm": 0.22601179778575897, "learning_rate": 0.0002, "loss": 0.6333, "mean_token_accuracy": 0.8200484201312065, "num_tokens": 514399926.0, "step": 16550 }, { "entropy": 0.5377839103341102, "epoch": 1.240647710319663, "grad_norm": 0.19987767934799194, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8194968674331904, "num_tokens": 515865046.0, "step": 16555 }, { "entropy": 0.5402515921741724, "epoch": 1.2410224342535148, "grad_norm": 0.2027144432067871, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.8192650683224201, "num_tokens": 517349389.0, "step": 16560 }, { "entropy": 0.5429001552984118, "epoch": 1.2413971581873666, "grad_norm": 0.2073775678873062, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.8178004186600447, "num_tokens": 518853398.0, "step": 16565 }, { "entropy": 0.543116653058678, "epoch": 1.2417718821212185, "grad_norm": 0.21826891601085663, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8195703323930502, "num_tokens": 520311982.0, "step": 16570 }, { "entropy": 0.5223147880285979, "epoch": 1.2421466060550703, "grad_norm": 0.20596879720687866, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.8243097428232431, "num_tokens": 521746283.0, "step": 16575 }, { "entropy": 0.5325337471440434, "epoch": 1.2425213299889222, "grad_norm": 0.2090800553560257, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8214882098138332, "num_tokens": 523184978.0, "step": 16580 }, { "entropy": 0.5451926976442337, "epoch": 1.242896053922774, "grad_norm": 0.2344151735305786, "learning_rate": 0.0002, "loss": 0.6387, "mean_token_accuracy": 0.8200867801904679, "num_tokens": 524634744.0, "step": 16585 }, { "entropy": 0.5513688078150153, "epoch": 1.2432707778566259, "grad_norm": 0.22870230674743652, "learning_rate": 0.0002, "loss": 0.6431, "mean_token_accuracy": 0.8166988678276539, "num_tokens": 526127057.0, "step": 16590 }, { "entropy": 0.559462333843112, "epoch": 1.2436455017904777, "grad_norm": 0.20669420063495636, "learning_rate": 0.0002, "loss": 0.6474, "mean_token_accuracy": 0.8154083523899317, "num_tokens": 527605278.0, "step": 16595 }, { "entropy": 0.5552646221593023, "epoch": 1.2440202257243296, "grad_norm": 0.23327700793743134, "learning_rate": 0.0002, "loss": 0.6398, "mean_token_accuracy": 0.8171249646693468, "num_tokens": 529120256.0, "step": 16600 }, { "entropy": 0.5376246856525541, "epoch": 1.2443949496581814, "grad_norm": 0.21520425379276276, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8190373290330172, "num_tokens": 530621643.0, "step": 16605 }, { "entropy": 0.5436696406453848, "epoch": 1.2447696735920333, "grad_norm": 0.21787701547145844, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8186636488884688, "num_tokens": 532095187.0, "step": 16610 }, { "entropy": 0.5359068954363465, "epoch": 1.2451443975258853, "grad_norm": 0.2188592106103897, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8239819221198559, "num_tokens": 533548364.0, "step": 16615 }, { "entropy": 0.5341380467638374, "epoch": 1.2455191214597372, "grad_norm": 0.2582451403141022, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.8248764652758837, "num_tokens": 534973611.0, "step": 16620 }, { "entropy": 0.5365978600457311, "epoch": 1.245893845393589, "grad_norm": 0.21712706983089447, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.8243703737854957, "num_tokens": 536423722.0, "step": 16625 }, { "entropy": 0.55458012111485, "epoch": 1.246268569327441, "grad_norm": 0.21260778605937958, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.819046164676547, "num_tokens": 537920813.0, "step": 16630 }, { "entropy": 0.5499800961464644, "epoch": 1.2466432932612928, "grad_norm": 0.2049102634191513, "learning_rate": 0.0002, "loss": 0.635, "mean_token_accuracy": 0.8187438141554594, "num_tokens": 539402429.0, "step": 16635 }, { "entropy": 0.544678608700633, "epoch": 1.2470180171951446, "grad_norm": 0.22512531280517578, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.8219233434647322, "num_tokens": 540894335.0, "step": 16640 }, { "entropy": 0.5479793136939406, "epoch": 1.2473927411289965, "grad_norm": 0.20512574911117554, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.8190116144716739, "num_tokens": 542398192.0, "step": 16645 }, { "entropy": 0.5391479057259858, "epoch": 1.2477674650628483, "grad_norm": 0.21243461966514587, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.8196938600391149, "num_tokens": 543809726.0, "step": 16650 }, { "entropy": 0.5387997280806303, "epoch": 1.2481421889967002, "grad_norm": 0.20903274416923523, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.8191237583756447, "num_tokens": 545322275.0, "step": 16655 }, { "entropy": 0.5431662291288376, "epoch": 1.248516912930552, "grad_norm": 0.25539296865463257, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.8206224743276834, "num_tokens": 546797897.0, "step": 16660 }, { "entropy": 0.5276119155809283, "epoch": 1.2488916368644039, "grad_norm": 0.48246124386787415, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.822123583778739, "num_tokens": 548279888.0, "step": 16665 }, { "entropy": 0.5254561686888337, "epoch": 1.2492663607982557, "grad_norm": 0.23879727721214294, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.8226583231240511, "num_tokens": 549721502.0, "step": 16670 }, { "entropy": 0.5334939325228334, "epoch": 1.2496410847321076, "grad_norm": 0.22770531475543976, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.8192650113254786, "num_tokens": 551204046.0, "step": 16675 }, { "entropy": 0.5331218712031841, "epoch": 1.2500158086659594, "grad_norm": 0.2129017412662506, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.8182093784213066, "num_tokens": 552635118.0, "step": 16680 }, { "entropy": 0.5406099494546652, "epoch": 1.2503905325998113, "grad_norm": 0.2269551008939743, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.8212612744420766, "num_tokens": 554132906.0, "step": 16685 }, { "entropy": 0.5201677974313498, "epoch": 1.250765256533663, "grad_norm": 0.22526618838310242, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8252065822482109, "num_tokens": 555570911.0, "step": 16690 }, { "entropy": 0.5406639372929931, "epoch": 1.251139980467515, "grad_norm": 0.20317810773849487, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8159836634993554, "num_tokens": 557079243.0, "step": 16695 }, { "entropy": 0.5305283708497882, "epoch": 1.2515147044013668, "grad_norm": 0.22196096181869507, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.8213914345949889, "num_tokens": 558561831.0, "step": 16700 }, { "entropy": 0.5396667955443263, "epoch": 1.2518894283352187, "grad_norm": 0.20039886236190796, "learning_rate": 0.0002, "loss": 0.6401, "mean_token_accuracy": 0.8196812197566032, "num_tokens": 560015552.0, "step": 16705 }, { "entropy": 0.5325476423837244, "epoch": 1.2522641522690705, "grad_norm": 0.20185157656669617, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.8192272856831551, "num_tokens": 561490033.0, "step": 16710 }, { "entropy": 0.5240982516668737, "epoch": 1.2526388762029224, "grad_norm": 0.22236528992652893, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.8242736250162125, "num_tokens": 562943419.0, "step": 16715 }, { "entropy": 0.5290748873725534, "epoch": 1.2530136001367742, "grad_norm": 0.19042439758777618, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.823859540373087, "num_tokens": 564401785.0, "step": 16720 }, { "entropy": 0.5473877063021064, "epoch": 1.253388324070626, "grad_norm": 0.2083514928817749, "learning_rate": 0.0002, "loss": 0.6388, "mean_token_accuracy": 0.8185325413942337, "num_tokens": 565859350.0, "step": 16725 }, { "entropy": 0.5398637644946576, "epoch": 1.253763048004478, "grad_norm": 0.20228421688079834, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8228220414370299, "num_tokens": 567296534.0, "step": 16730 }, { "entropy": 0.5250434272922575, "epoch": 1.2541377719383298, "grad_norm": 0.21615761518478394, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.82152896001935, "num_tokens": 568746950.0, "step": 16735 }, { "entropy": 0.5392712542787195, "epoch": 1.2545124958721816, "grad_norm": 0.23222339153289795, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.823460103198886, "num_tokens": 570223556.0, "step": 16740 }, { "entropy": 0.5393241422250867, "epoch": 1.2548872198060335, "grad_norm": 0.24326719343662262, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8201018072664737, "num_tokens": 571713984.0, "step": 16745 }, { "entropy": 0.5490091701969504, "epoch": 1.2552619437398853, "grad_norm": 0.18782995641231537, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8181841131299734, "num_tokens": 573220043.0, "step": 16750 }, { "entropy": 0.5451405242085456, "epoch": 1.2556366676737372, "grad_norm": 0.21211466193199158, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.8239660702645779, "num_tokens": 574666018.0, "step": 16755 }, { "entropy": 0.5590244997292757, "epoch": 1.256011391607589, "grad_norm": 0.2392389476299286, "learning_rate": 0.0002, "loss": 0.6429, "mean_token_accuracy": 0.8163200449198484, "num_tokens": 576155675.0, "step": 16760 }, { "entropy": 0.5478190646506846, "epoch": 1.2563861155414409, "grad_norm": 0.2175014168024063, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.8203384533524514, "num_tokens": 577613567.0, "step": 16765 }, { "entropy": 0.5336928766220808, "epoch": 1.2567608394752927, "grad_norm": 0.22348858416080475, "learning_rate": 0.0002, "loss": 0.6074, "mean_token_accuracy": 0.8252603605389595, "num_tokens": 579150157.0, "step": 16770 }, { "entropy": 0.5429612537845969, "epoch": 1.2571355634091446, "grad_norm": 0.20704281330108643, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8202266275882721, "num_tokens": 580667998.0, "step": 16775 }, { "entropy": 0.537352636270225, "epoch": 1.2575102873429964, "grad_norm": 0.21111160516738892, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.8221554297953844, "num_tokens": 582153072.0, "step": 16780 }, { "entropy": 0.5370260832831264, "epoch": 1.2578850112768483, "grad_norm": 0.20790456235408783, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.8195533879101277, "num_tokens": 583603313.0, "step": 16785 }, { "entropy": 0.5285095036029815, "epoch": 1.2582597352107001, "grad_norm": 0.21695636212825775, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.8213756624609232, "num_tokens": 585050861.0, "step": 16790 }, { "entropy": 0.5213481526821852, "epoch": 1.258634459144552, "grad_norm": 0.2239443063735962, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.821231983602047, "num_tokens": 586489714.0, "step": 16795 }, { "entropy": 0.5309294413775205, "epoch": 1.259009183078404, "grad_norm": 0.21686334908008575, "learning_rate": 0.0002, "loss": 0.6365, "mean_token_accuracy": 0.8228697460144758, "num_tokens": 587952237.0, "step": 16800 }, { "entropy": 0.5299536236561835, "epoch": 1.2593839070122559, "grad_norm": 0.20717909932136536, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.8206814810633659, "num_tokens": 589478008.0, "step": 16805 }, { "entropy": 0.5385773732326925, "epoch": 1.2597586309461077, "grad_norm": 0.2248837947845459, "learning_rate": 0.0002, "loss": 0.6443, "mean_token_accuracy": 0.8190208043903112, "num_tokens": 590955990.0, "step": 16810 }, { "entropy": 0.5292088152840734, "epoch": 1.2601333548799596, "grad_norm": 0.22515517473220825, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8212131500244141, "num_tokens": 592414306.0, "step": 16815 }, { "entropy": 0.5213922979310155, "epoch": 1.2605080788138114, "grad_norm": 0.2919764518737793, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8220275063067675, "num_tokens": 593872719.0, "step": 16820 }, { "entropy": 0.5377862673252821, "epoch": 1.2608828027476633, "grad_norm": 0.21326279640197754, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.8158270739018917, "num_tokens": 595395681.0, "step": 16825 }, { "entropy": 0.533869045600295, "epoch": 1.2612575266815151, "grad_norm": 0.1973363757133484, "learning_rate": 0.0002, "loss": 0.6408, "mean_token_accuracy": 0.8187405206263065, "num_tokens": 596864025.0, "step": 16830 }, { "entropy": 0.5281696000136435, "epoch": 1.261632250615367, "grad_norm": 0.19884520769119263, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.8207439843565225, "num_tokens": 598350233.0, "step": 16835 }, { "entropy": 0.5353417733684183, "epoch": 1.2620069745492188, "grad_norm": 0.20837999880313873, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.820575799793005, "num_tokens": 599814335.0, "step": 16840 }, { "entropy": 0.5451984511688351, "epoch": 1.2623816984830707, "grad_norm": 0.20728212594985962, "learning_rate": 0.0002, "loss": 0.6464, "mean_token_accuracy": 0.8186267659068107, "num_tokens": 601310571.0, "step": 16845 }, { "entropy": 0.5267370012588799, "epoch": 1.2627564224169225, "grad_norm": 0.21523337066173553, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8238344702869653, "num_tokens": 602756529.0, "step": 16850 }, { "entropy": 0.5270962400361896, "epoch": 1.2631311463507744, "grad_norm": 0.22263507544994354, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8226651281118393, "num_tokens": 604225557.0, "step": 16855 }, { "entropy": 0.537558664008975, "epoch": 1.2635058702846262, "grad_norm": 0.2032202184200287, "learning_rate": 0.0002, "loss": 0.6319, "mean_token_accuracy": 0.8208839680999518, "num_tokens": 605650744.0, "step": 16860 }, { "entropy": 0.5286545457318426, "epoch": 1.263880594218478, "grad_norm": 0.23980504274368286, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8236931376159191, "num_tokens": 607106798.0, "step": 16865 }, { "entropy": 0.530906262807548, "epoch": 1.26425531815233, "grad_norm": 0.22016169130802155, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.8245333261787892, "num_tokens": 608527159.0, "step": 16870 }, { "entropy": 0.5438356258906424, "epoch": 1.2646300420861818, "grad_norm": 0.2061275839805603, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.8221923775970936, "num_tokens": 610028934.0, "step": 16875 }, { "entropy": 0.5317008272744715, "epoch": 1.2650047660200336, "grad_norm": 0.2110389620065689, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.821918336302042, "num_tokens": 611492214.0, "step": 16880 }, { "entropy": 0.5391458285972476, "epoch": 1.2653794899538855, "grad_norm": 0.22440768778324127, "learning_rate": 0.0002, "loss": 0.6206, "mean_token_accuracy": 0.8228310581296683, "num_tokens": 612973108.0, "step": 16885 }, { "entropy": 0.549751996807754, "epoch": 1.2657542138877373, "grad_norm": 0.2128918617963791, "learning_rate": 0.0002, "loss": 0.641, "mean_token_accuracy": 0.8210071891546249, "num_tokens": 614473323.0, "step": 16890 }, { "entropy": 0.5370402748696506, "epoch": 1.2661289378215892, "grad_norm": 0.22646121680736542, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.824086532369256, "num_tokens": 615954412.0, "step": 16895 }, { "entropy": 0.5555910643190145, "epoch": 1.266503661755441, "grad_norm": 0.21005554497241974, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.8210885718464851, "num_tokens": 617417118.0, "step": 16900 }, { "entropy": 0.5516836190596223, "epoch": 1.266878385689293, "grad_norm": 0.2382601797580719, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.8207450214773416, "num_tokens": 618899121.0, "step": 16905 }, { "entropy": 0.5481962393969297, "epoch": 1.2672531096231447, "grad_norm": 0.21586988866329193, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.8195462174713611, "num_tokens": 620366721.0, "step": 16910 }, { "entropy": 0.5389401912689209, "epoch": 1.2676278335569968, "grad_norm": 0.22817257046699524, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8194390919059515, "num_tokens": 621818779.0, "step": 16915 }, { "entropy": 0.541199448145926, "epoch": 1.2680025574908487, "grad_norm": 0.2391708791255951, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.8202446859329939, "num_tokens": 623266085.0, "step": 16920 }, { "entropy": 0.5349443513900042, "epoch": 1.2683772814247005, "grad_norm": 0.22916051745414734, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.8217108082026243, "num_tokens": 624728544.0, "step": 16925 }, { "entropy": 0.538243212364614, "epoch": 1.2687520053585524, "grad_norm": 0.23479923605918884, "learning_rate": 0.0002, "loss": 0.6371, "mean_token_accuracy": 0.8172284677624703, "num_tokens": 626117274.0, "step": 16930 }, { "entropy": 0.5352308485656977, "epoch": 1.2691267292924042, "grad_norm": 0.23018990457057953, "learning_rate": 0.0002, "loss": 0.6368, "mean_token_accuracy": 0.8224647145718336, "num_tokens": 627580305.0, "step": 16935 }, { "entropy": 0.5415761534124612, "epoch": 1.269501453226256, "grad_norm": 0.21793270111083984, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.8216560300439596, "num_tokens": 629021149.0, "step": 16940 }, { "entropy": 0.5394600750878453, "epoch": 1.269876177160108, "grad_norm": 0.2047475278377533, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8219139780849218, "num_tokens": 630487355.0, "step": 16945 }, { "entropy": 0.531169188208878, "epoch": 1.2702509010939598, "grad_norm": 0.2042086124420166, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8230738364160061, "num_tokens": 631979021.0, "step": 16950 }, { "entropy": 0.5591348100453615, "epoch": 1.2706256250278116, "grad_norm": 0.21973219513893127, "learning_rate": 0.0002, "loss": 0.6537, "mean_token_accuracy": 0.8155770603567362, "num_tokens": 633458021.0, "step": 16955 }, { "entropy": 0.5495317630469799, "epoch": 1.2710003489616635, "grad_norm": 0.20351719856262207, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.8207841582596302, "num_tokens": 635010175.0, "step": 16960 }, { "entropy": 0.5389039866626263, "epoch": 1.2713750728955153, "grad_norm": 0.20443078875541687, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8185348954051733, "num_tokens": 636496926.0, "step": 16965 }, { "entropy": 0.533154666237533, "epoch": 1.2717497968293672, "grad_norm": 0.21231751143932343, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8254693672060966, "num_tokens": 637943699.0, "step": 16970 }, { "entropy": 0.5342582738026976, "epoch": 1.272124520763219, "grad_norm": 0.20274066925048828, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.8234259992837906, "num_tokens": 639389919.0, "step": 16975 }, { "entropy": 0.5380094958469271, "epoch": 1.2724992446970709, "grad_norm": 0.20689763128757477, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.820160761475563, "num_tokens": 640868366.0, "step": 16980 }, { "entropy": 0.5401715039275586, "epoch": 1.2728739686309227, "grad_norm": 0.20314128696918488, "learning_rate": 0.0002, "loss": 0.6373, "mean_token_accuracy": 0.8176679342985154, "num_tokens": 642340510.0, "step": 16985 }, { "entropy": 0.5385355591773987, "epoch": 1.2732486925647746, "grad_norm": 0.22718952596187592, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.819409528747201, "num_tokens": 643864985.0, "step": 16990 }, { "entropy": 0.5285898154601455, "epoch": 1.2736234164986264, "grad_norm": 0.20902763307094574, "learning_rate": 0.0002, "loss": 0.6264, "mean_token_accuracy": 0.8216732636094093, "num_tokens": 645346580.0, "step": 16995 }, { "entropy": 0.5298345517367125, "epoch": 1.2739981404324783, "grad_norm": 0.20656847953796387, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8195371583104134, "num_tokens": 646849801.0, "step": 17000 }, { "entropy": 0.5305873000994324, "epoch": 1.2743728643663301, "grad_norm": 0.21664506196975708, "learning_rate": 0.0002, "loss": 0.6328, "mean_token_accuracy": 0.821063569188118, "num_tokens": 648320840.0, "step": 17005 }, { "entropy": 0.5353246251121163, "epoch": 1.274747588300182, "grad_norm": 0.22235289216041565, "learning_rate": 0.0002, "loss": 0.6388, "mean_token_accuracy": 0.8185301035642624, "num_tokens": 649781344.0, "step": 17010 }, { "entropy": 0.5202114275656641, "epoch": 1.2751223122340338, "grad_norm": 0.21328343451023102, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8209323357790709, "num_tokens": 651271093.0, "step": 17015 }, { "entropy": 0.5192944388836622, "epoch": 1.2754970361678857, "grad_norm": 0.23347991704940796, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8195499375462532, "num_tokens": 652763486.0, "step": 17020 }, { "entropy": 0.5309138514101506, "epoch": 1.2758717601017375, "grad_norm": 0.3120865821838379, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.819993282482028, "num_tokens": 654267937.0, "step": 17025 }, { "entropy": 0.5209495315328241, "epoch": 1.2762464840355894, "grad_norm": 0.2146662473678589, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.8214613050222397, "num_tokens": 655733526.0, "step": 17030 }, { "entropy": 0.5065406957641244, "epoch": 1.2766212079694412, "grad_norm": 0.23770207166671753, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8234106089919806, "num_tokens": 657181842.0, "step": 17035 }, { "entropy": 0.5234216986224055, "epoch": 1.276995931903293, "grad_norm": 0.3393148183822632, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.8191783390939236, "num_tokens": 658671621.0, "step": 17040 }, { "entropy": 0.5288121491670609, "epoch": 1.277370655837145, "grad_norm": 0.19828484952449799, "learning_rate": 0.0002, "loss": 0.6428, "mean_token_accuracy": 0.8194995060563087, "num_tokens": 660158201.0, "step": 17045 }, { "entropy": 0.5233664374798537, "epoch": 1.2777453797709968, "grad_norm": 0.21012860536575317, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.8167076922953129, "num_tokens": 661659052.0, "step": 17050 }, { "entropy": 0.5089860804378986, "epoch": 1.2781201037048486, "grad_norm": 0.22381040453910828, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8213963117450476, "num_tokens": 663146457.0, "step": 17055 }, { "entropy": 0.4985718633979559, "epoch": 1.2784948276387005, "grad_norm": 0.21353240311145782, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.8214501060545445, "num_tokens": 664597571.0, "step": 17060 }, { "entropy": 0.5100520422682167, "epoch": 1.2788695515725523, "grad_norm": 0.21984361112117767, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.8218725521117449, "num_tokens": 666066457.0, "step": 17065 }, { "entropy": 0.507599832303822, "epoch": 1.2792442755064042, "grad_norm": 0.20086050033569336, "learning_rate": 0.0002, "loss": 0.6325, "mean_token_accuracy": 0.8197536122053861, "num_tokens": 667563130.0, "step": 17070 }, { "entropy": 0.5120520976372063, "epoch": 1.279618999440256, "grad_norm": 0.22539326548576355, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.8193456325680017, "num_tokens": 669023811.0, "step": 17075 }, { "entropy": 0.5178852695971727, "epoch": 1.2799937233741079, "grad_norm": 0.19901728630065918, "learning_rate": 0.0002, "loss": 0.6431, "mean_token_accuracy": 0.8177817042917013, "num_tokens": 670530217.0, "step": 17080 }, { "entropy": 0.507518419623375, "epoch": 1.2803684473079597, "grad_norm": 0.22197756171226501, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.8194750621914864, "num_tokens": 672012464.0, "step": 17085 }, { "entropy": 0.5218072540126741, "epoch": 1.2807431712418116, "grad_norm": 0.20759597420692444, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8203183200210333, "num_tokens": 673507189.0, "step": 17090 }, { "entropy": 0.5158052457496524, "epoch": 1.2811178951756634, "grad_norm": 0.22404776513576508, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8197438064962626, "num_tokens": 674971951.0, "step": 17095 }, { "entropy": 0.5223002905957401, "epoch": 1.2814926191095153, "grad_norm": 0.20573678612709045, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8205224070698023, "num_tokens": 676478709.0, "step": 17100 }, { "entropy": 0.5227289134636521, "epoch": 1.2818673430433671, "grad_norm": 0.20144611597061157, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.8187317416071892, "num_tokens": 677990324.0, "step": 17105 }, { "entropy": 0.5105966243892908, "epoch": 1.2822420669772192, "grad_norm": 0.21003729104995728, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8230297613888979, "num_tokens": 679463227.0, "step": 17110 }, { "entropy": 0.5200748527422547, "epoch": 1.282616790911071, "grad_norm": 0.1963115930557251, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.8191084630787373, "num_tokens": 680897753.0, "step": 17115 }, { "entropy": 0.5240413174033165, "epoch": 1.282991514844923, "grad_norm": 0.20978836715221405, "learning_rate": 0.0002, "loss": 0.6206, "mean_token_accuracy": 0.8215131931006908, "num_tokens": 682365061.0, "step": 17120 }, { "entropy": 0.5277116409502923, "epoch": 1.2833662387787748, "grad_norm": 0.20947258174419403, "learning_rate": 0.0002, "loss": 0.6337, "mean_token_accuracy": 0.8204397805035114, "num_tokens": 683821863.0, "step": 17125 }, { "entropy": 0.5323540883138775, "epoch": 1.2837409627126266, "grad_norm": 0.2554701566696167, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8192365597933531, "num_tokens": 685251975.0, "step": 17130 }, { "entropy": 0.5208651078864932, "epoch": 1.2841156866464785, "grad_norm": 0.2195533961057663, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8233562804758549, "num_tokens": 686706485.0, "step": 17135 }, { "entropy": 0.5044700909405947, "epoch": 1.2844904105803303, "grad_norm": 0.2368374466896057, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.8264242623001337, "num_tokens": 688183099.0, "step": 17140 }, { "entropy": 0.5346355704590678, "epoch": 1.2848651345141822, "grad_norm": 0.2506805658340454, "learning_rate": 0.0002, "loss": 0.6483, "mean_token_accuracy": 0.819443678483367, "num_tokens": 689653024.0, "step": 17145 }, { "entropy": 0.5184297099709511, "epoch": 1.285239858448034, "grad_norm": 0.21513521671295166, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8195681363344193, "num_tokens": 691140958.0, "step": 17150 }, { "entropy": 0.5155875916592777, "epoch": 1.2856145823818859, "grad_norm": 0.21698389947414398, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.8231150403618812, "num_tokens": 692641250.0, "step": 17155 }, { "entropy": 0.5206710144877433, "epoch": 1.2859893063157377, "grad_norm": 0.24860280752182007, "learning_rate": 0.0002, "loss": 0.6115, "mean_token_accuracy": 0.8217848099768161, "num_tokens": 694108056.0, "step": 17160 }, { "entropy": 0.5117846554145217, "epoch": 1.2863640302495896, "grad_norm": 0.22327101230621338, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.8231990102678537, "num_tokens": 695549433.0, "step": 17165 }, { "entropy": 0.5180930987000465, "epoch": 1.2867387541834414, "grad_norm": 0.21186281740665436, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.8234198674559593, "num_tokens": 697016664.0, "step": 17170 }, { "entropy": 0.5249787691980601, "epoch": 1.2871134781172933, "grad_norm": 0.21333947777748108, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8204733591526747, "num_tokens": 698494501.0, "step": 17175 }, { "entropy": 0.5064612792804837, "epoch": 1.2874882020511451, "grad_norm": 0.20890504121780396, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.826535428315401, "num_tokens": 699921338.0, "step": 17180 }, { "entropy": 0.5110140211880208, "epoch": 1.287862925984997, "grad_norm": 0.2072804570198059, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.8231586266309023, "num_tokens": 701424275.0, "step": 17185 }, { "entropy": 0.5261169606819749, "epoch": 1.2882376499188488, "grad_norm": 0.2096855789422989, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.8185027655214071, "num_tokens": 702874082.0, "step": 17190 }, { "entropy": 0.5361701576039195, "epoch": 1.2886123738527007, "grad_norm": 0.20904874801635742, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8184348426759243, "num_tokens": 704336493.0, "step": 17195 }, { "entropy": 0.5150789792649448, "epoch": 1.2889870977865525, "grad_norm": 0.21460388600826263, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.8260656222701073, "num_tokens": 705790629.0, "step": 17200 }, { "entropy": 0.534959532879293, "epoch": 1.2893618217204044, "grad_norm": 0.21361427009105682, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.8222104869782925, "num_tokens": 707230362.0, "step": 17205 }, { "entropy": 0.5454869821667672, "epoch": 1.2897365456542562, "grad_norm": 0.21844170987606049, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.8201423414051533, "num_tokens": 708740564.0, "step": 17210 }, { "entropy": 0.5339908723719418, "epoch": 1.290111269588108, "grad_norm": 0.21880798041820526, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8222176849842071, "num_tokens": 710203551.0, "step": 17215 }, { "entropy": 0.5548035962507128, "epoch": 1.29048599352196, "grad_norm": 0.22766584157943726, "learning_rate": 0.0002, "loss": 0.6481, "mean_token_accuracy": 0.8192840196192265, "num_tokens": 711648319.0, "step": 17220 }, { "entropy": 0.553338922560215, "epoch": 1.2908607174558118, "grad_norm": 0.20509721338748932, "learning_rate": 0.0002, "loss": 0.6361, "mean_token_accuracy": 0.8169158235192299, "num_tokens": 713151704.0, "step": 17225 }, { "entropy": 0.5368502328172327, "epoch": 1.2912354413896638, "grad_norm": 0.21167579293251038, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.8233175646513701, "num_tokens": 714652939.0, "step": 17230 }, { "entropy": 0.5356165884062648, "epoch": 1.2916101653235157, "grad_norm": 0.20701059699058533, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8235039308667182, "num_tokens": 716093343.0, "step": 17235 }, { "entropy": 0.5315627107396722, "epoch": 1.2919848892573675, "grad_norm": 0.24131880700588226, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.8212636303156614, "num_tokens": 717551175.0, "step": 17240 }, { "entropy": 0.5470450518652796, "epoch": 1.2923596131912194, "grad_norm": 0.2021557092666626, "learning_rate": 0.0002, "loss": 0.6361, "mean_token_accuracy": 0.8179654344916344, "num_tokens": 719028764.0, "step": 17245 }, { "entropy": 0.5620049448683858, "epoch": 1.2927343371250712, "grad_norm": 0.21849079430103302, "learning_rate": 0.0002, "loss": 0.6477, "mean_token_accuracy": 0.8161437548696995, "num_tokens": 720510293.0, "step": 17250 }, { "entropy": 0.5405958976596594, "epoch": 1.293109061058923, "grad_norm": 0.23516611754894257, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.8199659328907728, "num_tokens": 721944915.0, "step": 17255 }, { "entropy": 0.5451774053275585, "epoch": 1.293483784992775, "grad_norm": 0.3342348635196686, "learning_rate": 0.0002, "loss": 0.635, "mean_token_accuracy": 0.8192971669137478, "num_tokens": 723421712.0, "step": 17260 }, { "entropy": 0.5280931804329156, "epoch": 1.2938585089266268, "grad_norm": 0.21080633997917175, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8197267040610313, "num_tokens": 724846565.0, "step": 17265 }, { "entropy": 0.550848919339478, "epoch": 1.2942332328604786, "grad_norm": 0.21420209109783173, "learning_rate": 0.0002, "loss": 0.635, "mean_token_accuracy": 0.8197382029145956, "num_tokens": 726287593.0, "step": 17270 }, { "entropy": 0.5370485637336969, "epoch": 1.2946079567943305, "grad_norm": 0.2260882556438446, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8234186727553606, "num_tokens": 727732803.0, "step": 17275 }, { "entropy": 0.5418293914757669, "epoch": 1.2949826807281823, "grad_norm": 0.22854088246822357, "learning_rate": 0.0002, "loss": 0.6377, "mean_token_accuracy": 0.8216699879616499, "num_tokens": 729217305.0, "step": 17280 }, { "entropy": 0.5398891578428447, "epoch": 1.2953574046620342, "grad_norm": 0.21985329687595367, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.817815663665533, "num_tokens": 730663961.0, "step": 17285 }, { "entropy": 0.5390079839155077, "epoch": 1.295732128595886, "grad_norm": 0.20782168209552765, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.8193159081041813, "num_tokens": 732138672.0, "step": 17290 }, { "entropy": 0.5375215394422412, "epoch": 1.296106852529738, "grad_norm": 0.1988496482372284, "learning_rate": 0.0002, "loss": 0.641, "mean_token_accuracy": 0.8186177428811788, "num_tokens": 733585276.0, "step": 17295 }, { "entropy": 0.5454670837149024, "epoch": 1.2964815764635897, "grad_norm": 0.21807949244976044, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.8182588454335928, "num_tokens": 735063535.0, "step": 17300 }, { "entropy": 0.5414889082312584, "epoch": 1.2968563003974416, "grad_norm": 0.22606289386749268, "learning_rate": 0.0002, "loss": 0.6361, "mean_token_accuracy": 0.8195557508617639, "num_tokens": 736539565.0, "step": 17305 }, { "entropy": 0.5246193307451904, "epoch": 1.2972310243312934, "grad_norm": 0.20878779888153076, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.8236125208437443, "num_tokens": 737981547.0, "step": 17310 }, { "entropy": 0.5183373494073749, "epoch": 1.2976057482651453, "grad_norm": 0.22759978473186493, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.8236709479242563, "num_tokens": 739415632.0, "step": 17315 }, { "entropy": 0.5227294424548745, "epoch": 1.2979804721989971, "grad_norm": 0.2292136549949646, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.8209928929805755, "num_tokens": 740894302.0, "step": 17320 }, { "entropy": 0.5126921016722917, "epoch": 1.298355196132849, "grad_norm": 0.2227722406387329, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8275828488171101, "num_tokens": 742321566.0, "step": 17325 }, { "entropy": 0.5481049198657274, "epoch": 1.2987299200667008, "grad_norm": 0.22264952957630157, "learning_rate": 0.0002, "loss": 0.6491, "mean_token_accuracy": 0.817770129814744, "num_tokens": 743804691.0, "step": 17330 }, { "entropy": 0.5379106260836124, "epoch": 1.2991046440005527, "grad_norm": 0.22278501093387604, "learning_rate": 0.0002, "loss": 0.6337, "mean_token_accuracy": 0.8210056032985449, "num_tokens": 745280940.0, "step": 17335 }, { "entropy": 0.5366843635216355, "epoch": 1.2994793679344046, "grad_norm": 0.26381200551986694, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8212919801473617, "num_tokens": 746728257.0, "step": 17340 }, { "entropy": 0.527132474258542, "epoch": 1.2998540918682564, "grad_norm": 0.2226458489894867, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.8266294501721859, "num_tokens": 748164579.0, "step": 17345 }, { "entropy": 0.5222425192594529, "epoch": 1.3002288158021083, "grad_norm": 0.20749379694461823, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8214955799281597, "num_tokens": 749645603.0, "step": 17350 }, { "entropy": 0.5420105457305908, "epoch": 1.30060353973596, "grad_norm": 0.24438029527664185, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.8203112814575434, "num_tokens": 751115088.0, "step": 17355 }, { "entropy": 0.5316960336640477, "epoch": 1.300978263669812, "grad_norm": 0.20822682976722717, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.8219531282782555, "num_tokens": 752609666.0, "step": 17360 }, { "entropy": 0.553174714371562, "epoch": 1.3013529876036638, "grad_norm": 0.20475389063358307, "learning_rate": 0.0002, "loss": 0.6441, "mean_token_accuracy": 0.817405565828085, "num_tokens": 754114048.0, "step": 17365 }, { "entropy": 0.5372793330810964, "epoch": 1.3017277115375157, "grad_norm": 0.23162853717803955, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8234798077493906, "num_tokens": 755604200.0, "step": 17370 }, { "entropy": 0.547293609380722, "epoch": 1.3021024354713675, "grad_norm": 0.20956742763519287, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.8234459523111581, "num_tokens": 757063780.0, "step": 17375 }, { "entropy": 0.5532046610489487, "epoch": 1.3024771594052194, "grad_norm": 0.22961263358592987, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8197630859911442, "num_tokens": 758555634.0, "step": 17380 }, { "entropy": 0.5416607173159719, "epoch": 1.3028518833390712, "grad_norm": 0.20681513845920563, "learning_rate": 0.0002, "loss": 0.6333, "mean_token_accuracy": 0.82232217900455, "num_tokens": 760022521.0, "step": 17385 }, { "entropy": 0.5437560699880123, "epoch": 1.303226607272923, "grad_norm": 0.2030428647994995, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.8221590802073478, "num_tokens": 761489075.0, "step": 17390 }, { "entropy": 0.543852199614048, "epoch": 1.303601331206775, "grad_norm": 0.20205426216125488, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.818587401881814, "num_tokens": 762952810.0, "step": 17395 }, { "entropy": 0.5443169989623129, "epoch": 1.3039760551406268, "grad_norm": 0.20477639138698578, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8195758245885372, "num_tokens": 764418518.0, "step": 17400 }, { "entropy": 0.538553044013679, "epoch": 1.3043507790744786, "grad_norm": 0.22227805852890015, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.8199522096663714, "num_tokens": 765880309.0, "step": 17405 }, { "entropy": 0.5179390113800764, "epoch": 1.3047255030083305, "grad_norm": 0.24460993707180023, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.823204293847084, "num_tokens": 767310495.0, "step": 17410 }, { "entropy": 0.531233268789947, "epoch": 1.3051002269421823, "grad_norm": 0.25029805302619934, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.8234569862484932, "num_tokens": 768798949.0, "step": 17415 }, { "entropy": 0.5440118093043566, "epoch": 1.3054749508760344, "grad_norm": 0.2607663869857788, "learning_rate": 0.0002, "loss": 0.6347, "mean_token_accuracy": 0.818590147048235, "num_tokens": 770253844.0, "step": 17420 }, { "entropy": 0.5326594546437263, "epoch": 1.3058496748098862, "grad_norm": 0.2082897424697876, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.8217617034912109, "num_tokens": 771694833.0, "step": 17425 }, { "entropy": 0.5346418498083949, "epoch": 1.306224398743738, "grad_norm": 0.21928709745407104, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.8224576126784087, "num_tokens": 773145366.0, "step": 17430 }, { "entropy": 0.5285726796835661, "epoch": 1.30659912267759, "grad_norm": 0.21405820548534393, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.8242175277322531, "num_tokens": 774582837.0, "step": 17435 }, { "entropy": 0.5264638332650066, "epoch": 1.3069738466114418, "grad_norm": 0.22501301765441895, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8215315535664558, "num_tokens": 776024992.0, "step": 17440 }, { "entropy": 0.5379922619089484, "epoch": 1.3073485705452936, "grad_norm": 0.2657085955142975, "learning_rate": 0.0002, "loss": 0.6325, "mean_token_accuracy": 0.8211179126054049, "num_tokens": 777501275.0, "step": 17445 }, { "entropy": 0.5243445988744497, "epoch": 1.3077232944791455, "grad_norm": 0.2614968419075012, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.8228670038282871, "num_tokens": 778943922.0, "step": 17450 }, { "entropy": 0.5368192547932267, "epoch": 1.3080980184129973, "grad_norm": 0.22541674971580505, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8215113896876574, "num_tokens": 780401205.0, "step": 17455 }, { "entropy": 0.5397374615073204, "epoch": 1.3084727423468492, "grad_norm": 0.21375879645347595, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8239296849817037, "num_tokens": 781862919.0, "step": 17460 }, { "entropy": 0.5231396615505218, "epoch": 1.308847466280701, "grad_norm": 0.22655734419822693, "learning_rate": 0.0002, "loss": 0.6061, "mean_token_accuracy": 0.8241812579333783, "num_tokens": 783251625.0, "step": 17465 }, { "entropy": 0.5528624692931772, "epoch": 1.3092221902145529, "grad_norm": 0.21265527606010437, "learning_rate": 0.0002, "loss": 0.6355, "mean_token_accuracy": 0.8207762971520424, "num_tokens": 784782156.0, "step": 17470 }, { "entropy": 0.540797651000321, "epoch": 1.3095969141484047, "grad_norm": 0.24233439564704895, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8218001145869493, "num_tokens": 786247758.0, "step": 17475 }, { "entropy": 0.5615922179073095, "epoch": 1.3099716380822566, "grad_norm": 0.23054450750350952, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8194231737405062, "num_tokens": 787759371.0, "step": 17480 }, { "entropy": 0.56099095325917, "epoch": 1.3103463620161084, "grad_norm": 0.24855558574199677, "learning_rate": 0.0002, "loss": 0.643, "mean_token_accuracy": 0.8189569842070341, "num_tokens": 789182095.0, "step": 17485 }, { "entropy": 0.5509254392236471, "epoch": 1.3107210859499603, "grad_norm": 0.2279113233089447, "learning_rate": 0.0002, "loss": 0.6346, "mean_token_accuracy": 0.8206875581294298, "num_tokens": 790668108.0, "step": 17490 }, { "entropy": 0.5527985988184809, "epoch": 1.3110958098838121, "grad_norm": 0.24060417711734772, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.8202047664672136, "num_tokens": 792148976.0, "step": 17495 }, { "entropy": 0.5399865990504622, "epoch": 1.311470533817664, "grad_norm": 0.21801364421844482, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8270933695137501, "num_tokens": 793636605.0, "step": 17500 }, { "entropy": 0.5454795371741057, "epoch": 1.3118452577515158, "grad_norm": 0.23203474283218384, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.821748485788703, "num_tokens": 795096836.0, "step": 17505 }, { "entropy": 0.5515100402757526, "epoch": 1.3122199816853677, "grad_norm": 0.24137303233146667, "learning_rate": 0.0002, "loss": 0.634, "mean_token_accuracy": 0.8219695508480072, "num_tokens": 796550256.0, "step": 17510 }, { "entropy": 0.5670457163825631, "epoch": 1.3125947056192195, "grad_norm": 0.21356798708438873, "learning_rate": 0.0002, "loss": 0.645, "mean_token_accuracy": 0.8180795099586249, "num_tokens": 798040606.0, "step": 17515 }, { "entropy": 0.558624649234116, "epoch": 1.3129694295530714, "grad_norm": 0.2532499134540558, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.8194018132984638, "num_tokens": 799512793.0, "step": 17520 }, { "entropy": 0.5622690727934241, "epoch": 1.3133441534869232, "grad_norm": 0.20423638820648193, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8252154242247343, "num_tokens": 800979282.0, "step": 17525 }, { "entropy": 0.5614699147641659, "epoch": 1.313718877420775, "grad_norm": 0.21467970311641693, "learning_rate": 0.0002, "loss": 0.6392, "mean_token_accuracy": 0.8196811195462942, "num_tokens": 802422193.0, "step": 17530 }, { "entropy": 0.5521905492059886, "epoch": 1.314093601354627, "grad_norm": 0.20672135055065155, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.8231099769473076, "num_tokens": 803924529.0, "step": 17535 }, { "entropy": 0.5583566414192319, "epoch": 1.314468325288479, "grad_norm": 0.2146211713552475, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8224378179758787, "num_tokens": 805424388.0, "step": 17540 }, { "entropy": 0.5604377094656229, "epoch": 1.3148430492223309, "grad_norm": 0.21781402826309204, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8210178509354591, "num_tokens": 806891130.0, "step": 17545 }, { "entropy": 0.5366901215165854, "epoch": 1.3152177731561827, "grad_norm": 0.22350671887397766, "learning_rate": 0.0002, "loss": 0.607, "mean_token_accuracy": 0.8231243789196014, "num_tokens": 808343563.0, "step": 17550 }, { "entropy": 0.5849927948787809, "epoch": 1.3155924970900346, "grad_norm": 0.21940159797668457, "learning_rate": 0.0002, "loss": 0.6655, "mean_token_accuracy": 0.8151740599423647, "num_tokens": 809858525.0, "step": 17555 }, { "entropy": 0.5428789053112268, "epoch": 1.3159672210238864, "grad_norm": 0.23359976708889008, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.823007534071803, "num_tokens": 811285311.0, "step": 17560 }, { "entropy": 0.5574202343821526, "epoch": 1.3163419449577383, "grad_norm": 0.21507695317268372, "learning_rate": 0.0002, "loss": 0.6405, "mean_token_accuracy": 0.8201352518051863, "num_tokens": 812763615.0, "step": 17565 }, { "entropy": 0.5597101047635078, "epoch": 1.3167166688915901, "grad_norm": 0.25034087896347046, "learning_rate": 0.0002, "loss": 0.6418, "mean_token_accuracy": 0.8212351754307747, "num_tokens": 814209763.0, "step": 17570 }, { "entropy": 0.5368046149611473, "epoch": 1.317091392825442, "grad_norm": 0.20713376998901367, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8214513968676329, "num_tokens": 815631574.0, "step": 17575 }, { "entropy": 0.5449250131845474, "epoch": 1.3174661167592938, "grad_norm": 0.2822185754776001, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8250657908618451, "num_tokens": 817070949.0, "step": 17580 }, { "entropy": 0.5329844910651446, "epoch": 1.3178408406931457, "grad_norm": 0.21409665048122406, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.8219449143856764, "num_tokens": 818544022.0, "step": 17585 }, { "entropy": 0.54858289193362, "epoch": 1.3182155646269975, "grad_norm": 0.2186567485332489, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.8195188630372285, "num_tokens": 819997967.0, "step": 17590 }, { "entropy": 0.5359257504343986, "epoch": 1.3185902885608494, "grad_norm": 0.2652932107448578, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.8236122529953718, "num_tokens": 821493406.0, "step": 17595 }, { "entropy": 0.5303835654631257, "epoch": 1.3189650124947012, "grad_norm": 0.22228623926639557, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.826771168410778, "num_tokens": 822944762.0, "step": 17600 }, { "entropy": 0.5420504376292229, "epoch": 1.319339736428553, "grad_norm": 0.23615828156471252, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8194649156183005, "num_tokens": 824408002.0, "step": 17605 }, { "entropy": 0.5617237845435739, "epoch": 1.319714460362405, "grad_norm": 0.25488874316215515, "learning_rate": 0.0002, "loss": 0.6501, "mean_token_accuracy": 0.8191965352743864, "num_tokens": 825890341.0, "step": 17610 }, { "entropy": 0.5296256627887488, "epoch": 1.3200891842962568, "grad_norm": 0.2203402817249298, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8220796294510364, "num_tokens": 827358273.0, "step": 17615 }, { "entropy": 0.5491172222420573, "epoch": 1.3204639082301086, "grad_norm": 0.21198458969593048, "learning_rate": 0.0002, "loss": 0.6488, "mean_token_accuracy": 0.8180796630680561, "num_tokens": 828835537.0, "step": 17620 }, { "entropy": 0.5348482466302812, "epoch": 1.3208386321639605, "grad_norm": 0.21448568999767303, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.8225814215838909, "num_tokens": 830304712.0, "step": 17625 }, { "entropy": 0.5172547748312354, "epoch": 1.3212133560978123, "grad_norm": 0.2316848486661911, "learning_rate": 0.0002, "loss": 0.6173, "mean_token_accuracy": 0.8245551742613315, "num_tokens": 831731138.0, "step": 17630 }, { "entropy": 0.5465498005971312, "epoch": 1.3215880800316642, "grad_norm": 0.2244601845741272, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.8222098253667355, "num_tokens": 833197158.0, "step": 17635 }, { "entropy": 0.5404886838048697, "epoch": 1.321962803965516, "grad_norm": 0.2093910425901413, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8204608224332333, "num_tokens": 834655690.0, "step": 17640 }, { "entropy": 0.5575552005320787, "epoch": 1.3223375278993679, "grad_norm": 0.2376975566148758, "learning_rate": 0.0002, "loss": 0.6519, "mean_token_accuracy": 0.8168488632887602, "num_tokens": 836146511.0, "step": 17645 }, { "entropy": 0.5413928881287575, "epoch": 1.3227122518332197, "grad_norm": 0.2486421763896942, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8210004981607198, "num_tokens": 837597962.0, "step": 17650 }, { "entropy": 0.5406007070094347, "epoch": 1.3230869757670716, "grad_norm": 0.21882736682891846, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8223793618381023, "num_tokens": 839057438.0, "step": 17655 }, { "entropy": 0.5451417371630669, "epoch": 1.3234616997009234, "grad_norm": 0.22247906029224396, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.821582218259573, "num_tokens": 840535432.0, "step": 17660 }, { "entropy": 0.5304272249341011, "epoch": 1.3238364236347753, "grad_norm": 0.20675411820411682, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.8268608879297972, "num_tokens": 841950828.0, "step": 17665 }, { "entropy": 0.5567441035062075, "epoch": 1.3242111475686271, "grad_norm": 0.22035101056098938, "learning_rate": 0.0002, "loss": 0.6433, "mean_token_accuracy": 0.8185348723083734, "num_tokens": 843408636.0, "step": 17670 }, { "entropy": 0.5483869049698115, "epoch": 1.324585871502479, "grad_norm": 0.22631044685840607, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.820292453467846, "num_tokens": 844834783.0, "step": 17675 }, { "entropy": 0.5409329552203417, "epoch": 1.3249605954363308, "grad_norm": 0.22864392399787903, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.8231065858155489, "num_tokens": 846318061.0, "step": 17680 }, { "entropy": 0.5454140659421682, "epoch": 1.3253353193701827, "grad_norm": 0.24892160296440125, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.8199608419090509, "num_tokens": 847796040.0, "step": 17685 }, { "entropy": 0.5412671187892556, "epoch": 1.3257100433040345, "grad_norm": 0.21696867048740387, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.8199212826788426, "num_tokens": 849293060.0, "step": 17690 }, { "entropy": 0.5407728392630815, "epoch": 1.3260847672378864, "grad_norm": 0.3433847725391388, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.8211193900555372, "num_tokens": 850728812.0, "step": 17695 }, { "entropy": 0.530192113481462, "epoch": 1.3264594911717382, "grad_norm": 0.2254306823015213, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8203370485454797, "num_tokens": 852196006.0, "step": 17700 }, { "entropy": 0.5355081584304571, "epoch": 1.32683421510559, "grad_norm": 0.24593380093574524, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.8226981207728385, "num_tokens": 853643297.0, "step": 17705 }, { "entropy": 0.5283860152587294, "epoch": 1.327208939039442, "grad_norm": 0.2188597172498703, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8217986986041069, "num_tokens": 855120042.0, "step": 17710 }, { "entropy": 0.5378985082730651, "epoch": 1.3275836629732938, "grad_norm": 0.23069947957992554, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8219518940895796, "num_tokens": 856638017.0, "step": 17715 }, { "entropy": 0.5582750685513019, "epoch": 1.3279583869071456, "grad_norm": 0.20868946611881256, "learning_rate": 0.0002, "loss": 0.6384, "mean_token_accuracy": 0.8211423311382532, "num_tokens": 858134950.0, "step": 17720 }, { "entropy": 0.559979428909719, "epoch": 1.3283331108409975, "grad_norm": 0.2084360271692276, "learning_rate": 0.0002, "loss": 0.6397, "mean_token_accuracy": 0.8179454483091831, "num_tokens": 859614651.0, "step": 17725 }, { "entropy": 0.5672563431784511, "epoch": 1.3287078347748493, "grad_norm": 0.23230630159378052, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.8164463926106691, "num_tokens": 861106433.0, "step": 17730 }, { "entropy": 0.5548114685341716, "epoch": 1.3290825587087014, "grad_norm": 0.23224420845508575, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.8206192072480917, "num_tokens": 862602693.0, "step": 17735 }, { "entropy": 0.5624751562252641, "epoch": 1.3294572826425533, "grad_norm": 0.22672180831432343, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8227516319602728, "num_tokens": 864042514.0, "step": 17740 }, { "entropy": 0.543137633614242, "epoch": 1.329832006576405, "grad_norm": 0.2399083822965622, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8278282057493925, "num_tokens": 865542221.0, "step": 17745 }, { "entropy": 0.5696317918598652, "epoch": 1.330206730510257, "grad_norm": 0.23603801429271698, "learning_rate": 0.0002, "loss": 0.6461, "mean_token_accuracy": 0.8177858047187329, "num_tokens": 867030743.0, "step": 17750 }, { "entropy": 0.5584361447021365, "epoch": 1.3305814544441088, "grad_norm": 0.23322899639606476, "learning_rate": 0.0002, "loss": 0.6325, "mean_token_accuracy": 0.8221356485038995, "num_tokens": 868454121.0, "step": 17755 }, { "entropy": 0.5626211581751704, "epoch": 1.3309561783779607, "grad_norm": 0.21945670247077942, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8208626046776771, "num_tokens": 869932725.0, "step": 17760 }, { "entropy": 0.5571235354989768, "epoch": 1.3313309023118125, "grad_norm": 0.29756200313568115, "learning_rate": 0.0002, "loss": 0.6256, "mean_token_accuracy": 0.8195165548473596, "num_tokens": 871380633.0, "step": 17765 }, { "entropy": 0.5554000619798899, "epoch": 1.3317056262456644, "grad_norm": 0.22725294530391693, "learning_rate": 0.0002, "loss": 0.6378, "mean_token_accuracy": 0.8211889054626227, "num_tokens": 872880678.0, "step": 17770 }, { "entropy": 0.5398020448163152, "epoch": 1.3320803501795162, "grad_norm": 0.26646772027015686, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8259808529168368, "num_tokens": 874305661.0, "step": 17775 }, { "entropy": 0.5585712511092424, "epoch": 1.332455074113368, "grad_norm": 0.23664230108261108, "learning_rate": 0.0002, "loss": 0.6409, "mean_token_accuracy": 0.8185956381261349, "num_tokens": 875804250.0, "step": 17780 }, { "entropy": 0.5536795808002353, "epoch": 1.33282979804722, "grad_norm": 0.22362346947193146, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8182799849659205, "num_tokens": 877249613.0, "step": 17785 }, { "entropy": 0.5561751630157232, "epoch": 1.3332045219810718, "grad_norm": 0.21185460686683655, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8260241452604532, "num_tokens": 878698180.0, "step": 17790 }, { "entropy": 0.5890220759436489, "epoch": 1.3335792459149236, "grad_norm": 0.21571014821529388, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.8198904979974031, "num_tokens": 880165895.0, "step": 17795 }, { "entropy": 0.5742599910125137, "epoch": 1.3339539698487755, "grad_norm": 0.2349720448255539, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.8240859813988208, "num_tokens": 881619916.0, "step": 17800 }, { "entropy": 0.589608321711421, "epoch": 1.3343286937826273, "grad_norm": 0.2299390733242035, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8217944856733084, "num_tokens": 883095483.0, "step": 17805 }, { "entropy": 0.5797021185979248, "epoch": 1.3347034177164792, "grad_norm": 0.26087260246276855, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8247719895094633, "num_tokens": 884566220.0, "step": 17810 }, { "entropy": 0.5746922615915537, "epoch": 1.335078141650331, "grad_norm": 0.23083044588565826, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8204670879989863, "num_tokens": 886039359.0, "step": 17815 }, { "entropy": 0.5705863896757364, "epoch": 1.3354528655841829, "grad_norm": 0.24022340774536133, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.8252066139131784, "num_tokens": 887501494.0, "step": 17820 }, { "entropy": 0.5751467308029532, "epoch": 1.3358275895180347, "grad_norm": 0.2471996396780014, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.8194509129971266, "num_tokens": 888964425.0, "step": 17825 }, { "entropy": 0.558689720183611, "epoch": 1.3362023134518866, "grad_norm": 0.2325272113084793, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.827184122055769, "num_tokens": 890390250.0, "step": 17830 }, { "entropy": 0.5882437724620104, "epoch": 1.3365770373857384, "grad_norm": 0.23697136342525482, "learning_rate": 0.0002, "loss": 0.6412, "mean_token_accuracy": 0.8186381127685308, "num_tokens": 891842365.0, "step": 17835 }, { "entropy": 0.5959819184616209, "epoch": 1.3369517613195903, "grad_norm": 0.24027122557163239, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.8202252976596356, "num_tokens": 893280860.0, "step": 17840 }, { "entropy": 0.5895187748596072, "epoch": 1.337326485253442, "grad_norm": 0.22941774129867554, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.821789626404643, "num_tokens": 894755862.0, "step": 17845 }, { "entropy": 0.5855564516037702, "epoch": 1.3377012091872942, "grad_norm": 0.21882864832878113, "learning_rate": 0.0002, "loss": 0.6395, "mean_token_accuracy": 0.8170117124915123, "num_tokens": 896216378.0, "step": 17850 }, { "entropy": 0.5888376707211137, "epoch": 1.338075933121146, "grad_norm": 0.277959406375885, "learning_rate": 0.0002, "loss": 0.6426, "mean_token_accuracy": 0.8181861381977796, "num_tokens": 897709022.0, "step": 17855 }, { "entropy": 0.5915840970352292, "epoch": 1.3384506570549979, "grad_norm": 0.2764458656311035, "learning_rate": 0.0002, "loss": 0.6484, "mean_token_accuracy": 0.8212389770895243, "num_tokens": 899222048.0, "step": 17860 }, { "entropy": 0.5559072891250253, "epoch": 1.3388253809888497, "grad_norm": 0.23035010695457458, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8281966757029295, "num_tokens": 900674356.0, "step": 17865 }, { "entropy": 0.5573692753911018, "epoch": 1.3392001049227016, "grad_norm": 0.22218777239322662, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8233879633247853, "num_tokens": 902127674.0, "step": 17870 }, { "entropy": 0.5706610701978206, "epoch": 1.3395748288565534, "grad_norm": 0.21127350628376007, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.8232976485043764, "num_tokens": 903584294.0, "step": 17875 }, { "entropy": 0.5775508346036077, "epoch": 1.3399495527904053, "grad_norm": 0.21973899006843567, "learning_rate": 0.0002, "loss": 0.634, "mean_token_accuracy": 0.8186805572360754, "num_tokens": 905087079.0, "step": 17880 }, { "entropy": 0.5799379454925656, "epoch": 1.3403242767242571, "grad_norm": 0.2078147679567337, "learning_rate": 0.0002, "loss": 0.6355, "mean_token_accuracy": 0.8196911115199328, "num_tokens": 906580467.0, "step": 17885 }, { "entropy": 0.5644422736018896, "epoch": 1.340699000658109, "grad_norm": 0.22149717807769775, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8229425124824047, "num_tokens": 908085401.0, "step": 17890 }, { "entropy": 0.5668319625779986, "epoch": 1.3410737245919608, "grad_norm": 0.22632823884487152, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8246711194515228, "num_tokens": 909520760.0, "step": 17895 }, { "entropy": 0.5544188222847879, "epoch": 1.3414484485258127, "grad_norm": 0.21042124927043915, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.8267438150942326, "num_tokens": 910983028.0, "step": 17900 }, { "entropy": 0.584031174518168, "epoch": 1.3418231724596645, "grad_norm": 0.20787611603736877, "learning_rate": 0.0002, "loss": 0.6352, "mean_token_accuracy": 0.8192367155104876, "num_tokens": 912463088.0, "step": 17905 }, { "entropy": 0.5734268635511398, "epoch": 1.3421978963935164, "grad_norm": 0.2756321430206299, "learning_rate": 0.0002, "loss": 0.615, "mean_token_accuracy": 0.8214787974953651, "num_tokens": 913925739.0, "step": 17910 }, { "entropy": 0.5757097624242306, "epoch": 1.3425726203273682, "grad_norm": 0.23728011548519135, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8215411297976971, "num_tokens": 915383163.0, "step": 17915 }, { "entropy": 0.5507190031930804, "epoch": 1.34294734426122, "grad_norm": 0.291176438331604, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.8268277332186699, "num_tokens": 916806607.0, "step": 17920 }, { "entropy": 0.5881087703630328, "epoch": 1.343322068195072, "grad_norm": 0.2450311928987503, "learning_rate": 0.0002, "loss": 0.6368, "mean_token_accuracy": 0.8197608202695846, "num_tokens": 918284199.0, "step": 17925 }, { "entropy": 0.593874522857368, "epoch": 1.3436967921289238, "grad_norm": 0.23487530648708344, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.8205621775239706, "num_tokens": 919792762.0, "step": 17930 }, { "entropy": 0.5870503777638078, "epoch": 1.3440715160627756, "grad_norm": 0.26131710410118103, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8240756895393133, "num_tokens": 921312179.0, "step": 17935 }, { "entropy": 0.5972501032054425, "epoch": 1.3444462399966275, "grad_norm": 0.24855034053325653, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8228021614253521, "num_tokens": 922777461.0, "step": 17940 }, { "entropy": 0.6003371838480234, "epoch": 1.3448209639304793, "grad_norm": 0.2270355075597763, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.8200649447739125, "num_tokens": 924242039.0, "step": 17945 }, { "entropy": 0.6018992638215422, "epoch": 1.3451956878643312, "grad_norm": 0.22523024678230286, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.8203369088470935, "num_tokens": 925736164.0, "step": 17950 }, { "entropy": 0.6194056816399097, "epoch": 1.345570411798183, "grad_norm": 0.35695528984069824, "learning_rate": 0.0002, "loss": 0.6425, "mean_token_accuracy": 0.8198686067014933, "num_tokens": 927175444.0, "step": 17955 }, { "entropy": 0.6152585530653596, "epoch": 1.345945135732035, "grad_norm": 0.2881467044353485, "learning_rate": 0.0002, "loss": 0.656, "mean_token_accuracy": 0.8179379727691412, "num_tokens": 928690203.0, "step": 17960 }, { "entropy": 0.5871551614254713, "epoch": 1.3463198596658867, "grad_norm": 0.2165948748588562, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.8228157985955477, "num_tokens": 930138061.0, "step": 17965 }, { "entropy": 0.5972222004085779, "epoch": 1.3466945835997386, "grad_norm": 0.22747640311717987, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.8183544117957353, "num_tokens": 931625174.0, "step": 17970 }, { "entropy": 0.5927743997424841, "epoch": 1.3470693075335904, "grad_norm": 0.22077062726020813, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.820287612080574, "num_tokens": 933128799.0, "step": 17975 }, { "entropy": 0.5877466889098286, "epoch": 1.3474440314674423, "grad_norm": 0.21509265899658203, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8207967542111874, "num_tokens": 934607693.0, "step": 17980 }, { "entropy": 0.576451699435711, "epoch": 1.3478187554012941, "grad_norm": 0.2402019500732422, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.8236986566334963, "num_tokens": 936093825.0, "step": 17985 }, { "entropy": 0.5788020845502615, "epoch": 1.348193479335146, "grad_norm": 0.32519248127937317, "learning_rate": 0.0002, "loss": 0.6258, "mean_token_accuracy": 0.8222233328968287, "num_tokens": 937535889.0, "step": 17990 }, { "entropy": 0.5758497627452016, "epoch": 1.3485682032689978, "grad_norm": 0.2210431545972824, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8218017090111971, "num_tokens": 938983705.0, "step": 17995 }, { "entropy": 0.5637328715994954, "epoch": 1.3489429272028497, "grad_norm": 0.20543842017650604, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.8182935982942581, "num_tokens": 940461384.0, "step": 18000 }, { "entropy": 0.5666754767298698, "epoch": 1.3493176511367015, "grad_norm": 0.2559449374675751, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.8226029574871063, "num_tokens": 941930897.0, "step": 18005 }, { "entropy": 0.5767115505412221, "epoch": 1.3496923750705534, "grad_norm": 0.21684059500694275, "learning_rate": 0.0002, "loss": 0.6461, "mean_token_accuracy": 0.8199824716895818, "num_tokens": 943415494.0, "step": 18010 }, { "entropy": 0.5709831949323416, "epoch": 1.3500670990044052, "grad_norm": 0.20668111741542816, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.8240838915109634, "num_tokens": 944866243.0, "step": 18015 }, { "entropy": 0.572556396573782, "epoch": 1.350441822938257, "grad_norm": 0.21488027274608612, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.8214759942144155, "num_tokens": 946351477.0, "step": 18020 }, { "entropy": 0.5622000264003872, "epoch": 1.350816546872109, "grad_norm": 0.27013155817985535, "learning_rate": 0.0002, "loss": 0.6264, "mean_token_accuracy": 0.8220796182751655, "num_tokens": 947806420.0, "step": 18025 }, { "entropy": 0.5659977102652192, "epoch": 1.3511912708059608, "grad_norm": 0.2328888475894928, "learning_rate": 0.0002, "loss": 0.637, "mean_token_accuracy": 0.8185396410524846, "num_tokens": 949286069.0, "step": 18030 }, { "entropy": 0.5630664718337357, "epoch": 1.3515659947398126, "grad_norm": 0.21763215959072113, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8204310931265354, "num_tokens": 950790459.0, "step": 18035 }, { "entropy": 0.5543124567717314, "epoch": 1.3519407186736645, "grad_norm": 0.22594785690307617, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8218416236341, "num_tokens": 952216416.0, "step": 18040 }, { "entropy": 0.5490172090008855, "epoch": 1.3523154426075166, "grad_norm": 0.21014204621315002, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8206403780728578, "num_tokens": 953674269.0, "step": 18045 }, { "entropy": 0.5661069264635443, "epoch": 1.3526901665413684, "grad_norm": 0.23427146673202515, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.8190776210278272, "num_tokens": 955156977.0, "step": 18050 }, { "entropy": 0.5630312696099281, "epoch": 1.3530648904752203, "grad_norm": 0.21917615830898285, "learning_rate": 0.0002, "loss": 0.6344, "mean_token_accuracy": 0.8179536502808332, "num_tokens": 956625760.0, "step": 18055 }, { "entropy": 0.5645365132018924, "epoch": 1.3534396144090721, "grad_norm": 0.1969015747308731, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.8238758448511362, "num_tokens": 958091708.0, "step": 18060 }, { "entropy": 0.5693597773090004, "epoch": 1.353814338342924, "grad_norm": 0.2493468075990677, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.8194833166897297, "num_tokens": 959509568.0, "step": 18065 }, { "entropy": 0.566785623319447, "epoch": 1.3541890622767758, "grad_norm": 0.21395009756088257, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8219267226755619, "num_tokens": 960959627.0, "step": 18070 }, { "entropy": 0.5766250848770141, "epoch": 1.3545637862106277, "grad_norm": 0.2097131311893463, "learning_rate": 0.0002, "loss": 0.6415, "mean_token_accuracy": 0.8196017701178789, "num_tokens": 962453732.0, "step": 18075 }, { "entropy": 0.5614686084911227, "epoch": 1.3549385101444795, "grad_norm": 0.21399284899234772, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.8245466597378254, "num_tokens": 963956237.0, "step": 18080 }, { "entropy": 0.5548467461019755, "epoch": 1.3553132340783314, "grad_norm": 0.2722587287425995, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.82245620675385, "num_tokens": 965412743.0, "step": 18085 }, { "entropy": 0.5625940734520555, "epoch": 1.3556879580121832, "grad_norm": 0.22424522042274475, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.8236600685864687, "num_tokens": 966902475.0, "step": 18090 }, { "entropy": 0.5665921218693256, "epoch": 1.356062681946035, "grad_norm": 0.24043454229831696, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.820020056143403, "num_tokens": 968400311.0, "step": 18095 }, { "entropy": 0.5621429406106472, "epoch": 1.356437405879887, "grad_norm": 0.2265780121088028, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.82280835211277, "num_tokens": 969907750.0, "step": 18100 }, { "entropy": 0.5509029248729348, "epoch": 1.3568121298137388, "grad_norm": 0.2152925580739975, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8213773593306541, "num_tokens": 971365104.0, "step": 18105 }, { "entropy": 0.5630975618958474, "epoch": 1.3571868537475906, "grad_norm": 0.26661866903305054, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.8217326741665602, "num_tokens": 972828786.0, "step": 18110 }, { "entropy": 0.5777286786586047, "epoch": 1.3575615776814425, "grad_norm": 0.4371023178100586, "learning_rate": 0.0002, "loss": 0.6496, "mean_token_accuracy": 0.8211116630584001, "num_tokens": 974315162.0, "step": 18115 }, { "entropy": 0.5631518095731736, "epoch": 1.3579363016152943, "grad_norm": 0.2685871720314026, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8213995400816202, "num_tokens": 975810358.0, "step": 18120 }, { "entropy": 0.5645429778844118, "epoch": 1.3583110255491462, "grad_norm": 0.2392674684524536, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.8204347167164088, "num_tokens": 977286066.0, "step": 18125 }, { "entropy": 0.571475968696177, "epoch": 1.358685749482998, "grad_norm": 0.23980017006397247, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.8210402671247721, "num_tokens": 978726381.0, "step": 18130 }, { "entropy": 0.5543508842587471, "epoch": 1.3590604734168499, "grad_norm": 0.21928958594799042, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.827364120259881, "num_tokens": 980156549.0, "step": 18135 }, { "entropy": 0.5559250941500068, "epoch": 1.3594351973507017, "grad_norm": 0.2336840182542801, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8205518178641796, "num_tokens": 981634169.0, "step": 18140 }, { "entropy": 0.5551105908118188, "epoch": 1.3598099212845536, "grad_norm": 0.23478230834007263, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8204372759908438, "num_tokens": 983106854.0, "step": 18145 }, { "entropy": 0.5504019167274237, "epoch": 1.3601846452184054, "grad_norm": 0.20871368050575256, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.8200558613985777, "num_tokens": 984516869.0, "step": 18150 }, { "entropy": 0.5451722176745534, "epoch": 1.3605593691522573, "grad_norm": 0.21740886569023132, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8225495424121618, "num_tokens": 986001074.0, "step": 18155 }, { "entropy": 0.5801249977201224, "epoch": 1.3609340930861094, "grad_norm": 0.2472684383392334, "learning_rate": 0.0002, "loss": 0.6472, "mean_token_accuracy": 0.8174821887165308, "num_tokens": 987471477.0, "step": 18160 }, { "entropy": 0.5672120662406087, "epoch": 1.3613088170199612, "grad_norm": 0.22788691520690918, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.8249508246779442, "num_tokens": 988942275.0, "step": 18165 }, { "entropy": 0.5840094489976764, "epoch": 1.361683540953813, "grad_norm": 0.21979138255119324, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8191066857427358, "num_tokens": 990442451.0, "step": 18170 }, { "entropy": 0.5857772625982761, "epoch": 1.362058264887665, "grad_norm": 0.21395424008369446, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.8214696202427149, "num_tokens": 991952755.0, "step": 18175 }, { "entropy": 0.5885281605646014, "epoch": 1.3624329888215168, "grad_norm": 0.22245430946350098, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8202929444611072, "num_tokens": 993416097.0, "step": 18180 }, { "entropy": 0.5823771873489022, "epoch": 1.3628077127553686, "grad_norm": 0.2515993118286133, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.8227174568921327, "num_tokens": 994909661.0, "step": 18185 }, { "entropy": 0.5865157600492239, "epoch": 1.3631824366892205, "grad_norm": 0.2341364622116089, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.822108068689704, "num_tokens": 996402474.0, "step": 18190 }, { "entropy": 0.5864778887480497, "epoch": 1.3635571606230723, "grad_norm": 0.2170407623052597, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.8212906543165446, "num_tokens": 997920503.0, "step": 18195 }, { "entropy": 0.5889681141823531, "epoch": 1.3639318845569242, "grad_norm": 0.23946231603622437, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8263656832277775, "num_tokens": 999405716.0, "step": 18200 }, { "entropy": 0.5949306959286332, "epoch": 1.364306608490776, "grad_norm": 0.24722427129745483, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.8215917341411114, "num_tokens": 1000846104.0, "step": 18205 }, { "entropy": 0.5699998104944826, "epoch": 1.3646813324246279, "grad_norm": 0.22359608113765717, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.8252595458179712, "num_tokens": 1002307211.0, "step": 18210 }, { "entropy": 0.5741207964718342, "epoch": 1.3650560563584797, "grad_norm": 0.2315215915441513, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.8229114506393671, "num_tokens": 1003746636.0, "step": 18215 }, { "entropy": 0.5836867436766624, "epoch": 1.3654307802923316, "grad_norm": 0.2277286797761917, "learning_rate": 0.0002, "loss": 0.6408, "mean_token_accuracy": 0.8197540018707514, "num_tokens": 1005194193.0, "step": 18220 }, { "entropy": 0.578705750592053, "epoch": 1.3658055042261834, "grad_norm": 0.24314001202583313, "learning_rate": 0.0002, "loss": 0.6365, "mean_token_accuracy": 0.8194390095770359, "num_tokens": 1006690288.0, "step": 18225 }, { "entropy": 0.5582138542085886, "epoch": 1.3661802281600353, "grad_norm": 0.24106276035308838, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8242759473621846, "num_tokens": 1008084385.0, "step": 18230 }, { "entropy": 0.5458089776337147, "epoch": 1.366554952093887, "grad_norm": 0.2231868952512741, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.8224149618297816, "num_tokens": 1009548848.0, "step": 18235 }, { "entropy": 0.5503210786730051, "epoch": 1.366929676027739, "grad_norm": 0.20743148028850555, "learning_rate": 0.0002, "loss": 0.6387, "mean_token_accuracy": 0.8178976319730282, "num_tokens": 1011056237.0, "step": 18240 }, { "entropy": 0.5334518225863576, "epoch": 1.3673043999615908, "grad_norm": 0.21914580464363098, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.8218322411179543, "num_tokens": 1012484793.0, "step": 18245 }, { "entropy": 0.5372772807255387, "epoch": 1.3676791238954427, "grad_norm": 0.24957624077796936, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.8187699899077415, "num_tokens": 1013923207.0, "step": 18250 }, { "entropy": 0.5447768725454807, "epoch": 1.3680538478292945, "grad_norm": 0.2276151180267334, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.8200451869517564, "num_tokens": 1015422917.0, "step": 18255 }, { "entropy": 0.5387387845665217, "epoch": 1.3684285717631464, "grad_norm": 0.2137243002653122, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8208126626908779, "num_tokens": 1016940202.0, "step": 18260 }, { "entropy": 0.5288257241249085, "epoch": 1.3688032956969982, "grad_norm": 0.2300540655851364, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8220745723694562, "num_tokens": 1018398261.0, "step": 18265 }, { "entropy": 0.530083230137825, "epoch": 1.36917801963085, "grad_norm": 0.2456589937210083, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.824040137603879, "num_tokens": 1019821177.0, "step": 18270 }, { "entropy": 0.5396040005609393, "epoch": 1.369552743564702, "grad_norm": 0.2192789614200592, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.8219284519553185, "num_tokens": 1021332597.0, "step": 18275 }, { "entropy": 0.5257559388875961, "epoch": 1.3699274674985538, "grad_norm": 0.2086258977651596, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.8250248689204455, "num_tokens": 1022811235.0, "step": 18280 }, { "entropy": 0.5288458190858364, "epoch": 1.3703021914324056, "grad_norm": 0.24850381910800934, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8246965192258358, "num_tokens": 1024249153.0, "step": 18285 }, { "entropy": 0.5427810544148087, "epoch": 1.3706769153662575, "grad_norm": 0.20781059563159943, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8222648367285729, "num_tokens": 1025777169.0, "step": 18290 }, { "entropy": 0.5569302345626056, "epoch": 1.3710516393001093, "grad_norm": 0.2626103162765503, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8196043573319912, "num_tokens": 1027247051.0, "step": 18295 }, { "entropy": 0.5465465361252427, "epoch": 1.3714263632339612, "grad_norm": 0.21610867977142334, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8244879085570573, "num_tokens": 1028705774.0, "step": 18300 }, { "entropy": 0.560505392961204, "epoch": 1.371801087167813, "grad_norm": 0.29104524850845337, "learning_rate": 0.0002, "loss": 0.6439, "mean_token_accuracy": 0.8181528758257628, "num_tokens": 1030130582.0, "step": 18305 }, { "entropy": 0.5581400020048022, "epoch": 1.3721758111016649, "grad_norm": 0.25810614228248596, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8214272558689117, "num_tokens": 1031602849.0, "step": 18310 }, { "entropy": 0.5576566237956285, "epoch": 1.3725505350355167, "grad_norm": 0.2173190414905548, "learning_rate": 0.0002, "loss": 0.6256, "mean_token_accuracy": 0.8198186963796615, "num_tokens": 1033076943.0, "step": 18315 }, { "entropy": 0.548386987671256, "epoch": 1.3729252589693686, "grad_norm": 0.2605094611644745, "learning_rate": 0.0002, "loss": 0.6328, "mean_token_accuracy": 0.8182971362024546, "num_tokens": 1034545255.0, "step": 18320 }, { "entropy": 0.5516075978055597, "epoch": 1.3732999829032204, "grad_norm": 0.2460758090019226, "learning_rate": 0.0002, "loss": 0.6392, "mean_token_accuracy": 0.8181282863020897, "num_tokens": 1036024088.0, "step": 18325 }, { "entropy": 0.5531412683427334, "epoch": 1.3736747068370723, "grad_norm": 0.23022013902664185, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.8215122666209936, "num_tokens": 1037527273.0, "step": 18330 }, { "entropy": 0.54238186981529, "epoch": 1.3740494307709241, "grad_norm": 0.21942618489265442, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8256227221339941, "num_tokens": 1038968903.0, "step": 18335 }, { "entropy": 0.5366449248045683, "epoch": 1.374424154704776, "grad_norm": 0.2623274326324463, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8240233283489943, "num_tokens": 1040419737.0, "step": 18340 }, { "entropy": 0.5507963381707668, "epoch": 1.3747988786386278, "grad_norm": 0.2236330807209015, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.824906875193119, "num_tokens": 1041894266.0, "step": 18345 }, { "entropy": 0.5469110976904631, "epoch": 1.3751736025724797, "grad_norm": 0.19235193729400635, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8242540940642357, "num_tokens": 1043391577.0, "step": 18350 }, { "entropy": 0.5442771666683257, "epoch": 1.3755483265063317, "grad_norm": 0.2039245367050171, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.8222804289311171, "num_tokens": 1044853524.0, "step": 18355 }, { "entropy": 0.5612956013530492, "epoch": 1.3759230504401836, "grad_norm": 0.21401634812355042, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.8221006382256746, "num_tokens": 1046367208.0, "step": 18360 }, { "entropy": 0.5334699437022209, "epoch": 1.3762977743740354, "grad_norm": 0.24333925545215607, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.8255235962569714, "num_tokens": 1047836942.0, "step": 18365 }, { "entropy": 0.5546488244086504, "epoch": 1.3766724983078873, "grad_norm": 0.20502927899360657, "learning_rate": 0.0002, "loss": 0.641, "mean_token_accuracy": 0.8203952625393868, "num_tokens": 1049319384.0, "step": 18370 }, { "entropy": 0.5605382073670626, "epoch": 1.3770472222417391, "grad_norm": 0.23459112644195557, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.8224697951227427, "num_tokens": 1050820457.0, "step": 18375 }, { "entropy": 0.5496962477453053, "epoch": 1.377421946175591, "grad_norm": 0.22746768593788147, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8245832521468401, "num_tokens": 1052283198.0, "step": 18380 }, { "entropy": 0.5577428692951798, "epoch": 1.3777966701094428, "grad_norm": 0.2117253839969635, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8209700711071491, "num_tokens": 1053768814.0, "step": 18385 }, { "entropy": 0.5668215043842793, "epoch": 1.3781713940432947, "grad_norm": 0.22880196571350098, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.8183470249176026, "num_tokens": 1055247428.0, "step": 18390 }, { "entropy": 0.5356684857979417, "epoch": 1.3785461179771465, "grad_norm": 0.24749255180358887, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.821983776986599, "num_tokens": 1056720337.0, "step": 18395 }, { "entropy": 0.5523131404072046, "epoch": 1.3789208419109984, "grad_norm": 0.2316475808620453, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8170494623482227, "num_tokens": 1058210483.0, "step": 18400 }, { "entropy": 0.5416146355681122, "epoch": 1.3792955658448502, "grad_norm": 0.30038779973983765, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.8228722054511308, "num_tokens": 1059660896.0, "step": 18405 }, { "entropy": 0.5456227155402302, "epoch": 1.379670289778702, "grad_norm": 0.25820720195770264, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.8214825719594956, "num_tokens": 1061165530.0, "step": 18410 }, { "entropy": 0.5622442556545139, "epoch": 1.380045013712554, "grad_norm": 0.24697713553905487, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.820214482024312, "num_tokens": 1062677691.0, "step": 18415 }, { "entropy": 0.5485614709556103, "epoch": 1.3804197376464058, "grad_norm": 0.22250179946422577, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.8232830803841352, "num_tokens": 1064151191.0, "step": 18420 }, { "entropy": 0.5524286661297083, "epoch": 1.3807944615802576, "grad_norm": 0.23516233265399933, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.8200383298099041, "num_tokens": 1065623690.0, "step": 18425 }, { "entropy": 0.5520048736594617, "epoch": 1.3811691855141095, "grad_norm": 0.27949002385139465, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.8202568642795086, "num_tokens": 1067075822.0, "step": 18430 }, { "entropy": 0.5595643315464258, "epoch": 1.3815439094479613, "grad_norm": 0.2236587405204773, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.819087152555585, "num_tokens": 1068556952.0, "step": 18435 }, { "entropy": 0.5614064196124673, "epoch": 1.3819186333818132, "grad_norm": 0.21828778088092804, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.820584874600172, "num_tokens": 1070028982.0, "step": 18440 }, { "entropy": 0.5511232584714889, "epoch": 1.382293357315665, "grad_norm": 0.26494160294532776, "learning_rate": 0.0002, "loss": 0.6288, "mean_token_accuracy": 0.8218063767999411, "num_tokens": 1071530233.0, "step": 18445 }, { "entropy": 0.5527107698842884, "epoch": 1.382668081249517, "grad_norm": 0.22567731142044067, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.8250757265836001, "num_tokens": 1072981011.0, "step": 18450 }, { "entropy": 0.5595062049105763, "epoch": 1.3830428051833688, "grad_norm": 0.25008586049079895, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8230026978999376, "num_tokens": 1074449993.0, "step": 18455 }, { "entropy": 0.5509400978684426, "epoch": 1.3834175291172206, "grad_norm": 0.2599448263645172, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.8221063550561667, "num_tokens": 1075922830.0, "step": 18460 }, { "entropy": 0.5711018837988376, "epoch": 1.3837922530510725, "grad_norm": 0.2501108944416046, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.8194646414369344, "num_tokens": 1077363784.0, "step": 18465 }, { "entropy": 0.557955103740096, "epoch": 1.3841669769849245, "grad_norm": 0.20775210857391357, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.825965266302228, "num_tokens": 1078852212.0, "step": 18470 }, { "entropy": 0.5637510405853391, "epoch": 1.3845417009187764, "grad_norm": 0.21883083879947662, "learning_rate": 0.0002, "loss": 0.643, "mean_token_accuracy": 0.8191406022757292, "num_tokens": 1080331201.0, "step": 18475 }, { "entropy": 0.5482432309538126, "epoch": 1.3849164248526282, "grad_norm": 0.23404762148857117, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.82120960354805, "num_tokens": 1081792630.0, "step": 18480 }, { "entropy": 0.5482798378914595, "epoch": 1.38529114878648, "grad_norm": 0.20405954122543335, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8216940924525261, "num_tokens": 1083288544.0, "step": 18485 }, { "entropy": 0.5473147159442305, "epoch": 1.385665872720332, "grad_norm": 0.2243230640888214, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.8222309976816178, "num_tokens": 1084795027.0, "step": 18490 }, { "entropy": 0.5696132572367787, "epoch": 1.3860405966541838, "grad_norm": 0.2671205401420593, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.818552466481924, "num_tokens": 1086271337.0, "step": 18495 }, { "entropy": 0.5717847334221006, "epoch": 1.3864153205880356, "grad_norm": 0.21670936048030853, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8253642924129962, "num_tokens": 1087784877.0, "step": 18500 }, { "entropy": 0.5703766481019557, "epoch": 1.3867900445218875, "grad_norm": 0.23374377191066742, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8231854800134897, "num_tokens": 1089201301.0, "step": 18505 }, { "entropy": 0.5828524151816964, "epoch": 1.3871647684557393, "grad_norm": 0.2509714663028717, "learning_rate": 0.0002, "loss": 0.639, "mean_token_accuracy": 0.8183797702193261, "num_tokens": 1090691721.0, "step": 18510 }, { "entropy": 0.5711704017594457, "epoch": 1.3875394923895912, "grad_norm": 0.22607821226119995, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8244790401309728, "num_tokens": 1092130509.0, "step": 18515 }, { "entropy": 0.5749512078240514, "epoch": 1.387914216323443, "grad_norm": 0.2262139767408371, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8232599109411239, "num_tokens": 1093589298.0, "step": 18520 }, { "entropy": 0.5824462067335844, "epoch": 1.3882889402572949, "grad_norm": 0.2194870412349701, "learning_rate": 0.0002, "loss": 0.6374, "mean_token_accuracy": 0.8179458986967802, "num_tokens": 1095027912.0, "step": 18525 }, { "entropy": 0.5862134786322712, "epoch": 1.3886636641911467, "grad_norm": 0.25815853476524353, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8271223295480012, "num_tokens": 1096451858.0, "step": 18530 }, { "entropy": 0.5651261167600751, "epoch": 1.3890383881249986, "grad_norm": 0.2183762937784195, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8196820247918367, "num_tokens": 1097912165.0, "step": 18535 }, { "entropy": 0.5554743429645896, "epoch": 1.3894131120588504, "grad_norm": 0.26753005385398865, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8257401011884212, "num_tokens": 1099378704.0, "step": 18540 }, { "entropy": 0.5751359997317195, "epoch": 1.3897878359927023, "grad_norm": 0.2211257815361023, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8218805357813835, "num_tokens": 1100922305.0, "step": 18545 }, { "entropy": 0.5625127632170915, "epoch": 1.3901625599265541, "grad_norm": 0.22092179954051971, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.8231497138738633, "num_tokens": 1102419353.0, "step": 18550 }, { "entropy": 0.5736172761768102, "epoch": 1.390537283860406, "grad_norm": 0.2372981458902359, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8240969136357308, "num_tokens": 1103893714.0, "step": 18555 }, { "entropy": 0.5807538764551282, "epoch": 1.3909120077942578, "grad_norm": 0.23887263238430023, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.8212679155170918, "num_tokens": 1105402174.0, "step": 18560 }, { "entropy": 0.5863873057067395, "epoch": 1.3912867317281097, "grad_norm": 0.2232254594564438, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8194823857396841, "num_tokens": 1106919039.0, "step": 18565 }, { "entropy": 0.5708422135561705, "epoch": 1.3916614556619615, "grad_norm": 0.25842973589897156, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8205191884189844, "num_tokens": 1108400992.0, "step": 18570 }, { "entropy": 0.5603873509913683, "epoch": 1.3920361795958134, "grad_norm": 0.22389185428619385, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8231182612478734, "num_tokens": 1109857076.0, "step": 18575 }, { "entropy": 0.5782246734946966, "epoch": 1.3924109035296652, "grad_norm": 0.25444939732551575, "learning_rate": 0.0002, "loss": 0.6412, "mean_token_accuracy": 0.8171570938080549, "num_tokens": 1111379006.0, "step": 18580 }, { "entropy": 0.5647264568135142, "epoch": 1.392785627463517, "grad_norm": 0.20605987310409546, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8228357199579477, "num_tokens": 1112832234.0, "step": 18585 }, { "entropy": 0.5517647432163357, "epoch": 1.393160351397369, "grad_norm": 0.25346750020980835, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.8243238806724549, "num_tokens": 1114276468.0, "step": 18590 }, { "entropy": 0.5541939124464988, "epoch": 1.3935350753312208, "grad_norm": 0.22248968482017517, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8235838044434786, "num_tokens": 1115768575.0, "step": 18595 }, { "entropy": 0.5606519816443324, "epoch": 1.3939097992650726, "grad_norm": 0.26495930552482605, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8212527088820935, "num_tokens": 1117230459.0, "step": 18600 }, { "entropy": 0.567524534650147, "epoch": 1.3942845231989245, "grad_norm": 0.25333088636398315, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.8209421463310719, "num_tokens": 1118722369.0, "step": 18605 }, { "entropy": 0.5552239581942559, "epoch": 1.3946592471327763, "grad_norm": 0.23365218937397003, "learning_rate": 0.0002, "loss": 0.6264, "mean_token_accuracy": 0.8219962667673826, "num_tokens": 1120199357.0, "step": 18610 }, { "entropy": 0.5567063070833683, "epoch": 1.3950339710666282, "grad_norm": 0.22462284564971924, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.8225266531109809, "num_tokens": 1121695911.0, "step": 18615 }, { "entropy": 0.5633408453315496, "epoch": 1.39540869500048, "grad_norm": 0.23349659144878387, "learning_rate": 0.0002, "loss": 0.6206, "mean_token_accuracy": 0.8230632532387971, "num_tokens": 1123148536.0, "step": 18620 }, { "entropy": 0.5662340834736824, "epoch": 1.3957834189343319, "grad_norm": 0.275536447763443, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8209920577704907, "num_tokens": 1124648621.0, "step": 18625 }, { "entropy": 0.5560692908242345, "epoch": 1.3961581428681837, "grad_norm": 0.21273796260356903, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8224771443754435, "num_tokens": 1126170117.0, "step": 18630 }, { "entropy": 0.5614910395815969, "epoch": 1.3965328668020356, "grad_norm": 0.2392098307609558, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8200216066092253, "num_tokens": 1127680573.0, "step": 18635 }, { "entropy": 0.5401725577190518, "epoch": 1.3969075907358874, "grad_norm": 0.25019943714141846, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8233229380100966, "num_tokens": 1129135632.0, "step": 18640 }, { "entropy": 0.5156754547730088, "epoch": 1.3972823146697393, "grad_norm": 0.27111274003982544, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.8298355657607317, "num_tokens": 1130604099.0, "step": 18645 }, { "entropy": 0.5437492123804987, "epoch": 1.3976570386035911, "grad_norm": 0.2786168158054352, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.8233371946960688, "num_tokens": 1132033608.0, "step": 18650 }, { "entropy": 0.557650850713253, "epoch": 1.398031762537443, "grad_norm": 0.20859375596046448, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.82207096144557, "num_tokens": 1133476988.0, "step": 18655 }, { "entropy": 0.5453822400420905, "epoch": 1.3984064864712948, "grad_norm": 0.21926282346248627, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.8194028101861477, "num_tokens": 1134925638.0, "step": 18660 }, { "entropy": 0.532237472385168, "epoch": 1.398781210405147, "grad_norm": 0.2267042100429535, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8214671175926924, "num_tokens": 1136372178.0, "step": 18665 }, { "entropy": 0.5349215425550937, "epoch": 1.3991559343389988, "grad_norm": 0.26684442162513733, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.8221944745630025, "num_tokens": 1137845831.0, "step": 18670 }, { "entropy": 0.5231974041089416, "epoch": 1.3995306582728506, "grad_norm": 0.23060956597328186, "learning_rate": 0.0002, "loss": 0.6168, "mean_token_accuracy": 0.8256002552807331, "num_tokens": 1139302303.0, "step": 18675 }, { "entropy": 0.5149096236564219, "epoch": 1.3999053822067025, "grad_norm": 0.25700464844703674, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8282996572554111, "num_tokens": 1140750172.0, "step": 18680 }, { "entropy": 0.535586335323751, "epoch": 1.4002801061405543, "grad_norm": 0.22656002640724182, "learning_rate": 0.0002, "loss": 0.6383, "mean_token_accuracy": 0.8220539480447769, "num_tokens": 1142221896.0, "step": 18685 }, { "entropy": 0.5254410346038639, "epoch": 1.4006548300744062, "grad_norm": 0.2445056140422821, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.8219348825514317, "num_tokens": 1143701031.0, "step": 18690 }, { "entropy": 0.5223669603466987, "epoch": 1.401029554008258, "grad_norm": 0.25977668166160583, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8252876561135054, "num_tokens": 1145167503.0, "step": 18695 }, { "entropy": 0.5143377227708698, "epoch": 1.4014042779421099, "grad_norm": 0.2306930422782898, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.8250914018601179, "num_tokens": 1146649913.0, "step": 18700 }, { "entropy": 0.5217334251850844, "epoch": 1.4017790018759617, "grad_norm": 0.22536760568618774, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.8234968446195126, "num_tokens": 1148104687.0, "step": 18705 }, { "entropy": 0.5325979420915246, "epoch": 1.4021537258098136, "grad_norm": 0.22935159504413605, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.8202593557536602, "num_tokens": 1149538011.0, "step": 18710 }, { "entropy": 0.5125956842675805, "epoch": 1.4025284497436654, "grad_norm": 0.23811449110507965, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8254248216748238, "num_tokens": 1150977608.0, "step": 18715 }, { "entropy": 0.5299778075888752, "epoch": 1.4029031736775173, "grad_norm": 0.23042581975460052, "learning_rate": 0.0002, "loss": 0.6457, "mean_token_accuracy": 0.8147928953170777, "num_tokens": 1152458100.0, "step": 18720 }, { "entropy": 0.52055480806157, "epoch": 1.4032778976113691, "grad_norm": 0.24137017130851746, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.8207789666950702, "num_tokens": 1153976636.0, "step": 18725 }, { "entropy": 0.5491656413301825, "epoch": 1.403652621545221, "grad_norm": 0.25669756531715393, "learning_rate": 0.0002, "loss": 0.6508, "mean_token_accuracy": 0.8181919883936644, "num_tokens": 1155431629.0, "step": 18730 }, { "entropy": 0.5370110999792814, "epoch": 1.4040273454790728, "grad_norm": 0.22746048867702484, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.8205600094050169, "num_tokens": 1156893788.0, "step": 18735 }, { "entropy": 0.5359577357769012, "epoch": 1.4044020694129247, "grad_norm": 0.2543151378631592, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.8199134837836027, "num_tokens": 1158376705.0, "step": 18740 }, { "entropy": 0.5354679254814982, "epoch": 1.4047767933467765, "grad_norm": 0.259172648191452, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.8221524957567453, "num_tokens": 1159864249.0, "step": 18745 }, { "entropy": 0.5331290148198604, "epoch": 1.4051515172806284, "grad_norm": 0.22546164691448212, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8236270181834697, "num_tokens": 1161328628.0, "step": 18750 }, { "entropy": 0.524688089452684, "epoch": 1.4055262412144802, "grad_norm": 0.23060883581638336, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.8208906516432762, "num_tokens": 1162741190.0, "step": 18755 }, { "entropy": 0.5270368590950966, "epoch": 1.405900965148332, "grad_norm": 0.2227112203836441, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8205534800887108, "num_tokens": 1164247268.0, "step": 18760 }, { "entropy": 0.5354616668075323, "epoch": 1.406275689082184, "grad_norm": 0.2225939780473709, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8232013303786516, "num_tokens": 1165731798.0, "step": 18765 }, { "entropy": 0.5451105687767267, "epoch": 1.4066504130160358, "grad_norm": 0.2156747579574585, "learning_rate": 0.0002, "loss": 0.6394, "mean_token_accuracy": 0.818200221285224, "num_tokens": 1167272723.0, "step": 18770 }, { "entropy": 0.529766402579844, "epoch": 1.4070251369498876, "grad_norm": 0.2447071373462677, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.8222569447010756, "num_tokens": 1168739275.0, "step": 18775 }, { "entropy": 0.5396252844482661, "epoch": 1.4073998608837397, "grad_norm": 0.23818475008010864, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8204953797161579, "num_tokens": 1170198077.0, "step": 18780 }, { "entropy": 0.520787231810391, "epoch": 1.4077745848175915, "grad_norm": 0.3016718924045563, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.823445588350296, "num_tokens": 1171636724.0, "step": 18785 }, { "entropy": 0.5381649883463979, "epoch": 1.4081493087514434, "grad_norm": 0.22053247690200806, "learning_rate": 0.0002, "loss": 0.6333, "mean_token_accuracy": 0.8207871243357658, "num_tokens": 1173101892.0, "step": 18790 }, { "entropy": 0.5325186360627413, "epoch": 1.4085240326852952, "grad_norm": 0.2488698959350586, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8272883966565132, "num_tokens": 1174549590.0, "step": 18795 }, { "entropy": 0.5357080336660147, "epoch": 1.408898756619147, "grad_norm": 0.22968690097332, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.8215065512806177, "num_tokens": 1175969343.0, "step": 18800 }, { "entropy": 0.532409798540175, "epoch": 1.409273480552999, "grad_norm": 0.2546880841255188, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.8239384386688471, "num_tokens": 1177402945.0, "step": 18805 }, { "entropy": 0.5582600492984057, "epoch": 1.4096482044868508, "grad_norm": 0.2362174540758133, "learning_rate": 0.0002, "loss": 0.6327, "mean_token_accuracy": 0.8194426603615284, "num_tokens": 1178891060.0, "step": 18810 }, { "entropy": 0.5333063084632158, "epoch": 1.4100229284207026, "grad_norm": 0.24828386306762695, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8241089154034853, "num_tokens": 1180357196.0, "step": 18815 }, { "entropy": 0.5311594698578119, "epoch": 1.4103976523545545, "grad_norm": 0.21568728983402252, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.824237621575594, "num_tokens": 1181803544.0, "step": 18820 }, { "entropy": 0.5420248804613947, "epoch": 1.4107723762884063, "grad_norm": 0.22301340103149414, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8220763005316257, "num_tokens": 1183300013.0, "step": 18825 }, { "entropy": 0.5568933604285121, "epoch": 1.4111471002222582, "grad_norm": 0.22895291447639465, "learning_rate": 0.0002, "loss": 0.6395, "mean_token_accuracy": 0.8181276626884937, "num_tokens": 1184812163.0, "step": 18830 }, { "entropy": 0.5563088212162256, "epoch": 1.41152182415611, "grad_norm": 0.23814548552036285, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8193807855248452, "num_tokens": 1186242439.0, "step": 18835 }, { "entropy": 0.5583711558952927, "epoch": 1.411896548089962, "grad_norm": 0.2637418806552887, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.8200509883463383, "num_tokens": 1187733492.0, "step": 18840 }, { "entropy": 0.5514092102646828, "epoch": 1.4122712720238138, "grad_norm": 0.2585291564464569, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8229457553476095, "num_tokens": 1189173341.0, "step": 18845 }, { "entropy": 0.5517172241583467, "epoch": 1.4126459959576656, "grad_norm": 0.21637213230133057, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.8248139873147011, "num_tokens": 1190612151.0, "step": 18850 }, { "entropy": 0.5605856901034713, "epoch": 1.4130207198915175, "grad_norm": 0.23080989718437195, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8203129529953003, "num_tokens": 1192114457.0, "step": 18855 }, { "entropy": 0.5497115686535835, "epoch": 1.4133954438253693, "grad_norm": 0.22311235964298248, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8234948258846998, "num_tokens": 1193565707.0, "step": 18860 }, { "entropy": 0.5619072636589408, "epoch": 1.4137701677592212, "grad_norm": 0.2553127706050873, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.821073367446661, "num_tokens": 1195073225.0, "step": 18865 }, { "entropy": 0.5431708509102464, "epoch": 1.414144891693073, "grad_norm": 0.2630697190761566, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8246842689812184, "num_tokens": 1196542878.0, "step": 18870 }, { "entropy": 0.5263404430821538, "epoch": 1.4145196156269249, "grad_norm": 0.22158055007457733, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8222205847501755, "num_tokens": 1198009139.0, "step": 18875 }, { "entropy": 0.5151868866756558, "epoch": 1.4148943395607767, "grad_norm": 0.24100689589977264, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8253208231180906, "num_tokens": 1199466701.0, "step": 18880 }, { "entropy": 0.5343313865363598, "epoch": 1.4152690634946286, "grad_norm": 0.22179919481277466, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8220161706209183, "num_tokens": 1200922083.0, "step": 18885 }, { "entropy": 0.5282410001382232, "epoch": 1.4156437874284804, "grad_norm": 0.2583574950695038, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8218632075935602, "num_tokens": 1202405476.0, "step": 18890 }, { "entropy": 0.5457282720133663, "epoch": 1.4160185113623323, "grad_norm": 0.22964800894260406, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8199547097086907, "num_tokens": 1203909957.0, "step": 18895 }, { "entropy": 0.5362167202867567, "epoch": 1.416393235296184, "grad_norm": 0.22328582406044006, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.819271158054471, "num_tokens": 1205376234.0, "step": 18900 }, { "entropy": 0.5334197923541069, "epoch": 1.416767959230036, "grad_norm": 0.24616730213165283, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8257356252521276, "num_tokens": 1206873793.0, "step": 18905 }, { "entropy": 0.5501300589181483, "epoch": 1.4171426831638878, "grad_norm": 0.2518090009689331, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8208256479352712, "num_tokens": 1208303728.0, "step": 18910 }, { "entropy": 0.5363082638010382, "epoch": 1.4175174070977397, "grad_norm": 0.2205815315246582, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.820676950365305, "num_tokens": 1209803733.0, "step": 18915 }, { "entropy": 0.5434666447341442, "epoch": 1.4178921310315915, "grad_norm": 0.23539963364601135, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.8245674818754196, "num_tokens": 1211250052.0, "step": 18920 }, { "entropy": 0.5408163249492646, "epoch": 1.4182668549654434, "grad_norm": 0.23018628358840942, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.8216217797249555, "num_tokens": 1212763661.0, "step": 18925 }, { "entropy": 0.5487868949770928, "epoch": 1.4186415788992952, "grad_norm": 0.22698885202407837, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8225209407508374, "num_tokens": 1214269627.0, "step": 18930 }, { "entropy": 0.5602793285623193, "epoch": 1.419016302833147, "grad_norm": 0.23453360795974731, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8250121269375086, "num_tokens": 1215716714.0, "step": 18935 }, { "entropy": 0.5616875477135181, "epoch": 1.419391026766999, "grad_norm": 0.23589001595973969, "learning_rate": 0.0002, "loss": 0.6384, "mean_token_accuracy": 0.8194628268480301, "num_tokens": 1217175477.0, "step": 18940 }, { "entropy": 0.5407648177817463, "epoch": 1.4197657507008508, "grad_norm": 0.22720636427402496, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8212438106536866, "num_tokens": 1218645326.0, "step": 18945 }, { "entropy": 0.5397806959226727, "epoch": 1.4201404746347026, "grad_norm": 0.25186142325401306, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8265286237001419, "num_tokens": 1220153455.0, "step": 18950 }, { "entropy": 0.5437822069972753, "epoch": 1.4205151985685545, "grad_norm": 0.22193916141986847, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.8248905301094055, "num_tokens": 1221561777.0, "step": 18955 }, { "entropy": 0.5540471939370036, "epoch": 1.4208899225024063, "grad_norm": 0.2149779349565506, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8193377695977688, "num_tokens": 1223064281.0, "step": 18960 }, { "entropy": 0.558342924900353, "epoch": 1.4212646464362582, "grad_norm": 0.3727540969848633, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.8205117430537939, "num_tokens": 1224487665.0, "step": 18965 }, { "entropy": 0.556677563674748, "epoch": 1.42163937037011, "grad_norm": 0.22802655398845673, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8240555934607983, "num_tokens": 1225916347.0, "step": 18970 }, { "entropy": 0.5761515235528349, "epoch": 1.422014094303962, "grad_norm": 0.23059359192848206, "learning_rate": 0.0002, "loss": 0.6347, "mean_token_accuracy": 0.8216799598187208, "num_tokens": 1227422788.0, "step": 18975 }, { "entropy": 0.5496464647352696, "epoch": 1.422388818237814, "grad_norm": 0.21481898427009583, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.8226465169340372, "num_tokens": 1228836411.0, "step": 18980 }, { "entropy": 0.5639134420081973, "epoch": 1.4227635421716658, "grad_norm": 0.23168499767780304, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8217763863503933, "num_tokens": 1230344236.0, "step": 18985 }, { "entropy": 0.5565074795857072, "epoch": 1.4231382661055176, "grad_norm": 0.2519899904727936, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.8231387551873922, "num_tokens": 1231788608.0, "step": 18990 }, { "entropy": 0.5643080858513713, "epoch": 1.4235129900393695, "grad_norm": 0.2588580548763275, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.8211917135864496, "num_tokens": 1233269427.0, "step": 18995 }, { "entropy": 0.5578394114971161, "epoch": 1.4238877139732213, "grad_norm": 0.23406890034675598, "learning_rate": 0.0002, "loss": 0.615, "mean_token_accuracy": 0.8232614889740943, "num_tokens": 1234717198.0, "step": 19000 }, { "entropy": 0.5514500856399536, "epoch": 1.4242624379070732, "grad_norm": 0.39653751254081726, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.8230737537145615, "num_tokens": 1236168209.0, "step": 19005 }, { "entropy": 0.5686844812706113, "epoch": 1.424637161840925, "grad_norm": 0.21088728308677673, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8236853882670403, "num_tokens": 1237656772.0, "step": 19010 }, { "entropy": 0.5721903506666421, "epoch": 1.4250118857747769, "grad_norm": 0.4238544702529907, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8194561026990413, "num_tokens": 1239157506.0, "step": 19015 }, { "entropy": 0.5590712979435921, "epoch": 1.4253866097086287, "grad_norm": 0.2594963014125824, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8207112435251475, "num_tokens": 1240657751.0, "step": 19020 }, { "entropy": 0.5653750192373991, "epoch": 1.4257613336424806, "grad_norm": 0.3169476389884949, "learning_rate": 0.0002, "loss": 0.6379, "mean_token_accuracy": 0.8184500571340323, "num_tokens": 1242134468.0, "step": 19025 }, { "entropy": 0.5476730093359947, "epoch": 1.4261360575763324, "grad_norm": 0.21256780624389648, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.8255770269781351, "num_tokens": 1243651128.0, "step": 19030 }, { "entropy": 0.5915962494909763, "epoch": 1.4265107815101843, "grad_norm": 0.2599217891693115, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8202731873840093, "num_tokens": 1245113211.0, "step": 19035 }, { "entropy": 0.5764409875497222, "epoch": 1.4268855054440361, "grad_norm": 0.21378976106643677, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8212806653231383, "num_tokens": 1246638748.0, "step": 19040 }, { "entropy": 0.5634601775556802, "epoch": 1.427260229377888, "grad_norm": 0.24045999348163605, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8237349111586809, "num_tokens": 1248092365.0, "step": 19045 }, { "entropy": 0.571114515978843, "epoch": 1.4276349533117398, "grad_norm": 0.2428739368915558, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.8217524036765098, "num_tokens": 1249577131.0, "step": 19050 }, { "entropy": 0.5728783331811428, "epoch": 1.4280096772455917, "grad_norm": 0.19951066374778748, "learning_rate": 0.0002, "loss": 0.6327, "mean_token_accuracy": 0.819800877571106, "num_tokens": 1251074665.0, "step": 19055 }, { "entropy": 0.5706898434087634, "epoch": 1.4283844011794435, "grad_norm": 0.23326659202575684, "learning_rate": 0.0002, "loss": 0.6387, "mean_token_accuracy": 0.8192785274237394, "num_tokens": 1252576670.0, "step": 19060 }, { "entropy": 0.5760914731770754, "epoch": 1.4287591251132954, "grad_norm": 0.22533413767814636, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.8230143763124943, "num_tokens": 1254077674.0, "step": 19065 }, { "entropy": 0.5652869891375303, "epoch": 1.4291338490471472, "grad_norm": 0.2551822364330292, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.8243827998638154, "num_tokens": 1255510957.0, "step": 19070 }, { "entropy": 0.5558964485302568, "epoch": 1.429508572980999, "grad_norm": 0.25105592608451843, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8247590318322182, "num_tokens": 1256953691.0, "step": 19075 }, { "entropy": 0.5835418632254005, "epoch": 1.429883296914851, "grad_norm": 0.3317396640777588, "learning_rate": 0.0002, "loss": 0.6504, "mean_token_accuracy": 0.8152731698006391, "num_tokens": 1258443269.0, "step": 19080 }, { "entropy": 0.5460918186232447, "epoch": 1.4302580208487028, "grad_norm": 0.22870951890945435, "learning_rate": 0.0002, "loss": 0.615, "mean_token_accuracy": 0.8248329881578684, "num_tokens": 1259917662.0, "step": 19085 }, { "entropy": 0.555830005928874, "epoch": 1.4306327447825549, "grad_norm": 0.26090630888938904, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.8207647133618593, "num_tokens": 1261402290.0, "step": 19090 }, { "entropy": 0.5719804273918271, "epoch": 1.4310074687164067, "grad_norm": 0.211478590965271, "learning_rate": 0.0002, "loss": 0.6426, "mean_token_accuracy": 0.817971783131361, "num_tokens": 1262953002.0, "step": 19095 }, { "entropy": 0.6061732642352581, "epoch": 1.4313821926502586, "grad_norm": 0.24595163762569427, "learning_rate": 0.0002, "loss": 0.6436, "mean_token_accuracy": 0.8165035422891378, "num_tokens": 1264421426.0, "step": 19100 }, { "entropy": 0.5821444595232605, "epoch": 1.4317569165841104, "grad_norm": 0.22597411274909973, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8247471705079079, "num_tokens": 1265892488.0, "step": 19105 }, { "entropy": 0.5649775953963398, "epoch": 1.4321316405179623, "grad_norm": 0.2095162272453308, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.8258280087262392, "num_tokens": 1267369254.0, "step": 19110 }, { "entropy": 0.5669946679845452, "epoch": 1.4325063644518141, "grad_norm": 0.211623415350914, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.8180005788803101, "num_tokens": 1268853526.0, "step": 19115 }, { "entropy": 0.5368715977296233, "epoch": 1.432881088385666, "grad_norm": 0.2507268786430359, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.8237901881337166, "num_tokens": 1270341979.0, "step": 19120 }, { "entropy": 0.5360909452661872, "epoch": 1.4332558123195178, "grad_norm": 0.21564021706581116, "learning_rate": 0.0002, "loss": 0.6115, "mean_token_accuracy": 0.8247805204242468, "num_tokens": 1271808006.0, "step": 19125 }, { "entropy": 0.5636698674410582, "epoch": 1.4336305362533697, "grad_norm": 0.25687286257743835, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.8215927679091692, "num_tokens": 1273329914.0, "step": 19130 }, { "entropy": 0.5523317689076066, "epoch": 1.4340052601872215, "grad_norm": 0.21900798380374908, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.8241933077573776, "num_tokens": 1274772514.0, "step": 19135 }, { "entropy": 0.5555432768538594, "epoch": 1.4343799841210734, "grad_norm": 0.3377217650413513, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8234736908227205, "num_tokens": 1276216716.0, "step": 19140 }, { "entropy": 0.5488248914480209, "epoch": 1.4347547080549252, "grad_norm": 0.19692283868789673, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8265394695103169, "num_tokens": 1277687047.0, "step": 19145 }, { "entropy": 0.5784148301929235, "epoch": 1.435129431988777, "grad_norm": 0.277586966753006, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.8198600709438324, "num_tokens": 1279181830.0, "step": 19150 }, { "entropy": 0.5787534637376666, "epoch": 1.435504155922629, "grad_norm": 0.23780320584774017, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8209119189530611, "num_tokens": 1280643228.0, "step": 19155 }, { "entropy": 0.5865323336794972, "epoch": 1.4358788798564808, "grad_norm": 0.21589195728302002, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.8235218293964863, "num_tokens": 1282120532.0, "step": 19160 }, { "entropy": 0.5902181820943951, "epoch": 1.4362536037903326, "grad_norm": 0.28447845578193665, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.8231064695864916, "num_tokens": 1283598594.0, "step": 19165 }, { "entropy": 0.587389362975955, "epoch": 1.4366283277241845, "grad_norm": 0.23993174731731415, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.8268197868019342, "num_tokens": 1285042539.0, "step": 19170 }, { "entropy": 0.5867270123213529, "epoch": 1.4370030516580363, "grad_norm": 0.2478809505701065, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.8179188270121813, "num_tokens": 1286570636.0, "step": 19175 }, { "entropy": 0.5785912567749619, "epoch": 1.4373777755918882, "grad_norm": 0.245832622051239, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.8202082414180041, "num_tokens": 1288057725.0, "step": 19180 }, { "entropy": 0.570228379406035, "epoch": 1.43775249952574, "grad_norm": 0.24494916200637817, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.8229965183883905, "num_tokens": 1289529044.0, "step": 19185 }, { "entropy": 0.5695804171264172, "epoch": 1.4381272234595919, "grad_norm": 0.36480849981307983, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.8218812752515078, "num_tokens": 1291018060.0, "step": 19190 }, { "entropy": 0.5757531592622399, "epoch": 1.4385019473934437, "grad_norm": 0.26062849164009094, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.8192446790635586, "num_tokens": 1292503389.0, "step": 19195 }, { "entropy": 0.555949280038476, "epoch": 1.4388766713272956, "grad_norm": 0.22066988050937653, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.8235489495098591, "num_tokens": 1293962933.0, "step": 19200 }, { "entropy": 0.582937215641141, "epoch": 1.4392513952611474, "grad_norm": 0.26598554849624634, "learning_rate": 0.0002, "loss": 0.6545, "mean_token_accuracy": 0.8165434062480926, "num_tokens": 1295460659.0, "step": 19205 }, { "entropy": 0.5547190202400089, "epoch": 1.4396261191949993, "grad_norm": 0.23682914674282074, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8232695095241069, "num_tokens": 1296909759.0, "step": 19210 }, { "entropy": 0.5714062836021185, "epoch": 1.4400008431288511, "grad_norm": 0.34666645526885986, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8221605636179448, "num_tokens": 1298412780.0, "step": 19215 }, { "entropy": 0.559035368449986, "epoch": 1.440375567062703, "grad_norm": 0.24020199477672577, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8195803955197334, "num_tokens": 1299937502.0, "step": 19220 }, { "entropy": 0.5490152217447758, "epoch": 1.4407502909965548, "grad_norm": 0.20487052202224731, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.8223395738750696, "num_tokens": 1301401596.0, "step": 19225 }, { "entropy": 0.5507574010640383, "epoch": 1.4411250149304067, "grad_norm": 0.22831754386425018, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8236499588936568, "num_tokens": 1302859263.0, "step": 19230 }, { "entropy": 0.5580156229436397, "epoch": 1.4414997388642585, "grad_norm": 0.2319851964712143, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.8251135084778071, "num_tokens": 1304349846.0, "step": 19235 }, { "entropy": 0.5543748052790761, "epoch": 1.4418744627981104, "grad_norm": 0.25037238001823425, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8242061607539654, "num_tokens": 1305827414.0, "step": 19240 }, { "entropy": 0.5661038666963577, "epoch": 1.4422491867319622, "grad_norm": 0.26004666090011597, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8181993678212166, "num_tokens": 1307356089.0, "step": 19245 }, { "entropy": 0.5637737335637212, "epoch": 1.442623910665814, "grad_norm": 0.25076979398727417, "learning_rate": 0.0002, "loss": 0.6557, "mean_token_accuracy": 0.8166159853339195, "num_tokens": 1308830622.0, "step": 19250 }, { "entropy": 0.5591371389105916, "epoch": 1.442998634599666, "grad_norm": 0.31698307394981384, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.8211180198937654, "num_tokens": 1310346215.0, "step": 19255 }, { "entropy": 0.5258070170879364, "epoch": 1.4433733585335178, "grad_norm": 0.2378363162279129, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.8289060611277819, "num_tokens": 1311852406.0, "step": 19260 }, { "entropy": 0.5631128709763289, "epoch": 1.4437480824673696, "grad_norm": 0.25372013449668884, "learning_rate": 0.0002, "loss": 0.6434, "mean_token_accuracy": 0.8168099749833345, "num_tokens": 1313335191.0, "step": 19265 }, { "entropy": 0.5592004077509045, "epoch": 1.4441228064012215, "grad_norm": 0.2342633157968521, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8236234143376351, "num_tokens": 1314843489.0, "step": 19270 }, { "entropy": 0.5708150740712881, "epoch": 1.4444975303350733, "grad_norm": 0.2526068687438965, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8205423347651959, "num_tokens": 1316357956.0, "step": 19275 }, { "entropy": 0.595656112022698, "epoch": 1.4448722542689252, "grad_norm": 0.24093417823314667, "learning_rate": 0.0002, "loss": 0.6542, "mean_token_accuracy": 0.8165625911206007, "num_tokens": 1317881141.0, "step": 19280 }, { "entropy": 0.587020461820066, "epoch": 1.4452469782027773, "grad_norm": 0.24294984340667725, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.8234389767050743, "num_tokens": 1319329924.0, "step": 19285 }, { "entropy": 0.5765674127265811, "epoch": 1.445621702136629, "grad_norm": 0.2226983606815338, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.8207493431866169, "num_tokens": 1320741961.0, "step": 19290 }, { "entropy": 0.5774507272988558, "epoch": 1.445996426070481, "grad_norm": 0.3155275583267212, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8239750795066356, "num_tokens": 1322239823.0, "step": 19295 }, { "entropy": 0.5400104814209044, "epoch": 1.4463711500043328, "grad_norm": 0.22693327069282532, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.8283280331641436, "num_tokens": 1323618608.0, "step": 19300 }, { "entropy": 0.5692817281931639, "epoch": 1.4467458739381847, "grad_norm": 0.22132615745067596, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.8221347764134407, "num_tokens": 1325105109.0, "step": 19305 }, { "entropy": 0.5685860022902489, "epoch": 1.4471205978720365, "grad_norm": 0.22828207910060883, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8226377263665199, "num_tokens": 1326546175.0, "step": 19310 }, { "entropy": 0.544591929577291, "epoch": 1.4474953218058884, "grad_norm": 0.21211867034435272, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.8222242802381515, "num_tokens": 1328033854.0, "step": 19315 }, { "entropy": 0.5530239555984735, "epoch": 1.4478700457397402, "grad_norm": 0.22473865747451782, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8204464934766292, "num_tokens": 1329519746.0, "step": 19320 }, { "entropy": 0.5613622082397342, "epoch": 1.448244769673592, "grad_norm": 0.23641611635684967, "learning_rate": 0.0002, "loss": 0.6409, "mean_token_accuracy": 0.8196219343692064, "num_tokens": 1331024935.0, "step": 19325 }, { "entropy": 0.5640204852446914, "epoch": 1.448619493607444, "grad_norm": 0.23662999272346497, "learning_rate": 0.0002, "loss": 0.6408, "mean_token_accuracy": 0.8206991478800774, "num_tokens": 1332493052.0, "step": 19330 }, { "entropy": 0.5413991948589683, "epoch": 1.4489942175412958, "grad_norm": 0.24259275197982788, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.8278279315680266, "num_tokens": 1333984040.0, "step": 19335 }, { "entropy": 0.5563998382538557, "epoch": 1.4493689414751476, "grad_norm": 0.2273886650800705, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8224132664501667, "num_tokens": 1335457818.0, "step": 19340 }, { "entropy": 0.5576706770807505, "epoch": 1.4497436654089995, "grad_norm": 0.2934775948524475, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8228851407766342, "num_tokens": 1336880475.0, "step": 19345 }, { "entropy": 0.5743835495784879, "epoch": 1.4501183893428513, "grad_norm": 0.27516067028045654, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.8231677625328302, "num_tokens": 1338377489.0, "step": 19350 }, { "entropy": 0.5814660385251045, "epoch": 1.4504931132767032, "grad_norm": 0.230267733335495, "learning_rate": 0.0002, "loss": 0.6311, "mean_token_accuracy": 0.823260223865509, "num_tokens": 1339844243.0, "step": 19355 }, { "entropy": 0.5542345941066742, "epoch": 1.450867837210555, "grad_norm": 0.20622873306274414, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8235101860016585, "num_tokens": 1341282999.0, "step": 19360 }, { "entropy": 0.5624557856470347, "epoch": 1.4512425611444069, "grad_norm": 0.3092283308506012, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8230269819498062, "num_tokens": 1342767343.0, "step": 19365 }, { "entropy": 0.5655486095696688, "epoch": 1.4516172850782587, "grad_norm": 0.2298250049352646, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.823737571388483, "num_tokens": 1344232110.0, "step": 19370 }, { "entropy": 0.5687306191772222, "epoch": 1.4519920090121106, "grad_norm": 0.21044747531414032, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.8211068864911795, "num_tokens": 1345711062.0, "step": 19375 }, { "entropy": 0.5666831035166979, "epoch": 1.4523667329459624, "grad_norm": 0.23877781629562378, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.8211355615407229, "num_tokens": 1347146649.0, "step": 19380 }, { "entropy": 0.5609169602394104, "epoch": 1.4527414568798143, "grad_norm": 0.23265710473060608, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.822397143766284, "num_tokens": 1348607520.0, "step": 19385 }, { "entropy": 0.5467417296022177, "epoch": 1.4531161808136661, "grad_norm": 0.21645672619342804, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.8242469001561403, "num_tokens": 1350070060.0, "step": 19390 }, { "entropy": 0.5585521201603114, "epoch": 1.453490904747518, "grad_norm": 0.28011396527290344, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8225143503397703, "num_tokens": 1351528048.0, "step": 19395 }, { "entropy": 0.550887986831367, "epoch": 1.4538656286813698, "grad_norm": 0.23802204430103302, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8216121960431337, "num_tokens": 1352983916.0, "step": 19400 }, { "entropy": 0.5663235638290643, "epoch": 1.4542403526152219, "grad_norm": 0.21603339910507202, "learning_rate": 0.0002, "loss": 0.6337, "mean_token_accuracy": 0.820636224374175, "num_tokens": 1354472212.0, "step": 19405 }, { "entropy": 0.5761786509305239, "epoch": 1.4546150765490737, "grad_norm": 0.231625035405159, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8193455796688796, "num_tokens": 1355939313.0, "step": 19410 }, { "entropy": 0.595072160102427, "epoch": 1.4549898004829256, "grad_norm": 0.22229991853237152, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.819852203503251, "num_tokens": 1357397701.0, "step": 19415 }, { "entropy": 0.556347362883389, "epoch": 1.4553645244167774, "grad_norm": 0.2396133691072464, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.8248046599328518, "num_tokens": 1358896643.0, "step": 19420 }, { "entropy": 0.5629924882203341, "epoch": 1.4557392483506293, "grad_norm": 0.2367715686559677, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8215083900839091, "num_tokens": 1360377980.0, "step": 19425 }, { "entropy": 0.5489724736660719, "epoch": 1.4561139722844811, "grad_norm": 0.2153671830892563, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8240905463695526, "num_tokens": 1361861881.0, "step": 19430 }, { "entropy": 0.5554329499602317, "epoch": 1.456488696218333, "grad_norm": 0.2868485152721405, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8247199267148971, "num_tokens": 1363325131.0, "step": 19435 }, { "entropy": 0.5601961460895837, "epoch": 1.4568634201521848, "grad_norm": 0.22998583316802979, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.8225098844617605, "num_tokens": 1364783474.0, "step": 19440 }, { "entropy": 0.5596558568999171, "epoch": 1.4572381440860367, "grad_norm": 0.25873127579689026, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8223651960492134, "num_tokens": 1366270275.0, "step": 19445 }, { "entropy": 0.5602040585130453, "epoch": 1.4576128680198885, "grad_norm": 0.24763554334640503, "learning_rate": 0.0002, "loss": 0.6383, "mean_token_accuracy": 0.820197369903326, "num_tokens": 1367741965.0, "step": 19450 }, { "entropy": 0.5399529338814318, "epoch": 1.4579875919537404, "grad_norm": 0.35451653599739075, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8241353400051594, "num_tokens": 1369191100.0, "step": 19455 }, { "entropy": 0.5434348836541176, "epoch": 1.4583623158875922, "grad_norm": 0.2240963578224182, "learning_rate": 0.0002, "loss": 0.6317, "mean_token_accuracy": 0.823080712929368, "num_tokens": 1370617134.0, "step": 19460 }, { "entropy": 0.5482613367959857, "epoch": 1.458737039821444, "grad_norm": 0.21835123002529144, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.8200772169977426, "num_tokens": 1372101698.0, "step": 19465 }, { "entropy": 0.5643340572714806, "epoch": 1.459111763755296, "grad_norm": 0.23905552923679352, "learning_rate": 0.0002, "loss": 0.6522, "mean_token_accuracy": 0.8167980581521987, "num_tokens": 1373601949.0, "step": 19470 }, { "entropy": 0.5549605179578065, "epoch": 1.4594864876891478, "grad_norm": 0.23613402247428894, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.819089787080884, "num_tokens": 1375050719.0, "step": 19475 }, { "entropy": 0.5265695253387094, "epoch": 1.4598612116229996, "grad_norm": 0.24334561824798584, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8217012826353312, "num_tokens": 1376476219.0, "step": 19480 }, { "entropy": 0.5163168154656887, "epoch": 1.4602359355568515, "grad_norm": 0.24353285133838654, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.8262802377343178, "num_tokens": 1377939410.0, "step": 19485 }, { "entropy": 0.5433078203350306, "epoch": 1.4606106594907033, "grad_norm": 0.35857629776000977, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.8218819703906775, "num_tokens": 1379422714.0, "step": 19490 }, { "entropy": 0.5406052545644343, "epoch": 1.4609853834245552, "grad_norm": 0.2198173701763153, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.822224835678935, "num_tokens": 1380925155.0, "step": 19495 }, { "entropy": 0.5567979061976075, "epoch": 1.461360107358407, "grad_norm": 0.257844477891922, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.8231182713061571, "num_tokens": 1382368955.0, "step": 19500 }, { "entropy": 0.5568613169714809, "epoch": 1.461734831292259, "grad_norm": 0.22539548575878143, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.8201289340853691, "num_tokens": 1383858097.0, "step": 19505 }, { "entropy": 0.5580027233809233, "epoch": 1.4621095552261107, "grad_norm": 0.21512599289417267, "learning_rate": 0.0002, "loss": 0.6523, "mean_token_accuracy": 0.8168425939977169, "num_tokens": 1385296907.0, "step": 19510 }, { "entropy": 0.5333580382168293, "epoch": 1.4624842791599626, "grad_norm": 0.2458692491054535, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8272290710359812, "num_tokens": 1386761019.0, "step": 19515 }, { "entropy": 0.547168061695993, "epoch": 1.4628590030938144, "grad_norm": 0.2796861529350281, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.8263355873525142, "num_tokens": 1388226486.0, "step": 19520 }, { "entropy": 0.5564767694100737, "epoch": 1.4632337270276663, "grad_norm": 0.2107221633195877, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.8209413412958384, "num_tokens": 1389722723.0, "step": 19525 }, { "entropy": 0.5414199401624501, "epoch": 1.4636084509615181, "grad_norm": 0.20834900438785553, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.820649205520749, "num_tokens": 1391206817.0, "step": 19530 }, { "entropy": 0.5404414247721434, "epoch": 1.46398317489537, "grad_norm": 0.22935809195041656, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.8245146494358778, "num_tokens": 1392628390.0, "step": 19535 }, { "entropy": 0.5520927192643285, "epoch": 1.4643578988292218, "grad_norm": 0.29433873295783997, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.8228141896426677, "num_tokens": 1394119981.0, "step": 19540 }, { "entropy": 0.5535134384408593, "epoch": 1.4647326227630737, "grad_norm": 0.2424897402524948, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.8208729851990938, "num_tokens": 1395595274.0, "step": 19545 }, { "entropy": 0.5442154368385672, "epoch": 1.4651073466969255, "grad_norm": 0.22218185663223267, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8225799605250359, "num_tokens": 1397098842.0, "step": 19550 }, { "entropy": 0.5453445021063089, "epoch": 1.4654820706307774, "grad_norm": 0.2176305204629898, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8268315989524126, "num_tokens": 1398600619.0, "step": 19555 }, { "entropy": 0.5551393469795585, "epoch": 1.4658567945646293, "grad_norm": 0.2490801215171814, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8193537440150976, "num_tokens": 1400063468.0, "step": 19560 }, { "entropy": 0.5416793663054704, "epoch": 1.466231518498481, "grad_norm": 0.24604660272598267, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8248280741274356, "num_tokens": 1401492120.0, "step": 19565 }, { "entropy": 0.5495038025081158, "epoch": 1.466606242432333, "grad_norm": 0.22556281089782715, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8220455352216959, "num_tokens": 1402980401.0, "step": 19570 }, { "entropy": 0.5572002021595835, "epoch": 1.4669809663661848, "grad_norm": 0.2125856727361679, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8221034731715917, "num_tokens": 1404470125.0, "step": 19575 }, { "entropy": 0.5759591354988516, "epoch": 1.4673556903000367, "grad_norm": 0.22800928354263306, "learning_rate": 0.0002, "loss": 0.6478, "mean_token_accuracy": 0.8196242690086365, "num_tokens": 1405954692.0, "step": 19580 }, { "entropy": 0.5693815175443888, "epoch": 1.4677304142338885, "grad_norm": 0.21971896290779114, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.8232919417321682, "num_tokens": 1407429590.0, "step": 19585 }, { "entropy": 0.5679776724427938, "epoch": 1.4681051381677404, "grad_norm": 0.18775705993175507, "learning_rate": 0.0002, "loss": 0.6349, "mean_token_accuracy": 0.8225914072245359, "num_tokens": 1408878073.0, "step": 19590 }, { "entropy": 0.5572405558079481, "epoch": 1.4684798621015922, "grad_norm": 0.24487869441509247, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.8221172131597996, "num_tokens": 1410372092.0, "step": 19595 }, { "entropy": 0.5531484248116613, "epoch": 1.4688545860354443, "grad_norm": 0.22618305683135986, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.8205090519040823, "num_tokens": 1411855999.0, "step": 19600 }, { "entropy": 0.5568261573091149, "epoch": 1.4692293099692961, "grad_norm": 0.23673179745674133, "learning_rate": 0.0002, "loss": 0.6332, "mean_token_accuracy": 0.8217747673392296, "num_tokens": 1413404917.0, "step": 19605 }, { "entropy": 0.5474659817293286, "epoch": 1.469604033903148, "grad_norm": 0.22348527610301971, "learning_rate": 0.0002, "loss": 0.6325, "mean_token_accuracy": 0.8244498979300261, "num_tokens": 1414849032.0, "step": 19610 }, { "entropy": 0.5481605449691415, "epoch": 1.4699787578369998, "grad_norm": 0.24430811405181885, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.8254472788423299, "num_tokens": 1416364060.0, "step": 19615 }, { "entropy": 0.533713729120791, "epoch": 1.4703534817708517, "grad_norm": 0.22065959870815277, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.8278768409043551, "num_tokens": 1417837468.0, "step": 19620 }, { "entropy": 0.5618507219478488, "epoch": 1.4707282057047035, "grad_norm": 0.24066272377967834, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8224756233394146, "num_tokens": 1419294853.0, "step": 19625 }, { "entropy": 0.5691572675481439, "epoch": 1.4711029296385554, "grad_norm": 0.24201662838459015, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8235446356236935, "num_tokens": 1420771925.0, "step": 19630 }, { "entropy": 0.5590378239750862, "epoch": 1.4714776535724072, "grad_norm": 0.2555638253688812, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8237343281507492, "num_tokens": 1422209504.0, "step": 19635 }, { "entropy": 0.5693544650450348, "epoch": 1.471852377506259, "grad_norm": 0.2246658056974411, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8204261153936386, "num_tokens": 1423668433.0, "step": 19640 }, { "entropy": 0.5433281043544411, "epoch": 1.472227101440111, "grad_norm": 0.21581999957561493, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8252794906497002, "num_tokens": 1425137466.0, "step": 19645 }, { "entropy": 0.5493941847234964, "epoch": 1.4726018253739628, "grad_norm": 0.22493210434913635, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.8227056380361318, "num_tokens": 1426617795.0, "step": 19650 }, { "entropy": 0.5648996410891414, "epoch": 1.4729765493078146, "grad_norm": 0.2134217619895935, "learning_rate": 0.0002, "loss": 0.6377, "mean_token_accuracy": 0.8216128133237361, "num_tokens": 1428045807.0, "step": 19655 }, { "entropy": 0.570400127209723, "epoch": 1.4733512732416665, "grad_norm": 0.24044138193130493, "learning_rate": 0.0002, "loss": 0.6327, "mean_token_accuracy": 0.8222149588167668, "num_tokens": 1429493986.0, "step": 19660 }, { "entropy": 0.5554603544995189, "epoch": 1.4737259971755183, "grad_norm": 0.27484747767448425, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8237431071698665, "num_tokens": 1430956921.0, "step": 19665 }, { "entropy": 0.5628506284207105, "epoch": 1.4741007211093702, "grad_norm": 0.20432016253471375, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.8279044382274151, "num_tokens": 1432407030.0, "step": 19670 }, { "entropy": 0.5631579842418433, "epoch": 1.474475445043222, "grad_norm": 0.218503937125206, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8225922048091888, "num_tokens": 1433897475.0, "step": 19675 }, { "entropy": 0.5530336316674948, "epoch": 1.4748501689770739, "grad_norm": 0.19415292143821716, "learning_rate": 0.0002, "loss": 0.6319, "mean_token_accuracy": 0.8199534166604281, "num_tokens": 1435397522.0, "step": 19680 }, { "entropy": 0.5450195468962192, "epoch": 1.4752248929109257, "grad_norm": 0.3230311870574951, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8211127236485481, "num_tokens": 1436888732.0, "step": 19685 }, { "entropy": 0.5256920598447323, "epoch": 1.4755996168447776, "grad_norm": 0.2581494152545929, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8236809082329273, "num_tokens": 1438383873.0, "step": 19690 }, { "entropy": 0.5142782635986805, "epoch": 1.4759743407786294, "grad_norm": 0.25478827953338623, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.8250999595969916, "num_tokens": 1439795776.0, "step": 19695 }, { "entropy": 0.52295540291816, "epoch": 1.4763490647124813, "grad_norm": 0.21912913024425507, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.8225165471434593, "num_tokens": 1441296672.0, "step": 19700 }, { "entropy": 0.5317225242033601, "epoch": 1.4767237886463331, "grad_norm": 0.2284969538450241, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.8201818376779556, "num_tokens": 1442738429.0, "step": 19705 }, { "entropy": 0.5363255187869072, "epoch": 1.477098512580185, "grad_norm": 0.28254151344299316, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.8220318082720042, "num_tokens": 1444201845.0, "step": 19710 }, { "entropy": 0.5331372620537878, "epoch": 1.477473236514037, "grad_norm": 0.4300522208213806, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8238316748291254, "num_tokens": 1445679032.0, "step": 19715 }, { "entropy": 0.5365743784233927, "epoch": 1.477847960447889, "grad_norm": 0.26892271637916565, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.8258494526147843, "num_tokens": 1447138241.0, "step": 19720 }, { "entropy": 0.52721759211272, "epoch": 1.4782226843817408, "grad_norm": 0.21975532174110413, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.8268551360815763, "num_tokens": 1448589325.0, "step": 19725 }, { "entropy": 0.526377029530704, "epoch": 1.4785974083155926, "grad_norm": 0.2242898941040039, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8271813780069351, "num_tokens": 1450030977.0, "step": 19730 }, { "entropy": 0.5330298410728573, "epoch": 1.4789721322494445, "grad_norm": 0.21311083436012268, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.8219398260116577, "num_tokens": 1451534562.0, "step": 19735 }, { "entropy": 0.5422844756394625, "epoch": 1.4793468561832963, "grad_norm": 0.2184181809425354, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.816503518819809, "num_tokens": 1453002207.0, "step": 19740 }, { "entropy": 0.5427023263648152, "epoch": 1.4797215801171482, "grad_norm": 0.2182038128376007, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8203535996377468, "num_tokens": 1454495524.0, "step": 19745 }, { "entropy": 0.5184312671422958, "epoch": 1.480096304051, "grad_norm": 0.2169715315103531, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.828950522467494, "num_tokens": 1455941201.0, "step": 19750 }, { "entropy": 0.5628787616267801, "epoch": 1.4804710279848519, "grad_norm": 0.29987213015556335, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8207603834569455, "num_tokens": 1457461777.0, "step": 19755 }, { "entropy": 0.5492906609550119, "epoch": 1.4808457519187037, "grad_norm": 0.22984276711940765, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8222549896687269, "num_tokens": 1458947644.0, "step": 19760 }, { "entropy": 0.5437789684161544, "epoch": 1.4812204758525556, "grad_norm": 0.22544139623641968, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8224544003605843, "num_tokens": 1460375381.0, "step": 19765 }, { "entropy": 0.5341801008209586, "epoch": 1.4815951997864074, "grad_norm": 0.3282756209373474, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.8226108994334936, "num_tokens": 1461818300.0, "step": 19770 }, { "entropy": 0.5430355651304126, "epoch": 1.4819699237202593, "grad_norm": 0.2581418454647064, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.8225839301943779, "num_tokens": 1463276403.0, "step": 19775 }, { "entropy": 0.5369512150064111, "epoch": 1.4823446476541111, "grad_norm": 0.20318172872066498, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8230238549411297, "num_tokens": 1464721023.0, "step": 19780 }, { "entropy": 0.5451994134113193, "epoch": 1.482719371587963, "grad_norm": 0.22626708447933197, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8191585160791874, "num_tokens": 1466171715.0, "step": 19785 }, { "entropy": 0.5581385752186179, "epoch": 1.4830940955218148, "grad_norm": 0.22642026841640472, "learning_rate": 0.0002, "loss": 0.6352, "mean_token_accuracy": 0.8213535908609628, "num_tokens": 1467651587.0, "step": 19790 }, { "entropy": 0.5461558765731752, "epoch": 1.4834688194556667, "grad_norm": 0.22985345125198364, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.8217033449560404, "num_tokens": 1469174688.0, "step": 19795 }, { "entropy": 0.5496660685166717, "epoch": 1.4838435433895185, "grad_norm": 0.23291538655757904, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.8217488687485457, "num_tokens": 1470678623.0, "step": 19800 }, { "entropy": 0.532028953358531, "epoch": 1.4842182673233704, "grad_norm": 0.2282027155160904, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8244070839136839, "num_tokens": 1472128457.0, "step": 19805 }, { "entropy": 0.5475883783772588, "epoch": 1.4845929912572222, "grad_norm": 0.24872885644435883, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.8244870439171791, "num_tokens": 1473594080.0, "step": 19810 }, { "entropy": 0.5450018431991339, "epoch": 1.484967715191074, "grad_norm": 0.23129969835281372, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8237869083881378, "num_tokens": 1475054349.0, "step": 19815 }, { "entropy": 0.5464992949739098, "epoch": 1.485342439124926, "grad_norm": 0.24417276680469513, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.8248875673860312, "num_tokens": 1476525200.0, "step": 19820 }, { "entropy": 0.5478458184748888, "epoch": 1.4857171630587778, "grad_norm": 0.24262751638889313, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8252895765006543, "num_tokens": 1478004491.0, "step": 19825 }, { "entropy": 0.5709240091964602, "epoch": 1.4860918869926296, "grad_norm": 0.26900920271873474, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.821166368946433, "num_tokens": 1479501993.0, "step": 19830 }, { "entropy": 0.5564262978732586, "epoch": 1.4864666109264815, "grad_norm": 0.25678956508636475, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.8238665122538805, "num_tokens": 1480991517.0, "step": 19835 }, { "entropy": 0.5544944515451788, "epoch": 1.4868413348603333, "grad_norm": 0.23279373347759247, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8252749659121037, "num_tokens": 1482452370.0, "step": 19840 }, { "entropy": 0.5589861189946532, "epoch": 1.4872160587941852, "grad_norm": 0.219410240650177, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8251717399805785, "num_tokens": 1483918774.0, "step": 19845 }, { "entropy": 0.5628014463931322, "epoch": 1.487590782728037, "grad_norm": 0.25761327147483826, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8252319756895303, "num_tokens": 1485402046.0, "step": 19850 }, { "entropy": 0.5706033181399107, "epoch": 1.4879655066618889, "grad_norm": 0.26599815487861633, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8209669131785631, "num_tokens": 1486869472.0, "step": 19855 }, { "entropy": 0.5558917867951095, "epoch": 1.4883402305957407, "grad_norm": 0.2199234962463379, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8244080182164908, "num_tokens": 1488326981.0, "step": 19860 }, { "entropy": 0.54869170691818, "epoch": 1.4887149545295926, "grad_norm": 0.23623089492321014, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8262884885072708, "num_tokens": 1489832802.0, "step": 19865 }, { "entropy": 0.5500115716829896, "epoch": 1.4890896784634444, "grad_norm": 0.2598571181297302, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8253563594073057, "num_tokens": 1491299672.0, "step": 19870 }, { "entropy": 0.5666880488395691, "epoch": 1.4894644023972963, "grad_norm": 0.23285193741321564, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.8230520479381085, "num_tokens": 1492784689.0, "step": 19875 }, { "entropy": 0.5622377920895815, "epoch": 1.4898391263311481, "grad_norm": 0.23638245463371277, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8283708471804857, "num_tokens": 1494213642.0, "step": 19880 }, { "entropy": 0.5796284735202789, "epoch": 1.490213850265, "grad_norm": 0.2324293553829193, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.8238442711532116, "num_tokens": 1495687776.0, "step": 19885 }, { "entropy": 0.5631801294162869, "epoch": 1.4905885741988518, "grad_norm": 0.2192460149526596, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8222247637808323, "num_tokens": 1497182801.0, "step": 19890 }, { "entropy": 0.5610471311956644, "epoch": 1.4909632981327037, "grad_norm": 0.2693887948989868, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8245943032205105, "num_tokens": 1498638682.0, "step": 19895 }, { "entropy": 0.5573125157505274, "epoch": 1.4913380220665555, "grad_norm": 0.2482662796974182, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8235681612044573, "num_tokens": 1500098200.0, "step": 19900 }, { "entropy": 0.5587534593418241, "epoch": 1.4917127460004074, "grad_norm": 0.25338777899742126, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8247009985148906, "num_tokens": 1501572454.0, "step": 19905 }, { "entropy": 0.5520109440200031, "epoch": 1.4920874699342594, "grad_norm": 0.22233709692955017, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8247436180710792, "num_tokens": 1503018649.0, "step": 19910 }, { "entropy": 0.5692125575616955, "epoch": 1.4924621938681113, "grad_norm": 0.2286013811826706, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.8184423722326756, "num_tokens": 1504531560.0, "step": 19915 }, { "entropy": 0.5632996007800102, "epoch": 1.4928369178019631, "grad_norm": 0.2408769577741623, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8241951402276755, "num_tokens": 1505980118.0, "step": 19920 }, { "entropy": 0.568064440973103, "epoch": 1.493211641735815, "grad_norm": 0.22278352081775665, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.8217969577759504, "num_tokens": 1507453182.0, "step": 19925 }, { "entropy": 0.5550464136525989, "epoch": 1.4935863656696668, "grad_norm": 0.22192637622356415, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.8218515064567328, "num_tokens": 1508971055.0, "step": 19930 }, { "entropy": 0.5645006023347378, "epoch": 1.4939610896035187, "grad_norm": 0.22808852791786194, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.8226878415793181, "num_tokens": 1510414367.0, "step": 19935 }, { "entropy": 0.555482767149806, "epoch": 1.4943358135373705, "grad_norm": 0.2491176724433899, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.8245610989630222, "num_tokens": 1511887583.0, "step": 19940 }, { "entropy": 0.5531242184340954, "epoch": 1.4947105374712224, "grad_norm": 0.25617024302482605, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8257489617913961, "num_tokens": 1513321556.0, "step": 19945 }, { "entropy": 0.5466047754511237, "epoch": 1.4950852614050743, "grad_norm": 0.22406555712223053, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.8247359059751034, "num_tokens": 1514779940.0, "step": 19950 }, { "entropy": 0.576344683766365, "epoch": 1.495459985338926, "grad_norm": 0.24476496875286102, "learning_rate": 0.0002, "loss": 0.6406, "mean_token_accuracy": 0.8185497842729091, "num_tokens": 1516295964.0, "step": 19955 }, { "entropy": 0.5649967456236482, "epoch": 1.495834709272778, "grad_norm": 0.222911536693573, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8244561225175857, "num_tokens": 1517792611.0, "step": 19960 }, { "entropy": 0.551762037537992, "epoch": 1.4962094332066298, "grad_norm": 0.2525043785572052, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8208713416010142, "num_tokens": 1519247379.0, "step": 19965 }, { "entropy": 0.5437554270029068, "epoch": 1.4965841571404817, "grad_norm": 0.2521336078643799, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8243151810020208, "num_tokens": 1520726314.0, "step": 19970 }, { "entropy": 0.5590129263699055, "epoch": 1.4969588810743335, "grad_norm": 0.21889904141426086, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.8226188939064741, "num_tokens": 1522224839.0, "step": 19975 }, { "entropy": 0.5658796285279095, "epoch": 1.4973336050081854, "grad_norm": 0.27196452021598816, "learning_rate": 0.0002, "loss": 0.6409, "mean_token_accuracy": 0.8202904712408781, "num_tokens": 1523703506.0, "step": 19980 }, { "entropy": 0.5447539441287518, "epoch": 1.4977083289420372, "grad_norm": 0.21411141753196716, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8189694087952375, "num_tokens": 1525200898.0, "step": 19985 }, { "entropy": 0.5557889565825462, "epoch": 1.498083052875889, "grad_norm": 0.2987571954727173, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8221585094928742, "num_tokens": 1526688688.0, "step": 19990 }, { "entropy": 0.5519056923687458, "epoch": 1.498457776809741, "grad_norm": 0.24151192605495453, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.8201932352036237, "num_tokens": 1528186006.0, "step": 19995 }, { "entropy": 0.5495951413176954, "epoch": 1.4988325007435928, "grad_norm": 0.2340601533651352, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.8255293659865857, "num_tokens": 1529651958.0, "step": 20000 }, { "entropy": 0.5666819108650089, "epoch": 1.4992072246774446, "grad_norm": 0.24795198440551758, "learning_rate": 0.0002, "loss": 0.6367, "mean_token_accuracy": 0.822637090831995, "num_tokens": 1419753.0, "step": 20005 }, { "entropy": 0.568309024348855, "epoch": 1.4995819486112965, "grad_norm": 0.22668060660362244, "learning_rate": 0.0002, "loss": 0.6441, "mean_token_accuracy": 0.8174381516873837, "num_tokens": 2876645.0, "step": 20010 }, { "entropy": 0.5671294463798404, "epoch": 1.4999566725451483, "grad_norm": 0.20385509729385376, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8216474302113056, "num_tokens": 4351672.0, "step": 20015 }, { "entropy": 0.5669899286702276, "epoch": 1.5003313964790004, "grad_norm": 0.2466505765914917, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8231590542942285, "num_tokens": 5842168.0, "step": 20020 }, { "entropy": 0.5598309624940156, "epoch": 1.5007061204128522, "grad_norm": 0.2086508572101593, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8209812492132187, "num_tokens": 7339350.0, "step": 20025 }, { "entropy": 0.5508539928123355, "epoch": 1.501080844346704, "grad_norm": 0.2169647514820099, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.8236430965363979, "num_tokens": 8798848.0, "step": 20030 }, { "entropy": 0.5517519636079669, "epoch": 1.501455568280556, "grad_norm": 0.2395768016576767, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.8213646821677685, "num_tokens": 10233161.0, "step": 20035 }, { "entropy": 0.5445291962474584, "epoch": 1.5018302922144078, "grad_norm": 0.22209355235099792, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.8230334676802158, "num_tokens": 11701327.0, "step": 20040 }, { "entropy": 0.5392529832199215, "epoch": 1.5022050161482596, "grad_norm": 0.35638120770454407, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8245615191757679, "num_tokens": 13174080.0, "step": 20045 }, { "entropy": 0.5390896135941148, "epoch": 1.5025797400821115, "grad_norm": 0.32770219445228577, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8235674019902944, "num_tokens": 14614082.0, "step": 20050 }, { "entropy": 0.5555062051862478, "epoch": 1.5029544640159633, "grad_norm": 0.2306947261095047, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.8216675776988268, "num_tokens": 16123421.0, "step": 20055 }, { "entropy": 0.5334288541227579, "epoch": 1.5033291879498152, "grad_norm": 0.2327306568622589, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.8288703534752131, "num_tokens": 17590236.0, "step": 20060 }, { "entropy": 0.5534016855992376, "epoch": 1.503703911883667, "grad_norm": 0.3440399169921875, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8212670870125294, "num_tokens": 19084456.0, "step": 20065 }, { "entropy": 0.5688415702432394, "epoch": 1.5040786358175189, "grad_norm": 0.20951496064662933, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.820628110691905, "num_tokens": 20594384.0, "step": 20070 }, { "entropy": 0.5729536276310683, "epoch": 1.5044533597513707, "grad_norm": 0.24060730636119843, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8252086412161589, "num_tokens": 22062515.0, "step": 20075 }, { "entropy": 0.5743923572823405, "epoch": 1.5048280836852226, "grad_norm": 0.25212550163269043, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.8224955666810274, "num_tokens": 23514323.0, "step": 20080 }, { "entropy": 0.5504537073895335, "epoch": 1.5052028076190744, "grad_norm": 0.21997316181659698, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.8236958533525467, "num_tokens": 24941989.0, "step": 20085 }, { "entropy": 0.5669133614748716, "epoch": 1.5055775315529263, "grad_norm": 0.269959956407547, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.8199646010994911, "num_tokens": 26438467.0, "step": 20090 }, { "entropy": 0.5670698186382651, "epoch": 1.5059522554867781, "grad_norm": 0.245058074593544, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8237819142639637, "num_tokens": 27921413.0, "step": 20095 }, { "entropy": 0.5672216968610883, "epoch": 1.50632697942063, "grad_norm": 0.23987343907356262, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8238328132778406, "num_tokens": 29342995.0, "step": 20100 }, { "entropy": 0.5511075470596551, "epoch": 1.5067017033544818, "grad_norm": 0.3127058446407318, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8222802620381117, "num_tokens": 30773350.0, "step": 20105 }, { "entropy": 0.5632163939997554, "epoch": 1.5070764272883337, "grad_norm": 0.23510462045669556, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8263508070260286, "num_tokens": 32238115.0, "step": 20110 }, { "entropy": 0.590528036467731, "epoch": 1.5074511512221855, "grad_norm": 0.23130254447460175, "learning_rate": 0.0002, "loss": 0.6466, "mean_token_accuracy": 0.8176679611206055, "num_tokens": 33715756.0, "step": 20115 }, { "entropy": 0.568102446012199, "epoch": 1.5078258751560374, "grad_norm": 0.23516634106636047, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8249100551009179, "num_tokens": 35197388.0, "step": 20120 }, { "entropy": 0.5669776437804103, "epoch": 1.5082005990898892, "grad_norm": 0.2529248595237732, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.8244698941707611, "num_tokens": 36680794.0, "step": 20125 }, { "entropy": 0.561643011122942, "epoch": 1.508575323023741, "grad_norm": 0.21855749189853668, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.8259745307266713, "num_tokens": 38126170.0, "step": 20130 }, { "entropy": 0.5725094297900796, "epoch": 1.508950046957593, "grad_norm": 0.2267167717218399, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8200452875345945, "num_tokens": 39599530.0, "step": 20135 }, { "entropy": 0.5744919868186116, "epoch": 1.5093247708914448, "grad_norm": 0.2336270809173584, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8287457533180713, "num_tokens": 41065588.0, "step": 20140 }, { "entropy": 0.5740945206955075, "epoch": 1.5096994948252966, "grad_norm": 0.2453644871711731, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8231807969510555, "num_tokens": 42467007.0, "step": 20145 }, { "entropy": 0.5891105987131595, "epoch": 1.5100742187591485, "grad_norm": 0.22994603216648102, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8216390706598758, "num_tokens": 43944728.0, "step": 20150 }, { "entropy": 0.5768379380926489, "epoch": 1.5104489426930003, "grad_norm": 0.21669770777225494, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.822740425914526, "num_tokens": 45424571.0, "step": 20155 }, { "entropy": 0.5706163946539163, "epoch": 1.5108236666268522, "grad_norm": 0.203069269657135, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8267625708132982, "num_tokens": 46922270.0, "step": 20160 }, { "entropy": 0.5672119403257966, "epoch": 1.511198390560704, "grad_norm": 0.22879061102867126, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.828531700745225, "num_tokens": 48372250.0, "step": 20165 }, { "entropy": 0.5768672985956073, "epoch": 1.511573114494556, "grad_norm": 0.22520598769187927, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8241666834801435, "num_tokens": 49815320.0, "step": 20170 }, { "entropy": 0.5690280986949802, "epoch": 1.5119478384284077, "grad_norm": 0.33106210827827454, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8223528224974871, "num_tokens": 51270387.0, "step": 20175 }, { "entropy": 0.5720721191726625, "epoch": 1.5123225623622596, "grad_norm": 0.23276735842227936, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8224912721663713, "num_tokens": 52718380.0, "step": 20180 }, { "entropy": 0.575617759861052, "epoch": 1.5126972862961114, "grad_norm": 0.2308254987001419, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.823317576944828, "num_tokens": 54206523.0, "step": 20185 }, { "entropy": 0.5711833698675036, "epoch": 1.5130720102299633, "grad_norm": 0.22627374529838562, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.8227956518530846, "num_tokens": 55668766.0, "step": 20190 }, { "entropy": 0.5717862542718649, "epoch": 1.5134467341638151, "grad_norm": 0.27472177147865295, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8220719330012798, "num_tokens": 57138223.0, "step": 20195 }, { "entropy": 0.5718523599207401, "epoch": 1.513821458097667, "grad_norm": 0.22294224798679352, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.8242546688765288, "num_tokens": 58570735.0, "step": 20200 }, { "entropy": 0.5638322336599231, "epoch": 1.5141961820315188, "grad_norm": 0.3325378894805908, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8229431785643101, "num_tokens": 60066196.0, "step": 20205 }, { "entropy": 0.5641233280301094, "epoch": 1.5145709059653707, "grad_norm": 0.2233436405658722, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8247791495174169, "num_tokens": 61499576.0, "step": 20210 }, { "entropy": 0.5638808893039823, "epoch": 1.5149456298992225, "grad_norm": 0.21952664852142334, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.8228069908916951, "num_tokens": 62953939.0, "step": 20215 }, { "entropy": 0.5810784554108978, "epoch": 1.5153203538330744, "grad_norm": 0.2073419690132141, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8183174498379231, "num_tokens": 64435474.0, "step": 20220 }, { "entropy": 0.5590831434354187, "epoch": 1.5156950777669262, "grad_norm": 0.23204679787158966, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.8247771847993135, "num_tokens": 65854237.0, "step": 20225 }, { "entropy": 0.5776782765984535, "epoch": 1.516069801700778, "grad_norm": 0.23560261726379395, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8223782379180193, "num_tokens": 67308356.0, "step": 20230 }, { "entropy": 0.5914875935763121, "epoch": 1.51644452563463, "grad_norm": 0.24937696754932404, "learning_rate": 0.0002, "loss": 0.6427, "mean_token_accuracy": 0.8202138043940067, "num_tokens": 68780627.0, "step": 20235 }, { "entropy": 0.5910473903641105, "epoch": 1.516819249568482, "grad_norm": 0.2227206528186798, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.8231250219047069, "num_tokens": 70217281.0, "step": 20240 }, { "entropy": 0.6088151870295405, "epoch": 1.5171939735023339, "grad_norm": 0.2701375484466553, "learning_rate": 0.0002, "loss": 0.6371, "mean_token_accuracy": 0.821181171014905, "num_tokens": 71666858.0, "step": 20245 }, { "entropy": 0.576298513635993, "epoch": 1.5175686974361857, "grad_norm": 0.27079299092292786, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8217102706432342, "num_tokens": 73143732.0, "step": 20250 }, { "entropy": 0.5760546337813139, "epoch": 1.5179434213700376, "grad_norm": 0.23164033889770508, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.8183128233999014, "num_tokens": 74655785.0, "step": 20255 }, { "entropy": 0.5682791753672063, "epoch": 1.5183181453038894, "grad_norm": 0.23360592126846313, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8202068410813809, "num_tokens": 76152251.0, "step": 20260 }, { "entropy": 0.5838618081063032, "epoch": 1.5186928692377413, "grad_norm": 0.2938920557498932, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8216181479394435, "num_tokens": 77614314.0, "step": 20265 }, { "entropy": 0.5781344823539257, "epoch": 1.5190675931715931, "grad_norm": 0.22344741225242615, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8223676033318043, "num_tokens": 79084813.0, "step": 20270 }, { "entropy": 0.5704928429797291, "epoch": 1.519442317105445, "grad_norm": 0.334644079208374, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.8231807868927717, "num_tokens": 80566923.0, "step": 20275 }, { "entropy": 0.5648432364687324, "epoch": 1.5198170410392968, "grad_norm": 0.289732426404953, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.823036552593112, "num_tokens": 82044057.0, "step": 20280 }, { "entropy": 0.5608055433258414, "epoch": 1.5201917649731487, "grad_norm": 0.22522114217281342, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.822149358317256, "num_tokens": 83531969.0, "step": 20285 }, { "entropy": 0.5661415103822947, "epoch": 1.5205664889070005, "grad_norm": 0.2480265498161316, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8218515999615192, "num_tokens": 85019371.0, "step": 20290 }, { "entropy": 0.5698502669110894, "epoch": 1.5209412128408524, "grad_norm": 0.24944041669368744, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.823749853298068, "num_tokens": 86508651.0, "step": 20295 }, { "entropy": 0.5755650894716382, "epoch": 1.5213159367747042, "grad_norm": 0.2423546463251114, "learning_rate": 0.0002, "loss": 0.6264, "mean_token_accuracy": 0.8210805494338274, "num_tokens": 87966709.0, "step": 20300 }, { "entropy": 0.5577608367428184, "epoch": 1.521690660708556, "grad_norm": 0.2338685244321823, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8257548488676548, "num_tokens": 89442486.0, "step": 20305 }, { "entropy": 0.5783713683485985, "epoch": 1.522065384642408, "grad_norm": 0.39196163415908813, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8197678282856942, "num_tokens": 90872794.0, "step": 20310 }, { "entropy": 0.5374946393072605, "epoch": 1.52244010857626, "grad_norm": 0.3478054404258728, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8256216220557689, "num_tokens": 92313067.0, "step": 20315 }, { "entropy": 0.546359372138977, "epoch": 1.5228148325101118, "grad_norm": 0.2345731258392334, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.8231514729559422, "num_tokens": 93780847.0, "step": 20320 }, { "entropy": 0.5776332629844546, "epoch": 1.5231895564439637, "grad_norm": 0.23430296778678894, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.8220543615520001, "num_tokens": 95244966.0, "step": 20325 }, { "entropy": 0.561722275055945, "epoch": 1.5235642803778155, "grad_norm": 0.2343955785036087, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.8260430332273245, "num_tokens": 96721431.0, "step": 20330 }, { "entropy": 0.5702042531222105, "epoch": 1.5239390043116674, "grad_norm": 0.2215549498796463, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.8221329048275947, "num_tokens": 98174873.0, "step": 20335 }, { "entropy": 0.5746491136029362, "epoch": 1.5243137282455193, "grad_norm": 0.2517823874950409, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.8236352801322937, "num_tokens": 99613241.0, "step": 20340 }, { "entropy": 0.5757840357720851, "epoch": 1.524688452179371, "grad_norm": 0.29546844959259033, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8224888253957033, "num_tokens": 101102010.0, "step": 20345 }, { "entropy": 0.5739309696480632, "epoch": 1.525063176113223, "grad_norm": 0.37193912267684937, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.8237185448408126, "num_tokens": 102557642.0, "step": 20350 }, { "entropy": 0.5900311537086964, "epoch": 1.5254379000470748, "grad_norm": 0.25470995903015137, "learning_rate": 0.0002, "loss": 0.6408, "mean_token_accuracy": 0.8197439178824425, "num_tokens": 104021436.0, "step": 20355 }, { "entropy": 0.5673836600035429, "epoch": 1.5258126239809267, "grad_norm": 0.2622065246105194, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8232462164014578, "num_tokens": 105515697.0, "step": 20360 }, { "entropy": 0.5911731949076057, "epoch": 1.5261873479147785, "grad_norm": 0.2980518937110901, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.8210481151938438, "num_tokens": 106989964.0, "step": 20365 }, { "entropy": 0.5868986846879125, "epoch": 1.5265620718486304, "grad_norm": 0.2354855090379715, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.8203052531927824, "num_tokens": 108459935.0, "step": 20370 }, { "entropy": 0.5536989783868194, "epoch": 1.5269367957824822, "grad_norm": 0.22741380333900452, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8254949018359184, "num_tokens": 109932090.0, "step": 20375 }, { "entropy": 0.5522216111421585, "epoch": 1.527311519716334, "grad_norm": 0.2582351267337799, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8247401297092438, "num_tokens": 111408825.0, "step": 20380 }, { "entropy": 0.5679235644638538, "epoch": 1.527686243650186, "grad_norm": 0.2303907573223114, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8233620468527079, "num_tokens": 112873952.0, "step": 20385 }, { "entropy": 0.5828039253130555, "epoch": 1.5280609675840378, "grad_norm": 0.22376340627670288, "learning_rate": 0.0002, "loss": 0.6394, "mean_token_accuracy": 0.8213379606604576, "num_tokens": 114321292.0, "step": 20390 }, { "entropy": 0.5632300585508346, "epoch": 1.5284356915178896, "grad_norm": 0.3843768239021301, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.8235715720802546, "num_tokens": 115740889.0, "step": 20395 }, { "entropy": 0.581216211989522, "epoch": 1.5288104154517415, "grad_norm": 0.23887385427951813, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8206372369080782, "num_tokens": 117169713.0, "step": 20400 }, { "entropy": 0.579658273793757, "epoch": 1.5291851393855933, "grad_norm": 0.2847374975681305, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.8197776664048433, "num_tokens": 118640336.0, "step": 20405 }, { "entropy": 0.5613258108496666, "epoch": 1.5295598633194452, "grad_norm": 0.22935093939304352, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.8214312221854925, "num_tokens": 120095517.0, "step": 20410 }, { "entropy": 0.575257127173245, "epoch": 1.529934587253297, "grad_norm": 0.21549326181411743, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8197788670659065, "num_tokens": 121557360.0, "step": 20415 }, { "entropy": 0.5743454286828638, "epoch": 1.5303093111871489, "grad_norm": 0.2317764014005661, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.822754530981183, "num_tokens": 123030007.0, "step": 20420 }, { "entropy": 0.5872451733797789, "epoch": 1.5306840351210007, "grad_norm": 0.3132995069026947, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.8220872357487679, "num_tokens": 124544784.0, "step": 20425 }, { "entropy": 0.577940677665174, "epoch": 1.5310587590548526, "grad_norm": 0.21700093150138855, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8197968598455191, "num_tokens": 126009973.0, "step": 20430 }, { "entropy": 0.5833158448338509, "epoch": 1.5314334829887044, "grad_norm": 0.2195175737142563, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.8192099791020155, "num_tokens": 127528220.0, "step": 20435 }, { "entropy": 0.5736799705773592, "epoch": 1.5318082069225563, "grad_norm": 0.2394532412290573, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.8270853150635957, "num_tokens": 128980278.0, "step": 20440 }, { "entropy": 0.5901008013635873, "epoch": 1.532182930856408, "grad_norm": 0.2523703873157501, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.8254819121211767, "num_tokens": 130455332.0, "step": 20445 }, { "entropy": 0.5971696430817246, "epoch": 1.53255765479026, "grad_norm": 0.28198811411857605, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.8206112802028656, "num_tokens": 131933949.0, "step": 20450 }, { "entropy": 0.5833905424922705, "epoch": 1.5329323787241118, "grad_norm": 0.21590527892112732, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8241698637604713, "num_tokens": 133407012.0, "step": 20455 }, { "entropy": 0.6091650389134884, "epoch": 1.5333071026579637, "grad_norm": 0.24982963502407074, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8218126766383648, "num_tokens": 134876154.0, "step": 20460 }, { "entropy": 0.5980490619316697, "epoch": 1.5336818265918155, "grad_norm": 0.267160028219223, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.8252584099769592, "num_tokens": 136348706.0, "step": 20465 }, { "entropy": 0.5729536913335324, "epoch": 1.5340565505256674, "grad_norm": 0.2532750070095062, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.8213712178170681, "num_tokens": 137826969.0, "step": 20470 }, { "entropy": 0.547929460555315, "epoch": 1.5344312744595192, "grad_norm": 0.22679902613162994, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.8268338136374951, "num_tokens": 139284061.0, "step": 20475 }, { "entropy": 0.5765217458829284, "epoch": 1.534805998393371, "grad_norm": 0.21651698648929596, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8229967758059502, "num_tokens": 140765185.0, "step": 20480 }, { "entropy": 0.574123983643949, "epoch": 1.535180722327223, "grad_norm": 0.2668929100036621, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8232029009610414, "num_tokens": 142227251.0, "step": 20485 }, { "entropy": 0.5608955143019557, "epoch": 1.5355554462610748, "grad_norm": 0.22753174602985382, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.826780977845192, "num_tokens": 143683944.0, "step": 20490 }, { "entropy": 0.5888738917186856, "epoch": 1.5359301701949266, "grad_norm": 0.20228025317192078, "learning_rate": 0.0002, "loss": 0.645, "mean_token_accuracy": 0.8202553749084472, "num_tokens": 145231388.0, "step": 20495 }, { "entropy": 0.5635782733559609, "epoch": 1.5363048941287785, "grad_norm": 0.22940488159656525, "learning_rate": 0.0002, "loss": 0.6173, "mean_token_accuracy": 0.8240818820893765, "num_tokens": 146696046.0, "step": 20500 }, { "entropy": 0.5736390622332692, "epoch": 1.5366796180626303, "grad_norm": 0.2398775815963745, "learning_rate": 0.0002, "loss": 0.6297, "mean_token_accuracy": 0.8252075932919979, "num_tokens": 148154431.0, "step": 20505 }, { "entropy": 0.564595727995038, "epoch": 1.5370543419964822, "grad_norm": 0.23974715173244476, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.8230349529534579, "num_tokens": 149638238.0, "step": 20510 }, { "entropy": 0.5575997566804289, "epoch": 1.537429065930334, "grad_norm": 0.2635347545146942, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.8205412965267896, "num_tokens": 151095546.0, "step": 20515 }, { "entropy": 0.5413709655404091, "epoch": 1.5378037898641859, "grad_norm": 0.23093107342720032, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.828204432502389, "num_tokens": 152532101.0, "step": 20520 }, { "entropy": 0.5493024472147227, "epoch": 1.5381785137980377, "grad_norm": 0.2716561555862427, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.8247150525450706, "num_tokens": 153966810.0, "step": 20525 }, { "entropy": 0.5754288118332624, "epoch": 1.5385532377318896, "grad_norm": 0.326303094625473, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8206160817295313, "num_tokens": 155403191.0, "step": 20530 }, { "entropy": 0.5666543867439031, "epoch": 1.5389279616657414, "grad_norm": 0.22045135498046875, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.8247824739664793, "num_tokens": 156916242.0, "step": 20535 }, { "entropy": 0.5562436893582344, "epoch": 1.5393026855995933, "grad_norm": 0.2318027913570404, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.821679774671793, "num_tokens": 158406936.0, "step": 20540 }, { "entropy": 0.5694274229928851, "epoch": 1.5396774095334451, "grad_norm": 0.2824723720550537, "learning_rate": 0.0002, "loss": 0.6377, "mean_token_accuracy": 0.8184169925749302, "num_tokens": 159914678.0, "step": 20545 }, { "entropy": 0.5478874834254384, "epoch": 1.5400521334672972, "grad_norm": 0.2286110818386078, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.8260124333202838, "num_tokens": 161396268.0, "step": 20550 }, { "entropy": 0.5536359491758048, "epoch": 1.540426857401149, "grad_norm": 0.2549944519996643, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.8238245010375976, "num_tokens": 162853381.0, "step": 20555 }, { "entropy": 0.5474575297906995, "epoch": 1.540801581335001, "grad_norm": 0.23129069805145264, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.8239465661346912, "num_tokens": 164312704.0, "step": 20560 }, { "entropy": 0.5554010471329093, "epoch": 1.5411763052688527, "grad_norm": 0.29366031289100647, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8226529616862536, "num_tokens": 165781727.0, "step": 20565 }, { "entropy": 0.5817666586488486, "epoch": 1.5415510292027046, "grad_norm": 0.30262163281440735, "learning_rate": 0.0002, "loss": 0.6355, "mean_token_accuracy": 0.8211708646267653, "num_tokens": 167240364.0, "step": 20570 }, { "entropy": 0.5562260089442134, "epoch": 1.5419257531365564, "grad_norm": 0.21745245158672333, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8228360950946808, "num_tokens": 168700159.0, "step": 20575 }, { "entropy": 0.5751066552475095, "epoch": 1.5423004770704083, "grad_norm": 0.24915087223052979, "learning_rate": 0.0002, "loss": 0.6368, "mean_token_accuracy": 0.8204403761774302, "num_tokens": 170132535.0, "step": 20580 }, { "entropy": 0.5759136396460235, "epoch": 1.5426752010042601, "grad_norm": 0.2766787111759186, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8201265726238489, "num_tokens": 171596698.0, "step": 20585 }, { "entropy": 0.5538654234260321, "epoch": 1.543049924938112, "grad_norm": 0.2517149746417999, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8246654588729143, "num_tokens": 173072170.0, "step": 20590 }, { "entropy": 0.5505644891411066, "epoch": 1.5434246488719638, "grad_norm": 0.23839592933654785, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.8255181189626455, "num_tokens": 174556981.0, "step": 20595 }, { "entropy": 0.5489900266751647, "epoch": 1.5437993728058157, "grad_norm": 0.24145206809043884, "learning_rate": 0.0002, "loss": 0.6342, "mean_token_accuracy": 0.8204487819224596, "num_tokens": 176042306.0, "step": 20600 }, { "entropy": 0.5400242811068893, "epoch": 1.5441740967396675, "grad_norm": 0.21273556351661682, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8250741008669138, "num_tokens": 1480149.0, "step": 20605 }, { "entropy": 0.5446140227839351, "epoch": 1.5445488206735194, "grad_norm": 0.24609681963920593, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.8222718425095081, "num_tokens": 2955097.0, "step": 20610 }, { "entropy": 0.5513022899627685, "epoch": 1.5449235446073712, "grad_norm": 0.31194233894348145, "learning_rate": 0.0002, "loss": 0.6349, "mean_token_accuracy": 0.8183193631470204, "num_tokens": 4440838.0, "step": 20615 }, { "entropy": 0.558218688890338, "epoch": 1.545298268541223, "grad_norm": 0.2154223769903183, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.822848679125309, "num_tokens": 5937819.0, "step": 20620 }, { "entropy": 0.5497607320547104, "epoch": 1.5456729924750752, "grad_norm": 0.2424250990152359, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8200298205018044, "num_tokens": 7410856.0, "step": 20625 }, { "entropy": 0.5415777917951345, "epoch": 1.546047716408927, "grad_norm": 0.24442917108535767, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.8241279922425747, "num_tokens": 8921614.0, "step": 20630 }, { "entropy": 0.5589156379923225, "epoch": 1.5464224403427789, "grad_norm": 0.23657162487506866, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.8193877805024385, "num_tokens": 10369156.0, "step": 20635 }, { "entropy": 0.5417553664185106, "epoch": 1.5467971642766307, "grad_norm": 0.29810723662376404, "learning_rate": 0.0002, "loss": 0.6258, "mean_token_accuracy": 0.8232709772884845, "num_tokens": 11810295.0, "step": 20640 }, { "entropy": 0.5469608470797539, "epoch": 1.5471718882104826, "grad_norm": 0.2496020644903183, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.8230864468961954, "num_tokens": 13267768.0, "step": 20645 }, { "entropy": 0.5583577474579215, "epoch": 1.5475466121443344, "grad_norm": 0.2334013283252716, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.8213396888226271, "num_tokens": 14741232.0, "step": 20650 }, { "entropy": 0.5416411589831114, "epoch": 1.5479213360781863, "grad_norm": 0.27191171050071716, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.8260732620954514, "num_tokens": 16206638.0, "step": 20655 }, { "entropy": 0.5572425231337548, "epoch": 1.5482960600120381, "grad_norm": 0.23294781148433685, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8223546840250492, "num_tokens": 17688053.0, "step": 20660 }, { "entropy": 0.5376308998093009, "epoch": 1.54867078394589, "grad_norm": 0.25916787981987, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.8265289973467589, "num_tokens": 19140468.0, "step": 20665 }, { "entropy": 0.5474349707365036, "epoch": 1.5490455078797418, "grad_norm": 0.2781670093536377, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.8200645353645086, "num_tokens": 20647746.0, "step": 20670 }, { "entropy": 0.532412033341825, "epoch": 1.5494202318135937, "grad_norm": 0.22971415519714355, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.8248737059533596, "num_tokens": 22068940.0, "step": 20675 }, { "entropy": 0.5456386528909206, "epoch": 1.5497949557474455, "grad_norm": 0.226980522274971, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.8217189855873585, "num_tokens": 23583373.0, "step": 20680 }, { "entropy": 0.5486686408519745, "epoch": 1.5501696796812974, "grad_norm": 0.25078126788139343, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.8197312861680984, "num_tokens": 25050864.0, "step": 20685 }, { "entropy": 0.5552808189764619, "epoch": 1.5505444036151492, "grad_norm": 0.26717785000801086, "learning_rate": 0.0002, "loss": 0.6359, "mean_token_accuracy": 0.819684574380517, "num_tokens": 26546977.0, "step": 20690 }, { "entropy": 0.5410879233852028, "epoch": 1.550919127549001, "grad_norm": 0.25193309783935547, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.82145058773458, "num_tokens": 28010804.0, "step": 20695 }, { "entropy": 0.5214231194928288, "epoch": 1.551293851482853, "grad_norm": 0.22994251549243927, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.8247982751578092, "num_tokens": 29465158.0, "step": 20700 }, { "entropy": 0.5452104020863772, "epoch": 1.5516685754167048, "grad_norm": 0.23802122473716736, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8229472942650318, "num_tokens": 30941057.0, "step": 20705 }, { "entropy": 0.5489320604130625, "epoch": 1.5520432993505566, "grad_norm": 0.2106172740459442, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.8198793575167656, "num_tokens": 32442993.0, "step": 20710 }, { "entropy": 0.5352871559560299, "epoch": 1.5524180232844085, "grad_norm": 0.2945164442062378, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.8261377919465304, "num_tokens": 33906843.0, "step": 20715 }, { "entropy": 0.5591617662459611, "epoch": 1.5527927472182603, "grad_norm": 0.23885692656040192, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8237957961857318, "num_tokens": 35377180.0, "step": 20720 }, { "entropy": 0.5681552711874247, "epoch": 1.5531674711521122, "grad_norm": 0.21699784696102142, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8247284919023514, "num_tokens": 36866314.0, "step": 20725 }, { "entropy": 0.5851278075948357, "epoch": 1.553542195085964, "grad_norm": 0.285003662109375, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.8215200260281563, "num_tokens": 38304097.0, "step": 20730 }, { "entropy": 0.5776584975421428, "epoch": 1.5539169190198159, "grad_norm": 0.21492509543895721, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.8205952845513821, "num_tokens": 39781667.0, "step": 20735 }, { "entropy": 0.5489239504560828, "epoch": 1.5542916429536677, "grad_norm": 0.2719987630844116, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.821756773814559, "num_tokens": 41278892.0, "step": 20740 }, { "entropy": 0.568826999515295, "epoch": 1.5546663668875196, "grad_norm": 0.2967146933078766, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.8210621599107981, "num_tokens": 42750390.0, "step": 20745 }, { "entropy": 0.5834575602784753, "epoch": 1.5550410908213714, "grad_norm": 0.25831472873687744, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.8195087682455778, "num_tokens": 44241499.0, "step": 20750 }, { "entropy": 0.5937809016555547, "epoch": 1.5554158147552233, "grad_norm": 0.23120693862438202, "learning_rate": 0.0002, "loss": 0.6437, "mean_token_accuracy": 0.819771084561944, "num_tokens": 45731884.0, "step": 20755 }, { "entropy": 0.5848525004461408, "epoch": 1.5557905386890751, "grad_norm": 0.23487117886543274, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.8208602268248797, "num_tokens": 47182396.0, "step": 20760 }, { "entropy": 0.5735884448513389, "epoch": 1.556165262622927, "grad_norm": 0.22352442145347595, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8200161449611187, "num_tokens": 48651310.0, "step": 20765 }, { "entropy": 0.573778385296464, "epoch": 1.5565399865567788, "grad_norm": 0.24929189682006836, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8198213610798121, "num_tokens": 50146821.0, "step": 20770 }, { "entropy": 0.5712427733466029, "epoch": 1.5569147104906307, "grad_norm": 0.23652616143226624, "learning_rate": 0.0002, "loss": 0.6373, "mean_token_accuracy": 0.8210174906998873, "num_tokens": 51593427.0, "step": 20775 }, { "entropy": 0.564096755720675, "epoch": 1.5572894344244825, "grad_norm": 0.2318771481513977, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.8228785499930382, "num_tokens": 53075097.0, "step": 20780 }, { "entropy": 0.5789984315633774, "epoch": 1.5576641583583344, "grad_norm": 0.25371792912483215, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.8223415095359087, "num_tokens": 54540646.0, "step": 20785 }, { "entropy": 0.5728953087702393, "epoch": 1.5580388822921862, "grad_norm": 0.2383098304271698, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.824604045972228, "num_tokens": 55992508.0, "step": 20790 }, { "entropy": 0.5662619665265083, "epoch": 1.558413606226038, "grad_norm": 0.22684933245182037, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8226312048733234, "num_tokens": 57449865.0, "step": 20795 }, { "entropy": 0.552147432230413, "epoch": 1.55878833015989, "grad_norm": 0.2529435157775879, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.8263409536331892, "num_tokens": 58910612.0, "step": 20800 }, { "entropy": 0.5935651251114905, "epoch": 1.5591630540937418, "grad_norm": 0.24590250849723816, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.8241826441138983, "num_tokens": 60315813.0, "step": 20805 }, { "entropy": 0.5921170150861144, "epoch": 1.5595377780275936, "grad_norm": 0.2374740093946457, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8253916792571545, "num_tokens": 61793527.0, "step": 20810 }, { "entropy": 0.5797510221600533, "epoch": 1.5599125019614455, "grad_norm": 0.2233729064464569, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.8193614427000284, "num_tokens": 63283706.0, "step": 20815 }, { "entropy": 0.5814077239483595, "epoch": 1.5602872258952973, "grad_norm": 0.2608109712600708, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.8228835362941027, "num_tokens": 64792946.0, "step": 20820 }, { "entropy": 0.5757188078016042, "epoch": 1.5606619498291492, "grad_norm": 0.24771250784397125, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.8271049793809653, "num_tokens": 66253803.0, "step": 20825 }, { "entropy": 0.5784394888207316, "epoch": 1.561036673763001, "grad_norm": 0.23444151878356934, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.8227148152887821, "num_tokens": 67710087.0, "step": 20830 }, { "entropy": 0.5731799622997642, "epoch": 1.5614113976968529, "grad_norm": 0.30156317353248596, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.8256457719951869, "num_tokens": 69204030.0, "step": 20835 }, { "entropy": 0.5712811466306448, "epoch": 1.5617861216307047, "grad_norm": 0.23929716646671295, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.826425202190876, "num_tokens": 70668418.0, "step": 20840 }, { "entropy": 0.5906168859452009, "epoch": 1.5621608455645566, "grad_norm": 0.2268339991569519, "learning_rate": 0.0002, "loss": 0.6361, "mean_token_accuracy": 0.8208085741847754, "num_tokens": 72122109.0, "step": 20845 }, { "entropy": 0.5717036593705416, "epoch": 1.5625355694984084, "grad_norm": 0.24631844460964203, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.8237026851624251, "num_tokens": 73597692.0, "step": 20850 }, { "entropy": 0.5797807270660996, "epoch": 1.5629102934322603, "grad_norm": 0.2954524755477905, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.8221133965998888, "num_tokens": 75079635.0, "step": 20855 }, { "entropy": 0.579280984774232, "epoch": 1.5632850173661124, "grad_norm": 0.2373565286397934, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.8210008651018142, "num_tokens": 76569291.0, "step": 20860 }, { "entropy": 0.5857789933681488, "epoch": 1.5636597412999642, "grad_norm": 0.22413581609725952, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8271234348416329, "num_tokens": 78061139.0, "step": 20865 }, { "entropy": 0.6063278866931796, "epoch": 1.564034465233816, "grad_norm": 0.23619872331619263, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.8225223127752542, "num_tokens": 79520184.0, "step": 20870 }, { "entropy": 0.5894115105271339, "epoch": 1.564409189167668, "grad_norm": 0.2320241928100586, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.8246382508426905, "num_tokens": 81007416.0, "step": 20875 }, { "entropy": 0.5788199182599782, "epoch": 1.5647839131015198, "grad_norm": 0.21862278878688812, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8242744565010071, "num_tokens": 82495792.0, "step": 20880 }, { "entropy": 0.5830720195546746, "epoch": 1.5651586370353716, "grad_norm": 0.26152318716049194, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.8226394958794117, "num_tokens": 83974810.0, "step": 20885 }, { "entropy": 0.5876906156539917, "epoch": 1.5655333609692235, "grad_norm": 0.21499910950660706, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.8254285279661417, "num_tokens": 85453041.0, "step": 20890 }, { "entropy": 0.5778729818761349, "epoch": 1.5659080849030753, "grad_norm": 0.265349417924881, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8212972693145275, "num_tokens": 86958160.0, "step": 20895 }, { "entropy": 0.5592421405017376, "epoch": 1.5662828088369272, "grad_norm": 0.26750749349594116, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8236667197197676, "num_tokens": 88414043.0, "step": 20900 }, { "entropy": 0.5823794716969133, "epoch": 1.566657532770779, "grad_norm": 0.22176899015903473, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.8189610306173563, "num_tokens": 89894708.0, "step": 20905 }, { "entropy": 0.5843296863138676, "epoch": 1.5670322567046309, "grad_norm": 0.23378567397594452, "learning_rate": 0.0002, "loss": 0.637, "mean_token_accuracy": 0.8197629012167453, "num_tokens": 91383576.0, "step": 20910 }, { "entropy": 0.5563793536275625, "epoch": 1.5674069806384827, "grad_norm": 0.26052477955818176, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8266893189400435, "num_tokens": 92860309.0, "step": 20915 }, { "entropy": 0.5692546905949711, "epoch": 1.5677817045723346, "grad_norm": 0.24671509861946106, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8213677495718003, "num_tokens": 94321587.0, "step": 20920 }, { "entropy": 0.5618269547820092, "epoch": 1.5681564285061864, "grad_norm": 0.24485059082508087, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8239411439746618, "num_tokens": 95796317.0, "step": 20925 }, { "entropy": 0.547921609506011, "epoch": 1.5685311524400383, "grad_norm": 0.275482177734375, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.8274452749639749, "num_tokens": 97275114.0, "step": 20930 }, { "entropy": 0.5834817456081509, "epoch": 1.5689058763738901, "grad_norm": 0.24184657633304596, "learning_rate": 0.0002, "loss": 0.6457, "mean_token_accuracy": 0.8167880821973086, "num_tokens": 98759917.0, "step": 20935 }, { "entropy": 0.5859330739825964, "epoch": 1.5692806003077422, "grad_norm": 0.25716155767440796, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.8255576506257057, "num_tokens": 100198962.0, "step": 20940 }, { "entropy": 0.5726820722222328, "epoch": 1.569655324241594, "grad_norm": 0.2536241412162781, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.826457628980279, "num_tokens": 101721048.0, "step": 20945 }, { "entropy": 0.5920535130426288, "epoch": 1.570030048175446, "grad_norm": 0.23347394168376923, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8248052727431059, "num_tokens": 103191636.0, "step": 20950 }, { "entropy": 0.5740554893389345, "epoch": 1.5704047721092977, "grad_norm": 0.22402817010879517, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8225957497954368, "num_tokens": 104638994.0, "step": 20955 }, { "entropy": 0.5600388523191213, "epoch": 1.5707794960431496, "grad_norm": 0.24692367017269135, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.8246654521673917, "num_tokens": 106112512.0, "step": 20960 }, { "entropy": 0.5703086504712701, "epoch": 1.5711542199770014, "grad_norm": 0.23316144943237305, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.8217408504337073, "num_tokens": 107582835.0, "step": 20965 }, { "entropy": 0.5917672397568822, "epoch": 1.5715289439108533, "grad_norm": 0.22788013517856598, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.8216013636440038, "num_tokens": 109047994.0, "step": 20970 }, { "entropy": 0.6029377149417996, "epoch": 1.5719036678447051, "grad_norm": 0.21441814303398132, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.8205932978540659, "num_tokens": 110532549.0, "step": 20975 }, { "entropy": 0.5788244253024459, "epoch": 1.572278391778557, "grad_norm": 0.23134765028953552, "learning_rate": 0.0002, "loss": 0.6328, "mean_token_accuracy": 0.8220352068543434, "num_tokens": 112007126.0, "step": 20980 }, { "entropy": 0.5761287525296211, "epoch": 1.5726531157124088, "grad_norm": 0.24550355970859528, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8223584935069084, "num_tokens": 113427104.0, "step": 20985 }, { "entropy": 0.5746223717927933, "epoch": 1.5730278396462607, "grad_norm": 0.25487059354782104, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8245226137340069, "num_tokens": 114871818.0, "step": 20990 }, { "entropy": 0.6125171585008502, "epoch": 1.5734025635801125, "grad_norm": 0.25210046768188477, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.8234486024826765, "num_tokens": 116328435.0, "step": 20995 }, { "entropy": 0.6126046441495419, "epoch": 1.5737772875139644, "grad_norm": 0.2573935389518738, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.8260029200464487, "num_tokens": 117779088.0, "step": 21000 }, { "entropy": 0.6063166489824653, "epoch": 1.5741520114478162, "grad_norm": 0.22329585254192352, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.8283204793930053, "num_tokens": 119251937.0, "step": 21005 }, { "entropy": 0.5971649019047618, "epoch": 1.574526735381668, "grad_norm": 0.2997703552246094, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.8221603147685528, "num_tokens": 120766876.0, "step": 21010 }, { "entropy": 0.5876068999990821, "epoch": 1.57490145931552, "grad_norm": 0.2584898769855499, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.8229948595166207, "num_tokens": 122216821.0, "step": 21015 }, { "entropy": 0.5804713763296604, "epoch": 1.5752761832493718, "grad_norm": 0.21415339410305023, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.8224530424922705, "num_tokens": 123685245.0, "step": 21020 }, { "entropy": 0.5868807574734092, "epoch": 1.5756509071832236, "grad_norm": 0.26973801851272583, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.8237260229885578, "num_tokens": 125133041.0, "step": 21025 }, { "entropy": 0.5866925692185759, "epoch": 1.5760256311170755, "grad_norm": 0.2399323731660843, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.8229312758892775, "num_tokens": 126573404.0, "step": 21030 }, { "entropy": 0.6159909648820758, "epoch": 1.5764003550509273, "grad_norm": 0.2625921666622162, "learning_rate": 0.0002, "loss": 0.647, "mean_token_accuracy": 0.819633923843503, "num_tokens": 127988147.0, "step": 21035 }, { "entropy": 0.5939430098980665, "epoch": 1.5767750789847792, "grad_norm": 0.22917529940605164, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8240275721997022, "num_tokens": 129435380.0, "step": 21040 }, { "entropy": 0.5921495355665684, "epoch": 1.577149802918631, "grad_norm": 0.2250984162092209, "learning_rate": 0.0002, "loss": 0.6256, "mean_token_accuracy": 0.8235674921423197, "num_tokens": 130919926.0, "step": 21045 }, { "entropy": 0.5724755948409438, "epoch": 1.577524526852483, "grad_norm": 0.27569448947906494, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.8278125699609518, "num_tokens": 132366610.0, "step": 21050 }, { "entropy": 0.5707032196223736, "epoch": 1.5778992507863348, "grad_norm": 0.2529895007610321, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8237403187900781, "num_tokens": 133839650.0, "step": 21055 }, { "entropy": 0.5759435711428523, "epoch": 1.5782739747201866, "grad_norm": 0.2399522215127945, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.8239029157906771, "num_tokens": 135283633.0, "step": 21060 }, { "entropy": 0.5775823716074229, "epoch": 1.5786486986540385, "grad_norm": 0.24824044108390808, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8229716278612613, "num_tokens": 136729979.0, "step": 21065 }, { "entropy": 0.5711610464379191, "epoch": 1.5790234225878903, "grad_norm": 0.23910219967365265, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.8264092996716499, "num_tokens": 138179372.0, "step": 21070 }, { "entropy": 0.5712891517207026, "epoch": 1.5793981465217422, "grad_norm": 0.22466903924942017, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8265659045428038, "num_tokens": 139611542.0, "step": 21075 }, { "entropy": 0.5682205675169826, "epoch": 1.579772870455594, "grad_norm": 0.23261624574661255, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.8262200117111206, "num_tokens": 141063226.0, "step": 21080 }, { "entropy": 0.5734266011044383, "epoch": 1.5801475943894459, "grad_norm": 0.26190346479415894, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.8220623534172773, "num_tokens": 142502831.0, "step": 21085 }, { "entropy": 0.5800749059766531, "epoch": 1.5805223183232977, "grad_norm": 0.28570160269737244, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8209350265562534, "num_tokens": 143985379.0, "step": 21090 }, { "entropy": 0.5673055578023195, "epoch": 1.5808970422571496, "grad_norm": 0.22555772960186005, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8227870296686888, "num_tokens": 145463222.0, "step": 21095 }, { "entropy": 0.5827539306133985, "epoch": 1.5812717661910014, "grad_norm": 0.22398754954338074, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.8188820645213127, "num_tokens": 146925900.0, "step": 21100 }, { "entropy": 0.5617051174864173, "epoch": 1.5816464901248533, "grad_norm": 0.25958338379859924, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.825512059032917, "num_tokens": 148365433.0, "step": 21105 }, { "entropy": 0.580884818173945, "epoch": 1.582021214058705, "grad_norm": 0.2488737255334854, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.8235869146883488, "num_tokens": 149827075.0, "step": 21110 }, { "entropy": 0.5926420666277409, "epoch": 1.582395937992557, "grad_norm": 0.2534932792186737, "learning_rate": 0.0002, "loss": 0.6431, "mean_token_accuracy": 0.8171744462102651, "num_tokens": 151360760.0, "step": 21115 }, { "entropy": 0.5750817388296128, "epoch": 1.5827706619264088, "grad_norm": 0.21414558589458466, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.8211111068725586, "num_tokens": 152841925.0, "step": 21120 }, { "entropy": 0.5673602884635329, "epoch": 1.5831453858602607, "grad_norm": 0.286945641040802, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.826096773520112, "num_tokens": 154298870.0, "step": 21125 }, { "entropy": 0.5779914315789938, "epoch": 1.5835201097941125, "grad_norm": 0.24751022458076477, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8217668138444424, "num_tokens": 155824331.0, "step": 21130 }, { "entropy": 0.5615755693987012, "epoch": 1.5838948337279644, "grad_norm": 0.2057180106639862, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.8251824107021093, "num_tokens": 157263593.0, "step": 21135 }, { "entropy": 0.5788657961413264, "epoch": 1.5842695576618162, "grad_norm": 0.23288315534591675, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8222769379615784, "num_tokens": 158743458.0, "step": 21140 }, { "entropy": 0.5794073909521102, "epoch": 1.584644281595668, "grad_norm": 0.2278956174850464, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.8233776096254587, "num_tokens": 160251295.0, "step": 21145 }, { "entropy": 0.5744436634704471, "epoch": 1.58501900552952, "grad_norm": 0.2838103175163269, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8226023513823748, "num_tokens": 161726863.0, "step": 21150 }, { "entropy": 0.5907306341454387, "epoch": 1.5853937294633718, "grad_norm": 0.250577837228775, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.823852988705039, "num_tokens": 163166404.0, "step": 21155 }, { "entropy": 0.6141717512160539, "epoch": 1.5857684533972236, "grad_norm": 0.22492823004722595, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.823479725420475, "num_tokens": 164597722.0, "step": 21160 }, { "entropy": 0.6023563805967569, "epoch": 1.5861431773310755, "grad_norm": 0.26710090041160583, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.8252482656389475, "num_tokens": 166078807.0, "step": 21165 }, { "entropy": 0.5988332305103541, "epoch": 1.5865179012649275, "grad_norm": 0.28761398792266846, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8219511233270168, "num_tokens": 167536809.0, "step": 21170 }, { "entropy": 0.5781072355806828, "epoch": 1.5868926251987794, "grad_norm": 0.21349363029003143, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.8207053571939469, "num_tokens": 169009030.0, "step": 21175 }, { "entropy": 0.5870904605835676, "epoch": 1.5872673491326312, "grad_norm": 0.23708949983119965, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.820133912563324, "num_tokens": 170435584.0, "step": 21180 }, { "entropy": 0.5998379793018103, "epoch": 1.587642073066483, "grad_norm": 0.262153685092926, "learning_rate": 0.0002, "loss": 0.6413, "mean_token_accuracy": 0.8192712355405092, "num_tokens": 171921589.0, "step": 21185 }, { "entropy": 0.5781120125204324, "epoch": 1.588016797000335, "grad_norm": 0.24238327145576477, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8249090380966664, "num_tokens": 173400693.0, "step": 21190 }, { "entropy": 0.5881959058344364, "epoch": 1.5883915209341868, "grad_norm": 0.23712901771068573, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8239924620836974, "num_tokens": 174870654.0, "step": 21195 }, { "entropy": 0.5870156511664391, "epoch": 1.5887662448680386, "grad_norm": 0.22678200900554657, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.823623926192522, "num_tokens": 176343176.0, "step": 21200 }, { "entropy": 0.6017902720719576, "epoch": 1.5891409688018905, "grad_norm": 0.23123157024383545, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.8215292818844319, "num_tokens": 177836229.0, "step": 21205 }, { "entropy": 0.5896786557510495, "epoch": 1.5895156927357423, "grad_norm": 0.24352794885635376, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8218499787151814, "num_tokens": 179313216.0, "step": 21210 }, { "entropy": 0.5826162664219737, "epoch": 1.5898904166695942, "grad_norm": 0.2933867573738098, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8260617543011903, "num_tokens": 180794230.0, "step": 21215 }, { "entropy": 0.6147926500067115, "epoch": 1.590265140603446, "grad_norm": 0.34936100244522095, "learning_rate": 0.0002, "loss": 0.6456, "mean_token_accuracy": 0.8181553140282631, "num_tokens": 182299420.0, "step": 21220 }, { "entropy": 0.6183992244303227, "epoch": 1.5906398645372979, "grad_norm": 0.3438504934310913, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8210065003484488, "num_tokens": 183771093.0, "step": 21225 }, { "entropy": 0.6019972966983914, "epoch": 1.5910145884711497, "grad_norm": 0.2778000235557556, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.8221017606556416, "num_tokens": 185225564.0, "step": 21230 }, { "entropy": 0.5777462314814329, "epoch": 1.5913893124050016, "grad_norm": 0.22973927855491638, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.8271876148879528, "num_tokens": 186697436.0, "step": 21235 }, { "entropy": 0.5995140029117465, "epoch": 1.5917640363388534, "grad_norm": 0.2588672339916229, "learning_rate": 0.0002, "loss": 0.6319, "mean_token_accuracy": 0.8198786050081253, "num_tokens": 188148081.0, "step": 21240 }, { "entropy": 0.5906918831169605, "epoch": 1.5921387602727053, "grad_norm": 0.28759145736694336, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.8222871381789446, "num_tokens": 189647891.0, "step": 21245 }, { "entropy": 0.5754435054957867, "epoch": 1.5925134842065574, "grad_norm": 0.2497623860836029, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.8227969322353601, "num_tokens": 191082911.0, "step": 21250 }, { "entropy": 0.5773149792104959, "epoch": 1.5928882081404092, "grad_norm": 0.32876577973365784, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.8237752225250006, "num_tokens": 192564096.0, "step": 21255 }, { "entropy": 0.5588790733367205, "epoch": 1.593262932074261, "grad_norm": 0.30350369215011597, "learning_rate": 0.0002, "loss": 0.615, "mean_token_accuracy": 0.8271852236241102, "num_tokens": 194040886.0, "step": 21260 }, { "entropy": 0.5749099232256413, "epoch": 1.593637656008113, "grad_norm": 0.2094484120607376, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.8242886129766702, "num_tokens": 195531246.0, "step": 21265 }, { "entropy": 0.5952992333099246, "epoch": 1.5940123799419648, "grad_norm": 0.22058755159378052, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.8228721246123314, "num_tokens": 196962808.0, "step": 21270 }, { "entropy": 0.5908411007374526, "epoch": 1.5943871038758166, "grad_norm": 0.24516735970973969, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.8218279082328082, "num_tokens": 198440983.0, "step": 21275 }, { "entropy": 0.5834316397085786, "epoch": 1.5947618278096685, "grad_norm": 0.26953357458114624, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8231410618871451, "num_tokens": 199947775.0, "step": 21280 }, { "entropy": 0.5726229695603251, "epoch": 1.5951365517435203, "grad_norm": 0.2267308384180069, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.8264009919017553, "num_tokens": 201377569.0, "step": 21285 }, { "entropy": 0.5858025761321187, "epoch": 1.5955112756773722, "grad_norm": 0.23054474592208862, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8231948722153902, "num_tokens": 202856978.0, "step": 21290 }, { "entropy": 0.5781295804306865, "epoch": 1.595885999611224, "grad_norm": 0.23386679589748383, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.8247339613735676, "num_tokens": 204311620.0, "step": 21295 }, { "entropy": 0.5770698856562376, "epoch": 1.5962607235450759, "grad_norm": 0.324972927570343, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.8240850478410721, "num_tokens": 205798877.0, "step": 21300 }, { "entropy": 0.5807706283405423, "epoch": 1.5966354474789277, "grad_norm": 0.21616195142269135, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.8229043953120708, "num_tokens": 207247303.0, "step": 21305 }, { "entropy": 0.5754869485273957, "epoch": 1.5970101714127796, "grad_norm": 0.23800241947174072, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.817412031814456, "num_tokens": 208726295.0, "step": 21310 }, { "entropy": 0.5696481106802821, "epoch": 1.5973848953466314, "grad_norm": 0.23411035537719727, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8243426073342561, "num_tokens": 210173214.0, "step": 21315 }, { "entropy": 0.5682998789474368, "epoch": 1.5977596192804833, "grad_norm": 0.23150227963924408, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.8263826701790095, "num_tokens": 211648494.0, "step": 21320 }, { "entropy": 0.5842795055359602, "epoch": 1.5981343432143351, "grad_norm": 0.2549414336681366, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.8239973381161689, "num_tokens": 213125026.0, "step": 21325 }, { "entropy": 0.5676878519356251, "epoch": 1.598509067148187, "grad_norm": 0.21820563077926636, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8255077745765448, "num_tokens": 214614574.0, "step": 21330 }, { "entropy": 0.5712791150435805, "epoch": 1.5988837910820388, "grad_norm": 0.2126346081495285, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8221534878015518, "num_tokens": 216081647.0, "step": 21335 }, { "entropy": 0.5736344497650862, "epoch": 1.5992585150158907, "grad_norm": 0.22544218599796295, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8220775924623013, "num_tokens": 217539660.0, "step": 21340 }, { "entropy": 0.5848297361284495, "epoch": 1.5996332389497425, "grad_norm": 0.24774152040481567, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.8224608477205038, "num_tokens": 218986355.0, "step": 21345 }, { "entropy": 0.5702755531296134, "epoch": 1.6000079628835944, "grad_norm": 0.23398557305335999, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.8260373141616583, "num_tokens": 220439225.0, "step": 21350 }, { "entropy": 0.5678003400564193, "epoch": 1.6003826868174462, "grad_norm": 0.23884671926498413, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8250802624970675, "num_tokens": 221924811.0, "step": 21355 }, { "entropy": 0.5609184054657816, "epoch": 1.600757410751298, "grad_norm": 0.2674374580383301, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.8286558989435434, "num_tokens": 223341513.0, "step": 21360 }, { "entropy": 0.5496268097311259, "epoch": 1.60113213468515, "grad_norm": 0.23732584714889526, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.8294548530131578, "num_tokens": 224830480.0, "step": 21365 }, { "entropy": 0.5831966990604996, "epoch": 1.6015068586190018, "grad_norm": 0.22366948425769806, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.8259199455380439, "num_tokens": 226328058.0, "step": 21370 }, { "entropy": 0.5877696022391319, "epoch": 1.6018815825528536, "grad_norm": 0.2063703089952469, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8195280846208334, "num_tokens": 227828708.0, "step": 21375 }, { "entropy": 0.5875281361863017, "epoch": 1.6022563064867055, "grad_norm": 0.22993850708007812, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.821103922277689, "num_tokens": 229344489.0, "step": 21380 }, { "entropy": 0.5886637987568974, "epoch": 1.6026310304205573, "grad_norm": 0.22364211082458496, "learning_rate": 0.0002, "loss": 0.6395, "mean_token_accuracy": 0.8213142264634371, "num_tokens": 230801019.0, "step": 21385 }, { "entropy": 0.5780318174511194, "epoch": 1.6030057543544092, "grad_norm": 0.23067378997802734, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8230352133512497, "num_tokens": 232307140.0, "step": 21390 }, { "entropy": 0.5711435713805258, "epoch": 1.603380478288261, "grad_norm": 0.2386089563369751, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.8240315563976764, "num_tokens": 233756115.0, "step": 21395 }, { "entropy": 0.5865009214729071, "epoch": 1.6037552022221129, "grad_norm": 0.217142716050148, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8227637108415365, "num_tokens": 235263060.0, "step": 21400 }, { "entropy": 0.5861660894006491, "epoch": 1.6041299261559647, "grad_norm": 0.2072082757949829, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8247960805892944, "num_tokens": 236732082.0, "step": 21405 }, { "entropy": 0.5896481474861502, "epoch": 1.6045046500898166, "grad_norm": 0.24461637437343597, "learning_rate": 0.0002, "loss": 0.6368, "mean_token_accuracy": 0.8212980795651674, "num_tokens": 238207074.0, "step": 21410 }, { "entropy": 0.5818133557215333, "epoch": 1.6048793740236684, "grad_norm": 0.2676967680454254, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8234158404171467, "num_tokens": 239691122.0, "step": 21415 }, { "entropy": 0.5981652192771435, "epoch": 1.6052540979575203, "grad_norm": 0.24382716417312622, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8203738048672676, "num_tokens": 241218082.0, "step": 21420 }, { "entropy": 0.5841409288346767, "epoch": 1.6056288218913721, "grad_norm": 0.2416231334209442, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8252534847706556, "num_tokens": 242723648.0, "step": 21425 }, { "entropy": 0.5947109088301659, "epoch": 1.606003545825224, "grad_norm": 0.30023935437202454, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.8200702365487814, "num_tokens": 244184613.0, "step": 21430 }, { "entropy": 0.5584115091711283, "epoch": 1.6063782697590758, "grad_norm": 0.22921349108219147, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.823947386443615, "num_tokens": 245624477.0, "step": 21435 }, { "entropy": 0.5759826846420765, "epoch": 1.6067529936929277, "grad_norm": 0.29227396845817566, "learning_rate": 0.0002, "loss": 0.6343, "mean_token_accuracy": 0.8206414494663476, "num_tokens": 247088312.0, "step": 21440 }, { "entropy": 0.5895926848053932, "epoch": 1.6071277176267795, "grad_norm": 0.19822204113006592, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.8224532879889012, "num_tokens": 248569221.0, "step": 21445 }, { "entropy": 0.5974510045722127, "epoch": 1.6075024415606314, "grad_norm": 0.2531324028968811, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.8228726591914892, "num_tokens": 250092724.0, "step": 21450 }, { "entropy": 0.6033687759190798, "epoch": 1.6078771654944832, "grad_norm": 0.24178795516490936, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.8221471555531025, "num_tokens": 251579457.0, "step": 21455 }, { "entropy": 0.600255030207336, "epoch": 1.608251889428335, "grad_norm": 0.24025629460811615, "learning_rate": 0.0002, "loss": 0.648, "mean_token_accuracy": 0.8180364992469549, "num_tokens": 253047106.0, "step": 21460 }, { "entropy": 0.5892023462802172, "epoch": 1.608626613362187, "grad_norm": 0.28976136445999146, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.8222565457224846, "num_tokens": 254515079.0, "step": 21465 }, { "entropy": 0.5673034057021141, "epoch": 1.6090013372960388, "grad_norm": 0.2205674648284912, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8229355204850435, "num_tokens": 256007470.0, "step": 21470 }, { "entropy": 0.5615049686282874, "epoch": 1.6093760612298906, "grad_norm": 0.22063367068767548, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.8238249827176333, "num_tokens": 257440260.0, "step": 21475 }, { "entropy": 0.5772243475541472, "epoch": 1.6097507851637427, "grad_norm": 0.23938696086406708, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.8234284728765487, "num_tokens": 258922996.0, "step": 21480 }, { "entropy": 0.5707463601604104, "epoch": 1.6101255090975946, "grad_norm": 0.26053303480148315, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8260425519198179, "num_tokens": 260419402.0, "step": 21485 }, { "entropy": 0.5799816619604826, "epoch": 1.6105002330314464, "grad_norm": 0.23958030343055725, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.8184848815202713, "num_tokens": 261888054.0, "step": 21490 }, { "entropy": 0.5736477792263031, "epoch": 1.6108749569652983, "grad_norm": 0.21779559552669525, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.8248697552829981, "num_tokens": 263390659.0, "step": 21495 }, { "entropy": 0.6026528980582952, "epoch": 1.61124968089915, "grad_norm": 0.2536238431930542, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.8244670648127794, "num_tokens": 264844097.0, "step": 21500 }, { "entropy": 0.615504495985806, "epoch": 1.611624404833002, "grad_norm": 0.26232150197029114, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.821436283364892, "num_tokens": 266312220.0, "step": 21505 }, { "entropy": 0.5885714706033468, "epoch": 1.6119991287668538, "grad_norm": 0.24626651406288147, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.8242444705218077, "num_tokens": 267757479.0, "step": 21510 }, { "entropy": 0.5788185870274901, "epoch": 1.6123738527007057, "grad_norm": 0.24199865758419037, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.8275424350053072, "num_tokens": 269178120.0, "step": 21515 }, { "entropy": 0.6169776489958168, "epoch": 1.6127485766345575, "grad_norm": 0.23768973350524902, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.8249190289527177, "num_tokens": 270658104.0, "step": 21520 }, { "entropy": 0.5864980118349195, "epoch": 1.6131233005684094, "grad_norm": 0.2633982002735138, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8262855406850577, "num_tokens": 272118084.0, "step": 21525 }, { "entropy": 0.5898979738354683, "epoch": 1.6134980245022612, "grad_norm": 0.23495711386203766, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.8280678309500218, "num_tokens": 273556927.0, "step": 21530 }, { "entropy": 0.5775605330243707, "epoch": 1.613872748436113, "grad_norm": 0.23238693177700043, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8246961042284966, "num_tokens": 275033640.0, "step": 21535 }, { "entropy": 0.5627665681764483, "epoch": 1.614247472369965, "grad_norm": 0.3033461570739746, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.8234102427959442, "num_tokens": 276498809.0, "step": 21540 }, { "entropy": 0.5567151946946979, "epoch": 1.6146221963038168, "grad_norm": 0.28145235776901245, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8237506978213787, "num_tokens": 277955695.0, "step": 21545 }, { "entropy": 0.5704765270464123, "epoch": 1.6149969202376686, "grad_norm": 0.238990917801857, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.8238540027290583, "num_tokens": 279439431.0, "step": 21550 }, { "entropy": 0.5479163689538836, "epoch": 1.6153716441715205, "grad_norm": 0.2634041905403137, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.8309866178780794, "num_tokens": 280850612.0, "step": 21555 }, { "entropy": 0.5702830012887716, "epoch": 1.6157463681053725, "grad_norm": 0.2680835723876953, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.8222187351435423, "num_tokens": 282303149.0, "step": 21560 }, { "entropy": 0.5617482854053378, "epoch": 1.6161210920392244, "grad_norm": 0.21969833970069885, "learning_rate": 0.0002, "loss": 0.6256, "mean_token_accuracy": 0.8241456937044859, "num_tokens": 283790847.0, "step": 21565 }, { "entropy": 0.5510366940870881, "epoch": 1.6164958159730762, "grad_norm": 0.26593586802482605, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8249091535806656, "num_tokens": 285246552.0, "step": 21570 }, { "entropy": 0.5720216285437345, "epoch": 1.616870539906928, "grad_norm": 0.30547821521759033, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8250853840261698, "num_tokens": 286707159.0, "step": 21575 }, { "entropy": 0.5631296280771494, "epoch": 1.61724526384078, "grad_norm": 0.24502134323120117, "learning_rate": 0.0002, "loss": 0.6347, "mean_token_accuracy": 0.8219385232776404, "num_tokens": 288190469.0, "step": 21580 }, { "entropy": 0.5585092826746404, "epoch": 1.6176199877746318, "grad_norm": 0.254214882850647, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.825038480758667, "num_tokens": 289613177.0, "step": 21585 }, { "entropy": 0.5672719918191433, "epoch": 1.6179947117084836, "grad_norm": 0.2286824733018875, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.82327737249434, "num_tokens": 291084544.0, "step": 21590 }, { "entropy": 0.5832114150747657, "epoch": 1.6183694356423355, "grad_norm": 0.24098187685012817, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.8194538120180368, "num_tokens": 292568570.0, "step": 21595 }, { "entropy": 0.566020623780787, "epoch": 1.6187441595761873, "grad_norm": 0.21796263754367828, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.8274850469082594, "num_tokens": 294043957.0, "step": 21600 }, { "entropy": 0.5834032030776143, "epoch": 1.6191188835100392, "grad_norm": 0.3076985478401184, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8238796692341566, "num_tokens": 295511495.0, "step": 21605 }, { "entropy": 0.567290092818439, "epoch": 1.619493607443891, "grad_norm": 0.2754177451133728, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.8262561947107315, "num_tokens": 296956452.0, "step": 21610 }, { "entropy": 0.56667217630893, "epoch": 1.6198683313777429, "grad_norm": 0.2386280745267868, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8228587586432695, "num_tokens": 298379716.0, "step": 21615 }, { "entropy": 0.5605000223964453, "epoch": 1.6202430553115947, "grad_norm": 0.2270268052816391, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.8238028313964605, "num_tokens": 299861304.0, "step": 21620 }, { "entropy": 0.5731315106153488, "epoch": 1.6206177792454466, "grad_norm": 0.2276366651058197, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8251129139214755, "num_tokens": 301314335.0, "step": 21625 }, { "entropy": 0.5691775575280189, "epoch": 1.6209925031792984, "grad_norm": 0.23083937168121338, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.822486498951912, "num_tokens": 302765259.0, "step": 21630 }, { "entropy": 0.577876889705658, "epoch": 1.6213672271131503, "grad_norm": 0.2300570160150528, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.8237215016037226, "num_tokens": 304249994.0, "step": 21635 }, { "entropy": 0.571863584779203, "epoch": 1.6217419510470021, "grad_norm": 0.21651145815849304, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8251395292580128, "num_tokens": 305663766.0, "step": 21640 }, { "entropy": 0.5595396430231631, "epoch": 1.622116674980854, "grad_norm": 0.21836505830287933, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8271476056426763, "num_tokens": 307155286.0, "step": 21645 }, { "entropy": 0.5504636242985725, "epoch": 1.6224913989147058, "grad_norm": 0.23952169716358185, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.8247218627482653, "num_tokens": 308592976.0, "step": 21650 }, { "entropy": 0.5483821097761392, "epoch": 1.6228661228485577, "grad_norm": 0.23362335562705994, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8287600871175528, "num_tokens": 310058087.0, "step": 21655 }, { "entropy": 0.5531410232186318, "epoch": 1.6232408467824095, "grad_norm": 0.23353084921836853, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.8244792826473712, "num_tokens": 311514983.0, "step": 21660 }, { "entropy": 0.5663080502301454, "epoch": 1.6236155707162614, "grad_norm": 0.2587563693523407, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.8223680041730403, "num_tokens": 312992098.0, "step": 21665 }, { "entropy": 0.5811742769554258, "epoch": 1.6239902946501132, "grad_norm": 0.2084832340478897, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.822270580381155, "num_tokens": 314462541.0, "step": 21670 }, { "entropy": 0.5781519128009677, "epoch": 1.624365018583965, "grad_norm": 0.2330836057662964, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8224631268531084, "num_tokens": 315934903.0, "step": 21675 }, { "entropy": 0.5780642610043287, "epoch": 1.624739742517817, "grad_norm": 0.22768345475196838, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.8242169771343469, "num_tokens": 317426094.0, "step": 21680 }, { "entropy": 0.5796404706314207, "epoch": 1.6251144664516688, "grad_norm": 0.3103260397911072, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.8218926161527633, "num_tokens": 318869489.0, "step": 21685 }, { "entropy": 0.5619574612006545, "epoch": 1.6254891903855206, "grad_norm": 0.2531040608882904, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8232702657580375, "num_tokens": 320321596.0, "step": 21690 }, { "entropy": 0.5693974388763309, "epoch": 1.6258639143193725, "grad_norm": 0.2353082299232483, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.8204625897109509, "num_tokens": 321781894.0, "step": 21695 }, { "entropy": 0.5613201608881354, "epoch": 1.6262386382532243, "grad_norm": 0.2670305371284485, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8265965804457664, "num_tokens": 323257526.0, "step": 21700 }, { "entropy": 0.5467006879858672, "epoch": 1.6266133621870762, "grad_norm": 0.2530473470687866, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.8277896918356419, "num_tokens": 324684324.0, "step": 21705 }, { "entropy": 0.5700495515018702, "epoch": 1.626988086120928, "grad_norm": 0.2295946478843689, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.8221381273120641, "num_tokens": 326184372.0, "step": 21710 }, { "entropy": 0.5671440489590168, "epoch": 1.62736281005478, "grad_norm": 0.2236892431974411, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.8227873969823122, "num_tokens": 327614755.0, "step": 21715 }, { "entropy": 0.5719897171482444, "epoch": 1.6277375339886317, "grad_norm": 0.23777534067630768, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.8227067988365888, "num_tokens": 329073131.0, "step": 21720 }, { "entropy": 0.5697572892531753, "epoch": 1.6281122579224836, "grad_norm": 0.2163059115409851, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.82217151299119, "num_tokens": 330511735.0, "step": 21725 }, { "entropy": 0.561367798037827, "epoch": 1.6284869818563354, "grad_norm": 0.2576998770236969, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.8245858374983073, "num_tokens": 331994032.0, "step": 21730 }, { "entropy": 0.5857861403375864, "epoch": 1.6288617057901873, "grad_norm": 0.24761183559894562, "learning_rate": 0.0002, "loss": 0.6457, "mean_token_accuracy": 0.8189274143427611, "num_tokens": 333455843.0, "step": 21735 }, { "entropy": 0.5700666416436434, "epoch": 1.6292364297240391, "grad_norm": 0.25458523631095886, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8251005124300719, "num_tokens": 334933125.0, "step": 21740 }, { "entropy": 0.5645604671910405, "epoch": 1.629611153657891, "grad_norm": 0.2289927452802658, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.8249215263873338, "num_tokens": 336374370.0, "step": 21745 }, { "entropy": 0.5807696923613548, "epoch": 1.6299858775917428, "grad_norm": 0.22011399269104004, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.8210720822215081, "num_tokens": 337871428.0, "step": 21750 }, { "entropy": 0.5686649644747377, "epoch": 1.6303606015255947, "grad_norm": 0.2133554369211197, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.8268918368965388, "num_tokens": 339312232.0, "step": 21755 }, { "entropy": 0.5744839303195477, "epoch": 1.6307353254594465, "grad_norm": 0.26708564162254333, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.826873367279768, "num_tokens": 340751534.0, "step": 21760 }, { "entropy": 0.566369442641735, "epoch": 1.6311100493932984, "grad_norm": 0.2132103443145752, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.825406152009964, "num_tokens": 342167525.0, "step": 21765 }, { "entropy": 0.5781621528789401, "epoch": 1.6314847733271503, "grad_norm": 0.24292615056037903, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8230346776545048, "num_tokens": 343601013.0, "step": 21770 }, { "entropy": 0.5691863400861621, "epoch": 1.631859497261002, "grad_norm": 0.2422584742307663, "learning_rate": 0.0002, "loss": 0.6155, "mean_token_accuracy": 0.8242981437593698, "num_tokens": 345076096.0, "step": 21775 }, { "entropy": 0.5809478404000401, "epoch": 1.632234221194854, "grad_norm": 0.22693412005901337, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.8238076642155647, "num_tokens": 346571247.0, "step": 21780 }, { "entropy": 0.5553932702168822, "epoch": 1.6326089451287058, "grad_norm": 0.25429782271385193, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.8286346189677716, "num_tokens": 348062411.0, "step": 21785 }, { "entropy": 0.5756281059235334, "epoch": 1.6329836690625577, "grad_norm": 0.21664750576019287, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.8226004112511873, "num_tokens": 349548404.0, "step": 21790 }, { "entropy": 0.5861048568040133, "epoch": 1.6333583929964097, "grad_norm": 0.22582100331783295, "learning_rate": 0.0002, "loss": 0.6408, "mean_token_accuracy": 0.818412534147501, "num_tokens": 351039984.0, "step": 21795 }, { "entropy": 0.5766924811527133, "epoch": 1.6337331169302616, "grad_norm": 0.2784685492515564, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8206237178295851, "num_tokens": 352545885.0, "step": 21800 }, { "entropy": 0.5631104366853833, "epoch": 1.6341078408641134, "grad_norm": 0.2541244328022003, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.825833547860384, "num_tokens": 353994626.0, "step": 21805 }, { "entropy": 0.5806199382990599, "epoch": 1.6344825647979653, "grad_norm": 0.22781410813331604, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.8266551200300455, "num_tokens": 355418454.0, "step": 21810 }, { "entropy": 0.5825686069205404, "epoch": 1.6348572887318171, "grad_norm": 0.26067018508911133, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8256616730242967, "num_tokens": 356900505.0, "step": 21815 }, { "entropy": 0.5740105668082833, "epoch": 1.635232012665669, "grad_norm": 0.2166675180196762, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.8250761814415455, "num_tokens": 358359911.0, "step": 21820 }, { "entropy": 0.5578246293589473, "epoch": 1.6356067365995208, "grad_norm": 0.26483961939811707, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.8263456340879202, "num_tokens": 359842270.0, "step": 21825 }, { "entropy": 0.5564733618870378, "epoch": 1.6359814605333727, "grad_norm": 0.22341327369213104, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.821404342725873, "num_tokens": 361319552.0, "step": 21830 }, { "entropy": 0.5626713126897812, "epoch": 1.6363561844672245, "grad_norm": 0.22531427443027496, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8257659189403057, "num_tokens": 362845734.0, "step": 21835 }, { "entropy": 0.5678618746809662, "epoch": 1.6367309084010764, "grad_norm": 0.28291088342666626, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.8243411112576723, "num_tokens": 364307762.0, "step": 21840 }, { "entropy": 0.5729370605200529, "epoch": 1.6371056323349282, "grad_norm": 0.21874895691871643, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8228249706327915, "num_tokens": 365786465.0, "step": 21845 }, { "entropy": 0.5682997189462184, "epoch": 1.63748035626878, "grad_norm": 0.23845431208610535, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8240668680518866, "num_tokens": 367242168.0, "step": 21850 }, { "entropy": 0.5855593506246806, "epoch": 1.637855080202632, "grad_norm": 0.265372633934021, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8228186186403036, "num_tokens": 368731878.0, "step": 21855 }, { "entropy": 0.5875844733789564, "epoch": 1.6382298041364838, "grad_norm": 0.26413023471832275, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8269605591893197, "num_tokens": 370190800.0, "step": 21860 }, { "entropy": 0.5732980338856578, "epoch": 1.6386045280703356, "grad_norm": 0.21892979741096497, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8250936355441809, "num_tokens": 371662039.0, "step": 21865 }, { "entropy": 0.5646902848035097, "epoch": 1.6389792520041877, "grad_norm": 0.23272103071212769, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.8252815343439579, "num_tokens": 373114677.0, "step": 21870 }, { "entropy": 0.5693275049328804, "epoch": 1.6393539759380396, "grad_norm": 0.2262629270553589, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8214972332119942, "num_tokens": 374603487.0, "step": 21875 }, { "entropy": 0.5584684755653143, "epoch": 1.6397286998718914, "grad_norm": 0.2661517262458801, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8222613025456667, "num_tokens": 376056037.0, "step": 21880 }, { "entropy": 0.5578297829255462, "epoch": 1.6401034238057433, "grad_norm": 0.25033602118492126, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8232162531465292, "num_tokens": 377493818.0, "step": 21885 }, { "entropy": 0.5488777451217175, "epoch": 1.640478147739595, "grad_norm": 0.2371152639389038, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.8271799463778734, "num_tokens": 378940180.0, "step": 21890 }, { "entropy": 0.5597940233536065, "epoch": 1.640852871673447, "grad_norm": 0.23901651799678802, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8257185112684965, "num_tokens": 380422916.0, "step": 21895 }, { "entropy": 0.5518261345103384, "epoch": 1.6412275956072988, "grad_norm": 0.2614063620567322, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.827464996278286, "num_tokens": 381880165.0, "step": 21900 }, { "entropy": 0.5639897551387548, "epoch": 1.6416023195411507, "grad_norm": 0.21581697463989258, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8203225906938314, "num_tokens": 383371870.0, "step": 21905 }, { "entropy": 0.5630325211212039, "epoch": 1.6419770434750025, "grad_norm": 0.2288476675748825, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.8251198701560497, "num_tokens": 384832851.0, "step": 21910 }, { "entropy": 0.5647503225132823, "epoch": 1.6423517674088544, "grad_norm": 0.25052228569984436, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.8261098459362983, "num_tokens": 386282094.0, "step": 21915 }, { "entropy": 0.5780191082507372, "epoch": 1.6427264913427062, "grad_norm": 0.2276102900505066, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8221879087388515, "num_tokens": 387824703.0, "step": 21920 }, { "entropy": 0.5613031143322587, "epoch": 1.643101215276558, "grad_norm": 0.26149722933769226, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.8223131712526083, "num_tokens": 389256150.0, "step": 21925 }, { "entropy": 0.565944804251194, "epoch": 1.64347593921041, "grad_norm": 0.22297786176204681, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.8249937042593956, "num_tokens": 390714591.0, "step": 21930 }, { "entropy": 0.5754708038643003, "epoch": 1.6438506631442618, "grad_norm": 0.23415984213352203, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.821722349897027, "num_tokens": 392150821.0, "step": 21935 }, { "entropy": 0.5894401326775551, "epoch": 1.6442253870781136, "grad_norm": 0.23673704266548157, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8230931650847196, "num_tokens": 393624661.0, "step": 21940 }, { "entropy": 0.5937993429601193, "epoch": 1.6446001110119655, "grad_norm": 0.22758467495441437, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.8242815144360065, "num_tokens": 395090741.0, "step": 21945 }, { "entropy": 0.5823782783001661, "epoch": 1.6449748349458173, "grad_norm": 0.25496408343315125, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.8243515729904175, "num_tokens": 396550981.0, "step": 21950 }, { "entropy": 0.5853358983993531, "epoch": 1.6453495588796692, "grad_norm": 0.28531336784362793, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.820722834020853, "num_tokens": 398016638.0, "step": 21955 }, { "entropy": 0.587766033038497, "epoch": 1.645724282813521, "grad_norm": 0.22760365903377533, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8223330363631248, "num_tokens": 399488523.0, "step": 21960 }, { "entropy": 0.5961080929264426, "epoch": 1.6460990067473729, "grad_norm": 0.23780792951583862, "learning_rate": 0.0002, "loss": 0.6398, "mean_token_accuracy": 0.8189356699585915, "num_tokens": 400950744.0, "step": 21965 }, { "entropy": 0.5956605236977339, "epoch": 1.6464737306812247, "grad_norm": 0.24511215090751648, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.8191812194883823, "num_tokens": 402444191.0, "step": 21970 }, { "entropy": 0.5818166732788086, "epoch": 1.6468484546150766, "grad_norm": 0.24556949734687805, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8222758628427982, "num_tokens": 403936799.0, "step": 21975 }, { "entropy": 0.5633613798767328, "epoch": 1.6472231785489284, "grad_norm": 0.2296147644519806, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.8299608409404755, "num_tokens": 405342264.0, "step": 21980 }, { "entropy": 0.5997445903718471, "epoch": 1.6475979024827803, "grad_norm": 0.2252432256937027, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.8204528767615556, "num_tokens": 406778534.0, "step": 21985 }, { "entropy": 0.5864499418064952, "epoch": 1.6479726264166321, "grad_norm": 0.23601093888282776, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.8261076305061579, "num_tokens": 408229384.0, "step": 21990 }, { "entropy": 0.5774087537080049, "epoch": 1.648347350350484, "grad_norm": 0.24723634123802185, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8211357586085797, "num_tokens": 409682313.0, "step": 21995 }, { "entropy": 0.579698734357953, "epoch": 1.6487220742843358, "grad_norm": 0.2794935405254364, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8211952436715364, "num_tokens": 411146363.0, "step": 22000 }, { "entropy": 0.5814308060333133, "epoch": 1.6490967982181877, "grad_norm": 0.2577756941318512, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8227798741310834, "num_tokens": 412617457.0, "step": 22005 }, { "entropy": 0.5794171713292599, "epoch": 1.6494715221520395, "grad_norm": 0.23593644797801971, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.8253320109099149, "num_tokens": 414087289.0, "step": 22010 }, { "entropy": 0.5826452031731606, "epoch": 1.6498462460858914, "grad_norm": 0.26889678835868835, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8241739604622126, "num_tokens": 415566664.0, "step": 22015 }, { "entropy": 0.591394598968327, "epoch": 1.6502209700197432, "grad_norm": 0.2544881999492645, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.8226647831499576, "num_tokens": 416983045.0, "step": 22020 }, { "entropy": 0.5827396897599101, "epoch": 1.650595693953595, "grad_norm": 0.23385296761989594, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8216931853443384, "num_tokens": 418472035.0, "step": 22025 }, { "entropy": 0.5785718625411391, "epoch": 1.650970417887447, "grad_norm": 0.22615966200828552, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.8249866954982281, "num_tokens": 419917943.0, "step": 22030 }, { "entropy": 0.582101252116263, "epoch": 1.6513451418212988, "grad_norm": 0.23783250153064728, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.827590486779809, "num_tokens": 421371652.0, "step": 22035 }, { "entropy": 0.5885595180094242, "epoch": 1.6517198657551506, "grad_norm": 0.22642892599105835, "learning_rate": 0.0002, "loss": 0.6297, "mean_token_accuracy": 0.8246154353022576, "num_tokens": 422855574.0, "step": 22040 }, { "entropy": 0.5708355817943811, "epoch": 1.6520945896890025, "grad_norm": 0.22775301337242126, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.8246940538287163, "num_tokens": 424307404.0, "step": 22045 }, { "entropy": 0.5473725227639079, "epoch": 1.6524693136228543, "grad_norm": 0.21472793817520142, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.8260077502578497, "num_tokens": 425757083.0, "step": 22050 }, { "entropy": 0.5692283849231898, "epoch": 1.6528440375567062, "grad_norm": 0.23332741856575012, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8229455497115851, "num_tokens": 427273876.0, "step": 22055 }, { "entropy": 0.5684347555041314, "epoch": 1.653218761490558, "grad_norm": 0.2275707721710205, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8241293054074049, "num_tokens": 428770523.0, "step": 22060 }, { "entropy": 0.5773626526817679, "epoch": 1.6535934854244099, "grad_norm": 0.2586822211742401, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8210516300052404, "num_tokens": 430231967.0, "step": 22065 }, { "entropy": 0.5794421266764402, "epoch": 1.6539682093582617, "grad_norm": 0.2828232944011688, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8250331994146108, "num_tokens": 431678776.0, "step": 22070 }, { "entropy": 0.5849839642643928, "epoch": 1.6543429332921136, "grad_norm": 0.23143644630908966, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.8230583630502224, "num_tokens": 433115786.0, "step": 22075 }, { "entropy": 0.5701329523697496, "epoch": 1.6547176572259654, "grad_norm": 0.2534964978694916, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8215539090335369, "num_tokens": 434555456.0, "step": 22080 }, { "entropy": 0.5642497865483165, "epoch": 1.6550923811598173, "grad_norm": 0.22523269057273865, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.8248834047466517, "num_tokens": 436084823.0, "step": 22085 }, { "entropy": 0.5699716636911034, "epoch": 1.6554671050936691, "grad_norm": 0.2692004442214966, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.82540079690516, "num_tokens": 437587162.0, "step": 22090 }, { "entropy": 0.5744011582806706, "epoch": 1.655841829027521, "grad_norm": 0.25731733441352844, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.8280691541731358, "num_tokens": 439040704.0, "step": 22095 }, { "entropy": 0.5749597342684865, "epoch": 1.6562165529613728, "grad_norm": 0.2466869205236435, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8230635024607181, "num_tokens": 440474901.0, "step": 22100 }, { "entropy": 0.566566107980907, "epoch": 1.656591276895225, "grad_norm": 0.24651561677455902, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8254842139780522, "num_tokens": 441961804.0, "step": 22105 }, { "entropy": 0.588614791445434, "epoch": 1.6569660008290767, "grad_norm": 0.28909575939178467, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.8176324803382158, "num_tokens": 443398504.0, "step": 22110 }, { "entropy": 0.5648691169917583, "epoch": 1.6573407247629286, "grad_norm": 0.21846072375774384, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.8254827085882426, "num_tokens": 444869556.0, "step": 22115 }, { "entropy": 0.5770204849541187, "epoch": 1.6577154486967804, "grad_norm": 0.23451434075832367, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.8208125039935112, "num_tokens": 446344628.0, "step": 22120 }, { "entropy": 0.5574254158884286, "epoch": 1.6580901726306323, "grad_norm": 0.25625666975975037, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8286577314138412, "num_tokens": 447810357.0, "step": 22125 }, { "entropy": 0.5629556795582176, "epoch": 1.6584648965644841, "grad_norm": 0.26269006729125977, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.823217112571001, "num_tokens": 449286651.0, "step": 22130 }, { "entropy": 0.5646901724860072, "epoch": 1.658839620498336, "grad_norm": 0.22565501928329468, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8225843533873558, "num_tokens": 450755650.0, "step": 22135 }, { "entropy": 0.5573884703218936, "epoch": 1.6592143444321878, "grad_norm": 0.23570045828819275, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8235248431563378, "num_tokens": 452232146.0, "step": 22140 }, { "entropy": 0.5775795951485634, "epoch": 1.6595890683660397, "grad_norm": 0.2411641925573349, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8230970285832881, "num_tokens": 453697116.0, "step": 22145 }, { "entropy": 0.5820532128214836, "epoch": 1.6599637922998915, "grad_norm": 0.22338111698627472, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.823677609488368, "num_tokens": 455153697.0, "step": 22150 }, { "entropy": 0.5402007708325982, "epoch": 1.6603385162337434, "grad_norm": 0.2533090114593506, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.8269328314810991, "num_tokens": 456587604.0, "step": 22155 }, { "entropy": 0.55347672495991, "epoch": 1.6607132401675953, "grad_norm": 0.22080332040786743, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.8240157533437014, "num_tokens": 458063040.0, "step": 22160 }, { "entropy": 0.5494637369178236, "epoch": 1.661087964101447, "grad_norm": 0.2211865931749344, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.8227040246129036, "num_tokens": 459557654.0, "step": 22165 }, { "entropy": 0.5664953333325684, "epoch": 1.661462688035299, "grad_norm": 0.2577325403690338, "learning_rate": 0.0002, "loss": 0.6375, "mean_token_accuracy": 0.8205716859549284, "num_tokens": 461058468.0, "step": 22170 }, { "entropy": 0.5590586319565773, "epoch": 1.6618374119691508, "grad_norm": 0.20977726578712463, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.8255564145743847, "num_tokens": 462526781.0, "step": 22175 }, { "entropy": 0.5855766657739878, "epoch": 1.6622121359030029, "grad_norm": 0.2642596662044525, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.8218121081590652, "num_tokens": 464041669.0, "step": 22180 }, { "entropy": 0.5719107033684849, "epoch": 1.6625868598368547, "grad_norm": 0.21611689031124115, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8235815722495318, "num_tokens": 465539133.0, "step": 22185 }, { "entropy": 0.5585629841312766, "epoch": 1.6629615837707066, "grad_norm": 0.2348795086145401, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8233176350593567, "num_tokens": 467017022.0, "step": 22190 }, { "entropy": 0.5640458940528333, "epoch": 1.6633363077045584, "grad_norm": 0.26582422852516174, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8232957527041436, "num_tokens": 468508034.0, "step": 22195 }, { "entropy": 0.5682473780587316, "epoch": 1.6637110316384103, "grad_norm": 0.22884072363376617, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.8198675025254488, "num_tokens": 470021544.0, "step": 22200 }, { "entropy": 0.5631500668823719, "epoch": 1.6640857555722621, "grad_norm": 0.2456151694059372, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.8239535510540008, "num_tokens": 471510852.0, "step": 22205 }, { "entropy": 0.5470332028344274, "epoch": 1.664460479506114, "grad_norm": 0.26240527629852295, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.8283279687166214, "num_tokens": 472987278.0, "step": 22210 }, { "entropy": 0.5342357110232114, "epoch": 1.6648352034399658, "grad_norm": 0.2154230773448944, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.8272606026381254, "num_tokens": 474429762.0, "step": 22215 }, { "entropy": 0.556783401966095, "epoch": 1.6652099273738177, "grad_norm": 0.339810848236084, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8217621497809887, "num_tokens": 475932579.0, "step": 22220 }, { "entropy": 0.5685012208297848, "epoch": 1.6655846513076695, "grad_norm": 0.23287522792816162, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.8219991687685251, "num_tokens": 477453017.0, "step": 22225 }, { "entropy": 0.5448008641600609, "epoch": 1.6659593752415214, "grad_norm": 0.2234766185283661, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.8219900812953711, "num_tokens": 478891160.0, "step": 22230 }, { "entropy": 0.5461978401988745, "epoch": 1.6663340991753732, "grad_norm": 0.21282489597797394, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.8249628182500601, "num_tokens": 480344234.0, "step": 22235 }, { "entropy": 0.5586129322648048, "epoch": 1.666708823109225, "grad_norm": 0.22630205750465393, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8230560481548309, "num_tokens": 481837530.0, "step": 22240 }, { "entropy": 0.5591969039291144, "epoch": 1.667083547043077, "grad_norm": 0.2394460290670395, "learning_rate": 0.0002, "loss": 0.6278, "mean_token_accuracy": 0.8219689760357142, "num_tokens": 483342447.0, "step": 22245 }, { "entropy": 0.5542982568964362, "epoch": 1.6674582709769288, "grad_norm": 0.24172337353229523, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.8200198076665401, "num_tokens": 484846220.0, "step": 22250 }, { "entropy": 0.5588256135582924, "epoch": 1.6678329949107806, "grad_norm": 0.2077193260192871, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.8224176682531834, "num_tokens": 486330305.0, "step": 22255 }, { "entropy": 0.5595951320603489, "epoch": 1.6682077188446325, "grad_norm": 0.23010358214378357, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.8262601237744093, "num_tokens": 487818041.0, "step": 22260 }, { "entropy": 0.5697363577783108, "epoch": 1.6685824427784843, "grad_norm": 0.24335114657878876, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8240788176655769, "num_tokens": 489275218.0, "step": 22265 }, { "entropy": 0.5564850324764847, "epoch": 1.6689571667123362, "grad_norm": 0.2506742775440216, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.8247298456728458, "num_tokens": 490722281.0, "step": 22270 }, { "entropy": 0.566107708401978, "epoch": 1.669331890646188, "grad_norm": 0.2649068534374237, "learning_rate": 0.0002, "loss": 0.6347, "mean_token_accuracy": 0.8207983881235122, "num_tokens": 492174242.0, "step": 22275 }, { "entropy": 0.5525896949693561, "epoch": 1.6697066145800399, "grad_norm": 0.21844393014907837, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.8219817332923413, "num_tokens": 493671214.0, "step": 22280 }, { "entropy": 0.5434584571048617, "epoch": 1.6700813385138917, "grad_norm": 0.22489149868488312, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.8230168119072914, "num_tokens": 495163939.0, "step": 22285 }, { "entropy": 0.5481268169358373, "epoch": 1.6704560624477436, "grad_norm": 0.2540808916091919, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8250320345163346, "num_tokens": 496598525.0, "step": 22290 }, { "entropy": 0.5534886255860328, "epoch": 1.6708307863815954, "grad_norm": 0.21017619967460632, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.8230214927345514, "num_tokens": 498061009.0, "step": 22295 }, { "entropy": 0.5722745258361102, "epoch": 1.6712055103154473, "grad_norm": 0.2550262212753296, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.8246476806700229, "num_tokens": 499527866.0, "step": 22300 }, { "entropy": 0.5569101011380553, "epoch": 1.6715802342492991, "grad_norm": 0.2763004004955292, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8248559560626745, "num_tokens": 500970945.0, "step": 22305 }, { "entropy": 0.5752576764672994, "epoch": 1.671954958183151, "grad_norm": 0.3795672357082367, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.8206763878464699, "num_tokens": 502428530.0, "step": 22310 }, { "entropy": 0.5607437990605831, "epoch": 1.6723296821170028, "grad_norm": 0.28054237365722656, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.8272024471312761, "num_tokens": 503857612.0, "step": 22315 }, { "entropy": 0.5938538521528244, "epoch": 1.6727044060508547, "grad_norm": 0.22878699004650116, "learning_rate": 0.0002, "loss": 0.6375, "mean_token_accuracy": 0.8238382015377284, "num_tokens": 505297787.0, "step": 22320 }, { "entropy": 0.5903862841427326, "epoch": 1.6730791299847065, "grad_norm": 0.2779439389705658, "learning_rate": 0.0002, "loss": 0.6266, "mean_token_accuracy": 0.8234796050935984, "num_tokens": 506771502.0, "step": 22325 }, { "entropy": 0.5873186973854899, "epoch": 1.6734538539185584, "grad_norm": 0.22428296506404877, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.8246740475296974, "num_tokens": 508223344.0, "step": 22330 }, { "entropy": 0.568581017293036, "epoch": 1.6738285778524102, "grad_norm": 0.25272151827812195, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.8245160266757011, "num_tokens": 509730880.0, "step": 22335 }, { "entropy": 0.5534762922674418, "epoch": 1.674203301786262, "grad_norm": 0.24327236413955688, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8225790668278933, "num_tokens": 511181712.0, "step": 22340 }, { "entropy": 0.562811741232872, "epoch": 1.674578025720114, "grad_norm": 0.22875820100307465, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.8251830626279115, "num_tokens": 512642074.0, "step": 22345 }, { "entropy": 0.5591786123812199, "epoch": 1.6749527496539658, "grad_norm": 0.250479519367218, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8243034146726131, "num_tokens": 514148971.0, "step": 22350 }, { "entropy": 0.5594370566308499, "epoch": 1.6753274735878176, "grad_norm": 0.28448519110679626, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.8251616604626179, "num_tokens": 515628067.0, "step": 22355 }, { "entropy": 0.5537125915288925, "epoch": 1.6757021975216695, "grad_norm": 0.23695039749145508, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.8224320888519288, "num_tokens": 517112671.0, "step": 22360 }, { "entropy": 0.5519643223844468, "epoch": 1.6760769214555213, "grad_norm": 0.2747432589530945, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.823658126220107, "num_tokens": 518546314.0, "step": 22365 }, { "entropy": 0.557232941314578, "epoch": 1.6764516453893732, "grad_norm": 0.23279431462287903, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8246975392103195, "num_tokens": 520005355.0, "step": 22370 }, { "entropy": 0.5743730513378977, "epoch": 1.676826369323225, "grad_norm": 0.27096617221832275, "learning_rate": 0.0002, "loss": 0.6342, "mean_token_accuracy": 0.8227309726178647, "num_tokens": 521479264.0, "step": 22375 }, { "entropy": 0.5486547353677451, "epoch": 1.677201093257077, "grad_norm": 0.39720630645751953, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.8266680706292391, "num_tokens": 522928495.0, "step": 22380 }, { "entropy": 0.5632760141044855, "epoch": 1.6775758171909287, "grad_norm": 0.23261061310768127, "learning_rate": 0.0002, "loss": 0.6183, "mean_token_accuracy": 0.8241979464888572, "num_tokens": 524421642.0, "step": 22385 }, { "entropy": 0.5567158531397581, "epoch": 1.6779505411247806, "grad_norm": 0.23714348673820496, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.8270470954477787, "num_tokens": 525891205.0, "step": 22390 }, { "entropy": 0.572763704136014, "epoch": 1.6783252650586324, "grad_norm": 0.35680145025253296, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.8226411331444978, "num_tokens": 527393196.0, "step": 22395 }, { "entropy": 0.5618600647896528, "epoch": 1.6786999889924843, "grad_norm": 0.27555397152900696, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.8262375064194203, "num_tokens": 528865892.0, "step": 22400 }, { "entropy": 0.5719286557286978, "epoch": 1.6790747129263361, "grad_norm": 0.2467241883277893, "learning_rate": 0.0002, "loss": 0.6256, "mean_token_accuracy": 0.8220950186252594, "num_tokens": 530330724.0, "step": 22405 }, { "entropy": 0.5618877526372671, "epoch": 1.679449436860188, "grad_norm": 0.29057878255844116, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8284148149192333, "num_tokens": 531787578.0, "step": 22410 }, { "entropy": 0.5715273395180702, "epoch": 1.67982416079404, "grad_norm": 0.3474242687225342, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8267129801213742, "num_tokens": 533189305.0, "step": 22415 }, { "entropy": 0.5637648411095142, "epoch": 1.680198884727892, "grad_norm": 0.3240899443626404, "learning_rate": 0.0002, "loss": 0.6326, "mean_token_accuracy": 0.8201906863600016, "num_tokens": 534644430.0, "step": 22420 }, { "entropy": 0.5642764531075954, "epoch": 1.6805736086617438, "grad_norm": 0.23703795671463013, "learning_rate": 0.0002, "loss": 0.633, "mean_token_accuracy": 0.8202781114727259, "num_tokens": 536109433.0, "step": 22425 }, { "entropy": 0.5435346512123942, "epoch": 1.6809483325955956, "grad_norm": 0.2160673588514328, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.8256457440555096, "num_tokens": 537523898.0, "step": 22430 }, { "entropy": 0.5744025617837906, "epoch": 1.6813230565294475, "grad_norm": 0.25581276416778564, "learning_rate": 0.0002, "loss": 0.6352, "mean_token_accuracy": 0.8188620205968619, "num_tokens": 538951668.0, "step": 22435 }, { "entropy": 0.5605024693533778, "epoch": 1.6816977804632993, "grad_norm": 0.26060837507247925, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8263946507126093, "num_tokens": 540424202.0, "step": 22440 }, { "entropy": 0.5831867685541511, "epoch": 1.6820725043971512, "grad_norm": 0.2641807496547699, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.8250779390335083, "num_tokens": 541851632.0, "step": 22445 }, { "entropy": 0.5833802053704857, "epoch": 1.682447228331003, "grad_norm": 0.46742764115333557, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.8213245704770088, "num_tokens": 543321530.0, "step": 22450 }, { "entropy": 0.5673633396625519, "epoch": 1.6828219522648549, "grad_norm": 0.23139764368534088, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8221462421119213, "num_tokens": 544821094.0, "step": 22455 }, { "entropy": 0.5520541053265333, "epoch": 1.6831966761987067, "grad_norm": 0.23188239336013794, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8252028658986091, "num_tokens": 546276385.0, "step": 22460 }, { "entropy": 0.5547691065818071, "epoch": 1.6835714001325586, "grad_norm": 0.35308170318603516, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.827018303796649, "num_tokens": 547716978.0, "step": 22465 }, { "entropy": 0.5711286701261997, "epoch": 1.6839461240664104, "grad_norm": 0.29957184195518494, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.8202120032161474, "num_tokens": 549190192.0, "step": 22470 }, { "entropy": 0.5544914314523339, "epoch": 1.6843208480002623, "grad_norm": 0.28207242488861084, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.8290743555873632, "num_tokens": 550634440.0, "step": 22475 }, { "entropy": 0.5670091653242707, "epoch": 1.6846955719341141, "grad_norm": 0.2671906650066376, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8212650429457427, "num_tokens": 552133337.0, "step": 22480 }, { "entropy": 0.5725349729880691, "epoch": 1.685070295867966, "grad_norm": 0.3103625476360321, "learning_rate": 0.0002, "loss": 0.6324, "mean_token_accuracy": 0.8210069827735424, "num_tokens": 553642747.0, "step": 22485 }, { "entropy": 0.5569515861570835, "epoch": 1.685445019801818, "grad_norm": 0.2554684281349182, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.8257125958800315, "num_tokens": 555163574.0, "step": 22490 }, { "entropy": 0.5659192388877272, "epoch": 1.68581974373567, "grad_norm": 0.2228127270936966, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.8204892791807652, "num_tokens": 556663635.0, "step": 22495 }, { "entropy": 0.5714155018329621, "epoch": 1.6861944676695217, "grad_norm": 0.2551049292087555, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8260651648044586, "num_tokens": 558132365.0, "step": 22500 }, { "entropy": 0.5606713762506843, "epoch": 1.6865691916033736, "grad_norm": 0.23729228973388672, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.822645815461874, "num_tokens": 559605643.0, "step": 22505 }, { "entropy": 0.5506237939000129, "epoch": 1.6869439155372254, "grad_norm": 0.22534312307834625, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.8235216539353132, "num_tokens": 561123040.0, "step": 22510 }, { "entropy": 0.5474358439445496, "epoch": 1.6873186394710773, "grad_norm": 0.2610824406147003, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.8277074985206128, "num_tokens": 562523205.0, "step": 22515 }, { "entropy": 0.5709275124594569, "epoch": 1.6876933634049291, "grad_norm": 0.2037067860364914, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.8237570114433765, "num_tokens": 563999233.0, "step": 22520 }, { "entropy": 0.5862649135291577, "epoch": 1.688068087338781, "grad_norm": 0.22366690635681152, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8252634417265654, "num_tokens": 565440828.0, "step": 22525 }, { "entropy": 0.5788586921989918, "epoch": 1.6884428112726328, "grad_norm": 0.26751089096069336, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.8225314192473888, "num_tokens": 566914786.0, "step": 22530 }, { "entropy": 0.5651432335376739, "epoch": 1.6888175352064847, "grad_norm": 0.2343319207429886, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.8270160429179668, "num_tokens": 568348552.0, "step": 22535 }, { "entropy": 0.5724685812368989, "epoch": 1.6891922591403365, "grad_norm": 0.24304015934467316, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.8211936119943857, "num_tokens": 569823917.0, "step": 22540 }, { "entropy": 0.5708281524479389, "epoch": 1.6895669830741884, "grad_norm": 0.24674174189567566, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.8234215188771487, "num_tokens": 571274858.0, "step": 22545 }, { "entropy": 0.5652345184236764, "epoch": 1.6899417070080403, "grad_norm": 0.23745976388454437, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.8289394974708557, "num_tokens": 572768347.0, "step": 22550 }, { "entropy": 0.5753073969855904, "epoch": 1.690316430941892, "grad_norm": 0.2419762909412384, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8242521055042744, "num_tokens": 574233363.0, "step": 22555 }, { "entropy": 0.5681339763104916, "epoch": 1.690691154875744, "grad_norm": 0.22903063893318176, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8244259502738714, "num_tokens": 575701660.0, "step": 22560 }, { "entropy": 0.5691647229716181, "epoch": 1.6910658788095958, "grad_norm": 0.2442907691001892, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8211589213460684, "num_tokens": 577173927.0, "step": 22565 }, { "entropy": 0.5597865307703614, "epoch": 1.6914406027434477, "grad_norm": 0.22546182572841644, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8212131161242724, "num_tokens": 578624645.0, "step": 22570 }, { "entropy": 0.5644084997475147, "epoch": 1.6918153266772995, "grad_norm": 0.24613508582115173, "learning_rate": 0.0002, "loss": 0.6264, "mean_token_accuracy": 0.8235973492264748, "num_tokens": 580058476.0, "step": 22575 }, { "entropy": 0.5614751376211643, "epoch": 1.6921900506111514, "grad_norm": 0.2738058269023895, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.823634222522378, "num_tokens": 581574790.0, "step": 22580 }, { "entropy": 0.5653905155137181, "epoch": 1.6925647745450032, "grad_norm": 0.2497725486755371, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.8215110074728728, "num_tokens": 583038957.0, "step": 22585 }, { "entropy": 0.5699217166751623, "epoch": 1.692939498478855, "grad_norm": 0.2795870900154114, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.8243305008858443, "num_tokens": 584515151.0, "step": 22590 }, { "entropy": 0.5414055349305272, "epoch": 1.693314222412707, "grad_norm": 0.22719129920005798, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8280941274017095, "num_tokens": 585985927.0, "step": 22595 }, { "entropy": 0.5452942775562406, "epoch": 1.6936889463465588, "grad_norm": 0.24050883948802948, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8263744585216045, "num_tokens": 587382963.0, "step": 22600 }, { "entropy": 0.5576423994265497, "epoch": 1.6940636702804106, "grad_norm": 0.22976087033748627, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.8247368428856134, "num_tokens": 588857882.0, "step": 22605 }, { "entropy": 0.5516075095161795, "epoch": 1.6944383942142625, "grad_norm": 0.2755395770072937, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.8242044154554605, "num_tokens": 590335368.0, "step": 22610 }, { "entropy": 0.5455880900844932, "epoch": 1.6948131181481143, "grad_norm": 0.21731559932231903, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.8229384709149599, "num_tokens": 591828028.0, "step": 22615 }, { "entropy": 0.5620538081042469, "epoch": 1.6951878420819662, "grad_norm": 0.22751688957214355, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8235024232417345, "num_tokens": 593236811.0, "step": 22620 }, { "entropy": 0.5499286327511073, "epoch": 1.695562566015818, "grad_norm": 0.2266782522201538, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.8255312126129866, "num_tokens": 594703860.0, "step": 22625 }, { "entropy": 0.5634828506037592, "epoch": 1.6959372899496699, "grad_norm": 0.2552539110183716, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.8262265980243683, "num_tokens": 596160492.0, "step": 22630 }, { "entropy": 0.5867874013260007, "epoch": 1.6963120138835217, "grad_norm": 0.2660416066646576, "learning_rate": 0.0002, "loss": 0.6343, "mean_token_accuracy": 0.8211677297949791, "num_tokens": 597614986.0, "step": 22635 }, { "entropy": 0.5759105740115047, "epoch": 1.6966867378173736, "grad_norm": 0.22766849398612976, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.8217196151614189, "num_tokens": 599073899.0, "step": 22640 }, { "entropy": 0.5686196573078632, "epoch": 1.6970614617512254, "grad_norm": 0.2516596019268036, "learning_rate": 0.0002, "loss": 0.6324, "mean_token_accuracy": 0.8232419941574335, "num_tokens": 600496231.0, "step": 22645 }, { "entropy": 0.5364616062492132, "epoch": 1.6974361856850773, "grad_norm": 0.23589679598808289, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.8270925294607878, "num_tokens": 601918059.0, "step": 22650 }, { "entropy": 0.5436954023316503, "epoch": 1.697810909618929, "grad_norm": 0.23068131506443024, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8269533671438694, "num_tokens": 603376791.0, "step": 22655 }, { "entropy": 0.5606953041628003, "epoch": 1.698185633552781, "grad_norm": 0.21533769369125366, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8256107613444328, "num_tokens": 604885095.0, "step": 22660 }, { "entropy": 0.5580501247197389, "epoch": 1.6985603574866328, "grad_norm": 0.2253926396369934, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8238552007824183, "num_tokens": 606387803.0, "step": 22665 }, { "entropy": 0.5684476662427187, "epoch": 1.6989350814204847, "grad_norm": 0.22186070680618286, "learning_rate": 0.0002, "loss": 0.639, "mean_token_accuracy": 0.8189900252968073, "num_tokens": 607885894.0, "step": 22670 }, { "entropy": 0.550896791741252, "epoch": 1.6993098053543365, "grad_norm": 0.2370474636554718, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8240449231117963, "num_tokens": 609377025.0, "step": 22675 }, { "entropy": 0.5586091589182616, "epoch": 1.6996845292881884, "grad_norm": 0.23183231055736542, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8249823153018951, "num_tokens": 610833824.0, "step": 22680 }, { "entropy": 0.5651203555986285, "epoch": 1.7000592532220402, "grad_norm": 0.21230915188789368, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8230390850454569, "num_tokens": 612364528.0, "step": 22685 }, { "entropy": 0.5711775770410895, "epoch": 1.700433977155892, "grad_norm": 0.2395762950181961, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8228697899729014, "num_tokens": 613826182.0, "step": 22690 }, { "entropy": 0.5776034664362669, "epoch": 1.700808701089744, "grad_norm": 0.21593676507472992, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.8225717794150114, "num_tokens": 615352050.0, "step": 22695 }, { "entropy": 0.5690777927637101, "epoch": 1.7011834250235958, "grad_norm": 0.24511116743087769, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.8204660534858703, "num_tokens": 616847218.0, "step": 22700 }, { "entropy": 0.5539747888222337, "epoch": 1.7015581489574476, "grad_norm": 0.31523674726486206, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8240058697760105, "num_tokens": 618298998.0, "step": 22705 }, { "entropy": 0.5759333908557892, "epoch": 1.7019328728912995, "grad_norm": 0.2474796175956726, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.8209058031439781, "num_tokens": 619780982.0, "step": 22710 }, { "entropy": 0.5619762510061264, "epoch": 1.7023075968251513, "grad_norm": 0.32450249791145325, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.8219110850244761, "num_tokens": 621214541.0, "step": 22715 }, { "entropy": 0.5533817848190665, "epoch": 1.7026823207590032, "grad_norm": 0.2469484955072403, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.8238107845187187, "num_tokens": 622627863.0, "step": 22720 }, { "entropy": 0.5522076742723584, "epoch": 1.7030570446928552, "grad_norm": 0.24862530827522278, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.8286203797906637, "num_tokens": 624129088.0, "step": 22725 }, { "entropy": 0.5849438266828656, "epoch": 1.703431768626707, "grad_norm": 0.23011353611946106, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8181309483945369, "num_tokens": 625616658.0, "step": 22730 }, { "entropy": 0.571597065217793, "epoch": 1.703806492560559, "grad_norm": 0.23069161176681519, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.8207931708544492, "num_tokens": 627090878.0, "step": 22735 }, { "entropy": 0.5662063861265778, "epoch": 1.7041812164944108, "grad_norm": 0.2582344114780426, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.8205678157508374, "num_tokens": 628587614.0, "step": 22740 }, { "entropy": 0.5698215581476689, "epoch": 1.7045559404282626, "grad_norm": 0.391255259513855, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.8227544650435448, "num_tokens": 630056245.0, "step": 22745 }, { "entropy": 0.5586214262992144, "epoch": 1.7049306643621145, "grad_norm": 0.24600368738174438, "learning_rate": 0.0002, "loss": 0.6206, "mean_token_accuracy": 0.8220846399664878, "num_tokens": 631573007.0, "step": 22750 }, { "entropy": 0.5619718860834837, "epoch": 1.7053053882959663, "grad_norm": 0.2620200514793396, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8245607949793339, "num_tokens": 633048911.0, "step": 22755 }, { "entropy": 0.5762408044189214, "epoch": 1.7056801122298182, "grad_norm": 0.2683603763580322, "learning_rate": 0.0002, "loss": 0.6371, "mean_token_accuracy": 0.8226236797869205, "num_tokens": 634502323.0, "step": 22760 }, { "entropy": 0.5707425184547901, "epoch": 1.70605483616367, "grad_norm": 0.22619426250457764, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8247799821197986, "num_tokens": 635984971.0, "step": 22765 }, { "entropy": 0.5536578757688403, "epoch": 1.706429560097522, "grad_norm": 0.25737816095352173, "learning_rate": 0.0002, "loss": 0.6, "mean_token_accuracy": 0.826652466505766, "num_tokens": 637455736.0, "step": 22770 }, { "entropy": 0.5524802028201521, "epoch": 1.7068042840313737, "grad_norm": 0.2157609462738037, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8307724975049495, "num_tokens": 638925007.0, "step": 22775 }, { "entropy": 0.5581434767693281, "epoch": 1.7071790079652256, "grad_norm": 0.22744491696357727, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8227389562875033, "num_tokens": 640355163.0, "step": 22780 }, { "entropy": 0.5623919323086739, "epoch": 1.7075537318990774, "grad_norm": 0.22484278678894043, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.8242181967943907, "num_tokens": 641834488.0, "step": 22785 }, { "entropy": 0.5757542813196779, "epoch": 1.7079284558329293, "grad_norm": 0.2325027585029602, "learning_rate": 0.0002, "loss": 0.634, "mean_token_accuracy": 0.8185530550777912, "num_tokens": 643326127.0, "step": 22790 }, { "entropy": 0.5801168101839721, "epoch": 1.7083031797667811, "grad_norm": 0.23267914354801178, "learning_rate": 0.0002, "loss": 0.6456, "mean_token_accuracy": 0.8226554729044437, "num_tokens": 644769977.0, "step": 22795 }, { "entropy": 0.5915732471272349, "epoch": 1.708677903700633, "grad_norm": 0.25788614153862, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.823129902780056, "num_tokens": 646268162.0, "step": 22800 }, { "entropy": 0.5876464676111937, "epoch": 1.709052627634485, "grad_norm": 0.35636621713638306, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.8268324669450522, "num_tokens": 647809199.0, "step": 22805 }, { "entropy": 0.5864704350009561, "epoch": 1.709427351568337, "grad_norm": 0.24076144397258759, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.8241095900535583, "num_tokens": 649281982.0, "step": 22810 }, { "entropy": 0.5854944288730621, "epoch": 1.7098020755021888, "grad_norm": 0.28517085313796997, "learning_rate": 0.0002, "loss": 0.6059, "mean_token_accuracy": 0.8257125295698643, "num_tokens": 650710713.0, "step": 22815 }, { "entropy": 0.5845791595056653, "epoch": 1.7101767994360406, "grad_norm": 0.22553476691246033, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8224936004728078, "num_tokens": 652128510.0, "step": 22820 }, { "entropy": 0.5673274792730808, "epoch": 1.7105515233698925, "grad_norm": 0.23104368150234222, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.8232918880879879, "num_tokens": 653646141.0, "step": 22825 }, { "entropy": 0.5586719082668423, "epoch": 1.7109262473037443, "grad_norm": 0.21651731431484222, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.8249974895268679, "num_tokens": 655106175.0, "step": 22830 }, { "entropy": 0.5418209007009864, "epoch": 1.7113009712375962, "grad_norm": 0.2246314436197281, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.821705624461174, "num_tokens": 656604785.0, "step": 22835 }, { "entropy": 0.5364978343248368, "epoch": 1.711675695171448, "grad_norm": 0.23204444348812103, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.8223279573023319, "num_tokens": 658105834.0, "step": 22840 }, { "entropy": 0.5211771447211504, "epoch": 1.7120504191052999, "grad_norm": 0.2303294986486435, "learning_rate": 0.0002, "loss": 0.6028, "mean_token_accuracy": 0.8253537703305482, "num_tokens": 659546318.0, "step": 22845 }, { "entropy": 0.5292529517784714, "epoch": 1.7124251430391517, "grad_norm": 0.2180705964565277, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8256280217319727, "num_tokens": 661037098.0, "step": 22850 }, { "entropy": 0.5526520505547523, "epoch": 1.7127998669730036, "grad_norm": 0.2205137312412262, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.8215926144272089, "num_tokens": 662519788.0, "step": 22855 }, { "entropy": 0.533359669148922, "epoch": 1.7131745909068554, "grad_norm": 0.2274995744228363, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.8277448914945126, "num_tokens": 663973375.0, "step": 22860 }, { "entropy": 0.5355330741032958, "epoch": 1.7135493148407073, "grad_norm": 0.2562617063522339, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.8267214607447386, "num_tokens": 665392861.0, "step": 22865 }, { "entropy": 0.5513211030513048, "epoch": 1.7139240387745591, "grad_norm": 0.22815215587615967, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.8240111481398344, "num_tokens": 666832117.0, "step": 22870 }, { "entropy": 0.5690052112564444, "epoch": 1.714298762708411, "grad_norm": 0.2277127057313919, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8211011547595263, "num_tokens": 668313990.0, "step": 22875 }, { "entropy": 0.5740903960540891, "epoch": 1.7146734866422628, "grad_norm": 0.21924486756324768, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.8233008816838264, "num_tokens": 669835714.0, "step": 22880 }, { "entropy": 0.596241832524538, "epoch": 1.7150482105761147, "grad_norm": 0.2549746334552765, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8230196613818407, "num_tokens": 671306891.0, "step": 22885 }, { "entropy": 0.5743513630703092, "epoch": 1.7154229345099665, "grad_norm": 0.24999307096004486, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.8264520723372698, "num_tokens": 672808899.0, "step": 22890 }, { "entropy": 0.5803443187847733, "epoch": 1.7157976584438184, "grad_norm": 0.21809503436088562, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.8226394180208445, "num_tokens": 674259822.0, "step": 22895 }, { "entropy": 0.5651140388101339, "epoch": 1.7161723823776702, "grad_norm": 0.25011953711509705, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.8219268906861543, "num_tokens": 675752752.0, "step": 22900 }, { "entropy": 0.5614835858345032, "epoch": 1.716547106311522, "grad_norm": 0.255839079618454, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.8237041871994734, "num_tokens": 677206014.0, "step": 22905 }, { "entropy": 0.5512228401377797, "epoch": 1.716921830245374, "grad_norm": 0.2635680139064789, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.8241624925285578, "num_tokens": 678695750.0, "step": 22910 }, { "entropy": 0.5778043732047081, "epoch": 1.7172965541792258, "grad_norm": 0.22888216376304626, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8200137067586184, "num_tokens": 680177528.0, "step": 22915 }, { "entropy": 0.5688441877253354, "epoch": 1.7176712781130776, "grad_norm": 0.2615741193294525, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8269883837550879, "num_tokens": 681630410.0, "step": 22920 }, { "entropy": 0.5839952483773232, "epoch": 1.7180460020469295, "grad_norm": 0.24220895767211914, "learning_rate": 0.0002, "loss": 0.6406, "mean_token_accuracy": 0.820214281976223, "num_tokens": 683093177.0, "step": 22925 }, { "entropy": 0.5706031698733568, "epoch": 1.7184207259807813, "grad_norm": 0.24721774458885193, "learning_rate": 0.0002, "loss": 0.6288, "mean_token_accuracy": 0.8221920009702444, "num_tokens": 684597910.0, "step": 22930 }, { "entropy": 0.5837050955742598, "epoch": 1.7187954499146332, "grad_norm": 0.27576956152915955, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8193539135158062, "num_tokens": 686093481.0, "step": 22935 }, { "entropy": 0.5549495261162519, "epoch": 1.719170173848485, "grad_norm": 0.23146814107894897, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8284423820674419, "num_tokens": 687541507.0, "step": 22940 }, { "entropy": 0.5552992498502135, "epoch": 1.7195448977823369, "grad_norm": 0.24029071629047394, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8284046281129122, "num_tokens": 689004983.0, "step": 22945 }, { "entropy": 0.5655665706843138, "epoch": 1.7199196217161887, "grad_norm": 0.24131415784358978, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8261512957513333, "num_tokens": 690461200.0, "step": 22950 }, { "entropy": 0.5579219900071621, "epoch": 1.7202943456500406, "grad_norm": 0.23918119072914124, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8258570525795221, "num_tokens": 691877665.0, "step": 22955 }, { "entropy": 0.5544588817283511, "epoch": 1.7206690695838924, "grad_norm": 0.23530548810958862, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8240240391343832, "num_tokens": 693357109.0, "step": 22960 }, { "entropy": 0.5529450517147779, "epoch": 1.7210437935177443, "grad_norm": 0.29019424319267273, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8210900820791721, "num_tokens": 694824415.0, "step": 22965 }, { "entropy": 0.5589935829862952, "epoch": 1.7214185174515961, "grad_norm": 0.2228519320487976, "learning_rate": 0.0002, "loss": 0.6173, "mean_token_accuracy": 0.8235544990748167, "num_tokens": 696300213.0, "step": 22970 }, { "entropy": 0.5608569666743278, "epoch": 1.721793241385448, "grad_norm": 0.2398669719696045, "learning_rate": 0.0002, "loss": 0.648, "mean_token_accuracy": 0.8198130697011947, "num_tokens": 697788087.0, "step": 22975 }, { "entropy": 0.5535319272428751, "epoch": 1.7221679653192998, "grad_norm": 0.24410706758499146, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8244892872869969, "num_tokens": 699227209.0, "step": 22980 }, { "entropy": 0.5463955957442522, "epoch": 1.7225426892531517, "grad_norm": 0.23123563826084137, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.8264918144792318, "num_tokens": 700661414.0, "step": 22985 }, { "entropy": 0.5528108341619372, "epoch": 1.7229174131870035, "grad_norm": 0.2511530816555023, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.8212541207671166, "num_tokens": 702139852.0, "step": 22990 }, { "entropy": 0.5281540609896183, "epoch": 1.7232921371208554, "grad_norm": 0.2181536704301834, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.8244669452309609, "num_tokens": 703606904.0, "step": 22995 }, { "entropy": 0.5472357148304582, "epoch": 1.7236668610547072, "grad_norm": 0.22327759861946106, "learning_rate": 0.0002, "loss": 0.6344, "mean_token_accuracy": 0.8205209333449602, "num_tokens": 705049096.0, "step": 23000 }, { "entropy": 0.5496057203039527, "epoch": 1.724041584988559, "grad_norm": 0.23897269368171692, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.8225245714187622, "num_tokens": 706498343.0, "step": 23005 }, { "entropy": 0.545398616604507, "epoch": 1.724416308922411, "grad_norm": 0.2174462080001831, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.8238134529441595, "num_tokens": 707952690.0, "step": 23010 }, { "entropy": 0.5420893342234194, "epoch": 1.7247910328562628, "grad_norm": 0.22505885362625122, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.8254637435078621, "num_tokens": 709380329.0, "step": 23015 }, { "entropy": 0.5516165439039469, "epoch": 1.7251657567901146, "grad_norm": 0.21584543585777283, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.8216937847435475, "num_tokens": 710823163.0, "step": 23020 }, { "entropy": 0.5448170155286789, "epoch": 1.7255404807239665, "grad_norm": 0.2485368698835373, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8229444965720176, "num_tokens": 712300195.0, "step": 23025 }, { "entropy": 0.5503126744180917, "epoch": 1.7259152046578183, "grad_norm": 0.24698282778263092, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.826753631234169, "num_tokens": 713768903.0, "step": 23030 }, { "entropy": 0.5371586490422487, "epoch": 1.7262899285916704, "grad_norm": 0.21156872808933258, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8283305909484625, "num_tokens": 715240630.0, "step": 23035 }, { "entropy": 0.5330006528645754, "epoch": 1.7266646525255223, "grad_norm": 0.2130710780620575, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8250524219125509, "num_tokens": 716715838.0, "step": 23040 }, { "entropy": 0.5468152955174446, "epoch": 1.727039376459374, "grad_norm": 0.20201709866523743, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8242831073701382, "num_tokens": 718247144.0, "step": 23045 }, { "entropy": 0.5459514787420631, "epoch": 1.727414100393226, "grad_norm": 0.23299500346183777, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.8234732676297426, "num_tokens": 719683338.0, "step": 23050 }, { "entropy": 0.5382135825231671, "epoch": 1.7277888243270778, "grad_norm": 0.2391936480998993, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.8265488013625145, "num_tokens": 721104208.0, "step": 23055 }, { "entropy": 0.5514962555840611, "epoch": 1.7281635482609297, "grad_norm": 0.2519688606262207, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8205960538238287, "num_tokens": 722541271.0, "step": 23060 }, { "entropy": 0.5541705094277859, "epoch": 1.7285382721947815, "grad_norm": 0.20843406021595, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8240409627556801, "num_tokens": 724044888.0, "step": 23065 }, { "entropy": 0.5565432529896498, "epoch": 1.7289129961286334, "grad_norm": 0.22625529766082764, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8209947526454926, "num_tokens": 725529253.0, "step": 23070 }, { "entropy": 0.5430802505463361, "epoch": 1.7292877200624852, "grad_norm": 0.24765276908874512, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8232810743153095, "num_tokens": 727017583.0, "step": 23075 }, { "entropy": 0.5235096596181392, "epoch": 1.729662443996337, "grad_norm": 0.25091788172721863, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.8320063348859549, "num_tokens": 728450371.0, "step": 23080 }, { "entropy": 0.5381023053079843, "epoch": 1.730037167930189, "grad_norm": 0.24194218218326569, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8271087348461151, "num_tokens": 729890645.0, "step": 23085 }, { "entropy": 0.5326255686581135, "epoch": 1.7304118918640408, "grad_norm": 0.23454095423221588, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.82598250284791, "num_tokens": 731322401.0, "step": 23090 }, { "entropy": 0.565023492835462, "epoch": 1.7307866157978926, "grad_norm": 0.22487950325012207, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.8247860964387655, "num_tokens": 732830317.0, "step": 23095 }, { "entropy": 0.5850747719407081, "epoch": 1.7311613397317445, "grad_norm": 0.284723699092865, "learning_rate": 0.0002, "loss": 0.6455, "mean_token_accuracy": 0.820859682559967, "num_tokens": 734302688.0, "step": 23100 }, { "entropy": 0.5576442077755928, "epoch": 1.7315360636655963, "grad_norm": 0.22585289180278778, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.8254363667219877, "num_tokens": 735731515.0, "step": 23105 }, { "entropy": 0.562561658769846, "epoch": 1.7319107875994482, "grad_norm": 0.23439595103263855, "learning_rate": 0.0002, "loss": 0.6345, "mean_token_accuracy": 0.8224814645946026, "num_tokens": 737223606.0, "step": 23110 }, { "entropy": 0.5281252469867468, "epoch": 1.7322855115333002, "grad_norm": 0.24670056998729706, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.8310920599848032, "num_tokens": 738650680.0, "step": 23115 }, { "entropy": 0.5568367548286914, "epoch": 1.732660235467152, "grad_norm": 0.2613934874534607, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8262316063046455, "num_tokens": 740084404.0, "step": 23120 }, { "entropy": 0.5687573865056038, "epoch": 1.733034959401004, "grad_norm": 0.24166779220104218, "learning_rate": 0.0002, "loss": 0.6327, "mean_token_accuracy": 0.8200013130903244, "num_tokens": 741549236.0, "step": 23125 }, { "entropy": 0.556380058452487, "epoch": 1.7334096833348558, "grad_norm": 0.23670771718025208, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8272006940096617, "num_tokens": 743006716.0, "step": 23130 }, { "entropy": 0.5773618396371603, "epoch": 1.7337844072687076, "grad_norm": 0.2608295679092407, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8255649730563164, "num_tokens": 744475120.0, "step": 23135 }, { "entropy": 0.5819599019363523, "epoch": 1.7341591312025595, "grad_norm": 0.25775346159935, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8238953679800034, "num_tokens": 745933218.0, "step": 23140 }, { "entropy": 0.5536993881687522, "epoch": 1.7345338551364113, "grad_norm": 0.3044836223125458, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.8246550299227238, "num_tokens": 747417744.0, "step": 23145 }, { "entropy": 0.5556094393134117, "epoch": 1.7349085790702632, "grad_norm": 0.2580968737602234, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8251174882054328, "num_tokens": 748869626.0, "step": 23150 }, { "entropy": 0.5621109766885638, "epoch": 1.735283303004115, "grad_norm": 0.2309723198413849, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.8222654197365046, "num_tokens": 750353912.0, "step": 23155 }, { "entropy": 0.5491555797867477, "epoch": 1.735658026937967, "grad_norm": 0.2229190468788147, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.8252888660877943, "num_tokens": 751791610.0, "step": 23160 }, { "entropy": 0.5601950848475099, "epoch": 1.7360327508718187, "grad_norm": 0.24852344393730164, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8257499732077122, "num_tokens": 753204259.0, "step": 23165 }, { "entropy": 0.5571356447413564, "epoch": 1.7364074748056706, "grad_norm": 0.2886582911014557, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8214308768510818, "num_tokens": 754637261.0, "step": 23170 }, { "entropy": 0.578379643894732, "epoch": 1.7367821987395224, "grad_norm": 0.2364957481622696, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.8197326131165028, "num_tokens": 756177053.0, "step": 23175 }, { "entropy": 0.5564591519534587, "epoch": 1.7371569226733743, "grad_norm": 0.2459242045879364, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.8254019431769848, "num_tokens": 757658889.0, "step": 23180 }, { "entropy": 0.5586902417242527, "epoch": 1.7375316466072261, "grad_norm": 0.2261529564857483, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.8231428436934948, "num_tokens": 759103467.0, "step": 23185 }, { "entropy": 0.5712322415784001, "epoch": 1.737906370541078, "grad_norm": 0.25696861743927, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.821765835955739, "num_tokens": 760560359.0, "step": 23190 }, { "entropy": 0.5749824872240424, "epoch": 1.7382810944749298, "grad_norm": 0.25367802381515503, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8251812577247619, "num_tokens": 762022389.0, "step": 23195 }, { "entropy": 0.5820931358262896, "epoch": 1.7386558184087817, "grad_norm": 0.22421488165855408, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8233275789767504, "num_tokens": 763454752.0, "step": 23200 }, { "entropy": 0.5896379644051194, "epoch": 1.7390305423426335, "grad_norm": 0.2115163505077362, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.8211043477058411, "num_tokens": 764900022.0, "step": 23205 }, { "entropy": 0.5881080098450184, "epoch": 1.7394052662764854, "grad_norm": 0.30208292603492737, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8269662816077471, "num_tokens": 766325694.0, "step": 23210 }, { "entropy": 0.5778294580057264, "epoch": 1.7397799902103372, "grad_norm": 0.22817350924015045, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.8259030401706695, "num_tokens": 767771769.0, "step": 23215 }, { "entropy": 0.5510360963642598, "epoch": 1.740154714144189, "grad_norm": 0.2375863641500473, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.8257479708641767, "num_tokens": 769227216.0, "step": 23220 }, { "entropy": 0.5554939886555076, "epoch": 1.740529438078041, "grad_norm": 0.23148612678050995, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8300493702292442, "num_tokens": 770664615.0, "step": 23225 }, { "entropy": 0.5528979610651732, "epoch": 1.7409041620118928, "grad_norm": 0.2396923005580902, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.8247740097343922, "num_tokens": 772176238.0, "step": 23230 }, { "entropy": 0.5647343922406435, "epoch": 1.7412788859457446, "grad_norm": 0.2643921375274658, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8218872170895338, "num_tokens": 773655383.0, "step": 23235 }, { "entropy": 0.5686667483299971, "epoch": 1.7416536098795965, "grad_norm": 0.2791277766227722, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8298863649368287, "num_tokens": 775125184.0, "step": 23240 }, { "entropy": 0.5755292339250445, "epoch": 1.7420283338134483, "grad_norm": 0.2104724645614624, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.8253238689154386, "num_tokens": 776616366.0, "step": 23245 }, { "entropy": 0.5672469763085246, "epoch": 1.7424030577473002, "grad_norm": 0.22848308086395264, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.8237776841968298, "num_tokens": 778089344.0, "step": 23250 }, { "entropy": 0.5518207609653473, "epoch": 1.742777781681152, "grad_norm": 0.2810569703578949, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.8227972436696291, "num_tokens": 779578765.0, "step": 23255 }, { "entropy": 0.583913960121572, "epoch": 1.743152505615004, "grad_norm": 0.24686621129512787, "learning_rate": 0.0002, "loss": 0.6397, "mean_token_accuracy": 0.817936310917139, "num_tokens": 781077586.0, "step": 23260 }, { "entropy": 0.57016531676054, "epoch": 1.7435272295488558, "grad_norm": 0.28519323468208313, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8220376078039408, "num_tokens": 782540138.0, "step": 23265 }, { "entropy": 0.5727458667010069, "epoch": 1.7439019534827076, "grad_norm": 0.24340322613716125, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.8228148475289345, "num_tokens": 784046187.0, "step": 23270 }, { "entropy": 0.5847227269783616, "epoch": 1.7442766774165595, "grad_norm": 0.22692149877548218, "learning_rate": 0.0002, "loss": 0.6414, "mean_token_accuracy": 0.8223953668028117, "num_tokens": 785505173.0, "step": 23275 }, { "entropy": 0.5589459782466293, "epoch": 1.7446514013504113, "grad_norm": 0.23459434509277344, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.8254351705312729, "num_tokens": 787000868.0, "step": 23280 }, { "entropy": 0.5890154713764787, "epoch": 1.7450261252842632, "grad_norm": 0.22534801065921783, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.8174033708870411, "num_tokens": 788490734.0, "step": 23285 }, { "entropy": 0.5691613737493754, "epoch": 1.745400849218115, "grad_norm": 0.22559259831905365, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8230052288621664, "num_tokens": 789985141.0, "step": 23290 }, { "entropy": 0.5589027799665928, "epoch": 1.7457755731519669, "grad_norm": 0.2194080948829651, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.8283052757382393, "num_tokens": 791451378.0, "step": 23295 }, { "entropy": 0.5638531817123293, "epoch": 1.7461502970858187, "grad_norm": 0.23981130123138428, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.8279987577348947, "num_tokens": 792911088.0, "step": 23300 }, { "entropy": 0.5651208070106805, "epoch": 1.7465250210196706, "grad_norm": 0.22965458035469055, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8243585117161274, "num_tokens": 794396140.0, "step": 23305 }, { "entropy": 0.5507726561278105, "epoch": 1.7468997449535224, "grad_norm": 0.23572853207588196, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.8245080899447202, "num_tokens": 795868316.0, "step": 23310 }, { "entropy": 0.533110124990344, "epoch": 1.7472744688873743, "grad_norm": 0.2669636309146881, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.8278415940701962, "num_tokens": 797308433.0, "step": 23315 }, { "entropy": 0.5503396589308978, "epoch": 1.747649192821226, "grad_norm": 0.2548346221446991, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.8245221566408872, "num_tokens": 798793272.0, "step": 23320 }, { "entropy": 0.5466241294518113, "epoch": 1.748023916755078, "grad_norm": 0.25700387358665466, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.8262919500470162, "num_tokens": 800300738.0, "step": 23325 }, { "entropy": 0.5400391601026058, "epoch": 1.7483986406889298, "grad_norm": 0.2349686324596405, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.824942072853446, "num_tokens": 801752484.0, "step": 23330 }, { "entropy": 0.5515965176746249, "epoch": 1.7487733646227817, "grad_norm": 0.3055219352245331, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.8216666661202907, "num_tokens": 803226875.0, "step": 23335 }, { "entropy": 0.5674604557454586, "epoch": 1.7491480885566335, "grad_norm": 0.23274420201778412, "learning_rate": 0.0002, "loss": 0.6359, "mean_token_accuracy": 0.8215732119977475, "num_tokens": 804723562.0, "step": 23340 }, { "entropy": 0.5695261670276522, "epoch": 1.7495228124904856, "grad_norm": 0.2049880176782608, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.8211966097354889, "num_tokens": 806208326.0, "step": 23345 }, { "entropy": 0.5712376939132809, "epoch": 1.7498975364243374, "grad_norm": 0.2442905753850937, "learning_rate": 0.0002, "loss": 0.6297, "mean_token_accuracy": 0.8223819505423308, "num_tokens": 807669970.0, "step": 23350 }, { "entropy": 0.550709524191916, "epoch": 1.7502722603581893, "grad_norm": 0.21296492218971252, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.8274991314858198, "num_tokens": 809176055.0, "step": 23355 }, { "entropy": 0.540094515029341, "epoch": 1.7506469842920411, "grad_norm": 0.26965048909187317, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.8278691656887531, "num_tokens": 810647124.0, "step": 23360 }, { "entropy": 0.5329783701337873, "epoch": 1.751021708225893, "grad_norm": 0.23982495069503784, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.8287832528352738, "num_tokens": 812140744.0, "step": 23365 }, { "entropy": 0.5491590669378639, "epoch": 1.7513964321597448, "grad_norm": 0.2304847240447998, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8245989114046097, "num_tokens": 813571346.0, "step": 23370 }, { "entropy": 0.5631014190614223, "epoch": 1.7517711560935967, "grad_norm": 0.2260044515132904, "learning_rate": 0.0002, "loss": 0.6328, "mean_token_accuracy": 0.8223288349807263, "num_tokens": 815009259.0, "step": 23375 }, { "entropy": 0.5644070478156209, "epoch": 1.7521458800274485, "grad_norm": 0.26732757687568665, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.824132290109992, "num_tokens": 816464689.0, "step": 23380 }, { "entropy": 0.5648545191623271, "epoch": 1.7525206039613004, "grad_norm": 0.2306598275899887, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.8242606718093157, "num_tokens": 817919964.0, "step": 23385 }, { "entropy": 0.5703352324664592, "epoch": 1.7528953278951522, "grad_norm": 0.2543081045150757, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.824062418192625, "num_tokens": 819396824.0, "step": 23390 }, { "entropy": 0.5878957321867346, "epoch": 1.753270051829004, "grad_norm": 0.21854037046432495, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.8197724346071482, "num_tokens": 820881779.0, "step": 23395 }, { "entropy": 0.5682462589815259, "epoch": 1.753644775762856, "grad_norm": 0.3470096290111542, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8247608702629805, "num_tokens": 822354356.0, "step": 23400 }, { "entropy": 0.5647120757028461, "epoch": 1.7540194996967078, "grad_norm": 0.2216251790523529, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.8280881267040968, "num_tokens": 823820929.0, "step": 23405 }, { "entropy": 0.5635066222399473, "epoch": 1.7543942236305596, "grad_norm": 0.30222854018211365, "learning_rate": 0.0002, "loss": 0.6028, "mean_token_accuracy": 0.8280680861324072, "num_tokens": 825264993.0, "step": 23410 }, { "entropy": 0.5887393461540341, "epoch": 1.7547689475644115, "grad_norm": 0.21309718489646912, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.8201029915362597, "num_tokens": 826762957.0, "step": 23415 }, { "entropy": 0.5644157666712999, "epoch": 1.7551436714982633, "grad_norm": 0.25832831859588623, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8241861168295145, "num_tokens": 828199709.0, "step": 23420 }, { "entropy": 0.5652311634272337, "epoch": 1.7555183954321154, "grad_norm": 0.2273329496383667, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.8237191654741765, "num_tokens": 829712290.0, "step": 23425 }, { "entropy": 0.568776517175138, "epoch": 1.7558931193659673, "grad_norm": 0.2895629405975342, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8242160804569721, "num_tokens": 831151373.0, "step": 23430 }, { "entropy": 0.5628887930884957, "epoch": 1.756267843299819, "grad_norm": 0.2602870464324951, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.8262964934110641, "num_tokens": 832600695.0, "step": 23435 }, { "entropy": 0.5533483795821666, "epoch": 1.756642567233671, "grad_norm": 0.23766210675239563, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.8243533365428448, "num_tokens": 834083149.0, "step": 23440 }, { "entropy": 0.5549366446211934, "epoch": 1.7570172911675228, "grad_norm": 0.23970593512058258, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8248516090214253, "num_tokens": 835507518.0, "step": 23445 }, { "entropy": 0.5586098704487086, "epoch": 1.7573920151013747, "grad_norm": 0.23157550394535065, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8223621185868979, "num_tokens": 836984085.0, "step": 23450 }, { "entropy": 0.5712758300825953, "epoch": 1.7577667390352265, "grad_norm": 0.22599123418331146, "learning_rate": 0.0002, "loss": 0.6326, "mean_token_accuracy": 0.8203539874404668, "num_tokens": 838453070.0, "step": 23455 }, { "entropy": 0.5680307552218438, "epoch": 1.7581414629690784, "grad_norm": 0.2326681762933731, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.820654546096921, "num_tokens": 839959966.0, "step": 23460 }, { "entropy": 0.5618850491940975, "epoch": 1.7585161869029302, "grad_norm": 0.22316046059131622, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8223292108625173, "num_tokens": 841439474.0, "step": 23465 }, { "entropy": 0.5471236675977706, "epoch": 1.758890910836782, "grad_norm": 0.2633689045906067, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8248823061585426, "num_tokens": 842903956.0, "step": 23470 }, { "entropy": 0.5643752722069622, "epoch": 1.759265634770634, "grad_norm": 0.3302637040615082, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8224990647286177, "num_tokens": 844335796.0, "step": 23475 }, { "entropy": 0.5725280513986946, "epoch": 1.7596403587044858, "grad_norm": 0.29462119936943054, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8249887235462665, "num_tokens": 845793132.0, "step": 23480 }, { "entropy": 0.5693760499358177, "epoch": 1.7600150826383376, "grad_norm": 0.21500681340694427, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.8291139956563711, "num_tokens": 847260981.0, "step": 23485 }, { "entropy": 0.5768402667716146, "epoch": 1.7603898065721895, "grad_norm": 0.4026014804840088, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8247029181569815, "num_tokens": 848691283.0, "step": 23490 }, { "entropy": 0.553422749042511, "epoch": 1.7607645305060413, "grad_norm": 0.3334931433200836, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.8273367349058389, "num_tokens": 850156935.0, "step": 23495 }, { "entropy": 0.5678861947730184, "epoch": 1.7611392544398932, "grad_norm": 0.2706768810749054, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.8264477271586657, "num_tokens": 851612335.0, "step": 23500 }, { "entropy": 0.5702185375615955, "epoch": 1.761513978373745, "grad_norm": 0.2830354869365692, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.8250342041254044, "num_tokens": 853050497.0, "step": 23505 }, { "entropy": 0.5758253103122115, "epoch": 1.7618887023075969, "grad_norm": 0.23302708566188812, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.8243147507309914, "num_tokens": 854520277.0, "step": 23510 }, { "entropy": 0.5648835349828005, "epoch": 1.7622634262414487, "grad_norm": 0.24410341680049896, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.8277772027999163, "num_tokens": 855955816.0, "step": 23515 }, { "entropy": 0.5846830381080508, "epoch": 1.7626381501753006, "grad_norm": 0.21819187700748444, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8220293566584587, "num_tokens": 857405666.0, "step": 23520 }, { "entropy": 0.5818465199321509, "epoch": 1.7630128741091524, "grad_norm": 0.22687208652496338, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.823760075867176, "num_tokens": 858872226.0, "step": 23525 }, { "entropy": 0.569314694777131, "epoch": 1.7633875980430043, "grad_norm": 0.2202397584915161, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.8240256130695343, "num_tokens": 860308657.0, "step": 23530 }, { "entropy": 0.5737431086599827, "epoch": 1.7637623219768561, "grad_norm": 0.2782852053642273, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8252922829240561, "num_tokens": 861780480.0, "step": 23535 }, { "entropy": 0.5714845784008503, "epoch": 1.764137045910708, "grad_norm": 0.216018408536911, "learning_rate": 0.0002, "loss": 0.6347, "mean_token_accuracy": 0.821528572589159, "num_tokens": 863227999.0, "step": 23540 }, { "entropy": 0.5761340154334903, "epoch": 1.7645117698445598, "grad_norm": 0.2500261664390564, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.8213910169899463, "num_tokens": 864698197.0, "step": 23545 }, { "entropy": 0.5868143940344452, "epoch": 1.7648864937784117, "grad_norm": 0.24101482331752777, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.8209796916693449, "num_tokens": 866202292.0, "step": 23550 }, { "entropy": 0.5770668778568506, "epoch": 1.7652612177122635, "grad_norm": 0.2131914496421814, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8260013438761235, "num_tokens": 867701230.0, "step": 23555 }, { "entropy": 0.5697152785956859, "epoch": 1.7656359416461154, "grad_norm": 0.2391759753227234, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.8255291637033224, "num_tokens": 869181274.0, "step": 23560 }, { "entropy": 0.5816537968814373, "epoch": 1.7660106655799672, "grad_norm": 0.23290690779685974, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.824410654231906, "num_tokens": 870669091.0, "step": 23565 }, { "entropy": 0.5692334279417992, "epoch": 1.766385389513819, "grad_norm": 0.2425345629453659, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.8280848953872919, "num_tokens": 872113563.0, "step": 23570 }, { "entropy": 0.568619285337627, "epoch": 1.766760113447671, "grad_norm": 0.2297676056623459, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8244746875017881, "num_tokens": 873589217.0, "step": 23575 }, { "entropy": 0.561399034038186, "epoch": 1.7671348373815228, "grad_norm": 0.26855960488319397, "learning_rate": 0.0002, "loss": 0.5928, "mean_token_accuracy": 0.828380062058568, "num_tokens": 875017804.0, "step": 23580 }, { "entropy": 0.5835052404552699, "epoch": 1.7675095613153746, "grad_norm": 0.21114753186702728, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8193473681807518, "num_tokens": 876493588.0, "step": 23585 }, { "entropy": 0.5713206170126796, "epoch": 1.7678842852492265, "grad_norm": 0.2635127604007721, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.827380470559001, "num_tokens": 877972863.0, "step": 23590 }, { "entropy": 0.5601808026432991, "epoch": 1.7682590091830783, "grad_norm": 0.2541191875934601, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8254760406911373, "num_tokens": 879420577.0, "step": 23595 }, { "entropy": 0.5694099687039852, "epoch": 1.7686337331169302, "grad_norm": 0.24810156226158142, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.8224047254770994, "num_tokens": 880877237.0, "step": 23600 }, { "entropy": 0.5722340852022171, "epoch": 1.769008457050782, "grad_norm": 0.24651174247264862, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8231909953057766, "num_tokens": 882294228.0, "step": 23605 }, { "entropy": 0.5724385540932417, "epoch": 1.7693831809846339, "grad_norm": 0.23015886545181274, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.8257162895053625, "num_tokens": 883765134.0, "step": 23610 }, { "entropy": 0.5954114468768239, "epoch": 1.7697579049184857, "grad_norm": 0.22723840177059174, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.8222298409789801, "num_tokens": 885220034.0, "step": 23615 }, { "entropy": 0.611225675791502, "epoch": 1.7701326288523376, "grad_norm": 0.2386978268623352, "learning_rate": 0.0002, "loss": 0.6397, "mean_token_accuracy": 0.8174357667565346, "num_tokens": 886716919.0, "step": 23620 }, { "entropy": 0.5862408069893718, "epoch": 1.7705073527861894, "grad_norm": 0.26315411925315857, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8255360707640648, "num_tokens": 888126847.0, "step": 23625 }, { "entropy": 0.5810665871948004, "epoch": 1.7708820767200413, "grad_norm": 0.22673754394054413, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.827130489423871, "num_tokens": 889553541.0, "step": 23630 }, { "entropy": 0.6087362755089998, "epoch": 1.7712568006538931, "grad_norm": 0.2222709357738495, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8207958679646253, "num_tokens": 891040862.0, "step": 23635 }, { "entropy": 0.5821422949433327, "epoch": 1.771631524587745, "grad_norm": 0.23133307695388794, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8256812307983636, "num_tokens": 892552290.0, "step": 23640 }, { "entropy": 0.617080757021904, "epoch": 1.7720062485215968, "grad_norm": 0.23389944434165955, "learning_rate": 0.0002, "loss": 0.6449, "mean_token_accuracy": 0.8199798051267863, "num_tokens": 894018506.0, "step": 23645 }, { "entropy": 0.6063617410138249, "epoch": 1.7723809724554487, "grad_norm": 0.22161400318145752, "learning_rate": 0.0002, "loss": 0.6278, "mean_token_accuracy": 0.82301328741014, "num_tokens": 895454424.0, "step": 23650 }, { "entropy": 0.5786415020003914, "epoch": 1.7727556963893008, "grad_norm": 0.25108635425567627, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8253698475658894, "num_tokens": 896958272.0, "step": 23655 }, { "entropy": 0.6030763799324632, "epoch": 1.7731304203231526, "grad_norm": 0.3230302035808563, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.8218886785209178, "num_tokens": 898412614.0, "step": 23660 }, { "entropy": 0.578519719466567, "epoch": 1.7735051442570045, "grad_norm": 0.22929547727108002, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.8261943072080612, "num_tokens": 899797600.0, "step": 23665 }, { "entropy": 0.5740701070055365, "epoch": 1.7738798681908563, "grad_norm": 0.2321326732635498, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.8266204420477152, "num_tokens": 901262534.0, "step": 23670 }, { "entropy": 0.5728446887806058, "epoch": 1.7742545921247082, "grad_norm": 0.21838873624801636, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8229261588305234, "num_tokens": 902732810.0, "step": 23675 }, { "entropy": 0.5693954614922404, "epoch": 1.77462931605856, "grad_norm": 0.226511150598526, "learning_rate": 0.0002, "loss": 0.6264, "mean_token_accuracy": 0.8230946887284517, "num_tokens": 904214607.0, "step": 23680 }, { "entropy": 0.5758426457643508, "epoch": 1.7750040399924119, "grad_norm": 0.24428488314151764, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8234905611723662, "num_tokens": 905707182.0, "step": 23685 }, { "entropy": 0.586402747221291, "epoch": 1.7753787639262637, "grad_norm": 0.2368273288011551, "learning_rate": 0.0002, "loss": 0.6345, "mean_token_accuracy": 0.8212953340262175, "num_tokens": 907208484.0, "step": 23690 }, { "entropy": 0.5944644281640649, "epoch": 1.7757534878601156, "grad_norm": 0.26568278670310974, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.8232542257755995, "num_tokens": 908627541.0, "step": 23695 }, { "entropy": 0.6069883171468973, "epoch": 1.7761282117939674, "grad_norm": 0.2503940463066101, "learning_rate": 0.0002, "loss": 0.6438, "mean_token_accuracy": 0.8194884702563285, "num_tokens": 910145753.0, "step": 23700 }, { "entropy": 0.579868421703577, "epoch": 1.7765029357278193, "grad_norm": 0.23175930976867676, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8220538765192031, "num_tokens": 911632712.0, "step": 23705 }, { "entropy": 0.5756962640210986, "epoch": 1.776877659661671, "grad_norm": 0.24934741854667664, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.826616583764553, "num_tokens": 913091974.0, "step": 23710 }, { "entropy": 0.5771831389516592, "epoch": 1.777252383595523, "grad_norm": 0.23778696358203888, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.8252131048589945, "num_tokens": 914577780.0, "step": 23715 }, { "entropy": 0.5894535524770618, "epoch": 1.7776271075293748, "grad_norm": 0.2335469126701355, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8211797680705786, "num_tokens": 916010568.0, "step": 23720 }, { "entropy": 0.5695085348561406, "epoch": 1.7780018314632267, "grad_norm": 0.24377436935901642, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.8286249008029699, "num_tokens": 917436029.0, "step": 23725 }, { "entropy": 0.5904012359678745, "epoch": 1.7783765553970785, "grad_norm": 0.2726905941963196, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8199192188680172, "num_tokens": 918913343.0, "step": 23730 }, { "entropy": 0.5851827168837189, "epoch": 1.7787512793309306, "grad_norm": 0.26016709208488464, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.8264689806848764, "num_tokens": 920358916.0, "step": 23735 }, { "entropy": 0.5727634703740477, "epoch": 1.7791260032647824, "grad_norm": 0.24139676988124847, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.827617396786809, "num_tokens": 921796668.0, "step": 23740 }, { "entropy": 0.590568957477808, "epoch": 1.7795007271986343, "grad_norm": 0.25615161657333374, "learning_rate": 0.0002, "loss": 0.6344, "mean_token_accuracy": 0.8220448270440102, "num_tokens": 923256920.0, "step": 23745 }, { "entropy": 0.5961750220507384, "epoch": 1.7798754511324861, "grad_norm": 0.2235218584537506, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.825799323990941, "num_tokens": 924706810.0, "step": 23750 }, { "entropy": 0.6245705815032124, "epoch": 1.780250175066338, "grad_norm": 0.25666630268096924, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8223139982670545, "num_tokens": 926175794.0, "step": 23755 }, { "entropy": 0.6232563244178891, "epoch": 1.7806248990001898, "grad_norm": 0.28141796588897705, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8256110079586506, "num_tokens": 927625471.0, "step": 23760 }, { "entropy": 0.6015581518411637, "epoch": 1.7809996229340417, "grad_norm": 0.23205554485321045, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.8262887589633465, "num_tokens": 929096649.0, "step": 23765 }, { "entropy": 0.571490989997983, "epoch": 1.7813743468678935, "grad_norm": 0.23109765350818634, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.8265662629157304, "num_tokens": 930533474.0, "step": 23770 }, { "entropy": 0.595748458430171, "epoch": 1.7817490708017454, "grad_norm": 0.2468220293521881, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.821284606307745, "num_tokens": 932004228.0, "step": 23775 }, { "entropy": 0.5885513139888644, "epoch": 1.7821237947355972, "grad_norm": 0.24490945041179657, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8234316542744636, "num_tokens": 933492779.0, "step": 23780 }, { "entropy": 0.5954612726345658, "epoch": 1.782498518669449, "grad_norm": 0.262428879737854, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.8248866122215986, "num_tokens": 934972072.0, "step": 23785 }, { "entropy": 0.6001560056582094, "epoch": 1.782873242603301, "grad_norm": 0.232773095369339, "learning_rate": 0.0002, "loss": 0.615, "mean_token_accuracy": 0.8243065498769283, "num_tokens": 936462017.0, "step": 23790 }, { "entropy": 0.5990530587732792, "epoch": 1.7832479665371528, "grad_norm": 0.2781904637813568, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8241605993360281, "num_tokens": 937936821.0, "step": 23795 }, { "entropy": 0.6003855746239424, "epoch": 1.7836226904710046, "grad_norm": 0.23045632243156433, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.823141534999013, "num_tokens": 939455572.0, "step": 23800 }, { "entropy": 0.5937170557677746, "epoch": 1.7839974144048565, "grad_norm": 0.24207574129104614, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8241193041205406, "num_tokens": 940946114.0, "step": 23805 }, { "entropy": 0.5864269454032183, "epoch": 1.7843721383387083, "grad_norm": 0.2525002062320709, "learning_rate": 0.0002, "loss": 0.6125, "mean_token_accuracy": 0.8258799772709609, "num_tokens": 942398769.0, "step": 23810 }, { "entropy": 0.5902386967092752, "epoch": 1.7847468622725602, "grad_norm": 0.5499678254127502, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.8292521800845861, "num_tokens": 943862020.0, "step": 23815 }, { "entropy": 0.6121455209329725, "epoch": 1.785121586206412, "grad_norm": 0.2346579134464264, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.8228910945355892, "num_tokens": 945327636.0, "step": 23820 }, { "entropy": 0.5847686175256968, "epoch": 1.7854963101402639, "grad_norm": 0.25487029552459717, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8254548348486423, "num_tokens": 946778745.0, "step": 23825 }, { "entropy": 0.5723187558352947, "epoch": 1.7858710340741157, "grad_norm": 0.24375048279762268, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.8272584289312362, "num_tokens": 948240893.0, "step": 23830 }, { "entropy": 0.5795849364250898, "epoch": 1.7862457580079676, "grad_norm": 0.22240695357322693, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.8272302601486444, "num_tokens": 949744478.0, "step": 23835 }, { "entropy": 0.586606589332223, "epoch": 1.7866204819418194, "grad_norm": 0.2214297205209732, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.828520692139864, "num_tokens": 951215976.0, "step": 23840 }, { "entropy": 0.5859427340328693, "epoch": 1.7869952058756713, "grad_norm": 0.21900174021720886, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8250773523002863, "num_tokens": 952688345.0, "step": 23845 }, { "entropy": 0.5927909513935447, "epoch": 1.7873699298095231, "grad_norm": 0.2452104687690735, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.8212077029049396, "num_tokens": 954157719.0, "step": 23850 }, { "entropy": 0.5787563497200608, "epoch": 1.787744653743375, "grad_norm": 0.24941399693489075, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.824513254314661, "num_tokens": 955651600.0, "step": 23855 }, { "entropy": 0.5746654987335205, "epoch": 1.7881193776772268, "grad_norm": 0.22512175142765045, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.8280336294323206, "num_tokens": 957122507.0, "step": 23860 }, { "entropy": 0.5824610505253076, "epoch": 1.7884941016110787, "grad_norm": 0.30122026801109314, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8216802380979061, "num_tokens": 958642513.0, "step": 23865 }, { "entropy": 0.5965153638273477, "epoch": 1.7888688255449305, "grad_norm": 0.2341109812259674, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.8202799253165722, "num_tokens": 960107338.0, "step": 23870 }, { "entropy": 0.567065997235477, "epoch": 1.7892435494787824, "grad_norm": 0.2609527111053467, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8247080601751804, "num_tokens": 961559870.0, "step": 23875 }, { "entropy": 0.572393361479044, "epoch": 1.7896182734126342, "grad_norm": 0.23487357795238495, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.8232495725154877, "num_tokens": 963023774.0, "step": 23880 }, { "entropy": 0.5688949352130294, "epoch": 1.789992997346486, "grad_norm": 0.22641858458518982, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8260237302631139, "num_tokens": 964471032.0, "step": 23885 }, { "entropy": 0.5695063034072518, "epoch": 1.790367721280338, "grad_norm": 0.2366691529750824, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.8209020674228669, "num_tokens": 965887983.0, "step": 23890 }, { "entropy": 0.5867785044014454, "epoch": 1.7907424452141898, "grad_norm": 0.27380433678627014, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.8258619043976069, "num_tokens": 967381645.0, "step": 23895 }, { "entropy": 0.5852218119427561, "epoch": 1.7911171691480416, "grad_norm": 0.23572850227355957, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.829973180219531, "num_tokens": 968799019.0, "step": 23900 }, { "entropy": 0.5839917968958617, "epoch": 1.7914918930818935, "grad_norm": 0.2243863046169281, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.8237579446285963, "num_tokens": 970257447.0, "step": 23905 }, { "entropy": 0.5817275036126375, "epoch": 1.7918666170157453, "grad_norm": 0.26196983456611633, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8242299031466246, "num_tokens": 971707547.0, "step": 23910 }, { "entropy": 0.5943614182993769, "epoch": 1.7922413409495972, "grad_norm": 0.21876996755599976, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.8211732778698206, "num_tokens": 973207401.0, "step": 23915 }, { "entropy": 0.584512661024928, "epoch": 1.792616064883449, "grad_norm": 0.2482582926750183, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.820413501560688, "num_tokens": 974680966.0, "step": 23920 }, { "entropy": 0.5866220217198134, "epoch": 1.792990788817301, "grad_norm": 0.23354332149028778, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.8251259878277779, "num_tokens": 976192230.0, "step": 23925 }, { "entropy": 0.5870731975883245, "epoch": 1.7933655127511527, "grad_norm": 0.2165449857711792, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.823877977579832, "num_tokens": 977648313.0, "step": 23930 }, { "entropy": 0.5868165086954832, "epoch": 1.7937402366850046, "grad_norm": 0.22602133452892303, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8238988894969225, "num_tokens": 979058589.0, "step": 23935 }, { "entropy": 0.5611592378467322, "epoch": 1.7941149606188564, "grad_norm": 0.2490803450345993, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.827890133857727, "num_tokens": 980489518.0, "step": 23940 }, { "entropy": 0.5640295954421163, "epoch": 1.7944896845527083, "grad_norm": 0.23541328310966492, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8241880014538765, "num_tokens": 982015427.0, "step": 23945 }, { "entropy": 0.5795428350567817, "epoch": 1.7948644084865601, "grad_norm": 0.22750870883464813, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8212237242609263, "num_tokens": 983473834.0, "step": 23950 }, { "entropy": 0.574669424816966, "epoch": 1.795239132420412, "grad_norm": 0.24022288620471954, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.8211094226688147, "num_tokens": 984955815.0, "step": 23955 }, { "entropy": 0.5613427694886923, "epoch": 1.7956138563542638, "grad_norm": 0.21329617500305176, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.8292905427515507, "num_tokens": 986446391.0, "step": 23960 }, { "entropy": 0.5703868668526411, "epoch": 1.7959885802881157, "grad_norm": 0.22738975286483765, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8311524614691734, "num_tokens": 987874177.0, "step": 23965 }, { "entropy": 0.5775369452312589, "epoch": 1.7963633042219678, "grad_norm": 0.2531515955924988, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.8236150600016117, "num_tokens": 989355917.0, "step": 23970 }, { "entropy": 0.5907612077891826, "epoch": 1.7967380281558196, "grad_norm": 0.2259301394224167, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.8218902461230755, "num_tokens": 990878025.0, "step": 23975 }, { "entropy": 0.5892641371116042, "epoch": 1.7971127520896715, "grad_norm": 0.2311295121908188, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8219023525714875, "num_tokens": 992329252.0, "step": 23980 }, { "entropy": 0.589720830321312, "epoch": 1.7974874760235233, "grad_norm": 0.22316575050354004, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8222591262310743, "num_tokens": 993778863.0, "step": 23985 }, { "entropy": 0.6043203927576541, "epoch": 1.7978621999573752, "grad_norm": 0.2396860420703888, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.8222847271710634, "num_tokens": 995262649.0, "step": 23990 }, { "entropy": 0.6020519677549601, "epoch": 1.798236923891227, "grad_norm": 0.22860492765903473, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.8197181023657322, "num_tokens": 996718036.0, "step": 23995 }, { "entropy": 0.5973303504288197, "epoch": 1.7986116478250789, "grad_norm": 0.24901090562343597, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8280276708304882, "num_tokens": 998194250.0, "step": 24000 }, { "entropy": 0.609417875111103, "epoch": 1.7989863717589307, "grad_norm": 0.22744716703891754, "learning_rate": 0.0002, "loss": 0.63, "mean_token_accuracy": 0.8228132925927639, "num_tokens": 999688708.0, "step": 24005 }, { "entropy": 0.5895034739747643, "epoch": 1.7993610956927826, "grad_norm": 0.24330022931098938, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.8235976938158274, "num_tokens": 1001166988.0, "step": 24010 }, { "entropy": 0.5798977624624968, "epoch": 1.7997358196266344, "grad_norm": 0.2706431448459625, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.8226487793028354, "num_tokens": 1002633593.0, "step": 24015 }, { "entropy": 0.5817854089662433, "epoch": 1.8001105435604863, "grad_norm": 0.2318894863128662, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.8246635608375072, "num_tokens": 1004091232.0, "step": 24020 }, { "entropy": 0.5982368512079119, "epoch": 1.8004852674943381, "grad_norm": 0.23245534300804138, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.8234231479465961, "num_tokens": 1005570332.0, "step": 24025 }, { "entropy": 0.5923232138156891, "epoch": 1.80085999142819, "grad_norm": 0.26433444023132324, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.8253256492316723, "num_tokens": 1007033240.0, "step": 24030 }, { "entropy": 0.5647701619192957, "epoch": 1.8012347153620418, "grad_norm": 0.243349090218544, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.8259752836078406, "num_tokens": 1008525457.0, "step": 24035 }, { "entropy": 0.5832120887935162, "epoch": 1.8016094392958937, "grad_norm": 0.23536889255046844, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8258775912225247, "num_tokens": 1009982996.0, "step": 24040 }, { "entropy": 0.5885654570534825, "epoch": 1.8019841632297458, "grad_norm": 0.253118097782135, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.8230384141206741, "num_tokens": 1011486999.0, "step": 24045 }, { "entropy": 0.5863381575793027, "epoch": 1.8023588871635976, "grad_norm": 0.2784633934497833, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.8259377598762512, "num_tokens": 1012944490.0, "step": 24050 }, { "entropy": 0.6000043036416173, "epoch": 1.8027336110974495, "grad_norm": 0.261911004781723, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.8233977120369673, "num_tokens": 1014373971.0, "step": 24055 }, { "entropy": 0.5906980954110622, "epoch": 1.8031083350313013, "grad_norm": 0.23783190548419952, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.8217784494161606, "num_tokens": 1015801691.0, "step": 24060 }, { "entropy": 0.6033401058986783, "epoch": 1.8034830589651532, "grad_norm": 0.2403906285762787, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8239948872476817, "num_tokens": 1017290723.0, "step": 24065 }, { "entropy": 0.61218715403229, "epoch": 1.803857782899005, "grad_norm": 0.26127147674560547, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8218978919088841, "num_tokens": 1018791344.0, "step": 24070 }, { "entropy": 0.6070285117253661, "epoch": 1.8042325068328569, "grad_norm": 0.27069011330604553, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8246157664805651, "num_tokens": 1020192753.0, "step": 24075 }, { "entropy": 0.6094475457444787, "epoch": 1.8046072307667087, "grad_norm": 0.24821141362190247, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.8249994102865458, "num_tokens": 1021674982.0, "step": 24080 }, { "entropy": 0.5996134707704186, "epoch": 1.8049819547005606, "grad_norm": 0.2612936496734619, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.822850839048624, "num_tokens": 1023152972.0, "step": 24085 }, { "entropy": 0.6184102106839419, "epoch": 1.8053566786344124, "grad_norm": 0.23935769498348236, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.8223799299448729, "num_tokens": 1024691248.0, "step": 24090 }, { "entropy": 0.6178361872211099, "epoch": 1.8057314025682643, "grad_norm": 0.2385367751121521, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8221934769302607, "num_tokens": 1026183030.0, "step": 24095 }, { "entropy": 0.607301595620811, "epoch": 1.806106126502116, "grad_norm": 0.282736599445343, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.8207782957702875, "num_tokens": 1027682553.0, "step": 24100 }, { "entropy": 0.5976161532104015, "epoch": 1.806480850435968, "grad_norm": 0.23352472484111786, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.820644486695528, "num_tokens": 1029145809.0, "step": 24105 }, { "entropy": 0.5897068249061703, "epoch": 1.8068555743698198, "grad_norm": 0.22895918786525726, "learning_rate": 0.0002, "loss": 0.6256, "mean_token_accuracy": 0.8236973278224469, "num_tokens": 1030644827.0, "step": 24110 }, { "entropy": 0.5778897793963551, "epoch": 1.8072302983036717, "grad_norm": 0.2623251676559448, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.8241169694811106, "num_tokens": 1032136445.0, "step": 24115 }, { "entropy": 0.5895558655261993, "epoch": 1.8076050222375235, "grad_norm": 0.23997581005096436, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.819759251177311, "num_tokens": 1033652716.0, "step": 24120 }, { "entropy": 0.5846960458904504, "epoch": 1.8079797461713754, "grad_norm": 0.22526657581329346, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8222249947488308, "num_tokens": 1035143520.0, "step": 24125 }, { "entropy": 0.57658452372998, "epoch": 1.8083544701052272, "grad_norm": 0.227503702044487, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8272997993975878, "num_tokens": 1036617808.0, "step": 24130 }, { "entropy": 0.5686422789469361, "epoch": 1.808729194039079, "grad_norm": 0.26122018694877625, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.8262253742665052, "num_tokens": 1038060679.0, "step": 24135 }, { "entropy": 0.5849253933876752, "epoch": 1.809103917972931, "grad_norm": 0.27362537384033203, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.8219957698136569, "num_tokens": 1039542435.0, "step": 24140 }, { "entropy": 0.5670090168714523, "epoch": 1.8094786419067828, "grad_norm": 0.2275075912475586, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8257712472230196, "num_tokens": 1040972698.0, "step": 24145 }, { "entropy": 0.5602695660665631, "epoch": 1.8098533658406346, "grad_norm": 0.23245011270046234, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8236010380089283, "num_tokens": 1042469816.0, "step": 24150 }, { "entropy": 0.5614771930500865, "epoch": 1.8102280897744865, "grad_norm": 0.21690109372138977, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8235088430345059, "num_tokens": 1043907800.0, "step": 24155 }, { "entropy": 0.5692325452342629, "epoch": 1.8106028137083383, "grad_norm": 0.24413467943668365, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.8252030096948146, "num_tokens": 1045409759.0, "step": 24160 }, { "entropy": 0.5610789934173226, "epoch": 1.8109775376421902, "grad_norm": 0.24602706730365753, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.8269301228225231, "num_tokens": 1046853909.0, "step": 24165 }, { "entropy": 0.5798082880675792, "epoch": 1.811352261576042, "grad_norm": 0.24879905581474304, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.8213881380856037, "num_tokens": 1048334447.0, "step": 24170 }, { "entropy": 0.5830364234745502, "epoch": 1.8117269855098939, "grad_norm": 0.22955337166786194, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8230745058506728, "num_tokens": 1049836100.0, "step": 24175 }, { "entropy": 0.5878631314262748, "epoch": 1.8121017094437457, "grad_norm": 0.2255718857049942, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.8229024905711413, "num_tokens": 1051366568.0, "step": 24180 }, { "entropy": 0.6111538864672184, "epoch": 1.8124764333775976, "grad_norm": 0.24013185501098633, "learning_rate": 0.0002, "loss": 0.6395, "mean_token_accuracy": 0.8201940637081861, "num_tokens": 1052831746.0, "step": 24185 }, { "entropy": 0.5974890671670436, "epoch": 1.8128511573114494, "grad_norm": 0.239101380109787, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.8219774447381496, "num_tokens": 1054352506.0, "step": 24190 }, { "entropy": 0.6040069406852127, "epoch": 1.8132258812453013, "grad_norm": 0.2331261932849884, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8202863030135632, "num_tokens": 1055842220.0, "step": 24195 }, { "entropy": 0.5770179163664579, "epoch": 1.8136006051791531, "grad_norm": 0.24120937287807465, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.8262612741440535, "num_tokens": 1057280088.0, "step": 24200 }, { "entropy": 0.5853644790127873, "epoch": 1.813975329113005, "grad_norm": 0.2247835397720337, "learning_rate": 0.0002, "loss": 0.6283, "mean_token_accuracy": 0.8206609413027763, "num_tokens": 1058778052.0, "step": 24205 }, { "entropy": 0.5600472433492542, "epoch": 1.8143500530468568, "grad_norm": 0.23531070351600647, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.828506688028574, "num_tokens": 1060241753.0, "step": 24210 }, { "entropy": 0.5611354352906346, "epoch": 1.8147247769807087, "grad_norm": 0.24779953062534332, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8276545006781817, "num_tokens": 1061709787.0, "step": 24215 }, { "entropy": 0.5930513966828584, "epoch": 1.8150995009145605, "grad_norm": 0.247334286570549, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.8250142857432365, "num_tokens": 1063188965.0, "step": 24220 }, { "entropy": 0.5610285906121135, "epoch": 1.8154742248484124, "grad_norm": 0.21255770325660706, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.8252702813595534, "num_tokens": 1064620267.0, "step": 24225 }, { "entropy": 0.583814456127584, "epoch": 1.8158489487822642, "grad_norm": 0.2382701188325882, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.8221102196723222, "num_tokens": 1066042235.0, "step": 24230 }, { "entropy": 0.577259733621031, "epoch": 1.816223672716116, "grad_norm": 0.251764714717865, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8215737663209438, "num_tokens": 1067555667.0, "step": 24235 }, { "entropy": 0.5678813019767404, "epoch": 1.816598396649968, "grad_norm": 0.23256054520606995, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8208914417773485, "num_tokens": 1069092126.0, "step": 24240 }, { "entropy": 0.5799851955845952, "epoch": 1.8169731205838198, "grad_norm": 0.25845035910606384, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.8239489078521729, "num_tokens": 1070542315.0, "step": 24245 }, { "entropy": 0.5730691675096751, "epoch": 1.8173478445176716, "grad_norm": 0.2484540492296219, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.824318352341652, "num_tokens": 1072006689.0, "step": 24250 }, { "entropy": 0.5822727249935269, "epoch": 1.8177225684515235, "grad_norm": 0.23248283565044403, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.8237657275050878, "num_tokens": 1073476742.0, "step": 24255 }, { "entropy": 0.5570935301482678, "epoch": 1.8180972923853753, "grad_norm": 0.30744674801826477, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.829035634920001, "num_tokens": 1074960543.0, "step": 24260 }, { "entropy": 0.5676291419193149, "epoch": 1.8184720163192272, "grad_norm": 0.3012322187423706, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.8222222704440355, "num_tokens": 1076405969.0, "step": 24265 }, { "entropy": 0.5840797992423177, "epoch": 1.818846740253079, "grad_norm": 0.24438124895095825, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8231356989592313, "num_tokens": 1077871263.0, "step": 24270 }, { "entropy": 0.5831349937245249, "epoch": 1.8192214641869309, "grad_norm": 0.22263696789741516, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.8225672226399183, "num_tokens": 1079363972.0, "step": 24275 }, { "entropy": 0.5754301257431507, "epoch": 1.819596188120783, "grad_norm": 0.23134516179561615, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.8257021229714155, "num_tokens": 1080798211.0, "step": 24280 }, { "entropy": 0.5804049698635936, "epoch": 1.8199709120546348, "grad_norm": 0.23256610333919525, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.8295606344938278, "num_tokens": 1082236476.0, "step": 24285 }, { "entropy": 0.6012609815225005, "epoch": 1.8203456359884866, "grad_norm": 0.2184199094772339, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.8197057083249092, "num_tokens": 1083695651.0, "step": 24290 }, { "entropy": 0.6036921551451087, "epoch": 1.8207203599223385, "grad_norm": 0.21801333129405975, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8237991809844971, "num_tokens": 1085196550.0, "step": 24295 }, { "entropy": 0.6163695368915796, "epoch": 1.8210950838561903, "grad_norm": 0.28273066878318787, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.821368894726038, "num_tokens": 1086682860.0, "step": 24300 }, { "entropy": 0.6034688137471675, "epoch": 1.8214698077900422, "grad_norm": 0.2542716860771179, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.8237187437713146, "num_tokens": 1088141713.0, "step": 24305 }, { "entropy": 0.6116394961252809, "epoch": 1.821844531723894, "grad_norm": 0.27170759439468384, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.8243686135858297, "num_tokens": 1089583879.0, "step": 24310 }, { "entropy": 0.6248993055894971, "epoch": 1.822219255657746, "grad_norm": 0.24820542335510254, "learning_rate": 0.0002, "loss": 0.6438, "mean_token_accuracy": 0.8196150843054056, "num_tokens": 1091069111.0, "step": 24315 }, { "entropy": 0.6206596732139588, "epoch": 1.8225939795915977, "grad_norm": 0.21609821915626526, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.8241421103477478, "num_tokens": 1092520070.0, "step": 24320 }, { "entropy": 0.6039772376418113, "epoch": 1.8229687035254496, "grad_norm": 0.26139014959335327, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.8242991290986538, "num_tokens": 1094012436.0, "step": 24325 }, { "entropy": 0.6071264062076807, "epoch": 1.8233434274593014, "grad_norm": 0.22928357124328613, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8258976012468338, "num_tokens": 1095514686.0, "step": 24330 }, { "entropy": 0.597963547334075, "epoch": 1.8237181513931533, "grad_norm": 0.22069571912288666, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.8242068357765675, "num_tokens": 1097006564.0, "step": 24335 }, { "entropy": 0.6012905908748507, "epoch": 1.8240928753270051, "grad_norm": 0.2238093912601471, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8228304550051689, "num_tokens": 1098523253.0, "step": 24340 }, { "entropy": 0.6005835067480803, "epoch": 1.824467599260857, "grad_norm": 0.2335936427116394, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.8248078495264053, "num_tokens": 1100035206.0, "step": 24345 }, { "entropy": 0.6154565492644906, "epoch": 1.8248423231947088, "grad_norm": 0.3030649423599243, "learning_rate": 0.0002, "loss": 0.6422, "mean_token_accuracy": 0.8189531069248914, "num_tokens": 1101530984.0, "step": 24350 }, { "entropy": 0.5830674469470978, "epoch": 1.825217047128561, "grad_norm": 0.24855616688728333, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.8268689818680286, "num_tokens": 1102964881.0, "step": 24355 }, { "entropy": 0.6007215265184641, "epoch": 1.8255917710624128, "grad_norm": 0.2385462373495102, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8236087415367365, "num_tokens": 1104443005.0, "step": 24360 }, { "entropy": 0.5749737313017249, "epoch": 1.8259664949962646, "grad_norm": 0.2733103930950165, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.8296256624162197, "num_tokens": 1105861473.0, "step": 24365 }, { "entropy": 0.5764261145144701, "epoch": 1.8263412189301165, "grad_norm": 0.24632896482944489, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8237066857516766, "num_tokens": 1107317372.0, "step": 24370 }, { "entropy": 0.5713635422289371, "epoch": 1.8267159428639683, "grad_norm": 0.23731142282485962, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.8249840181320905, "num_tokens": 1108754686.0, "step": 24375 }, { "entropy": 0.5840207960456609, "epoch": 1.8270906667978202, "grad_norm": 0.23122715950012207, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.8265416383743286, "num_tokens": 1110224212.0, "step": 24380 }, { "entropy": 0.586630336381495, "epoch": 1.827465390731672, "grad_norm": 0.2302350252866745, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.8243232306092978, "num_tokens": 1111728963.0, "step": 24385 }, { "entropy": 0.5873022926971316, "epoch": 1.8278401146655239, "grad_norm": 0.26023268699645996, "learning_rate": 0.0002, "loss": 0.6403, "mean_token_accuracy": 0.8196296982467175, "num_tokens": 1113154962.0, "step": 24390 }, { "entropy": 0.5873105784878134, "epoch": 1.8282148385993757, "grad_norm": 0.2722148895263672, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.8251823503524065, "num_tokens": 1114593777.0, "step": 24395 }, { "entropy": 0.5848234359174966, "epoch": 1.8285895625332276, "grad_norm": 0.27054476737976074, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8236448232084512, "num_tokens": 1116086920.0, "step": 24400 }, { "entropy": 0.5906822377815842, "epoch": 1.8289642864670794, "grad_norm": 0.23923783004283905, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.8257452469319105, "num_tokens": 1117565350.0, "step": 24405 }, { "entropy": 0.5916870404034853, "epoch": 1.8293390104009313, "grad_norm": 0.23786430060863495, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.8252582956105471, "num_tokens": 1119068029.0, "step": 24410 }, { "entropy": 0.5879182126373053, "epoch": 1.8297137343347831, "grad_norm": 0.25205618143081665, "learning_rate": 0.0002, "loss": 0.6297, "mean_token_accuracy": 0.8232823051512241, "num_tokens": 1120528275.0, "step": 24415 }, { "entropy": 0.6001204891130328, "epoch": 1.830088458268635, "grad_norm": 0.2303682565689087, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.8217118252068758, "num_tokens": 1122021881.0, "step": 24420 }, { "entropy": 0.5857802102342248, "epoch": 1.8304631822024868, "grad_norm": 0.22516582906246185, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.8248486649245024, "num_tokens": 1123517832.0, "step": 24425 }, { "entropy": 0.5854550087824464, "epoch": 1.8308379061363387, "grad_norm": 0.21655377745628357, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.8250096514821053, "num_tokens": 1124999193.0, "step": 24430 }, { "entropy": 0.5703201560303569, "epoch": 1.8312126300701905, "grad_norm": 0.2369614690542221, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8274830702692271, "num_tokens": 1126477021.0, "step": 24435 }, { "entropy": 0.5805042952299118, "epoch": 1.8315873540040424, "grad_norm": 0.2309758961200714, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8247569672763347, "num_tokens": 1127985579.0, "step": 24440 }, { "entropy": 0.5822816316038371, "epoch": 1.8319620779378942, "grad_norm": 0.21757091581821442, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.8244194246828556, "num_tokens": 1129455132.0, "step": 24445 }, { "entropy": 0.584273518435657, "epoch": 1.832336801871746, "grad_norm": 0.22574463486671448, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.8200131338089705, "num_tokens": 1130928219.0, "step": 24450 }, { "entropy": 0.5707978760823608, "epoch": 1.832711525805598, "grad_norm": 0.253505140542984, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8268721923232079, "num_tokens": 1132387810.0, "step": 24455 }, { "entropy": 0.5689200032502413, "epoch": 1.8330862497394498, "grad_norm": 0.21073079109191895, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.8273220341652632, "num_tokens": 1133858737.0, "step": 24460 }, { "entropy": 0.5863487880676985, "epoch": 1.8334609736733016, "grad_norm": 0.24774911999702454, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8227078627794981, "num_tokens": 1135320972.0, "step": 24465 }, { "entropy": 0.5942417567595839, "epoch": 1.8338356976071535, "grad_norm": 0.22177782654762268, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.821510611101985, "num_tokens": 1136778935.0, "step": 24470 }, { "entropy": 0.5954820657148957, "epoch": 1.8342104215410053, "grad_norm": 0.2452692985534668, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.8241930786520243, "num_tokens": 1138256307.0, "step": 24475 }, { "entropy": 0.6023160353302955, "epoch": 1.8345851454748572, "grad_norm": 0.2262432724237442, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.8261266734451056, "num_tokens": 1139696733.0, "step": 24480 }, { "entropy": 0.6271339423954487, "epoch": 1.834959869408709, "grad_norm": 0.24960526823997498, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.8232122786343098, "num_tokens": 1141122449.0, "step": 24485 }, { "entropy": 0.6131317485123873, "epoch": 1.8353345933425609, "grad_norm": 0.2525804042816162, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.8252401061356067, "num_tokens": 1142597855.0, "step": 24490 }, { "entropy": 0.5890182953327894, "epoch": 1.8357093172764127, "grad_norm": 0.27553805708885193, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8250795982778072, "num_tokens": 1144060282.0, "step": 24495 }, { "entropy": 0.5751840710639954, "epoch": 1.8360840412102646, "grad_norm": 0.24958741664886475, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.8316173050552607, "num_tokens": 1145513073.0, "step": 24500 }, { "entropy": 0.6103781776502728, "epoch": 1.8364587651441164, "grad_norm": 0.24493493139743805, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.8247563216835261, "num_tokens": 1146954028.0, "step": 24505 }, { "entropy": 0.6278395891189575, "epoch": 1.8368334890779683, "grad_norm": 0.2895508408546448, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.8248699102550745, "num_tokens": 1148406360.0, "step": 24510 }, { "entropy": 0.6289258299395442, "epoch": 1.8372082130118201, "grad_norm": 0.23481661081314087, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.8230706062167883, "num_tokens": 1149849473.0, "step": 24515 }, { "entropy": 0.6073805129155516, "epoch": 1.837582936945672, "grad_norm": 0.2279219776391983, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.8274141520261764, "num_tokens": 1151331517.0, "step": 24520 }, { "entropy": 0.6104252733290195, "epoch": 1.8379576608795238, "grad_norm": 0.23233039677143097, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8252209007740021, "num_tokens": 1152778571.0, "step": 24525 }, { "entropy": 0.6308047376573086, "epoch": 1.8383323848133757, "grad_norm": 0.2605922818183899, "learning_rate": 0.0002, "loss": 0.6183, "mean_token_accuracy": 0.8242530029267072, "num_tokens": 1154223398.0, "step": 24530 }, { "entropy": 0.6093518152832985, "epoch": 1.8387071087472275, "grad_norm": 0.21992532908916473, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.8259144447743892, "num_tokens": 1155714852.0, "step": 24535 }, { "entropy": 0.5970372304320335, "epoch": 1.8390818326810794, "grad_norm": 0.3664977550506592, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8257287431508302, "num_tokens": 1157181940.0, "step": 24540 }, { "entropy": 0.600533701479435, "epoch": 1.8394565566149312, "grad_norm": 0.2822592556476593, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8215375814586878, "num_tokens": 1158669521.0, "step": 24545 }, { "entropy": 0.5832845456898212, "epoch": 1.839831280548783, "grad_norm": 0.2896806299686432, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8253879699856043, "num_tokens": 1160117148.0, "step": 24550 }, { "entropy": 0.6004794159904122, "epoch": 1.840206004482635, "grad_norm": 0.2529181241989136, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8252750001847744, "num_tokens": 1161598517.0, "step": 24555 }, { "entropy": 0.625918690674007, "epoch": 1.8405807284164868, "grad_norm": 0.2359808385372162, "learning_rate": 0.0002, "loss": 0.6332, "mean_token_accuracy": 0.8206235133111477, "num_tokens": 1163076919.0, "step": 24560 }, { "entropy": 0.6175019098445773, "epoch": 1.8409554523503386, "grad_norm": 0.233567014336586, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.8269888415932656, "num_tokens": 1164547411.0, "step": 24565 }, { "entropy": 0.6082459585741162, "epoch": 1.8413301762841905, "grad_norm": 0.2238294929265976, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.8248518664389849, "num_tokens": 1166028243.0, "step": 24570 }, { "entropy": 0.6004905002191663, "epoch": 1.8417049002180423, "grad_norm": 0.25700920820236206, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.825759956613183, "num_tokens": 1167468922.0, "step": 24575 }, { "entropy": 0.6202940287068486, "epoch": 1.8420796241518942, "grad_norm": 0.23241344094276428, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.8256276629865169, "num_tokens": 1168932984.0, "step": 24580 }, { "entropy": 0.6211052477359772, "epoch": 1.842454348085746, "grad_norm": 0.22767870128154755, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.8235216367989778, "num_tokens": 1170414108.0, "step": 24585 }, { "entropy": 0.622607737313956, "epoch": 1.8428290720195981, "grad_norm": 0.22504481673240662, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.823496850579977, "num_tokens": 1171900847.0, "step": 24590 }, { "entropy": 0.6198470728471875, "epoch": 1.84320379595345, "grad_norm": 0.34608030319213867, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.8233469933271408, "num_tokens": 1173355035.0, "step": 24595 }, { "entropy": 0.6103344328701497, "epoch": 1.8435785198873018, "grad_norm": 0.23599864542484283, "learning_rate": 0.0002, "loss": 0.6266, "mean_token_accuracy": 0.8215735726058483, "num_tokens": 1174820222.0, "step": 24600 }, { "entropy": 0.6073199221864343, "epoch": 1.8439532438211537, "grad_norm": 0.23104344308376312, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.8244436509907246, "num_tokens": 1176279124.0, "step": 24605 }, { "entropy": 0.6153193857520819, "epoch": 1.8443279677550055, "grad_norm": 0.23253388702869415, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8235784955322742, "num_tokens": 1177775946.0, "step": 24610 }, { "entropy": 0.6132195137441159, "epoch": 1.8447026916888574, "grad_norm": 0.24427974224090576, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.8242855135351419, "num_tokens": 1179252339.0, "step": 24615 }, { "entropy": 0.6106891751289367, "epoch": 1.8450774156227092, "grad_norm": 0.28124871850013733, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8267601922154426, "num_tokens": 1180736405.0, "step": 24620 }, { "entropy": 0.6165317207574844, "epoch": 1.845452139556561, "grad_norm": 0.22540953755378723, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.8223715364933014, "num_tokens": 1182254641.0, "step": 24625 }, { "entropy": 0.6096318459138275, "epoch": 1.845826863490413, "grad_norm": 0.24370026588439941, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.821464428678155, "num_tokens": 1183722235.0, "step": 24630 }, { "entropy": 0.5794218577444553, "epoch": 1.8462015874242648, "grad_norm": 0.22415371239185333, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.8273111015558243, "num_tokens": 1185164075.0, "step": 24635 }, { "entropy": 0.6010872693732381, "epoch": 1.8465763113581166, "grad_norm": 0.23787875473499298, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.820318441838026, "num_tokens": 1186637422.0, "step": 24640 }, { "entropy": 0.6107164923101663, "epoch": 1.8469510352919685, "grad_norm": 0.2149081528186798, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8218188587576151, "num_tokens": 1188112383.0, "step": 24645 }, { "entropy": 0.5902570815756917, "epoch": 1.8473257592258203, "grad_norm": 0.224813312292099, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.8247678704559803, "num_tokens": 1189570867.0, "step": 24650 }, { "entropy": 0.5868563439697028, "epoch": 1.8477004831596722, "grad_norm": 0.261245459318161, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8262368809431792, "num_tokens": 1191038934.0, "step": 24655 }, { "entropy": 0.587680479325354, "epoch": 1.848075207093524, "grad_norm": 0.24012857675552368, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.8246461383998394, "num_tokens": 1192505277.0, "step": 24660 }, { "entropy": 0.5927163692191243, "epoch": 1.848449931027376, "grad_norm": 0.24523738026618958, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.8244537018239498, "num_tokens": 1193930037.0, "step": 24665 }, { "entropy": 0.5744764544069767, "epoch": 1.848824654961228, "grad_norm": 0.2642679512500763, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8236295945942402, "num_tokens": 1195399801.0, "step": 24670 }, { "entropy": 0.5760651292279363, "epoch": 1.8491993788950798, "grad_norm": 0.24066130816936493, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.8250770613551139, "num_tokens": 1196901929.0, "step": 24675 }, { "entropy": 0.5831180945038795, "epoch": 1.8495741028289316, "grad_norm": 0.23550991714000702, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.8252586591988802, "num_tokens": 1198367806.0, "step": 24680 }, { "entropy": 0.5815743200480938, "epoch": 1.8499488267627835, "grad_norm": 0.23937740921974182, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8275049544870854, "num_tokens": 1199811931.0, "step": 24685 }, { "entropy": 0.589514970779419, "epoch": 1.8503235506966353, "grad_norm": 0.2540755569934845, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.8209064431488514, "num_tokens": 1201222656.0, "step": 24690 }, { "entropy": 0.5726759085431695, "epoch": 1.8506982746304872, "grad_norm": 0.2602991759777069, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.8255494926124811, "num_tokens": 1202686445.0, "step": 24695 }, { "entropy": 0.5762983247637748, "epoch": 1.851072998564339, "grad_norm": 0.21635869145393372, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.8250299409031868, "num_tokens": 1204087206.0, "step": 24700 }, { "entropy": 0.5942597499117255, "epoch": 1.851447722498191, "grad_norm": 0.23083747923374176, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8223785106092691, "num_tokens": 1205519186.0, "step": 24705 }, { "entropy": 0.5900750566273928, "epoch": 1.8518224464320427, "grad_norm": 0.254299134016037, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8252174511551857, "num_tokens": 1206969642.0, "step": 24710 }, { "entropy": 0.6198918398469686, "epoch": 1.8521971703658946, "grad_norm": 0.22659079730510712, "learning_rate": 0.0002, "loss": 0.6439, "mean_token_accuracy": 0.8184098307043314, "num_tokens": 1208455010.0, "step": 24715 }, { "entropy": 0.5962601188570261, "epoch": 1.8525718942997464, "grad_norm": 0.2596256732940674, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8238306239247322, "num_tokens": 1209910681.0, "step": 24720 }, { "entropy": 0.6136237069964409, "epoch": 1.8529466182335983, "grad_norm": 0.25779616832733154, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.8210465170443058, "num_tokens": 1211375684.0, "step": 24725 }, { "entropy": 0.6104710720479488, "epoch": 1.8533213421674501, "grad_norm": 0.2495502382516861, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.8251737575978041, "num_tokens": 1212841952.0, "step": 24730 }, { "entropy": 0.6234582681208849, "epoch": 1.853696066101302, "grad_norm": 0.2534472346305847, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8253305986523628, "num_tokens": 1214293963.0, "step": 24735 }, { "entropy": 0.6199652027338743, "epoch": 1.8540707900351538, "grad_norm": 0.2561097741127014, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.8226489719003439, "num_tokens": 1215748854.0, "step": 24740 }, { "entropy": 0.6223150338977576, "epoch": 1.8544455139690057, "grad_norm": 0.24351589381694794, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.8253681007772684, "num_tokens": 1217256704.0, "step": 24745 }, { "entropy": 0.6404430003836751, "epoch": 1.8548202379028575, "grad_norm": 0.28859585523605347, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8240275740623474, "num_tokens": 1218733375.0, "step": 24750 }, { "entropy": 0.6378907779231667, "epoch": 1.8551949618367094, "grad_norm": 0.2930898666381836, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.8254788465797901, "num_tokens": 1220156011.0, "step": 24755 }, { "entropy": 0.6137337524443864, "epoch": 1.8555696857705613, "grad_norm": 0.2746044993400574, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.8248117987066508, "num_tokens": 1221617862.0, "step": 24760 }, { "entropy": 0.5910709897056222, "epoch": 1.855944409704413, "grad_norm": 0.23258988559246063, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8283299170434475, "num_tokens": 1223094679.0, "step": 24765 }, { "entropy": 0.5956854446791112, "epoch": 1.856319133638265, "grad_norm": 0.2377784103155136, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8265265762805939, "num_tokens": 1224530169.0, "step": 24770 }, { "entropy": 0.5953910194337368, "epoch": 1.8566938575721168, "grad_norm": 0.29903626441955566, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.8283370461314916, "num_tokens": 1226017347.0, "step": 24775 }, { "entropy": 0.6058859389275313, "epoch": 1.8570685815059687, "grad_norm": 0.2398918867111206, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.8235150806605815, "num_tokens": 1227491177.0, "step": 24780 }, { "entropy": 0.5884340090677143, "epoch": 1.8574433054398205, "grad_norm": 0.33905696868896484, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.8264653984457254, "num_tokens": 1228988766.0, "step": 24785 }, { "entropy": 0.6086905833333731, "epoch": 1.8578180293736724, "grad_norm": 0.24438664317131042, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8232888661324977, "num_tokens": 1230483551.0, "step": 24790 }, { "entropy": 0.6270504703745245, "epoch": 1.8581927533075242, "grad_norm": 0.2235729992389679, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.8212491571903229, "num_tokens": 1231977810.0, "step": 24795 }, { "entropy": 0.6158683590590954, "epoch": 1.858567477241376, "grad_norm": 0.2413730025291443, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.826573607698083, "num_tokens": 1233454570.0, "step": 24800 }, { "entropy": 0.617788146995008, "epoch": 1.858942201175228, "grad_norm": 0.22493281960487366, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8239827454090118, "num_tokens": 1234878156.0, "step": 24805 }, { "entropy": 0.6179253064095974, "epoch": 1.8593169251090798, "grad_norm": 0.2183995395898819, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8263239469379187, "num_tokens": 1236380027.0, "step": 24810 }, { "entropy": 0.6108023963868618, "epoch": 1.8596916490429316, "grad_norm": 0.2565806806087494, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.824007349833846, "num_tokens": 1237852735.0, "step": 24815 }, { "entropy": 0.6111113930121064, "epoch": 1.8600663729767835, "grad_norm": 0.2782679796218872, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.826176792383194, "num_tokens": 1239373539.0, "step": 24820 }, { "entropy": 0.6026251282542944, "epoch": 1.8604410969106353, "grad_norm": 0.23232313990592957, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8225293524563313, "num_tokens": 1240840714.0, "step": 24825 }, { "entropy": 0.6066063228994608, "epoch": 1.8608158208444872, "grad_norm": 0.2383970469236374, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8268943283706903, "num_tokens": 1242286697.0, "step": 24830 }, { "entropy": 0.6184277139604092, "epoch": 1.861190544778339, "grad_norm": 0.2624019682407379, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.8261349197477103, "num_tokens": 1243731567.0, "step": 24835 }, { "entropy": 0.583114012144506, "epoch": 1.8615652687121909, "grad_norm": 0.21981866657733917, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.8304847899824381, "num_tokens": 1245175006.0, "step": 24840 }, { "entropy": 0.6035491174086929, "epoch": 1.8619399926460427, "grad_norm": 0.22276540100574493, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.8264384534209966, "num_tokens": 1246663927.0, "step": 24845 }, { "entropy": 0.6103834683075547, "epoch": 1.8623147165798946, "grad_norm": 0.23872798681259155, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8249407172203064, "num_tokens": 1248141593.0, "step": 24850 }, { "entropy": 0.6215593757107853, "epoch": 1.8626894405137464, "grad_norm": 0.255971759557724, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.822262592241168, "num_tokens": 1249597935.0, "step": 24855 }, { "entropy": 0.6035418720915914, "epoch": 1.8630641644475983, "grad_norm": 0.24144220352172852, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8246809981763363, "num_tokens": 1251078821.0, "step": 24860 }, { "entropy": 0.5935591047629714, "epoch": 1.86343888838145, "grad_norm": 0.24725119769573212, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.826869860291481, "num_tokens": 1252547797.0, "step": 24865 }, { "entropy": 0.5759992836043238, "epoch": 1.863813612315302, "grad_norm": 0.2300768941640854, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.8262093808501959, "num_tokens": 1254023969.0, "step": 24870 }, { "entropy": 0.5957531800493598, "epoch": 1.8641883362491538, "grad_norm": 0.22639226913452148, "learning_rate": 0.0002, "loss": 0.6396, "mean_token_accuracy": 0.8199108608067036, "num_tokens": 1255505685.0, "step": 24875 }, { "entropy": 0.5809493478387594, "epoch": 1.8645630601830057, "grad_norm": 0.24869941174983978, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8233973123133183, "num_tokens": 1256981240.0, "step": 24880 }, { "entropy": 0.5776104215532541, "epoch": 1.8649377841168575, "grad_norm": 0.4378170967102051, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.8264786906540393, "num_tokens": 1258460036.0, "step": 24885 }, { "entropy": 0.5722871366888285, "epoch": 1.8653125080507094, "grad_norm": 0.2754746079444885, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.825802207365632, "num_tokens": 1259887907.0, "step": 24890 }, { "entropy": 0.5869664207100869, "epoch": 1.8656872319845612, "grad_norm": 0.2372027188539505, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.8282582815736532, "num_tokens": 1261328359.0, "step": 24895 }, { "entropy": 0.587493285536766, "epoch": 1.8660619559184133, "grad_norm": 0.33779972791671753, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.8270143024623394, "num_tokens": 1262759963.0, "step": 24900 }, { "entropy": 0.5873772950842977, "epoch": 1.8664366798522651, "grad_norm": 0.2788415551185608, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.8251019060611725, "num_tokens": 1264256290.0, "step": 24905 }, { "entropy": 0.6042635314166546, "epoch": 1.866811403786117, "grad_norm": 0.21024851500988007, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8231443706899881, "num_tokens": 1265743957.0, "step": 24910 }, { "entropy": 0.5864242322742939, "epoch": 1.8671861277199688, "grad_norm": 0.2730160355567932, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.8249515581876039, "num_tokens": 1267185033.0, "step": 24915 }, { "entropy": 0.5984942747280002, "epoch": 1.8675608516538207, "grad_norm": 0.2525821626186371, "learning_rate": 0.0002, "loss": 0.6297, "mean_token_accuracy": 0.8220372829586268, "num_tokens": 1268645127.0, "step": 24920 }, { "entropy": 0.5926859250292182, "epoch": 1.8679355755876725, "grad_norm": 0.28002074360847473, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8242001432925463, "num_tokens": 1270098313.0, "step": 24925 }, { "entropy": 0.6065247818827629, "epoch": 1.8683102995215244, "grad_norm": 0.22982414066791534, "learning_rate": 0.0002, "loss": 0.6468, "mean_token_accuracy": 0.8179846320301294, "num_tokens": 1271576418.0, "step": 24930 }, { "entropy": 0.5595626439899206, "epoch": 1.8686850234553762, "grad_norm": 0.224358931183815, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.8307235900312662, "num_tokens": 1272995794.0, "step": 24935 }, { "entropy": 0.5767881650477648, "epoch": 1.869059747389228, "grad_norm": 0.37893494963645935, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.8274650007486344, "num_tokens": 1274484945.0, "step": 24940 }, { "entropy": 0.5843788938596844, "epoch": 1.86943447132308, "grad_norm": 0.2198997288942337, "learning_rate": 0.0002, "loss": 0.6115, "mean_token_accuracy": 0.8234330113977194, "num_tokens": 1276044316.0, "step": 24945 }, { "entropy": 0.5989829139783979, "epoch": 1.8698091952569318, "grad_norm": 0.2324816733598709, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8230299860239029, "num_tokens": 1277500696.0, "step": 24950 }, { "entropy": 0.5887103037908673, "epoch": 1.8701839191907836, "grad_norm": 0.26160913705825806, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.8284818876534701, "num_tokens": 1278917801.0, "step": 24955 }, { "entropy": 0.6025155778974295, "epoch": 1.8705586431246355, "grad_norm": 0.23380158841609955, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8255818720906973, "num_tokens": 1280391259.0, "step": 24960 }, { "entropy": 0.6219927394762635, "epoch": 1.8709333670584873, "grad_norm": 0.23280590772628784, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.8221248012036085, "num_tokens": 1281902736.0, "step": 24965 }, { "entropy": 0.5913367014378309, "epoch": 1.8713080909923392, "grad_norm": 0.24496668577194214, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8250652596354484, "num_tokens": 1283431944.0, "step": 24970 }, { "entropy": 0.5722745252773166, "epoch": 1.871682814926191, "grad_norm": 0.24999921023845673, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.8284194562584162, "num_tokens": 1284882495.0, "step": 24975 }, { "entropy": 0.5665017735213042, "epoch": 1.8720575388600431, "grad_norm": 0.24208205938339233, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.8291631732136011, "num_tokens": 1286333822.0, "step": 24980 }, { "entropy": 0.5717529352754355, "epoch": 1.872432262793895, "grad_norm": 0.293641597032547, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8291374575346708, "num_tokens": 1287841479.0, "step": 24985 }, { "entropy": 0.5894270600751043, "epoch": 1.8728069867277468, "grad_norm": 0.2530345320701599, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8233603645116091, "num_tokens": 1289306653.0, "step": 24990 }, { "entropy": 0.5857516182586551, "epoch": 1.8731817106615987, "grad_norm": 0.24689112603664398, "learning_rate": 0.0002, "loss": 0.6006, "mean_token_accuracy": 0.8270575940608978, "num_tokens": 1290790212.0, "step": 24995 }, { "entropy": 0.5989462597295642, "epoch": 1.8735564345954505, "grad_norm": 0.24272210896015167, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.828053393214941, "num_tokens": 1292263554.0, "step": 25000 }, { "entropy": 0.6134665828198195, "epoch": 1.8739311585293024, "grad_norm": 0.2931012213230133, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.8197528876364231, "num_tokens": 1293769616.0, "step": 25005 }, { "entropy": 0.5997716808691621, "epoch": 1.8743058824631542, "grad_norm": 0.2547030746936798, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8274753130972385, "num_tokens": 1295273139.0, "step": 25010 }, { "entropy": 0.5855827430263162, "epoch": 1.874680606397006, "grad_norm": 0.250934362411499, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.8291271600872279, "num_tokens": 1296694313.0, "step": 25015 }, { "entropy": 0.5886424547061324, "epoch": 1.875055330330858, "grad_norm": 0.2867506742477417, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8250142317265272, "num_tokens": 1298172362.0, "step": 25020 }, { "entropy": 0.5873112259432673, "epoch": 1.8754300542647098, "grad_norm": 0.27424612641334534, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.8271580934524536, "num_tokens": 1299633110.0, "step": 25025 }, { "entropy": 0.5901447311043739, "epoch": 1.8758047781985616, "grad_norm": 0.2505430281162262, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.8247491620481014, "num_tokens": 1301105454.0, "step": 25030 }, { "entropy": 0.5839159516617656, "epoch": 1.8761795021324135, "grad_norm": 0.2418675720691681, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.8259003095328807, "num_tokens": 1302547530.0, "step": 25035 }, { "entropy": 0.5930907452479005, "epoch": 1.8765542260662653, "grad_norm": 0.28641340136528015, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.8220521699637174, "num_tokens": 1304057629.0, "step": 25040 }, { "entropy": 0.6032157134264707, "epoch": 1.8769289500001172, "grad_norm": 0.21705026924610138, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.8217382688075304, "num_tokens": 1305541360.0, "step": 25045 }, { "entropy": 0.5968624206259847, "epoch": 1.877303673933969, "grad_norm": 0.25658953189849854, "learning_rate": 0.0002, "loss": 0.6258, "mean_token_accuracy": 0.8213435553014279, "num_tokens": 1307024669.0, "step": 25050 }, { "entropy": 0.5742761552333832, "epoch": 1.8776783978678209, "grad_norm": 0.25002962350845337, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.8277882855385542, "num_tokens": 1308514985.0, "step": 25055 }, { "entropy": 0.5913353176787496, "epoch": 1.8780531218016727, "grad_norm": 0.22695522010326385, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8236117910593748, "num_tokens": 1309971412.0, "step": 25060 }, { "entropy": 0.5880039168521762, "epoch": 1.8784278457355246, "grad_norm": 0.2861245572566986, "learning_rate": 0.0002, "loss": 0.6258, "mean_token_accuracy": 0.8227381970733404, "num_tokens": 1311460432.0, "step": 25065 }, { "entropy": 0.5761503433808685, "epoch": 1.8788025696693764, "grad_norm": 0.21966992318630219, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.8299084905534982, "num_tokens": 1312929361.0, "step": 25070 }, { "entropy": 0.59389353916049, "epoch": 1.8791772936032283, "grad_norm": 0.23464563488960266, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.8236547585576772, "num_tokens": 1314415166.0, "step": 25075 }, { "entropy": 0.5991126695647836, "epoch": 1.8795520175370801, "grad_norm": 0.2721364498138428, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8216637823730707, "num_tokens": 1315915646.0, "step": 25080 }, { "entropy": 0.5968026535585522, "epoch": 1.879926741470932, "grad_norm": 0.25059056282043457, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.8249635729938746, "num_tokens": 1317410036.0, "step": 25085 }, { "entropy": 0.5754102911800146, "epoch": 1.8803014654047838, "grad_norm": 0.2477468103170395, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.8282362330704928, "num_tokens": 1318886303.0, "step": 25090 }, { "entropy": 0.5873117819428444, "epoch": 1.8806761893386357, "grad_norm": 0.2410281002521515, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.8253479801118374, "num_tokens": 1320388824.0, "step": 25095 }, { "entropy": 0.582881112396717, "epoch": 1.8810509132724875, "grad_norm": 0.28224846720695496, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.8254081629216671, "num_tokens": 1321875767.0, "step": 25100 }, { "entropy": 0.5786128751933575, "epoch": 1.8814256372063394, "grad_norm": 0.22273124754428864, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.8299757946282625, "num_tokens": 1323349403.0, "step": 25105 }, { "entropy": 0.6030887056142091, "epoch": 1.8818003611401912, "grad_norm": 0.2390497773885727, "learning_rate": 0.0002, "loss": 0.6324, "mean_token_accuracy": 0.8200109455734491, "num_tokens": 1324837674.0, "step": 25110 }, { "entropy": 0.614547174423933, "epoch": 1.882175085074043, "grad_norm": 0.22291457653045654, "learning_rate": 0.0002, "loss": 0.634, "mean_token_accuracy": 0.8173723381012679, "num_tokens": 1326324203.0, "step": 25115 }, { "entropy": 0.5857685565948486, "epoch": 1.882549809007895, "grad_norm": 0.25351032614707947, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.8259515255689621, "num_tokens": 1327774874.0, "step": 25120 }, { "entropy": 0.592725171521306, "epoch": 1.8829245329417468, "grad_norm": 0.24113404750823975, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8246064063161611, "num_tokens": 1329232429.0, "step": 25125 }, { "entropy": 0.6162686072289943, "epoch": 1.8832992568755986, "grad_norm": 0.2441011518239975, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.8235144525766372, "num_tokens": 1330681821.0, "step": 25130 }, { "entropy": 0.6044587921351194, "epoch": 1.8836739808094505, "grad_norm": 0.2238425612449646, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.8252132169902324, "num_tokens": 1332153007.0, "step": 25135 }, { "entropy": 0.6235697465017438, "epoch": 1.8840487047433023, "grad_norm": 0.25770923495292664, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.8209517400711774, "num_tokens": 1333638963.0, "step": 25140 }, { "entropy": 0.5613535827025771, "epoch": 1.8844234286771542, "grad_norm": 0.2498772144317627, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.8330988023430109, "num_tokens": 1335079206.0, "step": 25145 }, { "entropy": 0.5906686782836914, "epoch": 1.884798152611006, "grad_norm": 0.2449447363615036, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8257297839969396, "num_tokens": 1336568231.0, "step": 25150 }, { "entropy": 0.5986617246642709, "epoch": 1.8851728765448579, "grad_norm": 0.2558460533618927, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8221878875046968, "num_tokens": 1338016158.0, "step": 25155 }, { "entropy": 0.6036085365340114, "epoch": 1.8855476004787097, "grad_norm": 0.2329273223876953, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8227381065487862, "num_tokens": 1339511943.0, "step": 25160 }, { "entropy": 0.5965175064280629, "epoch": 1.8859223244125616, "grad_norm": 0.22689706087112427, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.8281743597239256, "num_tokens": 1340974268.0, "step": 25165 }, { "entropy": 0.6227826351299882, "epoch": 1.8862970483464134, "grad_norm": 0.2222214639186859, "learning_rate": 0.0002, "loss": 0.6346, "mean_token_accuracy": 0.8191988445818424, "num_tokens": 1342446204.0, "step": 25170 }, { "entropy": 0.6057692481204867, "epoch": 1.8866717722802653, "grad_norm": 0.23413649201393127, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.8263875301927328, "num_tokens": 1343926195.0, "step": 25175 }, { "entropy": 0.6171077473089099, "epoch": 1.8870464962141171, "grad_norm": 0.22731205821037292, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.82427624091506, "num_tokens": 1345409522.0, "step": 25180 }, { "entropy": 0.6184850102290511, "epoch": 1.887421220147969, "grad_norm": 0.23855160176753998, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8264931708574295, "num_tokens": 1346900306.0, "step": 25185 }, { "entropy": 0.6121781649067998, "epoch": 1.8877959440818208, "grad_norm": 0.22182251513004303, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8249945923686027, "num_tokens": 1348355758.0, "step": 25190 }, { "entropy": 0.5975368585437536, "epoch": 1.8881706680156727, "grad_norm": 0.2664547264575958, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8278205659240484, "num_tokens": 1349799214.0, "step": 25195 }, { "entropy": 0.5868489637970924, "epoch": 1.8885453919495245, "grad_norm": 0.2692663073539734, "learning_rate": 0.0002, "loss": 0.6115, "mean_token_accuracy": 0.8254497531801462, "num_tokens": 1351232726.0, "step": 25200 }, { "entropy": 0.6023152258247138, "epoch": 1.8889201158833764, "grad_norm": 0.23307257890701294, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8225822720676661, "num_tokens": 1352677082.0, "step": 25205 }, { "entropy": 0.5994515128433704, "epoch": 1.8892948398172285, "grad_norm": 0.26849788427352905, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8246100839227438, "num_tokens": 1354148592.0, "step": 25210 }, { "entropy": 0.5925964068621397, "epoch": 1.8896695637510803, "grad_norm": 0.22046086192131042, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.8246294811367989, "num_tokens": 1355657219.0, "step": 25215 }, { "entropy": 0.5827610794454813, "epoch": 1.8900442876849322, "grad_norm": 0.2712383568286896, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.828063553571701, "num_tokens": 1357121825.0, "step": 25220 }, { "entropy": 0.588108767569065, "epoch": 1.890419011618784, "grad_norm": 0.26275908946990967, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.8289320506155491, "num_tokens": 1358589805.0, "step": 25225 }, { "entropy": 0.5908537019044161, "epoch": 1.8907937355526359, "grad_norm": 0.21938784420490265, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8252327617257833, "num_tokens": 1360070395.0, "step": 25230 }, { "entropy": 0.5940032638609409, "epoch": 1.8911684594864877, "grad_norm": 0.27800121903419495, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.8200702793896198, "num_tokens": 1361549452.0, "step": 25235 }, { "entropy": 0.5840416487306357, "epoch": 1.8915431834203396, "grad_norm": 0.2553457021713257, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8269444786012172, "num_tokens": 1362994671.0, "step": 25240 }, { "entropy": 0.6031862944364548, "epoch": 1.8919179073541914, "grad_norm": 0.24162597954273224, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.8245837923139334, "num_tokens": 1364470113.0, "step": 25245 }, { "entropy": 0.5832859462127089, "epoch": 1.8922926312880433, "grad_norm": 0.2337033748626709, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.8259010013192892, "num_tokens": 1365942535.0, "step": 25250 }, { "entropy": 0.5957572016865015, "epoch": 1.892667355221895, "grad_norm": 0.25194427371025085, "learning_rate": 0.0002, "loss": 0.6125, "mean_token_accuracy": 0.8260795805603266, "num_tokens": 1367384975.0, "step": 25255 }, { "entropy": 0.5978129243478179, "epoch": 1.893042079155747, "grad_norm": 0.2349550873041153, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.8241118602454662, "num_tokens": 1368893249.0, "step": 25260 }, { "entropy": 0.5883857188746333, "epoch": 1.8934168030895988, "grad_norm": 0.2146751582622528, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.8253702506422996, "num_tokens": 1370358657.0, "step": 25265 }, { "entropy": 0.5902678605169058, "epoch": 1.8937915270234507, "grad_norm": 0.2144956737756729, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.8270495973527432, "num_tokens": 1371842918.0, "step": 25270 }, { "entropy": 0.6030077932402491, "epoch": 1.8941662509573025, "grad_norm": 0.2809559404850006, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.826240087300539, "num_tokens": 1373263634.0, "step": 25275 }, { "entropy": 0.6061123291030526, "epoch": 1.8945409748911544, "grad_norm": 0.2681557238101959, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.8237202942371369, "num_tokens": 1374695051.0, "step": 25280 }, { "entropy": 0.5941149963065981, "epoch": 1.8949156988250062, "grad_norm": 0.2245340645313263, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.8253740169107914, "num_tokens": 1376185491.0, "step": 25285 }, { "entropy": 0.6146759133785963, "epoch": 1.8952904227588583, "grad_norm": 0.30392855405807495, "learning_rate": 0.0002, "loss": 0.6278, "mean_token_accuracy": 0.8185777183622122, "num_tokens": 1377669796.0, "step": 25290 }, { "entropy": 0.5954269383102655, "epoch": 1.8956651466927101, "grad_norm": 0.23312099277973175, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8231364887207746, "num_tokens": 1379150413.0, "step": 25295 }, { "entropy": 0.6116126120090485, "epoch": 1.896039870626562, "grad_norm": 0.2636493444442749, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.822585878893733, "num_tokens": 1380627546.0, "step": 25300 }, { "entropy": 0.6077557465061545, "epoch": 1.8964145945604138, "grad_norm": 0.22633863985538483, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.8219490069895983, "num_tokens": 1382089058.0, "step": 25305 }, { "entropy": 0.605663456581533, "epoch": 1.8967893184942657, "grad_norm": 0.228026881814003, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.8278830025345087, "num_tokens": 1383546790.0, "step": 25310 }, { "entropy": 0.5994147600606083, "epoch": 1.8971640424281175, "grad_norm": 0.2227364480495453, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8237317487597465, "num_tokens": 1385013780.0, "step": 25315 }, { "entropy": 0.5963879944756627, "epoch": 1.8975387663619694, "grad_norm": 0.45973530411720276, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.823594669625163, "num_tokens": 1386462587.0, "step": 25320 }, { "entropy": 0.5810660695657134, "epoch": 1.8979134902958212, "grad_norm": 0.22622054815292358, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8273158580064773, "num_tokens": 1387905642.0, "step": 25325 }, { "entropy": 0.5827225273475051, "epoch": 1.898288214229673, "grad_norm": 0.28104260563850403, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8216739274561405, "num_tokens": 1389385097.0, "step": 25330 }, { "entropy": 0.5669077806174755, "epoch": 1.898662938163525, "grad_norm": 0.2591700851917267, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.8308661565184593, "num_tokens": 1390900581.0, "step": 25335 }, { "entropy": 0.580938683822751, "epoch": 1.8990376620973768, "grad_norm": 0.23265448212623596, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.8268510308116674, "num_tokens": 1392390614.0, "step": 25340 }, { "entropy": 0.5793154604732991, "epoch": 1.8994123860312286, "grad_norm": 0.23446547985076904, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8273093599826098, "num_tokens": 1393844960.0, "step": 25345 }, { "entropy": 0.60304213501513, "epoch": 1.8997871099650805, "grad_norm": 0.2224300056695938, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.8219745364040136, "num_tokens": 1395301315.0, "step": 25350 }, { "entropy": 0.6078013427555561, "epoch": 1.9001618338989323, "grad_norm": 0.23144829273223877, "learning_rate": 0.0002, "loss": 0.63, "mean_token_accuracy": 0.8207441486418248, "num_tokens": 1396767379.0, "step": 25355 }, { "entropy": 0.6103266617283225, "epoch": 1.9005365578327842, "grad_norm": 0.30764883756637573, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8281295720487833, "num_tokens": 1398233264.0, "step": 25360 }, { "entropy": 0.6071938427165151, "epoch": 1.900911281766636, "grad_norm": 0.2507613003253937, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.8216642189770937, "num_tokens": 1399679886.0, "step": 25365 }, { "entropy": 0.5990001620724797, "epoch": 1.901286005700488, "grad_norm": 0.23149417340755463, "learning_rate": 0.0002, "loss": 0.6059, "mean_token_accuracy": 0.8276969846338034, "num_tokens": 1401119939.0, "step": 25370 }, { "entropy": 0.5944688394665718, "epoch": 1.9016607296343397, "grad_norm": 0.24140872061252594, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.8242877665907145, "num_tokens": 1402580887.0, "step": 25375 }, { "entropy": 0.5908888524398208, "epoch": 1.9020354535681916, "grad_norm": 0.21744230389595032, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8268540278077126, "num_tokens": 1404061959.0, "step": 25380 }, { "entropy": 0.6169875800609589, "epoch": 1.9024101775020434, "grad_norm": 0.26422667503356934, "learning_rate": 0.0002, "loss": 0.6408, "mean_token_accuracy": 0.8215350441634655, "num_tokens": 1405527946.0, "step": 25385 }, { "entropy": 0.5844759847968817, "epoch": 1.9027849014358953, "grad_norm": 0.22942404448986053, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.8238435883074999, "num_tokens": 1407001600.0, "step": 25390 }, { "entropy": 0.5893150420859457, "epoch": 1.9031596253697471, "grad_norm": 0.21895946562290192, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8274295203387737, "num_tokens": 1408436933.0, "step": 25395 }, { "entropy": 0.5869859570637346, "epoch": 1.903534349303599, "grad_norm": 0.3580193817615509, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.8250449202954769, "num_tokens": 1409957383.0, "step": 25400 }, { "entropy": 0.573896761238575, "epoch": 1.9039090732374508, "grad_norm": 0.25527170300483704, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.824926209077239, "num_tokens": 1411444826.0, "step": 25405 }, { "entropy": 0.5731947788968682, "epoch": 1.9042837971713027, "grad_norm": 0.2135271579027176, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.8265456896275282, "num_tokens": 1412903450.0, "step": 25410 }, { "entropy": 0.5850680677220226, "epoch": 1.9046585211051545, "grad_norm": 0.23520274460315704, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.825556631013751, "num_tokens": 1414370905.0, "step": 25415 }, { "entropy": 0.590957990847528, "epoch": 1.9050332450390064, "grad_norm": 0.24019332230091095, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.8214722096920013, "num_tokens": 1415809644.0, "step": 25420 }, { "entropy": 0.5871123405173421, "epoch": 1.9054079689728582, "grad_norm": 0.21711742877960205, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.8205826949328184, "num_tokens": 1417281592.0, "step": 25425 }, { "entropy": 0.5794287230819464, "epoch": 1.90578269290671, "grad_norm": 0.26259419322013855, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8234323795884848, "num_tokens": 1418760511.0, "step": 25430 }, { "entropy": 0.5911189330741763, "epoch": 1.906157416840562, "grad_norm": 0.23252296447753906, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.8243596568703652, "num_tokens": 1420249499.0, "step": 25435 }, { "entropy": 0.580801079235971, "epoch": 1.9065321407744138, "grad_norm": 0.22350746393203735, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.8232195559889078, "num_tokens": 1421714293.0, "step": 25440 }, { "entropy": 0.5762186951935291, "epoch": 1.9069068647082656, "grad_norm": 0.2416691929101944, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8261392202228308, "num_tokens": 1423196599.0, "step": 25445 }, { "entropy": 0.5809514785185457, "epoch": 1.9072815886421175, "grad_norm": 0.2081904113292694, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.8258631877601147, "num_tokens": 1424696677.0, "step": 25450 }, { "entropy": 0.5860581999644637, "epoch": 1.9076563125759693, "grad_norm": 0.27246227860450745, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.8211572144180537, "num_tokens": 1426198118.0, "step": 25455 }, { "entropy": 0.5972066286951303, "epoch": 1.9080310365098212, "grad_norm": 0.2299894243478775, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.8192323114722967, "num_tokens": 1427669520.0, "step": 25460 }, { "entropy": 0.5757892470806837, "epoch": 1.908405760443673, "grad_norm": 0.2168234884738922, "learning_rate": 0.0002, "loss": 0.6, "mean_token_accuracy": 0.8255556643009185, "num_tokens": 1429159892.0, "step": 25465 }, { "entropy": 0.5864704057574273, "epoch": 1.908780484377525, "grad_norm": 0.31412264704704285, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.8201960559934378, "num_tokens": 1430627721.0, "step": 25470 }, { "entropy": 0.6073621252551675, "epoch": 1.9091552083113768, "grad_norm": 0.25173547863960266, "learning_rate": 0.0002, "loss": 0.6438, "mean_token_accuracy": 0.8196699265390635, "num_tokens": 1432147916.0, "step": 25475 }, { "entropy": 0.6064713453873992, "epoch": 1.9095299322452286, "grad_norm": 0.31165504455566406, "learning_rate": 0.0002, "loss": 0.6368, "mean_token_accuracy": 0.8211893100291491, "num_tokens": 1433620921.0, "step": 25480 }, { "entropy": 0.5796294951811433, "epoch": 1.9099046561790805, "grad_norm": 0.23374593257904053, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.8282024160027504, "num_tokens": 1435073771.0, "step": 25485 }, { "entropy": 0.5880508353933692, "epoch": 1.9102793801129323, "grad_norm": 0.25223660469055176, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8284515231847763, "num_tokens": 1436548438.0, "step": 25490 }, { "entropy": 0.601642488874495, "epoch": 1.9106541040467842, "grad_norm": 0.2302904725074768, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8266216412186622, "num_tokens": 1437941899.0, "step": 25495 }, { "entropy": 0.5994819475337863, "epoch": 1.911028827980636, "grad_norm": 0.25183239579200745, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8267401020973921, "num_tokens": 1439397928.0, "step": 25500 }, { "entropy": 0.5980060556903481, "epoch": 1.9114035519144879, "grad_norm": 0.31845784187316895, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.8298162762075663, "num_tokens": 1440882982.0, "step": 25505 }, { "entropy": 0.6083921547979116, "epoch": 1.9117782758483397, "grad_norm": 0.29755017161369324, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.8217012155801058, "num_tokens": 1442340714.0, "step": 25510 }, { "entropy": 0.6020858036354184, "epoch": 1.9121529997821916, "grad_norm": 0.24298571050167084, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.8228733140975237, "num_tokens": 1443813427.0, "step": 25515 }, { "entropy": 0.6178229179233312, "epoch": 1.9125277237160436, "grad_norm": 0.24535422027111053, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.8213480267673731, "num_tokens": 1445288100.0, "step": 25520 }, { "entropy": 0.5920557180419564, "epoch": 1.9129024476498955, "grad_norm": 0.354818731546402, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8256340127438306, "num_tokens": 1446752585.0, "step": 25525 }, { "entropy": 0.6050641629844904, "epoch": 1.9132771715837473, "grad_norm": 0.24628673493862152, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.8225464217364788, "num_tokens": 1448253814.0, "step": 25530 }, { "entropy": 0.601188007555902, "epoch": 1.9136518955175992, "grad_norm": 0.23876823484897614, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.8234363477677107, "num_tokens": 1449738059.0, "step": 25535 }, { "entropy": 0.6196513008326292, "epoch": 1.914026619451451, "grad_norm": 0.24990366399288177, "learning_rate": 0.0002, "loss": 0.6403, "mean_token_accuracy": 0.8201347455382347, "num_tokens": 1451234220.0, "step": 25540 }, { "entropy": 0.5921712934970855, "epoch": 1.9144013433853029, "grad_norm": 0.23695050179958344, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8241920195519924, "num_tokens": 1452716558.0, "step": 25545 }, { "entropy": 0.6130012512207031, "epoch": 1.9147760673191547, "grad_norm": 0.24437007308006287, "learning_rate": 0.0002, "loss": 0.6324, "mean_token_accuracy": 0.8201507352292537, "num_tokens": 1454177796.0, "step": 25550 }, { "entropy": 0.6026687387377023, "epoch": 1.9151507912530066, "grad_norm": 0.25533658266067505, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.8256284814327955, "num_tokens": 1455660151.0, "step": 25555 }, { "entropy": 0.5963298672810197, "epoch": 1.9155255151868584, "grad_norm": 0.239890918135643, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8263048011809587, "num_tokens": 1457104835.0, "step": 25560 }, { "entropy": 0.5876923207193613, "epoch": 1.9159002391207103, "grad_norm": 0.21911774575710297, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.823461914435029, "num_tokens": 1458581329.0, "step": 25565 }, { "entropy": 0.5859302120283246, "epoch": 1.9162749630545621, "grad_norm": 0.24995458126068115, "learning_rate": 0.0002, "loss": 0.6046, "mean_token_accuracy": 0.8293785993009806, "num_tokens": 1460057054.0, "step": 25570 }, { "entropy": 0.5963035464286804, "epoch": 1.916649686988414, "grad_norm": 0.23896875977516174, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8297320619225502, "num_tokens": 1461505496.0, "step": 25575 }, { "entropy": 0.6234509279951453, "epoch": 1.9170244109222658, "grad_norm": 0.2634914815425873, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.8212934996932745, "num_tokens": 1462991672.0, "step": 25580 }, { "entropy": 0.5958159729838371, "epoch": 1.9173991348561177, "grad_norm": 0.2503279447555542, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.8266368512064218, "num_tokens": 1464412589.0, "step": 25585 }, { "entropy": 0.5875166300684214, "epoch": 1.9177738587899695, "grad_norm": 0.2833063304424286, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8257477011531591, "num_tokens": 1465879697.0, "step": 25590 }, { "entropy": 0.5893741071224212, "epoch": 1.9181485827238214, "grad_norm": 0.2413821816444397, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8221629880368709, "num_tokens": 1467389523.0, "step": 25595 }, { "entropy": 0.5784184144809842, "epoch": 1.9185233066576735, "grad_norm": 0.2543241083621979, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.8254987455904483, "num_tokens": 1468852827.0, "step": 25600 }, { "entropy": 0.5874075904488564, "epoch": 1.9188980305915253, "grad_norm": 0.25524768233299255, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.8238630063831807, "num_tokens": 1470334824.0, "step": 25605 }, { "entropy": 0.5882811438292265, "epoch": 1.9192727545253772, "grad_norm": 0.2500133216381073, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.8231612581759691, "num_tokens": 1471826757.0, "step": 25610 }, { "entropy": 0.5775259802117944, "epoch": 1.919647478459229, "grad_norm": 0.27372226119041443, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.8269838962703944, "num_tokens": 1473288125.0, "step": 25615 }, { "entropy": 0.5823035245761276, "epoch": 1.9200222023930809, "grad_norm": 0.24324198067188263, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8253226447850466, "num_tokens": 1474723735.0, "step": 25620 }, { "entropy": 0.5857744446024299, "epoch": 1.9203969263269327, "grad_norm": 0.255340039730072, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.8249082151800394, "num_tokens": 1476205287.0, "step": 25625 }, { "entropy": 0.56366101577878, "epoch": 1.9207716502607846, "grad_norm": 0.2658378779888153, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.8255032408982516, "num_tokens": 1477669824.0, "step": 25630 }, { "entropy": 0.5668471079319716, "epoch": 1.9211463741946364, "grad_norm": 0.23569776117801666, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.8238196633756161, "num_tokens": 1479161275.0, "step": 25635 }, { "entropy": 0.5673710001632571, "epoch": 1.9215210981284883, "grad_norm": 0.27445223927497864, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.8213906679302454, "num_tokens": 1480635577.0, "step": 25640 }, { "entropy": 0.568426126986742, "epoch": 1.92189582206234, "grad_norm": 0.2733519971370697, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.8235320001840591, "num_tokens": 1482109523.0, "step": 25645 }, { "entropy": 0.5705672172829509, "epoch": 1.922270545996192, "grad_norm": 0.2199220061302185, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8265011139214039, "num_tokens": 1483595455.0, "step": 25650 }, { "entropy": 0.555936967767775, "epoch": 1.9226452699300438, "grad_norm": 0.24953950941562653, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.826276034861803, "num_tokens": 1485026212.0, "step": 25655 }, { "entropy": 0.5736785635352135, "epoch": 1.9230199938638957, "grad_norm": 0.2526644766330719, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.8221217397600412, "num_tokens": 1486517514.0, "step": 25660 }, { "entropy": 0.5839030904695391, "epoch": 1.9233947177977475, "grad_norm": 0.2825617492198944, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.8264031562954187, "num_tokens": 1487955474.0, "step": 25665 }, { "entropy": 0.5978593964129686, "epoch": 1.9237694417315994, "grad_norm": 0.27760815620422363, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.823248852044344, "num_tokens": 1489447904.0, "step": 25670 }, { "entropy": 0.5884196845814585, "epoch": 1.9241441656654512, "grad_norm": 0.24013084173202515, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.823550245910883, "num_tokens": 1490959225.0, "step": 25675 }, { "entropy": 0.5887851325795055, "epoch": 1.924518889599303, "grad_norm": 0.2603825628757477, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.8270344588905573, "num_tokens": 1492426664.0, "step": 25680 }, { "entropy": 0.602082341350615, "epoch": 1.924893613533155, "grad_norm": 0.2512229382991791, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.8241531290113926, "num_tokens": 1493914234.0, "step": 25685 }, { "entropy": 0.589099970459938, "epoch": 1.9252683374670068, "grad_norm": 0.2910650968551636, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.8269766174256802, "num_tokens": 1495335189.0, "step": 25690 }, { "entropy": 0.5784912057220936, "epoch": 1.9256430614008586, "grad_norm": 0.2800990641117096, "learning_rate": 0.0002, "loss": 0.6074, "mean_token_accuracy": 0.8246557455509901, "num_tokens": 1496775250.0, "step": 25695 }, { "entropy": 0.5710781496018171, "epoch": 1.9260177853347105, "grad_norm": 0.2330835461616516, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.827728133648634, "num_tokens": 1498241898.0, "step": 25700 }, { "entropy": 0.5679316753521562, "epoch": 1.9263925092685623, "grad_norm": 0.23290005326271057, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8260374080389739, "num_tokens": 1499713830.0, "step": 25705 }, { "entropy": 0.5726318296045065, "epoch": 1.9267672332024142, "grad_norm": 0.2548713982105255, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.8244679637253285, "num_tokens": 1501215401.0, "step": 25710 }, { "entropy": 0.5786132177338004, "epoch": 1.927141957136266, "grad_norm": 0.25742122530937195, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.8246420677751303, "num_tokens": 1502732094.0, "step": 25715 }, { "entropy": 0.5764339003711939, "epoch": 1.9275166810701179, "grad_norm": 0.24361062049865723, "learning_rate": 0.0002, "loss": 0.615, "mean_token_accuracy": 0.8251828026026488, "num_tokens": 1504164857.0, "step": 25720 }, { "entropy": 0.593359711766243, "epoch": 1.9278914050039697, "grad_norm": 0.23245972394943237, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.8244379471987486, "num_tokens": 1505621467.0, "step": 25725 }, { "entropy": 0.6092345491051674, "epoch": 1.9282661289378216, "grad_norm": 0.21277089416980743, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.8224469806998969, "num_tokens": 1507136785.0, "step": 25730 }, { "entropy": 0.6125331232324243, "epoch": 1.9286408528716734, "grad_norm": 0.22212806344032288, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8223191950470209, "num_tokens": 1508585074.0, "step": 25735 }, { "entropy": 0.6132519327104091, "epoch": 1.9290155768055253, "grad_norm": 0.23904873430728912, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.8225095268338919, "num_tokens": 1510040503.0, "step": 25740 }, { "entropy": 0.6107864892110229, "epoch": 1.9293903007393771, "grad_norm": 0.25278377532958984, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.8231151614338159, "num_tokens": 1511490200.0, "step": 25745 }, { "entropy": 0.587964010052383, "epoch": 1.929765024673229, "grad_norm": 0.23399588465690613, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.830345668643713, "num_tokens": 1512964287.0, "step": 25750 }, { "entropy": 0.5938050577417016, "epoch": 1.9301397486070808, "grad_norm": 0.2547435760498047, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.8239239986985922, "num_tokens": 1514432578.0, "step": 25755 }, { "entropy": 0.5965510440990329, "epoch": 1.9305144725409327, "grad_norm": 0.23764915764331818, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.8224544141441583, "num_tokens": 1515965042.0, "step": 25760 }, { "entropy": 0.5940247934311629, "epoch": 1.9308891964747845, "grad_norm": 0.24106456339359283, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.8225108958780766, "num_tokens": 1517432981.0, "step": 25765 }, { "entropy": 0.5804778099060058, "epoch": 1.9312639204086364, "grad_norm": 0.25489211082458496, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.825439064949751, "num_tokens": 1518907994.0, "step": 25770 }, { "entropy": 0.6083348777145148, "epoch": 1.9316386443424882, "grad_norm": 0.22885753214359283, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.8236935988068581, "num_tokens": 1520385658.0, "step": 25775 }, { "entropy": 0.5884446188807487, "epoch": 1.93201336827634, "grad_norm": 0.22484293580055237, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8234231431037188, "num_tokens": 1521874085.0, "step": 25780 }, { "entropy": 0.5989419998601079, "epoch": 1.932388092210192, "grad_norm": 0.24299435317516327, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.8226814094930888, "num_tokens": 1523322480.0, "step": 25785 }, { "entropy": 0.5861924480646848, "epoch": 1.9327628161440438, "grad_norm": 0.23713381588459015, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8270346097648144, "num_tokens": 1524816614.0, "step": 25790 }, { "entropy": 0.5885693999007344, "epoch": 1.9331375400778956, "grad_norm": 0.26795846223831177, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8232842333614826, "num_tokens": 1526245924.0, "step": 25795 }, { "entropy": 0.5812580633908511, "epoch": 1.9335122640117475, "grad_norm": 0.26708418130874634, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8274386584758758, "num_tokens": 1527700724.0, "step": 25800 }, { "entropy": 0.5890687089413404, "epoch": 1.9338869879455993, "grad_norm": 0.2635115385055542, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.8214336805045604, "num_tokens": 1472771.0, "step": 25805 }, { "entropy": 0.5948664719238878, "epoch": 1.9342617118794512, "grad_norm": 0.2481255978345871, "learning_rate": 0.0002, "loss": 0.6206, "mean_token_accuracy": 0.824547692015767, "num_tokens": 2941751.0, "step": 25810 }, { "entropy": 0.6005215782672166, "epoch": 1.934636435813303, "grad_norm": 0.26929306983947754, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.8210404314100742, "num_tokens": 4414490.0, "step": 25815 }, { "entropy": 0.5948374869301916, "epoch": 1.9350111597471549, "grad_norm": 0.22132273018360138, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.8253141429275275, "num_tokens": 5859070.0, "step": 25820 }, { "entropy": 0.5855232944712043, "epoch": 1.9353858836810067, "grad_norm": 0.2231445163488388, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8236203972250223, "num_tokens": 7285575.0, "step": 25825 }, { "entropy": 0.5766067689284682, "epoch": 1.9357606076148586, "grad_norm": 0.24233651161193848, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.8240337878465652, "num_tokens": 8774844.0, "step": 25830 }, { "entropy": 0.5852965768426657, "epoch": 1.9361353315487106, "grad_norm": 0.2396961748600006, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.8249766379594803, "num_tokens": 10213364.0, "step": 25835 }, { "entropy": 0.5976591976359487, "epoch": 1.9365100554825625, "grad_norm": 0.2503151297569275, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8225820478051901, "num_tokens": 11687732.0, "step": 25840 }, { "entropy": 0.5920827055349946, "epoch": 1.9368847794164143, "grad_norm": 0.23596793413162231, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.8263189852237701, "num_tokens": 13177873.0, "step": 25845 }, { "entropy": 0.5957056455314159, "epoch": 1.9372595033502662, "grad_norm": 0.23165695369243622, "learning_rate": 0.0002, "loss": 0.6183, "mean_token_accuracy": 0.8223624773323536, "num_tokens": 14663830.0, "step": 25850 }, { "entropy": 0.5882081162184477, "epoch": 1.937634227284118, "grad_norm": 0.22647042572498322, "learning_rate": 0.0002, "loss": 0.6074, "mean_token_accuracy": 0.8246498759835958, "num_tokens": 16105608.0, "step": 25855 }, { "entropy": 0.594187879562378, "epoch": 1.93800895121797, "grad_norm": 0.31708815693855286, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.8251105576753617, "num_tokens": 17545016.0, "step": 25860 }, { "entropy": 0.6032039152458311, "epoch": 1.9383836751518218, "grad_norm": 0.27270257472991943, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8253863316029311, "num_tokens": 19028810.0, "step": 25865 }, { "entropy": 0.6011664073914289, "epoch": 1.9387583990856736, "grad_norm": 0.2213815301656723, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.827291925996542, "num_tokens": 20494313.0, "step": 25870 }, { "entropy": 0.6244247762486339, "epoch": 1.9391331230195255, "grad_norm": 0.2324940264225006, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8230885572731494, "num_tokens": 22017863.0, "step": 25875 }, { "entropy": 0.6019473744556307, "epoch": 1.9395078469533773, "grad_norm": 0.24248917400836945, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.8282840553671121, "num_tokens": 23468741.0, "step": 25880 }, { "entropy": 0.6103999180719256, "epoch": 1.9398825708872292, "grad_norm": 0.2261429727077484, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.8284533075988293, "num_tokens": 24947655.0, "step": 25885 }, { "entropy": 0.6141865886747837, "epoch": 1.940257294821081, "grad_norm": 0.24292908608913422, "learning_rate": 0.0002, "loss": 0.6349, "mean_token_accuracy": 0.8194290407001972, "num_tokens": 26445182.0, "step": 25890 }, { "entropy": 0.5925233921036124, "epoch": 1.9406320187549329, "grad_norm": 0.2845442593097687, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.8263367157429456, "num_tokens": 27930347.0, "step": 25895 }, { "entropy": 0.6149776179343462, "epoch": 1.9410067426887847, "grad_norm": 0.23173998296260834, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.8254602152854205, "num_tokens": 29440746.0, "step": 25900 }, { "entropy": 0.607605971582234, "epoch": 1.9413814666226366, "grad_norm": 0.27074530720710754, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8250109113752842, "num_tokens": 30935717.0, "step": 25905 }, { "entropy": 0.6083950651809573, "epoch": 1.9417561905564886, "grad_norm": 0.24382780492305756, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8224107008427382, "num_tokens": 32396844.0, "step": 25910 }, { "entropy": 0.5971226915717125, "epoch": 1.9421309144903405, "grad_norm": 0.20616021752357483, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8256474427878857, "num_tokens": 33850601.0, "step": 25915 }, { "entropy": 0.5937972763553262, "epoch": 1.9425056384241923, "grad_norm": 0.23731814324855804, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.824216378480196, "num_tokens": 35289587.0, "step": 25920 }, { "entropy": 0.5949497781693935, "epoch": 1.9428803623580442, "grad_norm": 0.28293153643608093, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8260234531015158, "num_tokens": 36752218.0, "step": 25925 }, { "entropy": 0.5865465911105275, "epoch": 1.943255086291896, "grad_norm": 0.3011299967765808, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.8302482586354017, "num_tokens": 38210998.0, "step": 25930 }, { "entropy": 0.6093134541064501, "epoch": 1.9436298102257479, "grad_norm": 0.27506354451179504, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.824212907999754, "num_tokens": 39653780.0, "step": 25935 }, { "entropy": 0.592563160136342, "epoch": 1.9440045341595997, "grad_norm": 0.22104531526565552, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.8286864440888166, "num_tokens": 41106801.0, "step": 25940 }, { "entropy": 0.5972993671894073, "epoch": 1.9443792580934516, "grad_norm": 0.23330388963222504, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8216350402683019, "num_tokens": 42564274.0, "step": 25945 }, { "entropy": 0.5704649113118648, "epoch": 1.9447539820273034, "grad_norm": 0.24402210116386414, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.8271330442279577, "num_tokens": 44040371.0, "step": 25950 }, { "entropy": 0.5781120667234063, "epoch": 1.9451287059611553, "grad_norm": 0.21759440004825592, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.8255025077611208, "num_tokens": 45488134.0, "step": 25955 }, { "entropy": 0.5750485962256789, "epoch": 1.9455034298950071, "grad_norm": 0.21882885694503784, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.827984756231308, "num_tokens": 46913519.0, "step": 25960 }, { "entropy": 0.5958069561049342, "epoch": 1.945878153828859, "grad_norm": 0.2454458475112915, "learning_rate": 0.0002, "loss": 0.645, "mean_token_accuracy": 0.8191369462758302, "num_tokens": 48404760.0, "step": 25965 }, { "entropy": 0.582484157755971, "epoch": 1.9462528777627108, "grad_norm": 0.2560737431049347, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.8258166115731, "num_tokens": 49879315.0, "step": 25970 }, { "entropy": 0.5893050774931907, "epoch": 1.9466276016965627, "grad_norm": 0.2573314607143402, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.8222794983536005, "num_tokens": 51362312.0, "step": 25975 }, { "entropy": 0.5879449268803001, "epoch": 1.9470023256304145, "grad_norm": 0.24823996424674988, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8268843423575163, "num_tokens": 52851691.0, "step": 25980 }, { "entropy": 0.5949434069916606, "epoch": 1.9473770495642664, "grad_norm": 0.2663544714450836, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8228618551045657, "num_tokens": 54333459.0, "step": 25985 }, { "entropy": 0.583906333334744, "epoch": 1.9477517734981182, "grad_norm": 0.227411150932312, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8286389339715242, "num_tokens": 55801268.0, "step": 25990 }, { "entropy": 0.5977417752146721, "epoch": 1.94812649743197, "grad_norm": 0.2325288951396942, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.8256944265216589, "num_tokens": 57300798.0, "step": 25995 }, { "entropy": 0.6074775004759431, "epoch": 1.948501221365822, "grad_norm": 0.2337213158607483, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8251435443758964, "num_tokens": 58765343.0, "step": 26000 }, { "entropy": 0.5891720734536647, "epoch": 1.9488759452996738, "grad_norm": 0.2296583652496338, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8273073058575392, "num_tokens": 60250314.0, "step": 26005 }, { "entropy": 0.5851678606122732, "epoch": 1.9492506692335256, "grad_norm": 0.22987410426139832, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8227623861283064, "num_tokens": 61687771.0, "step": 26010 }, { "entropy": 0.5883162124082446, "epoch": 1.9496253931673775, "grad_norm": 0.23908886313438416, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.8272796884179116, "num_tokens": 63152270.0, "step": 26015 }, { "entropy": 0.6059172470122576, "epoch": 1.9500001171012293, "grad_norm": 0.2863325774669647, "learning_rate": 0.0002, "loss": 0.6168, "mean_token_accuracy": 0.8260949660092592, "num_tokens": 64618604.0, "step": 26020 }, { "entropy": 0.5996322488412261, "epoch": 1.9503748410350812, "grad_norm": 0.2796788811683655, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8260500263422728, "num_tokens": 66108305.0, "step": 26025 }, { "entropy": 0.6122660513967275, "epoch": 1.950749564968933, "grad_norm": 0.272449254989624, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.822725472226739, "num_tokens": 67598742.0, "step": 26030 }, { "entropy": 0.6059251520782709, "epoch": 1.9511242889027849, "grad_norm": 0.2669229805469513, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8249306533485651, "num_tokens": 69082479.0, "step": 26035 }, { "entropy": 0.605781814083457, "epoch": 1.9514990128366367, "grad_norm": 0.3035043478012085, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.8261829152703285, "num_tokens": 70546810.0, "step": 26040 }, { "entropy": 0.6082359384745359, "epoch": 1.9518737367704886, "grad_norm": 0.2612118721008301, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8226664677262306, "num_tokens": 72059096.0, "step": 26045 }, { "entropy": 0.5990720603615045, "epoch": 1.9522484607043404, "grad_norm": 0.25964316725730896, "learning_rate": 0.0002, "loss": 0.615, "mean_token_accuracy": 0.8247035779058933, "num_tokens": 73527111.0, "step": 26050 }, { "entropy": 0.6084329338744283, "epoch": 1.9526231846381923, "grad_norm": 0.2579631209373474, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8181900296360254, "num_tokens": 75060482.0, "step": 26055 }, { "entropy": 0.6127833051607012, "epoch": 1.9529979085720441, "grad_norm": 0.2849763035774231, "learning_rate": 0.0002, "loss": 0.641, "mean_token_accuracy": 0.8200882367789746, "num_tokens": 76539925.0, "step": 26060 }, { "entropy": 0.6074942788109183, "epoch": 1.953372632505896, "grad_norm": 0.25651979446411133, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.8184791814535857, "num_tokens": 78047965.0, "step": 26065 }, { "entropy": 0.5968333216384053, "epoch": 1.9537473564397478, "grad_norm": 0.23371100425720215, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8243351202458143, "num_tokens": 79490480.0, "step": 26070 }, { "entropy": 0.5937554787844419, "epoch": 1.9541220803735997, "grad_norm": 0.22770793735980988, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8245595060288906, "num_tokens": 80959547.0, "step": 26075 }, { "entropy": 0.5998935882002115, "epoch": 1.9544968043074515, "grad_norm": 0.21130779385566711, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.8253748644143343, "num_tokens": 82442773.0, "step": 26080 }, { "entropy": 0.5777610806748271, "epoch": 1.9548715282413034, "grad_norm": 0.20747217535972595, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8281290985643863, "num_tokens": 83907443.0, "step": 26085 }, { "entropy": 0.5805687246844172, "epoch": 1.9552462521751552, "grad_norm": 0.24696673452854156, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8237863507121801, "num_tokens": 85357337.0, "step": 26090 }, { "entropy": 0.5912977416068316, "epoch": 1.955620976109007, "grad_norm": 0.23288145661354065, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8222731146961451, "num_tokens": 86824693.0, "step": 26095 }, { "entropy": 0.5873122112825513, "epoch": 1.955995700042859, "grad_norm": 0.33203980326652527, "learning_rate": 0.0002, "loss": 0.6258, "mean_token_accuracy": 0.823791005089879, "num_tokens": 88263604.0, "step": 26100 }, { "entropy": 0.5999187413603068, "epoch": 1.9563704239767108, "grad_norm": 0.24210652709007263, "learning_rate": 0.0002, "loss": 0.6419, "mean_token_accuracy": 0.821020046621561, "num_tokens": 89716927.0, "step": 26105 }, { "entropy": 0.5738387359306216, "epoch": 1.9567451479105626, "grad_norm": 0.2925764322280884, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8262517392635346, "num_tokens": 91126513.0, "step": 26110 }, { "entropy": 0.5733636263757944, "epoch": 1.9571198718444145, "grad_norm": 0.2666972577571869, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8260395728051663, "num_tokens": 92569210.0, "step": 26115 }, { "entropy": 0.5842085659503937, "epoch": 1.9574945957782663, "grad_norm": 0.22531573474407196, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.8243949424475432, "num_tokens": 94049580.0, "step": 26120 }, { "entropy": 0.5953168040141463, "epoch": 1.9578693197121182, "grad_norm": 0.24543878436088562, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.8217584427446127, "num_tokens": 95501834.0, "step": 26125 }, { "entropy": 0.5977968372404575, "epoch": 1.95824404364597, "grad_norm": 0.22389137744903564, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.827220081537962, "num_tokens": 96993195.0, "step": 26130 }, { "entropy": 0.6007878357544542, "epoch": 1.958618767579822, "grad_norm": 0.22838471829891205, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.8229058090597391, "num_tokens": 98491016.0, "step": 26135 }, { "entropy": 0.6106753684580326, "epoch": 1.9589934915136737, "grad_norm": 0.2772458791732788, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.8242794468998909, "num_tokens": 99958194.0, "step": 26140 }, { "entropy": 0.5909330511465669, "epoch": 1.9593682154475258, "grad_norm": 0.24152037501335144, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.8248504910618066, "num_tokens": 101410062.0, "step": 26145 }, { "entropy": 0.6085104381665587, "epoch": 1.9597429393813777, "grad_norm": 0.2424718737602234, "learning_rate": 0.0002, "loss": 0.6278, "mean_token_accuracy": 0.8236850075423717, "num_tokens": 102903820.0, "step": 26150 }, { "entropy": 0.5899240907281638, "epoch": 1.9601176633152295, "grad_norm": 0.25681281089782715, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.8285179730504751, "num_tokens": 104362913.0, "step": 26155 }, { "entropy": 0.5934245368465781, "epoch": 1.9604923872490814, "grad_norm": 0.22882050275802612, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8249780561774969, "num_tokens": 105809666.0, "step": 26160 }, { "entropy": 0.5984877122566103, "epoch": 1.9608671111829332, "grad_norm": 0.35495349764823914, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.8248902827501297, "num_tokens": 107364311.0, "step": 26165 }, { "entropy": 0.5830071996897459, "epoch": 1.961241835116785, "grad_norm": 0.3242053985595703, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.8287275835871697, "num_tokens": 108836151.0, "step": 26170 }, { "entropy": 0.5942465217784048, "epoch": 1.961616559050637, "grad_norm": 0.23483175039291382, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.8229913897812366, "num_tokens": 110318819.0, "step": 26175 }, { "entropy": 0.5748112738132477, "epoch": 1.9619912829844888, "grad_norm": 0.24688628315925598, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8273730900138616, "num_tokens": 111809050.0, "step": 26180 }, { "entropy": 0.5732085946947336, "epoch": 1.9623660069183406, "grad_norm": 0.33347412943840027, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.8290322948247194, "num_tokens": 113272915.0, "step": 26185 }, { "entropy": 0.5779432041570545, "epoch": 1.9627407308521925, "grad_norm": 0.2595624327659607, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8223965164273978, "num_tokens": 114716901.0, "step": 26190 }, { "entropy": 0.5874349242076278, "epoch": 1.9631154547860443, "grad_norm": 0.2540680468082428, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.8246492367237807, "num_tokens": 116160909.0, "step": 26195 }, { "entropy": 0.578520299680531, "epoch": 1.9634901787198962, "grad_norm": 0.24625197052955627, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.82503358758986, "num_tokens": 117592738.0, "step": 26200 }, { "entropy": 0.5813098257407546, "epoch": 1.963864902653748, "grad_norm": 0.22985808551311493, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8247170347720385, "num_tokens": 119057248.0, "step": 26205 }, { "entropy": 0.5648361548781395, "epoch": 1.9642396265875999, "grad_norm": 0.2448948174715042, "learning_rate": 0.0002, "loss": 0.6074, "mean_token_accuracy": 0.8277863573282958, "num_tokens": 120540589.0, "step": 26210 }, { "entropy": 0.5786413710564375, "epoch": 1.9646143505214517, "grad_norm": 0.2542787790298462, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.8254399716854095, "num_tokens": 122011203.0, "step": 26215 }, { "entropy": 0.5969409139826893, "epoch": 1.9649890744553038, "grad_norm": 0.2569825351238251, "learning_rate": 0.0002, "loss": 0.6297, "mean_token_accuracy": 0.8209719721227884, "num_tokens": 123448443.0, "step": 26220 }, { "entropy": 0.5675378471612931, "epoch": 1.9653637983891556, "grad_norm": 0.3370172679424286, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.8253618758171797, "num_tokens": 124885945.0, "step": 26225 }, { "entropy": 0.5823122713714838, "epoch": 1.9657385223230075, "grad_norm": 0.22224007546901703, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8251466568559408, "num_tokens": 126372084.0, "step": 26230 }, { "entropy": 0.5879185825586319, "epoch": 1.9661132462568593, "grad_norm": 0.26297029852867126, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.8225628916174174, "num_tokens": 127853630.0, "step": 26235 }, { "entropy": 0.5845142204314471, "epoch": 1.9664879701907112, "grad_norm": 0.24594436585903168, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.8263237461447716, "num_tokens": 129325072.0, "step": 26240 }, { "entropy": 0.5965312417596579, "epoch": 1.966862694124563, "grad_norm": 0.28922104835510254, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.8271456729620695, "num_tokens": 130769582.0, "step": 26245 }, { "entropy": 0.6036937665194273, "epoch": 1.967237418058415, "grad_norm": 0.23025548458099365, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.820811940357089, "num_tokens": 132221380.0, "step": 26250 }, { "entropy": 0.5883490588515997, "epoch": 1.9676121419922667, "grad_norm": 0.2907620370388031, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.8291618980467319, "num_tokens": 133702302.0, "step": 26255 }, { "entropy": 0.5919439984485507, "epoch": 1.9679868659261186, "grad_norm": 0.261481374502182, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.8254494834691286, "num_tokens": 135186295.0, "step": 26260 }, { "entropy": 0.5910911843180656, "epoch": 1.9683615898599705, "grad_norm": 0.25093212723731995, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.8254299696534872, "num_tokens": 136666380.0, "step": 26265 }, { "entropy": 0.5961570540443063, "epoch": 1.9687363137938223, "grad_norm": 0.22955575585365295, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.8269493315368891, "num_tokens": 138142689.0, "step": 26270 }, { "entropy": 0.5914493488147855, "epoch": 1.9691110377276742, "grad_norm": 0.22581180930137634, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8240629807114601, "num_tokens": 139616586.0, "step": 26275 }, { "entropy": 0.5915333535522223, "epoch": 1.969485761661526, "grad_norm": 0.2385571002960205, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8241822980344296, "num_tokens": 141113882.0, "step": 26280 }, { "entropy": 0.5891587786376477, "epoch": 1.9698604855953779, "grad_norm": 0.23064643144607544, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.8240341305732727, "num_tokens": 142563109.0, "step": 26285 }, { "entropy": 0.5978619046509266, "epoch": 1.9702352095292297, "grad_norm": 0.25717782974243164, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.8268027894198895, "num_tokens": 144019454.0, "step": 26290 }, { "entropy": 0.5974737469106912, "epoch": 1.9706099334630816, "grad_norm": 0.261447548866272, "learning_rate": 0.0002, "loss": 0.6125, "mean_token_accuracy": 0.8237415857613086, "num_tokens": 145450870.0, "step": 26295 }, { "entropy": 0.5999531116336584, "epoch": 1.9709846573969334, "grad_norm": 0.25964218378067017, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.8215520277619361, "num_tokens": 146942814.0, "step": 26300 }, { "entropy": 0.5840532071888447, "epoch": 1.9713593813307853, "grad_norm": 0.24745582044124603, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8248787347227335, "num_tokens": 148476921.0, "step": 26305 }, { "entropy": 0.6056008210405708, "epoch": 1.971734105264637, "grad_norm": 0.2749038636684418, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.8213661354035139, "num_tokens": 149953169.0, "step": 26310 }, { "entropy": 0.5904569588601589, "epoch": 1.972108829198489, "grad_norm": 0.2721565067768097, "learning_rate": 0.0002, "loss": 0.6039, "mean_token_accuracy": 0.8276990227401256, "num_tokens": 151422773.0, "step": 26315 }, { "entropy": 0.5918485384434462, "epoch": 1.9724835531323408, "grad_norm": 0.22334161400794983, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8252490606158972, "num_tokens": 152921704.0, "step": 26320 }, { "entropy": 0.6027303284034133, "epoch": 1.9728582770661927, "grad_norm": 0.22793860733509064, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8259943630546331, "num_tokens": 154384893.0, "step": 26325 }, { "entropy": 0.5866541758179664, "epoch": 1.9732330010000445, "grad_norm": 0.2408718466758728, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8255648095160723, "num_tokens": 155836756.0, "step": 26330 }, { "entropy": 0.6023217024281621, "epoch": 1.9736077249338964, "grad_norm": 0.356512188911438, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.8257626127451658, "num_tokens": 157349685.0, "step": 26335 }, { "entropy": 0.5909181552007794, "epoch": 1.9739824488677482, "grad_norm": 0.23088006675243378, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.8251981813460588, "num_tokens": 158835432.0, "step": 26340 }, { "entropy": 0.5915780659765005, "epoch": 1.9743571728016, "grad_norm": 0.36743974685668945, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8270438350737095, "num_tokens": 160340500.0, "step": 26345 }, { "entropy": 0.5968485416844487, "epoch": 1.974731896735452, "grad_norm": 0.25413277745246887, "learning_rate": 0.0002, "loss": 0.6046, "mean_token_accuracy": 0.82412523701787, "num_tokens": 161794561.0, "step": 26350 }, { "entropy": 0.613394434377551, "epoch": 1.9751066206693038, "grad_norm": 0.26083967089653015, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.8248979523777962, "num_tokens": 163302707.0, "step": 26355 }, { "entropy": 0.585911040008068, "epoch": 1.9754813446031556, "grad_norm": 0.317783385515213, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.8299449507147074, "num_tokens": 164779850.0, "step": 26360 }, { "entropy": 0.589511445723474, "epoch": 1.9758560685370075, "grad_norm": 0.2507086396217346, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.8303582184016705, "num_tokens": 166255161.0, "step": 26365 }, { "entropy": 0.5847002455964685, "epoch": 1.9762307924708593, "grad_norm": 0.25215059518814087, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8274968888610601, "num_tokens": 167697537.0, "step": 26370 }, { "entropy": 0.6073427377268672, "epoch": 1.9766055164047112, "grad_norm": 0.23072852194309235, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.8212268393486738, "num_tokens": 169207551.0, "step": 26375 }, { "entropy": 0.5975000996142625, "epoch": 1.976980240338563, "grad_norm": 0.27427971363067627, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.8280824694782496, "num_tokens": 170656006.0, "step": 26380 }, { "entropy": 0.6069650463759899, "epoch": 1.9773549642724149, "grad_norm": 0.2849292457103729, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.8239812474697829, "num_tokens": 172149351.0, "step": 26385 }, { "entropy": 0.6034294473007321, "epoch": 1.9777296882062667, "grad_norm": 0.3868841230869293, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.8240374688059091, "num_tokens": 173652384.0, "step": 26390 }, { "entropy": 0.5854467811062932, "epoch": 1.9781044121401186, "grad_norm": 0.24622252583503723, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8262446731328964, "num_tokens": 175149601.0, "step": 26395 }, { "entropy": 0.607771229185164, "epoch": 1.9784791360739704, "grad_norm": 0.22717727720737457, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.8214649368077517, "num_tokens": 176646017.0, "step": 26400 }, { "entropy": 0.587752834148705, "epoch": 1.9788538600078223, "grad_norm": 0.2723914086818695, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.8272875215858221, "num_tokens": 178111099.0, "step": 26405 }, { "entropy": 0.599476788751781, "epoch": 1.9792285839416741, "grad_norm": 0.27139919996261597, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.824680944532156, "num_tokens": 179560310.0, "step": 26410 }, { "entropy": 0.6020775686949491, "epoch": 1.979603307875526, "grad_norm": 0.23712681233882904, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.824565451219678, "num_tokens": 181061468.0, "step": 26415 }, { "entropy": 0.5933213103562593, "epoch": 1.9799780318093778, "grad_norm": 0.3048665523529053, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.8217222515493632, "num_tokens": 182557454.0, "step": 26420 }, { "entropy": 0.580111320130527, "epoch": 1.9803527557432297, "grad_norm": 0.2397320419549942, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.827034980431199, "num_tokens": 184038938.0, "step": 26425 }, { "entropy": 0.6082004049792886, "epoch": 1.9807274796770815, "grad_norm": 0.22787651419639587, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.8204731963574886, "num_tokens": 185529374.0, "step": 26430 }, { "entropy": 0.6061400391161442, "epoch": 1.9811022036109334, "grad_norm": 0.2169894427061081, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.8252893671393394, "num_tokens": 187032519.0, "step": 26435 }, { "entropy": 0.5877265691757202, "epoch": 1.9814769275447852, "grad_norm": 0.25809839367866516, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.8256535436958075, "num_tokens": 188499716.0, "step": 26440 }, { "entropy": 0.5776393931359053, "epoch": 1.981851651478637, "grad_norm": 0.2331065833568573, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.8239523351192475, "num_tokens": 189941611.0, "step": 26445 }, { "entropy": 0.5686948165297508, "epoch": 1.982226375412489, "grad_norm": 0.25964343547821045, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.8272545259445906, "num_tokens": 191435252.0, "step": 26450 }, { "entropy": 0.5848906707018614, "epoch": 1.982601099346341, "grad_norm": 0.2693438231945038, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.8261385701596737, "num_tokens": 192933037.0, "step": 26455 }, { "entropy": 0.5927821947261691, "epoch": 1.9829758232801928, "grad_norm": 0.22943823039531708, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.8251082427799702, "num_tokens": 194450181.0, "step": 26460 }, { "entropy": 0.5995290953665972, "epoch": 1.9833505472140447, "grad_norm": 0.22566990554332733, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8230024550110102, "num_tokens": 195961893.0, "step": 26465 }, { "entropy": 0.5918855985626579, "epoch": 1.9837252711478965, "grad_norm": 0.2252884954214096, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.8234981890767813, "num_tokens": 197460462.0, "step": 26470 }, { "entropy": 0.5973582815378904, "epoch": 1.9840999950817484, "grad_norm": 0.231316938996315, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8241278231143951, "num_tokens": 198904083.0, "step": 26475 }, { "entropy": 0.6005895348265767, "epoch": 1.9844747190156002, "grad_norm": 0.34423139691352844, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.826152466610074, "num_tokens": 200376830.0, "step": 26480 }, { "entropy": 0.6037478426471352, "epoch": 1.984849442949452, "grad_norm": 0.9577329754829407, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.8220184817910194, "num_tokens": 201845278.0, "step": 26485 }, { "entropy": 0.6029273660853505, "epoch": 1.985224166883304, "grad_norm": 0.22908395528793335, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8238917652517557, "num_tokens": 203294605.0, "step": 26490 }, { "entropy": 0.604142608307302, "epoch": 1.9855988908171558, "grad_norm": 0.2291843593120575, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8238874658942222, "num_tokens": 204776215.0, "step": 26495 }, { "entropy": 0.5868363901972771, "epoch": 1.9859736147510076, "grad_norm": 0.44177839159965515, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.8264834374189377, "num_tokens": 206199361.0, "step": 26500 }, { "entropy": 0.603475728072226, "epoch": 1.9863483386848595, "grad_norm": 0.24996158480644226, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8273143380880356, "num_tokens": 207632064.0, "step": 26505 }, { "entropy": 0.5829911133274436, "epoch": 1.9867230626187113, "grad_norm": 0.2321632206439972, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8283224880695343, "num_tokens": 209098136.0, "step": 26510 }, { "entropy": 0.5853326492011547, "epoch": 1.9870977865525632, "grad_norm": 0.2967575788497925, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.8288592360913754, "num_tokens": 210590877.0, "step": 26515 }, { "entropy": 0.5860830979421735, "epoch": 1.987472510486415, "grad_norm": 0.2449166625738144, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.8225750848650932, "num_tokens": 212065185.0, "step": 26520 }, { "entropy": 0.5906287925317883, "epoch": 1.987847234420267, "grad_norm": 0.2510605454444885, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.8197634067386389, "num_tokens": 213603391.0, "step": 26525 }, { "entropy": 0.599095406010747, "epoch": 1.988221958354119, "grad_norm": 0.30577102303504944, "learning_rate": 0.0002, "loss": 0.6337, "mean_token_accuracy": 0.8217611618340015, "num_tokens": 215067731.0, "step": 26530 }, { "entropy": 0.5665709163993597, "epoch": 1.9885966822879708, "grad_norm": 0.23884202539920807, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.8279894433915616, "num_tokens": 216520237.0, "step": 26535 }, { "entropy": 0.5869825173169374, "epoch": 1.9889714062218227, "grad_norm": 0.26562339067459106, "learning_rate": 0.0002, "loss": 0.6256, "mean_token_accuracy": 0.8242473036050797, "num_tokens": 217946154.0, "step": 26540 }, { "entropy": 0.5811146646738052, "epoch": 1.9893461301556745, "grad_norm": 0.24768029153347015, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8230411499738693, "num_tokens": 219388023.0, "step": 26545 }, { "entropy": 0.5787871609441936, "epoch": 1.9897208540895264, "grad_norm": 0.4000564217567444, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.8284757729619742, "num_tokens": 220804337.0, "step": 26550 }, { "entropy": 0.5809161467477679, "epoch": 1.9900955780233782, "grad_norm": 0.25196021795272827, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.826192195340991, "num_tokens": 222298033.0, "step": 26555 }, { "entropy": 0.5984633691608906, "epoch": 1.99047030195723, "grad_norm": 0.24497413635253906, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8225912351161242, "num_tokens": 223787426.0, "step": 26560 }, { "entropy": 0.5787488522008062, "epoch": 1.990845025891082, "grad_norm": 0.2588868737220764, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.8306217692792416, "num_tokens": 225314337.0, "step": 26565 }, { "entropy": 0.5938095126301051, "epoch": 1.9912197498249338, "grad_norm": 0.23395805060863495, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.8266245622187853, "num_tokens": 226765089.0, "step": 26570 }, { "entropy": 0.5832427447661758, "epoch": 1.9915944737587856, "grad_norm": 0.2875167727470398, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.8247217912226915, "num_tokens": 228261432.0, "step": 26575 }, { "entropy": 0.5957107175141573, "epoch": 1.9919691976926375, "grad_norm": 0.2986392080783844, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.824340683221817, "num_tokens": 229706676.0, "step": 26580 }, { "entropy": 0.5870132889598608, "epoch": 1.9923439216264893, "grad_norm": 0.2727271318435669, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.8244659904390573, "num_tokens": 231167431.0, "step": 26585 }, { "entropy": 0.587210631556809, "epoch": 1.9927186455603412, "grad_norm": 0.24750691652297974, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.8246493488550186, "num_tokens": 232634180.0, "step": 26590 }, { "entropy": 0.59911601357162, "epoch": 1.993093369494193, "grad_norm": 0.21010850369930267, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8265341222286224, "num_tokens": 234067253.0, "step": 26595 }, { "entropy": 0.579389913752675, "epoch": 1.9934680934280449, "grad_norm": 0.2555595636367798, "learning_rate": 0.0002, "loss": 0.6074, "mean_token_accuracy": 0.8277370717376471, "num_tokens": 235515164.0, "step": 26600 }, { "entropy": 0.5908417360857129, "epoch": 1.9938428173618967, "grad_norm": 0.2489747852087021, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8237824391573667, "num_tokens": 236957102.0, "step": 26605 }, { "entropy": 0.582246010005474, "epoch": 1.9942175412957486, "grad_norm": 0.2366480678319931, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.8257036756724119, "num_tokens": 238461343.0, "step": 26610 }, { "entropy": 0.5773574909195304, "epoch": 1.9945922652296004, "grad_norm": 0.24033528566360474, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.8276914589107036, "num_tokens": 239921948.0, "step": 26615 }, { "entropy": 0.5796498661860824, "epoch": 1.9949669891634523, "grad_norm": 0.23701801896095276, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8268893100321293, "num_tokens": 241381880.0, "step": 26620 }, { "entropy": 0.579760423488915, "epoch": 1.9953417130973041, "grad_norm": 0.22348082065582275, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.8274849846959114, "num_tokens": 242861740.0, "step": 26625 }, { "entropy": 0.585285360738635, "epoch": 1.995716437031156, "grad_norm": 0.23794421553611755, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8224194552749395, "num_tokens": 244339772.0, "step": 26630 }, { "entropy": 0.5919573925435543, "epoch": 1.9960911609650078, "grad_norm": 0.27847999334335327, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.8241445206105709, "num_tokens": 245821072.0, "step": 26635 }, { "entropy": 0.597807128727436, "epoch": 1.9964658848988597, "grad_norm": 0.24212861061096191, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.8261039193719626, "num_tokens": 247288084.0, "step": 26640 }, { "entropy": 0.605435405112803, "epoch": 1.9968406088327115, "grad_norm": 0.23981241881847382, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.8255311023443938, "num_tokens": 248786474.0, "step": 26645 }, { "entropy": 0.5959627209231257, "epoch": 1.9972153327665634, "grad_norm": 0.23679345846176147, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8268770068883896, "num_tokens": 250213441.0, "step": 26650 }, { "entropy": 0.5933291582390666, "epoch": 1.9975900567004152, "grad_norm": 0.21984705328941345, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.8301341451704503, "num_tokens": 251615962.0, "step": 26655 }, { "entropy": 0.6057115789502859, "epoch": 1.997964780634267, "grad_norm": 0.2542871832847595, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.8273612599819898, "num_tokens": 253024485.0, "step": 26660 }, { "entropy": 0.6146315762773156, "epoch": 1.998339504568119, "grad_norm": 0.2760353982448578, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.824252513051033, "num_tokens": 254507701.0, "step": 26665 }, { "entropy": 0.6094476846978069, "epoch": 1.9987142285019708, "grad_norm": 0.26611649990081787, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.8237610332667827, "num_tokens": 256018735.0, "step": 26670 }, { "entropy": 0.5973253354430199, "epoch": 1.9990889524358226, "grad_norm": 0.3210649788379669, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8248776961117983, "num_tokens": 257456297.0, "step": 26675 }, { "entropy": 0.5961148919537663, "epoch": 1.9994636763696745, "grad_norm": 0.2262161374092102, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8263170771300793, "num_tokens": 258945284.0, "step": 26680 }, { "entropy": 0.5917741177603603, "epoch": 1.9998384003035263, "grad_norm": 0.24844299256801605, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.8232959609478712, "num_tokens": 260465556.0, "step": 26685 }, { "entropy": 0.6086660970870713, "epoch": 2.0001498895735406, "grad_norm": 0.23737159371376038, "learning_rate": 0.0002, "loss": 0.6256, "mean_token_accuracy": 0.8204808607137293, "num_tokens": 261734297.0, "step": 26690 }, { "entropy": 0.5771837640553713, "epoch": 2.0005246135073924, "grad_norm": 0.2631221413612366, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8274178467690945, "num_tokens": 263224626.0, "step": 26695 }, { "entropy": 0.5884828511625528, "epoch": 2.0008993374412443, "grad_norm": 0.22817906737327576, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8241645861417055, "num_tokens": 264715814.0, "step": 26700 }, { "entropy": 0.578592044301331, "epoch": 2.001274061375096, "grad_norm": 0.24138061702251434, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.8255031801760196, "num_tokens": 266188892.0, "step": 26705 }, { "entropy": 0.5789668606594205, "epoch": 2.001648785308948, "grad_norm": 0.2575417160987854, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8312565639615059, "num_tokens": 267632697.0, "step": 26710 }, { "entropy": 0.5882806740701199, "epoch": 2.0020235092428, "grad_norm": 0.24298277497291565, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.8279138177633285, "num_tokens": 269091369.0, "step": 26715 }, { "entropy": 0.6055224318057298, "epoch": 2.002398233176652, "grad_norm": 0.2473972886800766, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.8275780614465476, "num_tokens": 270534884.0, "step": 26720 }, { "entropy": 0.6002696730196476, "epoch": 2.002772957110504, "grad_norm": 0.2540464699268341, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.8274539731442928, "num_tokens": 272020150.0, "step": 26725 }, { "entropy": 0.6013143630698323, "epoch": 2.003147681044356, "grad_norm": 0.2712500989437103, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.8278876360505819, "num_tokens": 273513555.0, "step": 26730 }, { "entropy": 0.6134433826431632, "epoch": 2.0035224049782077, "grad_norm": 0.2216261774301529, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.8252346072345972, "num_tokens": 274985906.0, "step": 26735 }, { "entropy": 0.588517058826983, "epoch": 2.0038971289120595, "grad_norm": 0.2732073664665222, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.8277607951313257, "num_tokens": 276459710.0, "step": 26740 }, { "entropy": 0.5957229223102332, "epoch": 2.0042718528459114, "grad_norm": 0.2436404973268509, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.8262510262429714, "num_tokens": 277942299.0, "step": 26745 }, { "entropy": 0.5907959211617708, "epoch": 2.004646576779763, "grad_norm": 0.27349936962127686, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.823165214434266, "num_tokens": 279387162.0, "step": 26750 }, { "entropy": 0.5959409523755312, "epoch": 2.005021300713615, "grad_norm": 0.2633330225944519, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8244098763912916, "num_tokens": 280899796.0, "step": 26755 }, { "entropy": 0.6031670186668634, "epoch": 2.005396024647467, "grad_norm": 0.25932490825653076, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8207872468978167, "num_tokens": 282458219.0, "step": 26760 }, { "entropy": 0.5839017568156123, "epoch": 2.0057707485813188, "grad_norm": 0.22876769304275513, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.8259336598217487, "num_tokens": 283945922.0, "step": 26765 }, { "entropy": 0.6016558146104216, "epoch": 2.0061454725151706, "grad_norm": 0.25447165966033936, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.8259745467454195, "num_tokens": 285391831.0, "step": 26770 }, { "entropy": 0.5946881081908941, "epoch": 2.0065201964490225, "grad_norm": 0.22263804078102112, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.825185801461339, "num_tokens": 286905059.0, "step": 26775 }, { "entropy": 0.5938623758032918, "epoch": 2.0068949203828743, "grad_norm": 0.2291947454214096, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.8265160579234362, "num_tokens": 288385005.0, "step": 26780 }, { "entropy": 0.5952842788770795, "epoch": 2.007269644316726, "grad_norm": 0.2693077325820923, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.8281639851629734, "num_tokens": 289837117.0, "step": 26785 }, { "entropy": 0.5982706224545836, "epoch": 2.007644368250578, "grad_norm": 0.28009963035583496, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.8274324610829353, "num_tokens": 291290042.0, "step": 26790 }, { "entropy": 0.6091718876734376, "epoch": 2.00801909218443, "grad_norm": 0.24990569055080414, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.8264685548841953, "num_tokens": 292747822.0, "step": 26795 }, { "entropy": 0.5994821107015014, "epoch": 2.0083938161182817, "grad_norm": 0.237440824508667, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.8265676781535148, "num_tokens": 294219209.0, "step": 26800 }, { "entropy": 0.5917622465640306, "epoch": 2.0087685400521336, "grad_norm": 0.24487516283988953, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.8255145013332367, "num_tokens": 295718486.0, "step": 26805 }, { "entropy": 0.5932751828804612, "epoch": 2.0091432639859854, "grad_norm": 0.24866430461406708, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.8281317908316851, "num_tokens": 297154968.0, "step": 26810 }, { "entropy": 0.6046684110537172, "epoch": 2.0095179879198373, "grad_norm": 0.22799885272979736, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.8293153330683708, "num_tokens": 298638019.0, "step": 26815 }, { "entropy": 0.6025205194950104, "epoch": 2.009892711853689, "grad_norm": 0.20953519642353058, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.8303606417030096, "num_tokens": 300148632.0, "step": 26820 }, { "entropy": 0.6081800943240523, "epoch": 2.010267435787541, "grad_norm": 0.23602835834026337, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.8312235731631518, "num_tokens": 301587382.0, "step": 26825 }, { "entropy": 0.6334350574761629, "epoch": 2.010642159721393, "grad_norm": 0.27441009879112244, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8203420042991638, "num_tokens": 303104893.0, "step": 26830 }, { "entropy": 0.6143886471167207, "epoch": 2.0110168836552447, "grad_norm": 0.23521031439304352, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.8267871256917715, "num_tokens": 304540949.0, "step": 26835 }, { "entropy": 0.6058722136542201, "epoch": 2.0113916075890965, "grad_norm": 0.24343617260456085, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8282064590603113, "num_tokens": 305957295.0, "step": 26840 }, { "entropy": 0.5974116992205382, "epoch": 2.0117663315229484, "grad_norm": 0.24492862820625305, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.8296484511345625, "num_tokens": 307438581.0, "step": 26845 }, { "entropy": 0.6152048077434301, "epoch": 2.0121410554568, "grad_norm": 0.22717110812664032, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.826123209297657, "num_tokens": 308914859.0, "step": 26850 }, { "entropy": 0.6088526321575045, "epoch": 2.012515779390652, "grad_norm": 0.2500881552696228, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.8254837084561586, "num_tokens": 310399861.0, "step": 26855 }, { "entropy": 0.5901775769889355, "epoch": 2.012890503324504, "grad_norm": 0.254595547914505, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.8287965524941683, "num_tokens": 311841907.0, "step": 26860 }, { "entropy": 0.586624194867909, "epoch": 2.0132652272583558, "grad_norm": 0.22737574577331543, "learning_rate": 0.0002, "loss": 0.5928, "mean_token_accuracy": 0.8301995169371367, "num_tokens": 313280506.0, "step": 26865 }, { "entropy": 0.5973213566467166, "epoch": 2.0136399511922076, "grad_norm": 0.2476395070552826, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.8259969368577004, "num_tokens": 314746615.0, "step": 26870 }, { "entropy": 0.598671312443912, "epoch": 2.0140146751260595, "grad_norm": 0.2512878179550171, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.827300376445055, "num_tokens": 316201371.0, "step": 26875 }, { "entropy": 0.5881544448435306, "epoch": 2.0143893990599113, "grad_norm": 0.23766161501407623, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.82884985730052, "num_tokens": 317637774.0, "step": 26880 }, { "entropy": 0.5873301196843386, "epoch": 2.014764122993763, "grad_norm": 0.23923325538635254, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.8266318276524544, "num_tokens": 319115905.0, "step": 26885 }, { "entropy": 0.5954338716343045, "epoch": 2.015138846927615, "grad_norm": 0.22810304164886475, "learning_rate": 0.0002, "loss": 0.607, "mean_token_accuracy": 0.8270083129405975, "num_tokens": 320597082.0, "step": 26890 }, { "entropy": 0.599723681807518, "epoch": 2.015513570861467, "grad_norm": 0.3002369999885559, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.8253579076379538, "num_tokens": 322029210.0, "step": 26895 }, { "entropy": 0.6016477750614285, "epoch": 2.0158882947953187, "grad_norm": 0.2314557135105133, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8254353493452072, "num_tokens": 323535542.0, "step": 26900 }, { "entropy": 0.6009952960535884, "epoch": 2.0162630187291706, "grad_norm": 0.22122497856616974, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.8251292876899242, "num_tokens": 325018252.0, "step": 26905 }, { "entropy": 0.603752551600337, "epoch": 2.0166377426630224, "grad_norm": 0.24495787918567657, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.8261353306472301, "num_tokens": 326524939.0, "step": 26910 }, { "entropy": 0.6010821437463164, "epoch": 2.0170124665968743, "grad_norm": 0.2513398826122284, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.8289453119039536, "num_tokens": 327964641.0, "step": 26915 }, { "entropy": 0.6198259644210339, "epoch": 2.017387190530726, "grad_norm": 0.3290111720561981, "learning_rate": 0.0002, "loss": 0.6372, "mean_token_accuracy": 0.8216986317187548, "num_tokens": 329459594.0, "step": 26920 }, { "entropy": 0.5867593554779887, "epoch": 2.017761914464578, "grad_norm": 0.2792423367500305, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.827397010102868, "num_tokens": 330899276.0, "step": 26925 }, { "entropy": 0.5892785092815757, "epoch": 2.01813663839843, "grad_norm": 0.26057738065719604, "learning_rate": 0.0002, "loss": 0.5956, "mean_token_accuracy": 0.8287837352603674, "num_tokens": 332343609.0, "step": 26930 }, { "entropy": 0.5933166043832898, "epoch": 2.0185113623322817, "grad_norm": 0.31115585565567017, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.8268754974007606, "num_tokens": 333780092.0, "step": 26935 }, { "entropy": 0.5954577568918467, "epoch": 2.0188860862661335, "grad_norm": 0.29747769236564636, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.8285983014851809, "num_tokens": 335221392.0, "step": 26940 }, { "entropy": 0.5958637071773409, "epoch": 2.0192608101999854, "grad_norm": 0.2653684914112091, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.8280130170285702, "num_tokens": 336686936.0, "step": 26945 }, { "entropy": 0.5948242953047156, "epoch": 2.0196355341338372, "grad_norm": 0.23912657797336578, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8259360603988171, "num_tokens": 338174276.0, "step": 26950 }, { "entropy": 0.5773105127736926, "epoch": 2.020010258067689, "grad_norm": 0.3075768053531647, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.8260987479239702, "num_tokens": 339657725.0, "step": 26955 }, { "entropy": 0.5871544735506177, "epoch": 2.020384982001541, "grad_norm": 0.2745771110057831, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8273787941783667, "num_tokens": 341149694.0, "step": 26960 }, { "entropy": 0.5748363455757499, "epoch": 2.020759705935393, "grad_norm": 0.22617186605930328, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.8276427384465933, "num_tokens": 342623883.0, "step": 26965 }, { "entropy": 0.5869982974603772, "epoch": 2.0211344298692446, "grad_norm": 0.27684178948402405, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8243160802870989, "num_tokens": 344124985.0, "step": 26970 }, { "entropy": 0.59200817681849, "epoch": 2.0215091538030965, "grad_norm": 0.22864790260791779, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.8262759633362293, "num_tokens": 345590504.0, "step": 26975 }, { "entropy": 0.5988256411626935, "epoch": 2.0218838777369483, "grad_norm": 0.25045379996299744, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.8255646400153637, "num_tokens": 347057269.0, "step": 26980 }, { "entropy": 0.5934097973629833, "epoch": 2.0222586016708, "grad_norm": 0.264252632856369, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.8285515815019607, "num_tokens": 348558270.0, "step": 26985 }, { "entropy": 0.6016195056959986, "epoch": 2.022633325604652, "grad_norm": 0.26537764072418213, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.826461598649621, "num_tokens": 349954145.0, "step": 26990 }, { "entropy": 0.6002611171454191, "epoch": 2.023008049538504, "grad_norm": 0.23959501087665558, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.8257009379565716, "num_tokens": 351438700.0, "step": 26995 }, { "entropy": 0.6096050463616848, "epoch": 2.0233827734723557, "grad_norm": 0.485981822013855, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.824307993426919, "num_tokens": 352879332.0, "step": 27000 }, { "entropy": 0.5945497885346412, "epoch": 2.0237574974062076, "grad_norm": 0.24078352749347687, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.8246859956532717, "num_tokens": 1486877.0, "step": 27005 }, { "entropy": 0.5896658018231392, "epoch": 2.0241322213400594, "grad_norm": 0.22922857105731964, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8281280979514122, "num_tokens": 2943060.0, "step": 27010 }, { "entropy": 0.5979463752359152, "epoch": 2.0245069452739113, "grad_norm": 0.24989306926727295, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8254360862076282, "num_tokens": 4426392.0, "step": 27015 }, { "entropy": 0.586722238920629, "epoch": 2.024881669207763, "grad_norm": 0.22297942638397217, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.8259856857359409, "num_tokens": 5913338.0, "step": 27020 }, { "entropy": 0.5959366820752621, "epoch": 2.025256393141615, "grad_norm": 0.23383328318595886, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.825752617418766, "num_tokens": 7366429.0, "step": 27025 }, { "entropy": 0.5933318827301264, "epoch": 2.0256311170754673, "grad_norm": 0.29641565680503845, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8253182839602232, "num_tokens": 8835006.0, "step": 27030 }, { "entropy": 0.6006581937894225, "epoch": 2.026005841009319, "grad_norm": 0.2677904963493347, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.828586995229125, "num_tokens": 10305974.0, "step": 27035 }, { "entropy": 0.6026527434587479, "epoch": 2.026380564943171, "grad_norm": 0.253022700548172, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.8228376042097807, "num_tokens": 11796599.0, "step": 27040 }, { "entropy": 0.5956917919218541, "epoch": 2.026755288877023, "grad_norm": 0.2220163643360138, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.8259055972099304, "num_tokens": 13207841.0, "step": 27045 }, { "entropy": 0.5890123037621379, "epoch": 2.0271300128108747, "grad_norm": 0.24931076169013977, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.8245350077748299, "num_tokens": 14664670.0, "step": 27050 }, { "entropy": 0.5820705378428102, "epoch": 2.0275047367447265, "grad_norm": 0.27007290720939636, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.826691548153758, "num_tokens": 16128692.0, "step": 27055 }, { "entropy": 0.5870777241885662, "epoch": 2.0278794606785784, "grad_norm": 0.2746545672416687, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.826801597699523, "num_tokens": 17571861.0, "step": 27060 }, { "entropy": 0.5987028319388628, "epoch": 2.0282541846124302, "grad_norm": 0.27007409930229187, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.8259171143174171, "num_tokens": 19022600.0, "step": 27065 }, { "entropy": 0.6031523300334811, "epoch": 2.028628908546282, "grad_norm": 0.4599672853946686, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.8232980638742446, "num_tokens": 20494430.0, "step": 27070 }, { "entropy": 0.5867784796282649, "epoch": 2.029003632480134, "grad_norm": 0.2275441437959671, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8270463548600674, "num_tokens": 21982688.0, "step": 27075 }, { "entropy": 0.5854662394151091, "epoch": 2.029378356413986, "grad_norm": 0.24629376828670502, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.8270881276577711, "num_tokens": 23446487.0, "step": 27080 }, { "entropy": 0.6026045268401503, "epoch": 2.0297530803478376, "grad_norm": 0.2169923633337021, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.8238271679729223, "num_tokens": 24963583.0, "step": 27085 }, { "entropy": 0.5814634399488569, "epoch": 2.0301278042816895, "grad_norm": 0.22938120365142822, "learning_rate": 0.0002, "loss": 0.5904, "mean_token_accuracy": 0.8291513442993164, "num_tokens": 26484829.0, "step": 27090 }, { "entropy": 0.5997279051691293, "epoch": 2.0305025282155413, "grad_norm": 0.25275471806526184, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.8260777462273836, "num_tokens": 27992492.0, "step": 27095 }, { "entropy": 0.5946123411878943, "epoch": 2.030877252149393, "grad_norm": 0.2381369024515152, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.8248967550694942, "num_tokens": 29481475.0, "step": 27100 }, { "entropy": 0.6230403417721391, "epoch": 2.031251976083245, "grad_norm": 0.23676779866218567, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.822975542768836, "num_tokens": 30938990.0, "step": 27105 }, { "entropy": 0.6184109961614013, "epoch": 2.031626700017097, "grad_norm": 0.23049134016036987, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8261321540921926, "num_tokens": 32397491.0, "step": 27110 }, { "entropy": 0.6243892204016447, "epoch": 2.0320014239509487, "grad_norm": 0.22560296952724457, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8253690853714943, "num_tokens": 33867237.0, "step": 27115 }, { "entropy": 0.6065986817702651, "epoch": 2.0323761478848006, "grad_norm": 0.2693448066711426, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.8256696335971355, "num_tokens": 35323661.0, "step": 27120 }, { "entropy": 0.6059661921113729, "epoch": 2.0327508718186524, "grad_norm": 0.24541962146759033, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.8262641087174416, "num_tokens": 36745723.0, "step": 27125 }, { "entropy": 0.5992450715973974, "epoch": 2.0331255957525043, "grad_norm": 0.2313215285539627, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.8290981840342283, "num_tokens": 38194898.0, "step": 27130 }, { "entropy": 0.605517796240747, "epoch": 2.033500319686356, "grad_norm": 0.24926728010177612, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.8248253554105759, "num_tokens": 39695696.0, "step": 27135 }, { "entropy": 0.5941408222541213, "epoch": 2.033875043620208, "grad_norm": 0.25787949562072754, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.8261226423084735, "num_tokens": 41143990.0, "step": 27140 }, { "entropy": 0.5852117573842406, "epoch": 2.03424976755406, "grad_norm": 0.25626271963119507, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.8265170119702816, "num_tokens": 42598644.0, "step": 27145 }, { "entropy": 0.6060135662555695, "epoch": 2.0346244914879117, "grad_norm": 0.25871971249580383, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8217566035687923, "num_tokens": 44098510.0, "step": 27150 }, { "entropy": 0.6056313393637538, "epoch": 2.0349992154217635, "grad_norm": 0.22044946253299713, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.8234956514090299, "num_tokens": 45580959.0, "step": 27155 }, { "entropy": 0.6049505362287164, "epoch": 2.0353739393556154, "grad_norm": 0.25004932284355164, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.825543861836195, "num_tokens": 47045357.0, "step": 27160 }, { "entropy": 0.5895659673959017, "epoch": 2.0357486632894672, "grad_norm": 0.2529848515987396, "learning_rate": 0.0002, "loss": 0.5863, "mean_token_accuracy": 0.8315881606191396, "num_tokens": 48477031.0, "step": 27165 }, { "entropy": 0.5878336330875754, "epoch": 2.036123387223319, "grad_norm": 0.2404332011938095, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8256149746477603, "num_tokens": 49946515.0, "step": 27170 }, { "entropy": 0.593574489839375, "epoch": 2.036498111157171, "grad_norm": 0.2501292824745178, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8259136900305748, "num_tokens": 51447804.0, "step": 27175 }, { "entropy": 0.6040347259491682, "epoch": 2.036872835091023, "grad_norm": 0.2123415768146515, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.8284664753824472, "num_tokens": 52894407.0, "step": 27180 }, { "entropy": 0.5904272424057126, "epoch": 2.0372475590248746, "grad_norm": 0.24012792110443115, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.830261729285121, "num_tokens": 54351535.0, "step": 27185 }, { "entropy": 0.6000138130038977, "epoch": 2.0376222829587265, "grad_norm": 0.32346111536026, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.8255277048796416, "num_tokens": 55861722.0, "step": 27190 }, { "entropy": 0.6104092162102461, "epoch": 2.0379970068925783, "grad_norm": 0.24314406514167786, "learning_rate": 0.0002, "loss": 0.6125, "mean_token_accuracy": 0.8258366160094738, "num_tokens": 57268213.0, "step": 27195 }, { "entropy": 0.6189892292022705, "epoch": 2.03837173082643, "grad_norm": 0.2383318543434143, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.8206689417362213, "num_tokens": 58707133.0, "step": 27200 }, { "entropy": 0.5972379022277892, "epoch": 2.038746454760282, "grad_norm": 0.2404962033033371, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8273565236479044, "num_tokens": 60184063.0, "step": 27205 }, { "entropy": 0.5830365344882011, "epoch": 2.039121178694134, "grad_norm": 0.24041375517845154, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.831073221936822, "num_tokens": 61672676.0, "step": 27210 }, { "entropy": 0.6017352705821395, "epoch": 2.0394959026279857, "grad_norm": 0.22788359224796295, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8265752233564854, "num_tokens": 63131927.0, "step": 27215 }, { "entropy": 0.612584912776947, "epoch": 2.0398706265618376, "grad_norm": 0.22647646069526672, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.8236782182008028, "num_tokens": 64596050.0, "step": 27220 }, { "entropy": 0.5994164856150747, "epoch": 2.0402453504956894, "grad_norm": 0.47719234228134155, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.8254249401390552, "num_tokens": 66063996.0, "step": 27225 }, { "entropy": 0.609409592486918, "epoch": 2.0406200744295413, "grad_norm": 0.25856977701187134, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.8236877497285604, "num_tokens": 67536218.0, "step": 27230 }, { "entropy": 0.6102656548842788, "epoch": 2.040994798363393, "grad_norm": 0.22958894073963165, "learning_rate": 0.0002, "loss": 0.6115, "mean_token_accuracy": 0.8238056179136037, "num_tokens": 68974574.0, "step": 27235 }, { "entropy": 0.600311983563006, "epoch": 2.041369522297245, "grad_norm": 0.25919410586357117, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8258134678006173, "num_tokens": 70468442.0, "step": 27240 }, { "entropy": 0.5987512774765491, "epoch": 2.041744246231097, "grad_norm": 0.25941628217697144, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.8256413727998734, "num_tokens": 71958985.0, "step": 27245 }, { "entropy": 0.5868002060800791, "epoch": 2.0421189701649487, "grad_norm": 0.242800772190094, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.8286790035665035, "num_tokens": 73434424.0, "step": 27250 }, { "entropy": 0.5911146312952041, "epoch": 2.0424936940988005, "grad_norm": 0.2472355216741562, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.8272600691765547, "num_tokens": 74907914.0, "step": 27255 }, { "entropy": 0.5991921465843916, "epoch": 2.0428684180326524, "grad_norm": 0.2341611236333847, "learning_rate": 0.0002, "loss": 0.6155, "mean_token_accuracy": 0.823807242885232, "num_tokens": 76400735.0, "step": 27260 }, { "entropy": 0.5971611743792892, "epoch": 2.0432431419665043, "grad_norm": 0.27969786524772644, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.8233453955501318, "num_tokens": 77851938.0, "step": 27265 }, { "entropy": 0.5878964750096202, "epoch": 2.043617865900356, "grad_norm": 0.27720391750335693, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.8268775682896375, "num_tokens": 79327014.0, "step": 27270 }, { "entropy": 0.5865248277783394, "epoch": 2.043992589834208, "grad_norm": 0.26242825388908386, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8289370547980071, "num_tokens": 80829573.0, "step": 27275 }, { "entropy": 0.6031025763601064, "epoch": 2.04436731376806, "grad_norm": 0.2197725772857666, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8223586399108171, "num_tokens": 82310282.0, "step": 27280 }, { "entropy": 0.5936502845957875, "epoch": 2.0447420377019117, "grad_norm": 0.25299161672592163, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.8267874322831631, "num_tokens": 83776842.0, "step": 27285 }, { "entropy": 0.6248359428718686, "epoch": 2.0451167616357635, "grad_norm": 0.2417335957288742, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.8216944146901369, "num_tokens": 85251559.0, "step": 27290 }, { "entropy": 0.6166465884074569, "epoch": 2.0454914855696154, "grad_norm": 0.24188777804374695, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.8263456027954816, "num_tokens": 86725474.0, "step": 27295 }, { "entropy": 0.6082981377840042, "epoch": 2.045866209503467, "grad_norm": 0.2779759466648102, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.8258145555853844, "num_tokens": 88187154.0, "step": 27300 }, { "entropy": 0.5943273577839137, "epoch": 2.046240933437319, "grad_norm": 0.2459595501422882, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.8282722413539887, "num_tokens": 89670222.0, "step": 27305 }, { "entropy": 0.6011217651888728, "epoch": 2.046615657371171, "grad_norm": 0.24223442375659943, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.8281256645917893, "num_tokens": 91095878.0, "step": 27310 }, { "entropy": 0.6137834956869483, "epoch": 2.0469903813050228, "grad_norm": 0.24805553257465363, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.8247367773205042, "num_tokens": 92562284.0, "step": 27315 }, { "entropy": 0.6193879343569278, "epoch": 2.0473651052388746, "grad_norm": 0.23854750394821167, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.8230283696204423, "num_tokens": 94037940.0, "step": 27320 }, { "entropy": 0.608734199218452, "epoch": 2.0477398291727265, "grad_norm": 0.25981849431991577, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.827113189548254, "num_tokens": 95460749.0, "step": 27325 }, { "entropy": 0.6115036038681865, "epoch": 2.0481145531065783, "grad_norm": 0.3487916588783264, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8264156833291054, "num_tokens": 96958101.0, "step": 27330 }, { "entropy": 0.6095089184120297, "epoch": 2.04848927704043, "grad_norm": 0.22510507702827454, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8249003063887358, "num_tokens": 98458028.0, "step": 27335 }, { "entropy": 0.5854966621845961, "epoch": 2.048864000974282, "grad_norm": 0.2257436364889145, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.8290320996195077, "num_tokens": 99869246.0, "step": 27340 }, { "entropy": 0.5896180020645261, "epoch": 2.0492387249081343, "grad_norm": 0.21786634624004364, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8288296859711408, "num_tokens": 101304605.0, "step": 27345 }, { "entropy": 0.5978792389854789, "epoch": 2.049613448841986, "grad_norm": 0.22883926331996918, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8264860145747661, "num_tokens": 102756455.0, "step": 27350 }, { "entropy": 0.6132361553609371, "epoch": 2.049988172775838, "grad_norm": 0.2964344024658203, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8240900076925755, "num_tokens": 104193108.0, "step": 27355 }, { "entropy": 0.6063279531896114, "epoch": 2.05036289670969, "grad_norm": 0.2298608422279358, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.8266089767217636, "num_tokens": 105644922.0, "step": 27360 }, { "entropy": 0.5837928617373109, "epoch": 2.0507376206435417, "grad_norm": 0.23009537160396576, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.8322098042815924, "num_tokens": 107119858.0, "step": 27365 }, { "entropy": 0.6085226587951184, "epoch": 2.0511123445773936, "grad_norm": 0.2335895150899887, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8250892281532287, "num_tokens": 108640491.0, "step": 27370 }, { "entropy": 0.5906320996582508, "epoch": 2.0514870685112454, "grad_norm": 0.2826938331127167, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.8263714794069529, "num_tokens": 110085415.0, "step": 27375 }, { "entropy": 0.5808702800422907, "epoch": 2.0518617924450973, "grad_norm": 0.42187321186065674, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.8302614245563745, "num_tokens": 111572371.0, "step": 27380 }, { "entropy": 0.59756686296314, "epoch": 2.052236516378949, "grad_norm": 0.27547892928123474, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.8258647881448269, "num_tokens": 113037175.0, "step": 27385 }, { "entropy": 0.594453402236104, "epoch": 2.052611240312801, "grad_norm": 0.22423705458641052, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.8237814299762249, "num_tokens": 114511913.0, "step": 27390 }, { "entropy": 0.5975711980834604, "epoch": 2.052985964246653, "grad_norm": 0.21628428995609283, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.8279456347227097, "num_tokens": 115997196.0, "step": 27395 }, { "entropy": 0.6012409335002303, "epoch": 2.0533606881805047, "grad_norm": 0.25867483019828796, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.8224715474992991, "num_tokens": 117485405.0, "step": 27400 }, { "entropy": 0.604047792777419, "epoch": 2.0537354121143565, "grad_norm": 0.2838650345802307, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.8250374473631382, "num_tokens": 118957683.0, "step": 27405 }, { "entropy": 0.6132749084383249, "epoch": 2.0541101360482084, "grad_norm": 0.25251489877700806, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.8250743750482797, "num_tokens": 120395001.0, "step": 27410 }, { "entropy": 0.6068360788747669, "epoch": 2.05448485998206, "grad_norm": 0.2917647659778595, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8239242352545262, "num_tokens": 121857769.0, "step": 27415 }, { "entropy": 0.5919506939128041, "epoch": 2.054859583915912, "grad_norm": 0.2614468038082123, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.8285769458860159, "num_tokens": 123298458.0, "step": 27420 }, { "entropy": 0.6041140381246806, "epoch": 2.055234307849764, "grad_norm": 0.21753592789173126, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8231668647378683, "num_tokens": 124808342.0, "step": 27425 }, { "entropy": 0.6132740218192338, "epoch": 2.0556090317836158, "grad_norm": 0.29938268661499023, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.8232881426811218, "num_tokens": 126297641.0, "step": 27430 }, { "entropy": 0.5995913287624717, "epoch": 2.0559837557174676, "grad_norm": 0.285114049911499, "learning_rate": 0.0002, "loss": 0.5931, "mean_token_accuracy": 0.8286926358938217, "num_tokens": 127733158.0, "step": 27435 }, { "entropy": 0.6025960836559534, "epoch": 2.0563584796513195, "grad_norm": 0.29098647832870483, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.8259203981608152, "num_tokens": 129215322.0, "step": 27440 }, { "entropy": 0.6108989633619786, "epoch": 2.0567332035851713, "grad_norm": 0.23154541850090027, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.8237193424254656, "num_tokens": 130669403.0, "step": 27445 }, { "entropy": 0.6111178603023291, "epoch": 2.057107927519023, "grad_norm": 0.2442483752965927, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8240297317504883, "num_tokens": 132144617.0, "step": 27450 }, { "entropy": 0.6103514285758138, "epoch": 2.057482651452875, "grad_norm": 0.4042951464653015, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.8227129690349102, "num_tokens": 133616982.0, "step": 27455 }, { "entropy": 0.5953747041523456, "epoch": 2.057857375386727, "grad_norm": 0.5315353870391846, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.8284896366298199, "num_tokens": 135061966.0, "step": 27460 }, { "entropy": 0.5986319366842509, "epoch": 2.0582320993205787, "grad_norm": 0.22561684250831604, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.8251932196319103, "num_tokens": 136542033.0, "step": 27465 }, { "entropy": 0.5907520581036806, "epoch": 2.0586068232544306, "grad_norm": 0.22588124871253967, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8268628843128681, "num_tokens": 137995678.0, "step": 27470 }, { "entropy": 0.6002456838265061, "epoch": 2.0589815471882824, "grad_norm": 0.37237945199012756, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.8241237480193376, "num_tokens": 139497303.0, "step": 27475 }, { "entropy": 0.5913759557530284, "epoch": 2.0593562711221343, "grad_norm": 0.23273970186710358, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.8223440475761891, "num_tokens": 140995892.0, "step": 27480 }, { "entropy": 0.5869690835475921, "epoch": 2.059730995055986, "grad_norm": 0.2674430310726166, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.8249039493501187, "num_tokens": 142485590.0, "step": 27485 }, { "entropy": 0.5999947708100081, "epoch": 2.060105718989838, "grad_norm": 0.2872920036315918, "learning_rate": 0.0002, "loss": 0.6173, "mean_token_accuracy": 0.8236346494406461, "num_tokens": 143948777.0, "step": 27490 }, { "entropy": 0.5874331597238779, "epoch": 2.06048044292369, "grad_norm": 0.21964357793331146, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.8274873115122319, "num_tokens": 145441980.0, "step": 27495 }, { "entropy": 0.5897275915369391, "epoch": 2.0608551668575417, "grad_norm": 0.21924351155757904, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.8293765842914581, "num_tokens": 146923268.0, "step": 27500 }, { "entropy": 0.6079806484282017, "epoch": 2.0612298907913935, "grad_norm": 0.20598487555980682, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.8243913743644953, "num_tokens": 148350034.0, "step": 27505 }, { "entropy": 0.5964029313996434, "epoch": 2.0616046147252454, "grad_norm": 0.24523621797561646, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.825958889350295, "num_tokens": 149813077.0, "step": 27510 }, { "entropy": 0.596509731002152, "epoch": 2.061979338659097, "grad_norm": 0.2501387298107147, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.8278182994574308, "num_tokens": 151256070.0, "step": 27515 }, { "entropy": 0.6142162749543786, "epoch": 2.062354062592949, "grad_norm": 0.2440178394317627, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.825521806254983, "num_tokens": 152727382.0, "step": 27520 }, { "entropy": 0.6135577406734228, "epoch": 2.062728786526801, "grad_norm": 0.23370710015296936, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.826565059274435, "num_tokens": 154153002.0, "step": 27525 }, { "entropy": 0.6196733066812158, "epoch": 2.0631035104606528, "grad_norm": 0.27972182631492615, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.8268660854548215, "num_tokens": 155590070.0, "step": 27530 }, { "entropy": 0.6088277038186789, "epoch": 2.0634782343945046, "grad_norm": 0.25310760736465454, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.8251929592341185, "num_tokens": 157040819.0, "step": 27535 }, { "entropy": 0.6195800069719553, "epoch": 2.0638529583283565, "grad_norm": 0.37288254499435425, "learning_rate": 0.0002, "loss": 0.607, "mean_token_accuracy": 0.8264471318572759, "num_tokens": 158473484.0, "step": 27540 }, { "entropy": 0.6374248810112476, "epoch": 2.0642276822622083, "grad_norm": 0.22933755815029144, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8275329478085041, "num_tokens": 159945197.0, "step": 27545 }, { "entropy": 0.6263699753209948, "epoch": 2.06460240619606, "grad_norm": 0.312262624502182, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.8288340356200934, "num_tokens": 161382404.0, "step": 27550 }, { "entropy": 0.6287090023979545, "epoch": 2.064977130129912, "grad_norm": 0.29809701442718506, "learning_rate": 0.0002, "loss": 0.6074, "mean_token_accuracy": 0.8256029352545738, "num_tokens": 162844826.0, "step": 27555 }, { "entropy": 0.6085126822814345, "epoch": 2.065351854063764, "grad_norm": 0.21363745629787445, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.8261563092470169, "num_tokens": 164338547.0, "step": 27560 }, { "entropy": 0.6036114029586315, "epoch": 2.0657265779976157, "grad_norm": 0.22537502646446228, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.8321242853999138, "num_tokens": 165782622.0, "step": 27565 }, { "entropy": 0.6104231033474207, "epoch": 2.0661013019314676, "grad_norm": 0.32356250286102295, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.8269936941564083, "num_tokens": 167265787.0, "step": 27570 }, { "entropy": 0.6033198520541191, "epoch": 2.0664760258653194, "grad_norm": 0.29325008392333984, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.8264279119670391, "num_tokens": 168762091.0, "step": 27575 }, { "entropy": 0.6293136024847626, "epoch": 2.0668507497991713, "grad_norm": 0.22565533220767975, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.8253174424171448, "num_tokens": 170235428.0, "step": 27580 }, { "entropy": 0.6451014904305339, "epoch": 2.067225473733023, "grad_norm": 0.2665345072746277, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.8205167729407549, "num_tokens": 171726145.0, "step": 27585 }, { "entropy": 0.6394248098134995, "epoch": 2.067600197666875, "grad_norm": 0.22571569681167603, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.824163106456399, "num_tokens": 173211256.0, "step": 27590 }, { "entropy": 0.6269386483356356, "epoch": 2.067974921600727, "grad_norm": 0.2066512256860733, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.8259471785277128, "num_tokens": 174689810.0, "step": 27595 }, { "entropy": 0.6286160727962852, "epoch": 2.0683496455345787, "grad_norm": 0.25549572706222534, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.8322187390178442, "num_tokens": 176119980.0, "step": 27600 }, { "entropy": 0.6266347888857127, "epoch": 2.0687243694684305, "grad_norm": 0.2705455720424652, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.8237992435693741, "num_tokens": 177609921.0, "step": 27605 }, { "entropy": 0.6285433867946267, "epoch": 2.0690990934022824, "grad_norm": 0.21550162136554718, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.82097562327981, "num_tokens": 179083554.0, "step": 27610 }, { "entropy": 0.5998614985495806, "epoch": 2.0694738173361342, "grad_norm": 0.3949885666370392, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.8276468504220247, "num_tokens": 180543029.0, "step": 27615 }, { "entropy": 0.594497942738235, "epoch": 2.069848541269986, "grad_norm": 0.23642969131469727, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8259099524468183, "num_tokens": 182034084.0, "step": 27620 }, { "entropy": 0.5915019609034061, "epoch": 2.070223265203838, "grad_norm": 0.24095121026039124, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8281352337449789, "num_tokens": 183536211.0, "step": 27625 }, { "entropy": 0.605752881988883, "epoch": 2.0705979891376898, "grad_norm": 0.22874468564987183, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.8250128742307424, "num_tokens": 184976485.0, "step": 27630 }, { "entropy": 0.6088701616972685, "epoch": 2.0709727130715416, "grad_norm": 0.23022206127643585, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.827466094121337, "num_tokens": 186451811.0, "step": 27635 }, { "entropy": 0.6045728515833616, "epoch": 2.0713474370053935, "grad_norm": 0.23914846777915955, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.8238317511975766, "num_tokens": 187934392.0, "step": 27640 }, { "entropy": 0.5980664467439055, "epoch": 2.0717221609392453, "grad_norm": 0.21749840676784515, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8230042133480311, "num_tokens": 189398892.0, "step": 27645 }, { "entropy": 0.6023584980517626, "epoch": 2.0720968848730976, "grad_norm": 0.37447214126586914, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.8273107249289751, "num_tokens": 190838903.0, "step": 27650 }, { "entropy": 0.619824837334454, "epoch": 2.072471608806949, "grad_norm": 0.23398011922836304, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.8223368115723133, "num_tokens": 192311256.0, "step": 27655 }, { "entropy": 0.6203382613137365, "epoch": 2.0728463327408013, "grad_norm": 0.27768251299858093, "learning_rate": 0.0002, "loss": 0.6266, "mean_token_accuracy": 0.8204748507589101, "num_tokens": 193782057.0, "step": 27660 }, { "entropy": 0.603634730912745, "epoch": 2.073221056674653, "grad_norm": 0.4606991112232208, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.8263078823685646, "num_tokens": 195239972.0, "step": 27665 }, { "entropy": 0.6043858317658305, "epoch": 2.073595780608505, "grad_norm": 0.2623804211616516, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.8224283948540687, "num_tokens": 196707918.0, "step": 27670 }, { "entropy": 0.6036441402509809, "epoch": 2.073970504542357, "grad_norm": 0.24156077206134796, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8252816028892994, "num_tokens": 198175741.0, "step": 27675 }, { "entropy": 0.5994035979732871, "epoch": 2.0743452284762087, "grad_norm": 0.22182606160640717, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.8277744699269534, "num_tokens": 199616368.0, "step": 27680 }, { "entropy": 0.6188640207052231, "epoch": 2.0747199524100606, "grad_norm": 0.21486063301563263, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.823127481713891, "num_tokens": 201104715.0, "step": 27685 }, { "entropy": 0.598865375481546, "epoch": 2.0750946763439124, "grad_norm": 0.27034881711006165, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.8268838688731194, "num_tokens": 202596819.0, "step": 27690 }, { "entropy": 0.605124038271606, "epoch": 2.0754694002777643, "grad_norm": 0.25704848766326904, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8281519874930382, "num_tokens": 204073013.0, "step": 27695 }, { "entropy": 0.6201539201661944, "epoch": 2.075844124211616, "grad_norm": 0.27476292848587036, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.8231887795031071, "num_tokens": 205562207.0, "step": 27700 }, { "entropy": 0.6131379058584571, "epoch": 2.076218848145468, "grad_norm": 0.2717430591583252, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.8252883981913328, "num_tokens": 207021515.0, "step": 27705 }, { "entropy": 0.61037368606776, "epoch": 2.07659357207932, "grad_norm": 0.23927703499794006, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8263510152697563, "num_tokens": 208450715.0, "step": 27710 }, { "entropy": 0.610737993568182, "epoch": 2.0769682960131717, "grad_norm": 0.24409162998199463, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8271835636347532, "num_tokens": 209879738.0, "step": 27715 }, { "entropy": 0.605196586996317, "epoch": 2.0773430199470235, "grad_norm": 0.2454313337802887, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.8256806552410125, "num_tokens": 211325759.0, "step": 27720 }, { "entropy": 0.617846872843802, "epoch": 2.0777177438808754, "grad_norm": 0.2308313250541687, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.8276424024254083, "num_tokens": 212847412.0, "step": 27725 }, { "entropy": 0.6223897410556674, "epoch": 2.0780924678147272, "grad_norm": 0.24226975440979004, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.8226568277925252, "num_tokens": 214328054.0, "step": 27730 }, { "entropy": 0.6013298636302352, "epoch": 2.078467191748579, "grad_norm": 0.23912020027637482, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.8256117820739746, "num_tokens": 215804657.0, "step": 27735 }, { "entropy": 0.5978856349363924, "epoch": 2.078841915682431, "grad_norm": 0.2815781831741333, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.8296574518084526, "num_tokens": 217250268.0, "step": 27740 }, { "entropy": 0.6183365643024444, "epoch": 2.079216639616283, "grad_norm": 0.22815997898578644, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.827629679441452, "num_tokens": 218743456.0, "step": 27745 }, { "entropy": 0.6370685206726193, "epoch": 2.0795913635501346, "grad_norm": 0.2366718351840973, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.8245736692100764, "num_tokens": 220210488.0, "step": 27750 }, { "entropy": 0.617943681590259, "epoch": 2.0799660874839865, "grad_norm": 0.3526599407196045, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.8267183229327202, "num_tokens": 221614076.0, "step": 27755 }, { "entropy": 0.623692349717021, "epoch": 2.0803408114178383, "grad_norm": 0.31233054399490356, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.822981221601367, "num_tokens": 223094040.0, "step": 27760 }, { "entropy": 0.6193120343610644, "epoch": 2.08071553535169, "grad_norm": 0.2607054114341736, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.8243760466575623, "num_tokens": 224548329.0, "step": 27765 }, { "entropy": 0.6194034930318594, "epoch": 2.081090259285542, "grad_norm": 0.37230175733566284, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8266801681369543, "num_tokens": 225989052.0, "step": 27770 }, { "entropy": 0.6178814334794879, "epoch": 2.081464983219394, "grad_norm": 0.23921309411525726, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.8261921286582947, "num_tokens": 227431739.0, "step": 27775 }, { "entropy": 0.6023144435137511, "epoch": 2.0818397071532457, "grad_norm": 0.2378142923116684, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.8245095018297434, "num_tokens": 228897381.0, "step": 27780 }, { "entropy": 0.6038913255557418, "epoch": 2.0822144310870976, "grad_norm": 0.2622896432876587, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.8263993408530951, "num_tokens": 230351843.0, "step": 27785 }, { "entropy": 0.6071311082690954, "epoch": 2.0825891550209494, "grad_norm": 0.26202696561813354, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.8242925118654967, "num_tokens": 231854957.0, "step": 27790 }, { "entropy": 0.5839624410495162, "epoch": 2.0829638789548013, "grad_norm": 0.2586475908756256, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.8308310620486736, "num_tokens": 233264123.0, "step": 27795 }, { "entropy": 0.589655016362667, "epoch": 2.083338602888653, "grad_norm": 0.2831708490848541, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.8280140202492475, "num_tokens": 234720406.0, "step": 27800 }, { "entropy": 0.5867407029494643, "epoch": 2.083713326822505, "grad_norm": 0.2441731095314026, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.8248521815985441, "num_tokens": 236164920.0, "step": 27805 }, { "entropy": 0.5777542106807232, "epoch": 2.084088050756357, "grad_norm": 0.27709493041038513, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.8268853895366192, "num_tokens": 237652835.0, "step": 27810 }, { "entropy": 0.5936647174879909, "epoch": 2.0844627746902087, "grad_norm": 0.24908684194087982, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.8217273760586977, "num_tokens": 239119243.0, "step": 27815 }, { "entropy": 0.5722470251843333, "epoch": 2.0848374986240605, "grad_norm": 0.24690088629722595, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.8271882653236389, "num_tokens": 240580494.0, "step": 27820 }, { "entropy": 0.5787847883999347, "epoch": 2.0852122225579124, "grad_norm": 0.22881470620632172, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.8268360655754805, "num_tokens": 242025825.0, "step": 27825 }, { "entropy": 0.5791231513023376, "epoch": 2.0855869464917642, "grad_norm": 0.22957743704319, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.8291801165789365, "num_tokens": 243504675.0, "step": 27830 }, { "entropy": 0.578324756026268, "epoch": 2.085961670425616, "grad_norm": 0.24358195066452026, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.8264084108173847, "num_tokens": 244969385.0, "step": 27835 }, { "entropy": 0.5726155512034893, "epoch": 2.086336394359468, "grad_norm": 0.26648929715156555, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.8273774210363627, "num_tokens": 246470320.0, "step": 27840 }, { "entropy": 0.5645999182015657, "epoch": 2.08671111829332, "grad_norm": 0.2569597065448761, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.8296757705509663, "num_tokens": 247916662.0, "step": 27845 }, { "entropy": 0.583992270566523, "epoch": 2.0870858422271716, "grad_norm": 0.24345800280570984, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.8250502571463585, "num_tokens": 249376001.0, "step": 27850 }, { "entropy": 0.5911372551694513, "epoch": 2.0874605661610235, "grad_norm": 0.23723623156547546, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.8190941125154495, "num_tokens": 250858266.0, "step": 27855 }, { "entropy": 0.5769346920773387, "epoch": 2.0878352900948753, "grad_norm": 0.3812768757343292, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.823964262753725, "num_tokens": 252353120.0, "step": 27860 }, { "entropy": 0.5665472814813256, "epoch": 2.088210014028727, "grad_norm": 0.22413821518421173, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.827096088975668, "num_tokens": 253768870.0, "step": 27865 }, { "entropy": 0.5754097012802959, "epoch": 2.088584737962579, "grad_norm": 0.25850456953048706, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8252246104180813, "num_tokens": 255304373.0, "step": 27870 }, { "entropy": 0.5650437390431762, "epoch": 2.088959461896431, "grad_norm": 0.23685136437416077, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.829358508065343, "num_tokens": 256755491.0, "step": 27875 }, { "entropy": 0.5817534983158111, "epoch": 2.0893341858302827, "grad_norm": 0.22883747518062592, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.8249434787780047, "num_tokens": 258229810.0, "step": 27880 }, { "entropy": 0.5749439680948853, "epoch": 2.0897089097641346, "grad_norm": 0.24331383407115936, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.8283079952001572, "num_tokens": 259686654.0, "step": 27885 }, { "entropy": 0.5972931941971182, "epoch": 2.0900836336979864, "grad_norm": 0.22580744326114655, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8270797312259675, "num_tokens": 261166559.0, "step": 27890 }, { "entropy": 0.6315293859690427, "epoch": 2.0904583576318383, "grad_norm": 0.30467522144317627, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.8203904371708631, "num_tokens": 262618795.0, "step": 27895 }, { "entropy": 0.6013838008046151, "epoch": 2.09083308156569, "grad_norm": 0.2275654375553131, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.825120110809803, "num_tokens": 264071159.0, "step": 27900 }, { "entropy": 0.5969643041491508, "epoch": 2.091207805499542, "grad_norm": 0.21382062137126923, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.8293532781302929, "num_tokens": 265490984.0, "step": 27905 }, { "entropy": 0.6053679691627621, "epoch": 2.091582529433394, "grad_norm": 0.31523463129997253, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.8261399284005165, "num_tokens": 266937861.0, "step": 27910 }, { "entropy": 0.5969174098223448, "epoch": 2.0919572533672457, "grad_norm": 0.4303741753101349, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.8264789331704379, "num_tokens": 268433840.0, "step": 27915 }, { "entropy": 0.5880933500826359, "epoch": 2.0923319773010975, "grad_norm": 0.2635699510574341, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.8268055871129036, "num_tokens": 269936066.0, "step": 27920 }, { "entropy": 0.6052381021901965, "epoch": 2.0927067012349494, "grad_norm": 0.27307838201522827, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.8254101775586605, "num_tokens": 271392539.0, "step": 27925 }, { "entropy": 0.5930571857839823, "epoch": 2.0930814251688012, "grad_norm": 0.26187804341316223, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.8237510245293379, "num_tokens": 272897371.0, "step": 27930 }, { "entropy": 0.5830917671322823, "epoch": 2.093456149102653, "grad_norm": 0.24802328646183014, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.8283838164061308, "num_tokens": 274338730.0, "step": 27935 }, { "entropy": 0.5909548318013549, "epoch": 2.093830873036505, "grad_norm": 0.2234833538532257, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.8206934813410044, "num_tokens": 275817828.0, "step": 27940 }, { "entropy": 0.5901047201827169, "epoch": 2.094205596970357, "grad_norm": 0.23561593890190125, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.8301822043955326, "num_tokens": 277241883.0, "step": 27945 }, { "entropy": 0.5995597375556827, "epoch": 2.0945803209042086, "grad_norm": 0.25141477584838867, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.8249769736081362, "num_tokens": 278688816.0, "step": 27950 }, { "entropy": 0.5893301462754608, "epoch": 2.0949550448380605, "grad_norm": 0.23945759236812592, "learning_rate": 0.0002, "loss": 0.6039, "mean_token_accuracy": 0.8264217741787434, "num_tokens": 280159649.0, "step": 27955 }, { "entropy": 0.5912339156493545, "epoch": 2.0953297687719123, "grad_norm": 0.3023797571659088, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.8257241804152727, "num_tokens": 281683795.0, "step": 27960 }, { "entropy": 0.5926779510453344, "epoch": 2.0957044927057646, "grad_norm": 0.22550541162490845, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.824087205901742, "num_tokens": 283147593.0, "step": 27965 }, { "entropy": 0.5877802103757859, "epoch": 2.0960792166396165, "grad_norm": 0.2733149528503418, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.825980394333601, "num_tokens": 284585485.0, "step": 27970 }, { "entropy": 0.5887030642479658, "epoch": 2.0964539405734683, "grad_norm": 0.24099589884281158, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.8235306713730097, "num_tokens": 286045743.0, "step": 27975 }, { "entropy": 0.5931937359273434, "epoch": 2.09682866450732, "grad_norm": 0.2419109344482422, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.8304174289107322, "num_tokens": 287527494.0, "step": 27980 }, { "entropy": 0.586847192235291, "epoch": 2.097203388441172, "grad_norm": 0.23707163333892822, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8271337382495403, "num_tokens": 288970208.0, "step": 27985 }, { "entropy": 0.5844407668337226, "epoch": 2.097578112375024, "grad_norm": 0.2444852739572525, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.8275886341929436, "num_tokens": 290410881.0, "step": 27990 }, { "entropy": 0.6026759980246424, "epoch": 2.0979528363088757, "grad_norm": 0.2459949254989624, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.8240005511790514, "num_tokens": 291863421.0, "step": 27995 }, { "entropy": 0.6004265990108252, "epoch": 2.0983275602427276, "grad_norm": 0.2698865234851837, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.828050896152854, "num_tokens": 293318502.0, "step": 28000 }, { "entropy": 0.588858200237155, "epoch": 2.0987022841765794, "grad_norm": 0.21950487792491913, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.8274943202733993, "num_tokens": 294817121.0, "step": 28005 }, { "entropy": 0.6062011647969484, "epoch": 2.0990770081104313, "grad_norm": 0.2617434561252594, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8234148673713207, "num_tokens": 296326207.0, "step": 28010 }, { "entropy": 0.6145452057942749, "epoch": 2.099451732044283, "grad_norm": 0.3054213225841522, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.8196629341691732, "num_tokens": 297858278.0, "step": 28015 }, { "entropy": 0.592034506238997, "epoch": 2.099826455978135, "grad_norm": 0.2356855422258377, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.8273363269865512, "num_tokens": 299284653.0, "step": 28020 }, { "entropy": 0.59103381652385, "epoch": 2.100201179911987, "grad_norm": 0.2410339117050171, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.8263643596321344, "num_tokens": 300728638.0, "step": 28025 }, { "entropy": 0.5880189431831241, "epoch": 2.1005759038458387, "grad_norm": 0.2342410683631897, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.8283220883458853, "num_tokens": 302196026.0, "step": 28030 }, { "entropy": 0.5773056453093887, "epoch": 2.1009506277796905, "grad_norm": 0.40315383672714233, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.8317733246833086, "num_tokens": 303596481.0, "step": 28035 }, { "entropy": 0.6037221992388367, "epoch": 2.1013253517135424, "grad_norm": 0.25604504346847534, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.8256909593939781, "num_tokens": 305082626.0, "step": 28040 }, { "entropy": 0.6082896221429109, "epoch": 2.1017000756473943, "grad_norm": 0.23086030781269073, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.8273591216653585, "num_tokens": 306552390.0, "step": 28045 }, { "entropy": 0.594217948615551, "epoch": 2.102074799581246, "grad_norm": 0.22861646115779877, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.8264262225478888, "num_tokens": 307997808.0, "step": 28050 }, { "entropy": 0.6099692238494754, "epoch": 2.102449523515098, "grad_norm": 0.24458198249340057, "learning_rate": 0.0002, "loss": 0.6283, "mean_token_accuracy": 0.8212161272764206, "num_tokens": 309498344.0, "step": 28055 }, { "entropy": 0.6071317059919238, "epoch": 2.10282424744895, "grad_norm": 0.24592337012290955, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.8274702288210392, "num_tokens": 310928178.0, "step": 28060 }, { "entropy": 0.5963842511177063, "epoch": 2.1031989713828017, "grad_norm": 0.22962263226509094, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.8261251784861088, "num_tokens": 312414786.0, "step": 28065 }, { "entropy": 0.602533831819892, "epoch": 2.1035736953166535, "grad_norm": 0.26612699031829834, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.8268738113343715, "num_tokens": 313858798.0, "step": 28070 }, { "entropy": 0.5999545261263848, "epoch": 2.1039484192505054, "grad_norm": 0.22646528482437134, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8281550168991089, "num_tokens": 315328562.0, "step": 28075 }, { "entropy": 0.6146028021350503, "epoch": 2.104323143184357, "grad_norm": 0.23360830545425415, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.8218329183757305, "num_tokens": 316820391.0, "step": 28080 }, { "entropy": 0.6126904467120766, "epoch": 2.104697867118209, "grad_norm": 0.22382725775241852, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.8256873100996017, "num_tokens": 318291963.0, "step": 28085 }, { "entropy": 0.6056471971794963, "epoch": 2.105072591052061, "grad_norm": 0.25989142060279846, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.8242385130375623, "num_tokens": 319742931.0, "step": 28090 }, { "entropy": 0.5870266210287809, "epoch": 2.1054473149859128, "grad_norm": 0.22233393788337708, "learning_rate": 0.0002, "loss": 0.6059, "mean_token_accuracy": 0.8267340496182441, "num_tokens": 321190326.0, "step": 28095 }, { "entropy": 0.6069131007418036, "epoch": 2.1058220389197646, "grad_norm": 0.23077015578746796, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.822118642181158, "num_tokens": 322713755.0, "step": 28100 }, { "entropy": 0.5993590198457242, "epoch": 2.1061967628536165, "grad_norm": 0.21485665440559387, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.8218768533319235, "num_tokens": 324154003.0, "step": 28105 }, { "entropy": 0.6016289250925183, "epoch": 2.1065714867874683, "grad_norm": 0.28088149428367615, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.8210013553500175, "num_tokens": 325630921.0, "step": 28110 }, { "entropy": 0.5798190455883742, "epoch": 2.10694621072132, "grad_norm": 0.25130337476730347, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.8279737312346697, "num_tokens": 327118855.0, "step": 28115 }, { "entropy": 0.5999014168977738, "epoch": 2.107320934655172, "grad_norm": 0.28082990646362305, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.8227740172296762, "num_tokens": 328602020.0, "step": 28120 }, { "entropy": 0.5883054461330175, "epoch": 2.107695658589024, "grad_norm": 0.23335862159729004, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.8272126276046038, "num_tokens": 330056337.0, "step": 28125 }, { "entropy": 0.6114635806530714, "epoch": 2.1080703825228757, "grad_norm": 0.32347872853279114, "learning_rate": 0.0002, "loss": 0.6173, "mean_token_accuracy": 0.8240579284727574, "num_tokens": 331518932.0, "step": 28130 }, { "entropy": 0.6026299422606826, "epoch": 2.1084451064567276, "grad_norm": 0.22405856847763062, "learning_rate": 0.0002, "loss": 0.6061, "mean_token_accuracy": 0.8275743156671524, "num_tokens": 333013120.0, "step": 28135 }, { "entropy": 0.5989869205281139, "epoch": 2.1088198303905794, "grad_norm": 0.305019348859787, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8270471565425396, "num_tokens": 334483232.0, "step": 28140 }, { "entropy": 0.605066335014999, "epoch": 2.1091945543244313, "grad_norm": 0.317240446805954, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.822347192838788, "num_tokens": 335952741.0, "step": 28145 }, { "entropy": 0.582925901003182, "epoch": 2.109569278258283, "grad_norm": 0.21490120887756348, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.8242077764123679, "num_tokens": 337441540.0, "step": 28150 }, { "entropy": 0.5959538197144866, "epoch": 2.109944002192135, "grad_norm": 0.21866975724697113, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.8259957242757082, "num_tokens": 338919420.0, "step": 28155 }, { "entropy": 0.5791963940486312, "epoch": 2.110318726125987, "grad_norm": 0.2442891001701355, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.8295104060322046, "num_tokens": 340392766.0, "step": 28160 }, { "entropy": 0.584405485726893, "epoch": 2.1106934500598387, "grad_norm": 0.2703487277030945, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.8261181026697159, "num_tokens": 341849240.0, "step": 28165 }, { "entropy": 0.5679236484691501, "epoch": 2.1110681739936905, "grad_norm": 0.22650983929634094, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.8293112251907587, "num_tokens": 343323512.0, "step": 28170 }, { "entropy": 0.5965134708210826, "epoch": 2.1114428979275424, "grad_norm": 0.22420229017734528, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.8227494083344936, "num_tokens": 344835338.0, "step": 28175 }, { "entropy": 0.5966590942814947, "epoch": 2.111817621861394, "grad_norm": 0.2279561161994934, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.8216403026133776, "num_tokens": 346306374.0, "step": 28180 }, { "entropy": 0.5891945527866482, "epoch": 2.112192345795246, "grad_norm": 0.24693818390369415, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8248169016093015, "num_tokens": 347792850.0, "step": 28185 }, { "entropy": 0.5674772400408983, "epoch": 2.112567069729098, "grad_norm": 0.2753317952156067, "learning_rate": 0.0002, "loss": 0.583, "mean_token_accuracy": 0.8273343067616225, "num_tokens": 349287608.0, "step": 28190 }, { "entropy": 0.5795035697519779, "epoch": 2.1129417936629498, "grad_norm": 0.2551945447921753, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.8272667277604342, "num_tokens": 350760054.0, "step": 28195 }, { "entropy": 0.5928620386868715, "epoch": 2.1133165175968016, "grad_norm": 0.2742956578731537, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8248224373906851, "num_tokens": 352253401.0, "step": 28200 }, { "entropy": 0.5940129062160849, "epoch": 2.1136912415306535, "grad_norm": 0.292326956987381, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8272979389876127, "num_tokens": 353687682.0, "step": 28205 }, { "entropy": 0.5892327753826976, "epoch": 2.1140659654645053, "grad_norm": 0.23189933598041534, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.8277465611696243, "num_tokens": 355135950.0, "step": 28210 }, { "entropy": 0.5927427219226956, "epoch": 2.114440689398357, "grad_norm": 0.23151330649852753, "learning_rate": 0.0002, "loss": 0.6059, "mean_token_accuracy": 0.8267101187258958, "num_tokens": 356594883.0, "step": 28215 }, { "entropy": 0.5798923535272479, "epoch": 2.114815413332209, "grad_norm": 0.23004348576068878, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.8265260506421328, "num_tokens": 358082655.0, "step": 28220 }, { "entropy": 0.5900891331955791, "epoch": 2.115190137266061, "grad_norm": 0.23343819379806519, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.8275845427066088, "num_tokens": 359583112.0, "step": 28225 }, { "entropy": 0.5803086426109075, "epoch": 2.1155648611999127, "grad_norm": 0.24812890589237213, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.8290455855429173, "num_tokens": 361079093.0, "step": 28230 }, { "entropy": 0.5805335318669677, "epoch": 2.1159395851337646, "grad_norm": 0.25621628761291504, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8298328876495361, "num_tokens": 362539834.0, "step": 28235 }, { "entropy": 0.5855695713311434, "epoch": 2.1163143090676164, "grad_norm": 0.25652623176574707, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.8264663856476545, "num_tokens": 363960153.0, "step": 28240 }, { "entropy": 0.5884634230285883, "epoch": 2.1166890330014683, "grad_norm": 0.2287832796573639, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.8233685828745365, "num_tokens": 365455189.0, "step": 28245 }, { "entropy": 0.5826260404661298, "epoch": 2.11706375693532, "grad_norm": 0.2319459319114685, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.8260916966944933, "num_tokens": 366970898.0, "step": 28250 }, { "entropy": 0.5828912653028965, "epoch": 2.117438480869172, "grad_norm": 0.23474250733852386, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.826286231353879, "num_tokens": 368455806.0, "step": 28255 }, { "entropy": 0.583026416040957, "epoch": 2.117813204803024, "grad_norm": 0.22942884266376495, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8289009556174278, "num_tokens": 369927609.0, "step": 28260 }, { "entropy": 0.5781817216426134, "epoch": 2.1181879287368757, "grad_norm": 0.24597039818763733, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.8268902972340584, "num_tokens": 371364056.0, "step": 28265 }, { "entropy": 0.573668253608048, "epoch": 2.118562652670728, "grad_norm": 0.24838092923164368, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.827988875284791, "num_tokens": 372832067.0, "step": 28270 }, { "entropy": 0.5687096362933517, "epoch": 2.1189373766045794, "grad_norm": 0.22616785764694214, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.8272732455283404, "num_tokens": 374301405.0, "step": 28275 }, { "entropy": 0.5689394157379866, "epoch": 2.1193121005384317, "grad_norm": 0.26988527178764343, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.8278079878538847, "num_tokens": 375768766.0, "step": 28280 }, { "entropy": 0.5589707877486945, "epoch": 2.1196868244722835, "grad_norm": 0.24732497334480286, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.8253045208752156, "num_tokens": 377209005.0, "step": 28285 }, { "entropy": 0.5436333006247878, "epoch": 2.1200615484061354, "grad_norm": 0.21772338449954987, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.8273695651441813, "num_tokens": 378693129.0, "step": 28290 }, { "entropy": 0.5450730128213763, "epoch": 2.120436272339987, "grad_norm": 0.37740886211395264, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.8264125902205706, "num_tokens": 380171018.0, "step": 28295 }, { "entropy": 0.5428341902792454, "epoch": 2.120810996273839, "grad_norm": 0.236437126994133, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8254857253283262, "num_tokens": 381662004.0, "step": 28300 }, { "entropy": 0.5626398386433721, "epoch": 2.121185720207691, "grad_norm": 0.25799909234046936, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.824120344221592, "num_tokens": 383156209.0, "step": 28305 }, { "entropy": 0.5532929925248027, "epoch": 2.1215604441415428, "grad_norm": 0.27317193150520325, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8250029578804969, "num_tokens": 384613919.0, "step": 28310 }, { "entropy": 0.5364638421684503, "epoch": 2.1219351680753946, "grad_norm": 0.2693554759025574, "learning_rate": 0.0002, "loss": 0.5795, "mean_token_accuracy": 0.8346329856663942, "num_tokens": 386043779.0, "step": 28315 }, { "entropy": 0.5462154006585479, "epoch": 2.1223098920092465, "grad_norm": 0.2352704256772995, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.8257940251380205, "num_tokens": 387548332.0, "step": 28320 }, { "entropy": 0.5499209336936474, "epoch": 2.1226846159430983, "grad_norm": 0.26353511214256287, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.8275880984961986, "num_tokens": 388988406.0, "step": 28325 }, { "entropy": 0.5529207728803158, "epoch": 2.12305933987695, "grad_norm": 0.2585431635379791, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.827890320494771, "num_tokens": 390462355.0, "step": 28330 }, { "entropy": 0.5535660969093442, "epoch": 2.123434063810802, "grad_norm": 0.2824636697769165, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.825318592786789, "num_tokens": 391955630.0, "step": 28335 }, { "entropy": 0.5457971664145589, "epoch": 2.123808787744654, "grad_norm": 0.2518618106842041, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.8296705890446902, "num_tokens": 393386542.0, "step": 28340 }, { "entropy": 0.5572770528495312, "epoch": 2.1241835116785057, "grad_norm": 0.22949078679084778, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.8225116357207298, "num_tokens": 394868867.0, "step": 28345 }, { "entropy": 0.5352124109864235, "epoch": 2.1245582356123576, "grad_norm": 0.31303659081459045, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.8294790655374527, "num_tokens": 396328739.0, "step": 28350 }, { "entropy": 0.5361457772552967, "epoch": 2.1249329595462094, "grad_norm": 0.27704185247421265, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.8290943533182145, "num_tokens": 397837306.0, "step": 28355 }, { "entropy": 0.5404321815818548, "epoch": 2.1253076834800613, "grad_norm": 0.3119017481803894, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8289932921528816, "num_tokens": 399323450.0, "step": 28360 }, { "entropy": 0.5413702119141817, "epoch": 2.125682407413913, "grad_norm": 0.5021958947181702, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.8260693773627281, "num_tokens": 400778967.0, "step": 28365 }, { "entropy": 0.531216118671, "epoch": 2.126057131347765, "grad_norm": 0.296527236700058, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.8239303801208735, "num_tokens": 402223394.0, "step": 28370 }, { "entropy": 0.5383089583367109, "epoch": 2.126431855281617, "grad_norm": 0.24506735801696777, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8269063867628574, "num_tokens": 403718756.0, "step": 28375 }, { "entropy": 0.529143925756216, "epoch": 2.1268065792154687, "grad_norm": 0.23117516934871674, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.8266814634203911, "num_tokens": 405196924.0, "step": 28380 }, { "entropy": 0.5476965650916099, "epoch": 2.1271813031493205, "grad_norm": 0.2520897686481476, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.8233717534691095, "num_tokens": 406683147.0, "step": 28385 }, { "entropy": 0.5493812175467611, "epoch": 2.1275560270831724, "grad_norm": 0.2946329414844513, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.8212327610701322, "num_tokens": 408146051.0, "step": 28390 }, { "entropy": 0.5379192413762212, "epoch": 2.1279307510170242, "grad_norm": 0.2692098021507263, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.8268349274992943, "num_tokens": 409631900.0, "step": 28395 }, { "entropy": 0.5395254921168089, "epoch": 2.128305474950876, "grad_norm": 0.2727383077144623, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.8267490047961473, "num_tokens": 411080504.0, "step": 28400 }, { "entropy": 0.5583788719028234, "epoch": 2.128680198884728, "grad_norm": 0.22833411395549774, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.8193272195756436, "num_tokens": 412588163.0, "step": 28405 }, { "entropy": 0.5487338720820845, "epoch": 2.1290549228185798, "grad_norm": 0.2401517629623413, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.8245834928005934, "num_tokens": 414027162.0, "step": 28410 }, { "entropy": 0.5522146973758936, "epoch": 2.1294296467524316, "grad_norm": 0.26421549916267395, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.8262020654976368, "num_tokens": 415497813.0, "step": 28415 }, { "entropy": 0.5533691981807352, "epoch": 2.1298043706862835, "grad_norm": 0.2367095649242401, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.827966534346342, "num_tokens": 416931941.0, "step": 28420 }, { "entropy": 0.5426125656813383, "epoch": 2.1301790946201353, "grad_norm": 0.25051257014274597, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.8274696558713913, "num_tokens": 418390229.0, "step": 28425 }, { "entropy": 0.5447257310152054, "epoch": 2.130553818553987, "grad_norm": 0.3388362526893616, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.8288097374141217, "num_tokens": 419839178.0, "step": 28430 }, { "entropy": 0.553308691456914, "epoch": 2.130928542487839, "grad_norm": 0.23388345539569855, "learning_rate": 0.0002, "loss": 0.607, "mean_token_accuracy": 0.824004216864705, "num_tokens": 421260545.0, "step": 28435 }, { "entropy": 0.549404769577086, "epoch": 2.131303266421691, "grad_norm": 0.22571879625320435, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.8256593991070986, "num_tokens": 422755157.0, "step": 28440 }, { "entropy": 0.5358613707125187, "epoch": 2.1316779903555427, "grad_norm": 0.28659912943840027, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.8273632936179638, "num_tokens": 424202270.0, "step": 28445 }, { "entropy": 0.530379599891603, "epoch": 2.1320527142893946, "grad_norm": 0.23477235436439514, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.8294879231601954, "num_tokens": 425654250.0, "step": 28450 }, { "entropy": 0.5331263473257423, "epoch": 2.1324274382232464, "grad_norm": 0.2256575971841812, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.8285694397985935, "num_tokens": 427099397.0, "step": 28455 }, { "entropy": 0.5417606651782989, "epoch": 2.1328021621570983, "grad_norm": 0.22965706884860992, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.8302829582244158, "num_tokens": 428543021.0, "step": 28460 }, { "entropy": 0.54784872867167, "epoch": 2.13317688609095, "grad_norm": 0.23829905688762665, "learning_rate": 0.0002, "loss": 0.5928, "mean_token_accuracy": 0.8281224116683006, "num_tokens": 429990058.0, "step": 28465 }, { "entropy": 0.5399151911959053, "epoch": 2.133551610024802, "grad_norm": 0.25787556171417236, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.8274760719388723, "num_tokens": 431438259.0, "step": 28470 }, { "entropy": 0.5416841521859169, "epoch": 2.133926333958654, "grad_norm": 0.24842189252376556, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.8259558409452439, "num_tokens": 432885393.0, "step": 28475 }, { "entropy": 0.558495968952775, "epoch": 2.1343010578925057, "grad_norm": 0.26309916377067566, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8254961300641298, "num_tokens": 434407520.0, "step": 28480 }, { "entropy": 0.5540667485445738, "epoch": 2.1346757818263575, "grad_norm": 0.2399505078792572, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.8258020728826523, "num_tokens": 435859899.0, "step": 28485 }, { "entropy": 0.5470607068389655, "epoch": 2.1350505057602094, "grad_norm": 0.2636556327342987, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.8289180882275105, "num_tokens": 437315514.0, "step": 28490 }, { "entropy": 0.5454508733004332, "epoch": 2.1354252296940612, "grad_norm": 0.5752881765365601, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.8260739848017693, "num_tokens": 438782173.0, "step": 28495 }, { "entropy": 0.5543725706636906, "epoch": 2.135799953627913, "grad_norm": 0.21868060529232025, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.8272608168423176, "num_tokens": 440257249.0, "step": 28500 }, { "entropy": 0.5611293682828545, "epoch": 2.136174677561765, "grad_norm": 0.2318209409713745, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.8276586819440126, "num_tokens": 441717440.0, "step": 28505 }, { "entropy": 0.5553218929097057, "epoch": 2.136549401495617, "grad_norm": 0.23099425435066223, "learning_rate": 0.0002, "loss": 0.5773, "mean_token_accuracy": 0.8272181674838066, "num_tokens": 443237778.0, "step": 28510 }, { "entropy": 0.5673793274909258, "epoch": 2.1369241254294686, "grad_norm": 0.22963444888591766, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8280694391578436, "num_tokens": 444715757.0, "step": 28515 }, { "entropy": 0.5622144477441907, "epoch": 2.1372988493633205, "grad_norm": 0.22805581986904144, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8272855043411255, "num_tokens": 446149057.0, "step": 28520 }, { "entropy": 0.5606565121561289, "epoch": 2.1376735732971723, "grad_norm": 0.2535133957862854, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.8273719500750303, "num_tokens": 447629047.0, "step": 28525 }, { "entropy": 0.5578585680574178, "epoch": 2.138048297231024, "grad_norm": 0.27599525451660156, "learning_rate": 0.0002, "loss": 0.587, "mean_token_accuracy": 0.829550176486373, "num_tokens": 449109360.0, "step": 28530 }, { "entropy": 0.5766076577827335, "epoch": 2.138423021164876, "grad_norm": 0.23269003629684448, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.8254890292882919, "num_tokens": 450589236.0, "step": 28535 }, { "entropy": 0.5616382790729404, "epoch": 2.138797745098728, "grad_norm": 0.23637425899505615, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.8295540764927865, "num_tokens": 452038774.0, "step": 28540 }, { "entropy": 0.5530067289248108, "epoch": 2.1391724690325797, "grad_norm": 0.24916483461856842, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.8304505724459886, "num_tokens": 453504878.0, "step": 28545 }, { "entropy": 0.55505998916924, "epoch": 2.1395471929664316, "grad_norm": 0.24962593615055084, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.8263996243476868, "num_tokens": 454966502.0, "step": 28550 }, { "entropy": 0.5659868650138378, "epoch": 2.1399219169002834, "grad_norm": 0.2903580665588379, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.8231091048568487, "num_tokens": 456451605.0, "step": 28555 }, { "entropy": 0.5668309926986694, "epoch": 2.1402966408341353, "grad_norm": 0.23154857754707336, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.8241942211985588, "num_tokens": 457959956.0, "step": 28560 }, { "entropy": 0.5441176053136587, "epoch": 2.140671364767987, "grad_norm": 0.21538691222667694, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.8287921600043774, "num_tokens": 459412459.0, "step": 28565 }, { "entropy": 0.5432639747858048, "epoch": 2.141046088701839, "grad_norm": 0.2652607560157776, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.8315729450434446, "num_tokens": 460841268.0, "step": 28570 }, { "entropy": 0.5527218310162425, "epoch": 2.1414208126356913, "grad_norm": 0.24396659433841705, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.8317321691662073, "num_tokens": 462311674.0, "step": 28575 }, { "entropy": 0.5549553768709302, "epoch": 2.1417955365695427, "grad_norm": 0.23083633184432983, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.8254100255668163, "num_tokens": 463787153.0, "step": 28580 }, { "entropy": 0.5610224401578308, "epoch": 2.142170260503395, "grad_norm": 0.30685552954673767, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8274249754846096, "num_tokens": 465239799.0, "step": 28585 }, { "entropy": 0.5736199762672186, "epoch": 2.1425449844372464, "grad_norm": 0.2720862030982971, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.8224710829555988, "num_tokens": 466702271.0, "step": 28590 }, { "entropy": 0.57176519241184, "epoch": 2.1429197083710987, "grad_norm": 0.251760333776474, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.8266206216067076, "num_tokens": 468183054.0, "step": 28595 }, { "entropy": 0.5624113504774868, "epoch": 2.1432944323049505, "grad_norm": 0.2547959089279175, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.8263838116079569, "num_tokens": 469682720.0, "step": 28600 }, { "entropy": 0.560633217729628, "epoch": 2.1436691562388024, "grad_norm": 0.23831920325756073, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.8323086071759462, "num_tokens": 471156337.0, "step": 28605 }, { "entropy": 0.5638384405523539, "epoch": 2.1440438801726542, "grad_norm": 0.2659015953540802, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.830680949985981, "num_tokens": 472577901.0, "step": 28610 }, { "entropy": 0.5643579062074423, "epoch": 2.144418604106506, "grad_norm": 0.24948038160800934, "learning_rate": 0.0002, "loss": 0.5917, "mean_token_accuracy": 0.8263152200728655, "num_tokens": 474048385.0, "step": 28615 }, { "entropy": 0.5635792896151542, "epoch": 2.144793328040358, "grad_norm": 0.3084021210670471, "learning_rate": 0.0002, "loss": 0.5863, "mean_token_accuracy": 0.8284591384232044, "num_tokens": 475498424.0, "step": 28620 }, { "entropy": 0.5841326836496592, "epoch": 2.14516805197421, "grad_norm": 0.283666729927063, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8280251603573561, "num_tokens": 476927971.0, "step": 28625 }, { "entropy": 0.5866151569411159, "epoch": 2.1455427759080616, "grad_norm": 0.24815545976161957, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.823848032951355, "num_tokens": 478379477.0, "step": 28630 }, { "entropy": 0.5837672933936119, "epoch": 2.1459174998419135, "grad_norm": 0.28080272674560547, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.8286763004958629, "num_tokens": 479848269.0, "step": 28635 }, { "entropy": 0.5905488811433315, "epoch": 2.1462922237757653, "grad_norm": 0.29874351620674133, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.8252269707620143, "num_tokens": 481305557.0, "step": 28640 }, { "entropy": 0.5593234287574887, "epoch": 2.146666947709617, "grad_norm": 0.23416557908058167, "learning_rate": 0.0002, "loss": 0.5654, "mean_token_accuracy": 0.8345272541046143, "num_tokens": 482751165.0, "step": 28645 }, { "entropy": 0.5807751398533583, "epoch": 2.147041671643469, "grad_norm": 0.3019266426563263, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.8287777651101351, "num_tokens": 484207400.0, "step": 28650 }, { "entropy": 0.5674560528248549, "epoch": 2.147416395577321, "grad_norm": 0.24990135431289673, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.8283372685313225, "num_tokens": 485648601.0, "step": 28655 }, { "entropy": 0.5731411321088672, "epoch": 2.1477911195111727, "grad_norm": 0.23646922409534454, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8225125193595886, "num_tokens": 487108151.0, "step": 28660 }, { "entropy": 0.5655405232682824, "epoch": 2.1481658434450246, "grad_norm": 0.25129440426826477, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.8225079257041216, "num_tokens": 488579997.0, "step": 28665 }, { "entropy": 0.5446358265355229, "epoch": 2.1485405673788764, "grad_norm": 0.25909921526908875, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.8276268117129802, "num_tokens": 490079315.0, "step": 28670 }, { "entropy": 0.5186176766641438, "epoch": 2.1489152913127283, "grad_norm": 0.2598549425601959, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.8324495516717434, "num_tokens": 491508510.0, "step": 28675 }, { "entropy": 0.5473261080682278, "epoch": 2.14929001524658, "grad_norm": 0.35371512174606323, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.824786015227437, "num_tokens": 492981920.0, "step": 28680 }, { "entropy": 0.546160789579153, "epoch": 2.149664739180432, "grad_norm": 0.2850242853164673, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.8253403548151255, "num_tokens": 494478118.0, "step": 28685 }, { "entropy": 0.5484871231019497, "epoch": 2.150039463114284, "grad_norm": 0.23530913889408112, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8307395804673433, "num_tokens": 495911836.0, "step": 28690 }, { "entropy": 0.5495235495269298, "epoch": 2.1504141870481357, "grad_norm": 0.2385445088148117, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.8273027248680591, "num_tokens": 497388624.0, "step": 28695 }, { "entropy": 0.541305049136281, "epoch": 2.1507889109819875, "grad_norm": 1.0977874994277954, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.8307233903557062, "num_tokens": 498800461.0, "step": 28700 }, { "entropy": 0.5544049274176359, "epoch": 2.1511636349158394, "grad_norm": 0.31711438298225403, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8273117437958717, "num_tokens": 500253231.0, "step": 28705 }, { "entropy": 0.5548480119556188, "epoch": 2.1515383588496912, "grad_norm": 0.24247245490550995, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.8272790361195803, "num_tokens": 501707923.0, "step": 28710 }, { "entropy": 0.546922892332077, "epoch": 2.151913082783543, "grad_norm": 0.2982594072818756, "learning_rate": 0.0002, "loss": 0.5811, "mean_token_accuracy": 0.8309690199792386, "num_tokens": 503178525.0, "step": 28715 }, { "entropy": 0.5686561347916722, "epoch": 2.152287806717395, "grad_norm": 0.24043016135692596, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.8227785304188728, "num_tokens": 504661819.0, "step": 28720 }, { "entropy": 0.5672959696501494, "epoch": 2.152662530651247, "grad_norm": 0.2805580794811249, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.8244945202022791, "num_tokens": 506097118.0, "step": 28725 }, { "entropy": 0.5684263648465275, "epoch": 2.1530372545850986, "grad_norm": 0.31799688935279846, "learning_rate": 0.0002, "loss": 0.6046, "mean_token_accuracy": 0.825389688462019, "num_tokens": 507555644.0, "step": 28730 }, { "entropy": 0.5613925129175186, "epoch": 2.1534119785189505, "grad_norm": 0.331802099943161, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.827223926782608, "num_tokens": 509044102.0, "step": 28735 }, { "entropy": 0.5538201762363315, "epoch": 2.1537867024528023, "grad_norm": 0.3689895272254944, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.8295028638094664, "num_tokens": 510511848.0, "step": 28740 }, { "entropy": 0.5593860244378448, "epoch": 2.154161426386654, "grad_norm": 0.22026878595352173, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.8279387224465609, "num_tokens": 511995510.0, "step": 28745 }, { "entropy": 0.5499405117705465, "epoch": 2.154536150320506, "grad_norm": 0.21202082931995392, "learning_rate": 0.0002, "loss": 0.5816, "mean_token_accuracy": 0.832116487249732, "num_tokens": 513462057.0, "step": 28750 }, { "entropy": 0.5681096481159329, "epoch": 2.154910874254358, "grad_norm": 0.22702693939208984, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8252293035387993, "num_tokens": 514949172.0, "step": 28755 }, { "entropy": 0.5733717449009419, "epoch": 2.1552855981882098, "grad_norm": 0.23456253111362457, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.82225241959095, "num_tokens": 516454981.0, "step": 28760 }, { "entropy": 0.5648787369951606, "epoch": 2.1556603221220616, "grad_norm": 0.2587275505065918, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.8275481503456831, "num_tokens": 517906317.0, "step": 28765 }, { "entropy": 0.565972576662898, "epoch": 2.1560350460559135, "grad_norm": 0.24781319499015808, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8248731855303049, "num_tokens": 519354240.0, "step": 28770 }, { "entropy": 0.5620773287490011, "epoch": 2.1564097699897653, "grad_norm": 0.2903972566127777, "learning_rate": 0.0002, "loss": 0.5889, "mean_token_accuracy": 0.8291736159473657, "num_tokens": 520792355.0, "step": 28775 }, { "entropy": 0.5567056614905596, "epoch": 2.156784493923617, "grad_norm": 0.2171122282743454, "learning_rate": 0.0002, "loss": 0.5814, "mean_token_accuracy": 0.8295509684830904, "num_tokens": 522269627.0, "step": 28780 }, { "entropy": 0.5713989220559597, "epoch": 2.157159217857469, "grad_norm": 0.2493867427110672, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.8259333901107311, "num_tokens": 523778998.0, "step": 28785 }, { "entropy": 0.5678383423015475, "epoch": 2.157533941791321, "grad_norm": 0.22842179238796234, "learning_rate": 0.0002, "loss": 0.5787, "mean_token_accuracy": 0.8334318842738867, "num_tokens": 525262446.0, "step": 28790 }, { "entropy": 0.576661285944283, "epoch": 2.1579086657251727, "grad_norm": 0.2489868402481079, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.8260597515851259, "num_tokens": 526749795.0, "step": 28795 }, { "entropy": 0.5786144655197859, "epoch": 2.1582833896590246, "grad_norm": 0.23223957419395447, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8262428902089596, "num_tokens": 528229592.0, "step": 28800 }, { "entropy": 0.579367645457387, "epoch": 2.1586581135928764, "grad_norm": 0.24803711473941803, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.8264780901372433, "num_tokens": 529701818.0, "step": 28805 }, { "entropy": 0.5596974508836865, "epoch": 2.1590328375267283, "grad_norm": 0.2398519366979599, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.8338488850742578, "num_tokens": 531145914.0, "step": 28810 }, { "entropy": 0.5811570398509502, "epoch": 2.15940756146058, "grad_norm": 0.301949679851532, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.8296400085091591, "num_tokens": 532606440.0, "step": 28815 }, { "entropy": 0.5772441636770964, "epoch": 2.159782285394432, "grad_norm": 0.23595210909843445, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.8261118482798337, "num_tokens": 534085394.0, "step": 28820 }, { "entropy": 0.5770837778225542, "epoch": 2.160157009328284, "grad_norm": 0.24788117408752441, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8274365793913603, "num_tokens": 535534671.0, "step": 28825 }, { "entropy": 0.5893869446590543, "epoch": 2.1605317332621357, "grad_norm": 0.24461357295513153, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.8237841755151749, "num_tokens": 537007473.0, "step": 28830 }, { "entropy": 0.5787079717963934, "epoch": 2.1609064571959875, "grad_norm": 0.26804086565971375, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8261829778552056, "num_tokens": 538455016.0, "step": 28835 }, { "entropy": 0.5867732495069504, "epoch": 2.1612811811298394, "grad_norm": 0.2387176752090454, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8227273568511009, "num_tokens": 539929634.0, "step": 28840 }, { "entropy": 0.5859843265265227, "epoch": 2.161655905063691, "grad_norm": 0.3207625150680542, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.823983070999384, "num_tokens": 541403925.0, "step": 28845 }, { "entropy": 0.5744790185242892, "epoch": 2.162030628997543, "grad_norm": 0.23550499975681305, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.8252887118607759, "num_tokens": 542861988.0, "step": 28850 }, { "entropy": 0.5852554412558675, "epoch": 2.162405352931395, "grad_norm": 0.3281664550304413, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8242629032582045, "num_tokens": 544332492.0, "step": 28855 }, { "entropy": 0.5773992609232664, "epoch": 2.1627800768652468, "grad_norm": 0.2364226132631302, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.8244673259556293, "num_tokens": 545797404.0, "step": 28860 }, { "entropy": 0.5733273448422551, "epoch": 2.1631548007990986, "grad_norm": 0.2481352537870407, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.8259204417467118, "num_tokens": 547298500.0, "step": 28865 }, { "entropy": 0.5664196232333779, "epoch": 2.1635295247329505, "grad_norm": 0.2348560094833374, "learning_rate": 0.0002, "loss": 0.5789, "mean_token_accuracy": 0.8316749725490808, "num_tokens": 548780313.0, "step": 28870 }, { "entropy": 0.5736606983467937, "epoch": 2.1639042486668023, "grad_norm": 0.2482811063528061, "learning_rate": 0.0002, "loss": 0.5837, "mean_token_accuracy": 0.8301619168370962, "num_tokens": 550267459.0, "step": 28875 }, { "entropy": 0.5530989173799753, "epoch": 2.164278972600654, "grad_norm": 0.2963274121284485, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8243417967110872, "num_tokens": 551738094.0, "step": 28880 }, { "entropy": 0.5166340002790093, "epoch": 2.164653696534506, "grad_norm": 0.22232890129089355, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.8293512728065252, "num_tokens": 553193291.0, "step": 28885 }, { "entropy": 0.5209354164078832, "epoch": 2.1650284204683583, "grad_norm": 0.46854156255722046, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8237321555614472, "num_tokens": 554650881.0, "step": 28890 }, { "entropy": 0.5252610189840198, "epoch": 2.1654031444022097, "grad_norm": 0.23759862780570984, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.8256659500300885, "num_tokens": 556126942.0, "step": 28895 }, { "entropy": 0.5051272256299854, "epoch": 2.165777868336062, "grad_norm": 0.30779439210891724, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.8281458150595427, "num_tokens": 557561587.0, "step": 28900 }, { "entropy": 0.5239719863049686, "epoch": 2.166152592269914, "grad_norm": 0.30091995000839233, "learning_rate": 0.0002, "loss": 0.5984, "mean_token_accuracy": 0.8250426076352596, "num_tokens": 559045891.0, "step": 28905 }, { "entropy": 0.5324108242988587, "epoch": 2.1665273162037657, "grad_norm": 0.2452007681131363, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8285960178822279, "num_tokens": 560542569.0, "step": 28910 }, { "entropy": 0.5353627718053758, "epoch": 2.1669020401376176, "grad_norm": 0.25609469413757324, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.8272206999361515, "num_tokens": 562009441.0, "step": 28915 }, { "entropy": 0.5432441126555204, "epoch": 2.1672767640714694, "grad_norm": 0.27123892307281494, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.8232582390308381, "num_tokens": 563504552.0, "step": 28920 }, { "entropy": 0.506523147970438, "epoch": 2.1676514880053213, "grad_norm": 0.26925140619277954, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.8286344017833471, "num_tokens": 564926515.0, "step": 28925 }, { "entropy": 0.5155210837721824, "epoch": 2.168026211939173, "grad_norm": 0.2253282219171524, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.8276263911277055, "num_tokens": 566398184.0, "step": 28930 }, { "entropy": 0.49741463288664817, "epoch": 2.168400935873025, "grad_norm": 0.2639649510383606, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.8284440584480762, "num_tokens": 567857176.0, "step": 28935 }, { "entropy": 0.5129865096881986, "epoch": 2.168775659806877, "grad_norm": 0.3117418587207794, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.8231038305908441, "num_tokens": 569351714.0, "step": 28940 }, { "entropy": 0.49713927423581483, "epoch": 2.1691503837407287, "grad_norm": 0.25271686911582947, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.8320089776068926, "num_tokens": 570811006.0, "step": 28945 }, { "entropy": 0.513497007265687, "epoch": 2.1695251076745805, "grad_norm": 0.31407251954078674, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8236559960991144, "num_tokens": 572293776.0, "step": 28950 }, { "entropy": 0.5222557498142123, "epoch": 2.1698998316084324, "grad_norm": 0.3558840751647949, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8262258388102055, "num_tokens": 573797899.0, "step": 28955 }, { "entropy": 0.5334392814897001, "epoch": 2.170274555542284, "grad_norm": 0.21748384833335876, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.824284004792571, "num_tokens": 575297704.0, "step": 28960 }, { "entropy": 0.5232736688107252, "epoch": 2.170649279476136, "grad_norm": 0.26959049701690674, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.8250230938196182, "num_tokens": 576798184.0, "step": 28965 }, { "entropy": 0.5196683168411255, "epoch": 2.171024003409988, "grad_norm": 0.23809035122394562, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.8288755439221859, "num_tokens": 578265630.0, "step": 28970 }, { "entropy": 0.5160402780398726, "epoch": 2.1713987273438398, "grad_norm": 0.2297333925962448, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.8281448747962713, "num_tokens": 579737626.0, "step": 28975 }, { "entropy": 0.5084735898301005, "epoch": 2.1717734512776916, "grad_norm": 0.26043862104415894, "learning_rate": 0.0002, "loss": 0.5803, "mean_token_accuracy": 0.8322113592177629, "num_tokens": 581179102.0, "step": 28980 }, { "entropy": 0.5244524901732802, "epoch": 2.1721481752115435, "grad_norm": 0.2612295150756836, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.8279900778084993, "num_tokens": 582636276.0, "step": 28985 }, { "entropy": 0.5281188892200589, "epoch": 2.1725228991453953, "grad_norm": 0.23533830046653748, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.827902901917696, "num_tokens": 584094350.0, "step": 28990 }, { "entropy": 0.5406900929287076, "epoch": 2.172897623079247, "grad_norm": 0.2326522022485733, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.8235617749392986, "num_tokens": 585565039.0, "step": 28995 }, { "entropy": 0.541918714530766, "epoch": 2.173272347013099, "grad_norm": 0.22279725968837738, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.8276726920157671, "num_tokens": 587082748.0, "step": 29000 }, { "entropy": 0.5381030458956957, "epoch": 2.173647070946951, "grad_norm": 0.24473121762275696, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.8265484169125557, "num_tokens": 588525096.0, "step": 29005 }, { "entropy": 0.5543637406080961, "epoch": 2.1740217948808027, "grad_norm": 0.27933311462402344, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.8274026911705732, "num_tokens": 589980095.0, "step": 29010 }, { "entropy": 0.5600931905210018, "epoch": 2.1743965188146546, "grad_norm": 0.23844806849956512, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.828253461048007, "num_tokens": 591473340.0, "step": 29015 }, { "entropy": 0.5657776445150375, "epoch": 2.1747712427485064, "grad_norm": 0.3315119743347168, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.8267966467887163, "num_tokens": 592976144.0, "step": 29020 }, { "entropy": 0.5554273838177324, "epoch": 2.1751459666823583, "grad_norm": 0.2443714141845703, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8267251025885344, "num_tokens": 594479711.0, "step": 29025 }, { "entropy": 0.5327306754887104, "epoch": 2.17552069061621, "grad_norm": 0.2542884051799774, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.8325424708425999, "num_tokens": 595948364.0, "step": 29030 }, { "entropy": 0.5438854718580842, "epoch": 2.175895414550062, "grad_norm": 0.24397023022174835, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.8287234820425511, "num_tokens": 597439266.0, "step": 29035 }, { "entropy": 0.5480520732700824, "epoch": 2.176270138483914, "grad_norm": 0.23545806109905243, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.827089212089777, "num_tokens": 598925118.0, "step": 29040 }, { "entropy": 0.5410604123026133, "epoch": 2.1766448624177657, "grad_norm": 0.21860118210315704, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8261899176985026, "num_tokens": 600360208.0, "step": 29045 }, { "entropy": 0.5525198243558407, "epoch": 2.1770195863516175, "grad_norm": 0.2593172788619995, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.8246821135282516, "num_tokens": 601851783.0, "step": 29050 }, { "entropy": 0.5644030459225178, "epoch": 2.1773943102854694, "grad_norm": 0.2447737604379654, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.8243034701794386, "num_tokens": 603323815.0, "step": 29055 }, { "entropy": 0.5520163195207715, "epoch": 2.177769034219321, "grad_norm": 0.310577392578125, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8252256497740745, "num_tokens": 604820776.0, "step": 29060 }, { "entropy": 0.5472389021888375, "epoch": 2.178143758153173, "grad_norm": 0.2350524663925171, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.8287081647664308, "num_tokens": 606265705.0, "step": 29065 }, { "entropy": 0.5465124880895018, "epoch": 2.178518482087025, "grad_norm": 0.24956290423870087, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.8280153729021549, "num_tokens": 607735863.0, "step": 29070 }, { "entropy": 0.5410796975716948, "epoch": 2.1788932060208768, "grad_norm": 0.236702561378479, "learning_rate": 0.0002, "loss": 0.5876, "mean_token_accuracy": 0.8292175147682428, "num_tokens": 609201976.0, "step": 29075 }, { "entropy": 0.558919095993042, "epoch": 2.1792679299547286, "grad_norm": 0.34305477142333984, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.8245147746056318, "num_tokens": 610668721.0, "step": 29080 }, { "entropy": 0.5438186360523105, "epoch": 2.1796426538885805, "grad_norm": 0.234938383102417, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.8284165516495705, "num_tokens": 612164188.0, "step": 29085 }, { "entropy": 0.5434417212381959, "epoch": 2.1800173778224323, "grad_norm": 0.2509304881095886, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.8247433636337519, "num_tokens": 613634931.0, "step": 29090 }, { "entropy": 0.5493938390165567, "epoch": 2.180392101756284, "grad_norm": 0.27062875032424927, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8258849389851093, "num_tokens": 615123681.0, "step": 29095 }, { "entropy": 0.548328554071486, "epoch": 2.180766825690136, "grad_norm": 0.23709511756896973, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8301049184054137, "num_tokens": 616617477.0, "step": 29100 }, { "entropy": 0.5421425187960267, "epoch": 2.181141549623988, "grad_norm": 0.23836417496204376, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.8250112745910883, "num_tokens": 618068961.0, "step": 29105 }, { "entropy": 0.543283911421895, "epoch": 2.1815162735578397, "grad_norm": 0.29870596528053284, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.8302448198199273, "num_tokens": 619521554.0, "step": 29110 }, { "entropy": 0.5476827669888735, "epoch": 2.1818909974916916, "grad_norm": 0.2144894152879715, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.8269964352250099, "num_tokens": 621019013.0, "step": 29115 }, { "entropy": 0.5294319603592157, "epoch": 2.1822657214255434, "grad_norm": 0.25629565119743347, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.8274747170507908, "num_tokens": 622508376.0, "step": 29120 }, { "entropy": 0.5402454832568765, "epoch": 2.1826404453593953, "grad_norm": 0.2545648515224457, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8280529327690601, "num_tokens": 623910523.0, "step": 29125 }, { "entropy": 0.518861536681652, "epoch": 2.183015169293247, "grad_norm": 0.3023199737071991, "learning_rate": 0.0002, "loss": 0.5715, "mean_token_accuracy": 0.8358221843838691, "num_tokens": 625329956.0, "step": 29130 }, { "entropy": 0.5631886813789606, "epoch": 2.183389893227099, "grad_norm": 0.3697129786014557, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.823123748973012, "num_tokens": 626807946.0, "step": 29135 }, { "entropy": 0.5465556612238288, "epoch": 2.183764617160951, "grad_norm": 0.254582941532135, "learning_rate": 0.0002, "loss": 0.586, "mean_token_accuracy": 0.8293576683849097, "num_tokens": 628287405.0, "step": 29140 }, { "entropy": 0.5490739308297634, "epoch": 2.1841393410948027, "grad_norm": 0.24659115076065063, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.8305345814675092, "num_tokens": 629739883.0, "step": 29145 }, { "entropy": 0.5447013491764665, "epoch": 2.1845140650286545, "grad_norm": 0.2320152074098587, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.828000596538186, "num_tokens": 631204853.0, "step": 29150 }, { "entropy": 0.5503334240987897, "epoch": 2.1848887889625064, "grad_norm": 0.23687002062797546, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.8248417496681213, "num_tokens": 632640714.0, "step": 29155 }, { "entropy": 0.5490172928199172, "epoch": 2.1852635128963582, "grad_norm": 0.2549203038215637, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.8272845476865769, "num_tokens": 634114576.0, "step": 29160 }, { "entropy": 0.5551849884912372, "epoch": 2.18563823683021, "grad_norm": 0.22648705542087555, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.8232392381876708, "num_tokens": 635578837.0, "step": 29165 }, { "entropy": 0.5509480215609074, "epoch": 2.186012960764062, "grad_norm": 0.28713110089302063, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.823611356690526, "num_tokens": 637039606.0, "step": 29170 }, { "entropy": 0.5380149474367499, "epoch": 2.186387684697914, "grad_norm": 0.3326561748981476, "learning_rate": 0.0002, "loss": 0.6074, "mean_token_accuracy": 0.82648507617414, "num_tokens": 638509823.0, "step": 29175 }, { "entropy": 0.562365991435945, "epoch": 2.1867624086317656, "grad_norm": 0.2606699466705322, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.8201039921492338, "num_tokens": 640017047.0, "step": 29180 }, { "entropy": 0.54675273001194, "epoch": 2.1871371325656175, "grad_norm": 0.2696729302406311, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.8249459363520145, "num_tokens": 641498273.0, "step": 29185 }, { "entropy": 0.5566047798842192, "epoch": 2.1875118564994693, "grad_norm": 0.28539803624153137, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8230833377689123, "num_tokens": 642964808.0, "step": 29190 }, { "entropy": 0.5355172490701079, "epoch": 2.187886580433321, "grad_norm": 0.22805197536945343, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.8304251454770565, "num_tokens": 644415760.0, "step": 29195 }, { "entropy": 0.5431317023932933, "epoch": 2.188261304367173, "grad_norm": 0.22271621227264404, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.8243778593838215, "num_tokens": 645879767.0, "step": 29200 }, { "entropy": 0.5319228054955601, "epoch": 2.1886360283010253, "grad_norm": 0.4557146430015564, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.8288418669253588, "num_tokens": 647352374.0, "step": 29205 }, { "entropy": 0.5173623504117131, "epoch": 2.1890107522348767, "grad_norm": 0.28054359555244446, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.8278017655014992, "num_tokens": 648787412.0, "step": 29210 }, { "entropy": 0.5332159209996462, "epoch": 2.189385476168729, "grad_norm": 0.24710237979888916, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.8207726676017046, "num_tokens": 650250861.0, "step": 29215 }, { "entropy": 0.5153416873887181, "epoch": 2.189760200102581, "grad_norm": 0.268947958946228, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.8309697788208723, "num_tokens": 651666443.0, "step": 29220 }, { "entropy": 0.5316873896867037, "epoch": 2.1901349240364327, "grad_norm": 0.23179292678833008, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.8249154414981603, "num_tokens": 653147208.0, "step": 29225 }, { "entropy": 0.5256699671968817, "epoch": 2.1905096479702846, "grad_norm": 0.3626796007156372, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.8301457572728396, "num_tokens": 654591502.0, "step": 29230 }, { "entropy": 0.5434149631299079, "epoch": 2.1908843719041364, "grad_norm": 0.29222699999809265, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.825509712845087, "num_tokens": 656061385.0, "step": 29235 }, { "entropy": 0.5666409805417061, "epoch": 2.1912590958379883, "grad_norm": 0.24076500535011292, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.8262388162314892, "num_tokens": 657547393.0, "step": 29240 }, { "entropy": 0.5377978187054395, "epoch": 2.19163381977184, "grad_norm": 0.21404330432415009, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.8271483909338713, "num_tokens": 659058934.0, "step": 29245 }, { "entropy": 0.527491345256567, "epoch": 2.192008543705692, "grad_norm": 0.238024041056633, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.8273173961788416, "num_tokens": 660506494.0, "step": 29250 }, { "entropy": 0.5267242042347788, "epoch": 2.192383267639544, "grad_norm": 0.24105975031852722, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.8277970563620329, "num_tokens": 661972260.0, "step": 29255 }, { "entropy": 0.5322347551584243, "epoch": 2.1927579915733957, "grad_norm": 0.3760729432106018, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8233402855694294, "num_tokens": 663441806.0, "step": 29260 }, { "entropy": 0.5260056452825665, "epoch": 2.1931327155072475, "grad_norm": 0.23134107887744904, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.8269189700484276, "num_tokens": 664908076.0, "step": 29265 }, { "entropy": 0.5288573928177357, "epoch": 2.1935074394410994, "grad_norm": 0.7011323571205139, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.8273238927125931, "num_tokens": 666355925.0, "step": 29270 }, { "entropy": 0.5315614309161901, "epoch": 2.1938821633749512, "grad_norm": 0.22058206796646118, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.8296957641839982, "num_tokens": 667794577.0, "step": 29275 }, { "entropy": 0.5341566499322653, "epoch": 2.194256887308803, "grad_norm": 0.2368537187576294, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.827523748204112, "num_tokens": 669246462.0, "step": 29280 }, { "entropy": 0.5362162180244923, "epoch": 2.194631611242655, "grad_norm": 0.23966647684574127, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.8258816666901112, "num_tokens": 670773912.0, "step": 29285 }, { "entropy": 0.5325005248188972, "epoch": 2.195006335176507, "grad_norm": 0.2521001100540161, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.8275773741304875, "num_tokens": 672226840.0, "step": 29290 }, { "entropy": 0.5367296910844743, "epoch": 2.1953810591103586, "grad_norm": 0.2599581182003021, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.8294469580054283, "num_tokens": 673671688.0, "step": 29295 }, { "entropy": 0.5398028003051877, "epoch": 2.1957557830442105, "grad_norm": 0.2660042643547058, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8274650126695633, "num_tokens": 675123103.0, "step": 29300 }, { "entropy": 0.5296708021312952, "epoch": 2.1961305069780623, "grad_norm": 0.2510922849178314, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.8279217544943094, "num_tokens": 676613424.0, "step": 29305 }, { "entropy": 0.5283604010939598, "epoch": 2.196505230911914, "grad_norm": 0.3114733397960663, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.8278944887220859, "num_tokens": 678090782.0, "step": 29310 }, { "entropy": 0.5436442617326975, "epoch": 2.196879954845766, "grad_norm": 0.28554248809814453, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.8265901371836663, "num_tokens": 679563401.0, "step": 29315 }, { "entropy": 0.5338202580809593, "epoch": 2.197254678779618, "grad_norm": 0.29060450196266174, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8269827272742987, "num_tokens": 681021916.0, "step": 29320 }, { "entropy": 0.5401175815612078, "epoch": 2.1976294027134697, "grad_norm": 0.2882141172885895, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.8283580034971237, "num_tokens": 682479472.0, "step": 29325 }, { "entropy": 0.5500358667224645, "epoch": 2.1980041266473216, "grad_norm": 0.2498389035463333, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.8266871221363544, "num_tokens": 683944319.0, "step": 29330 }, { "entropy": 0.5483491225168109, "epoch": 2.1983788505811734, "grad_norm": 0.23870782554149628, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.8271277986466885, "num_tokens": 685407850.0, "step": 29335 }, { "entropy": 0.5445594495162368, "epoch": 2.1987535745150253, "grad_norm": 0.24657343327999115, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8292805593460798, "num_tokens": 686832210.0, "step": 29340 }, { "entropy": 0.5560399053618312, "epoch": 2.199128298448877, "grad_norm": 0.3735024034976959, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.8246846914291381, "num_tokens": 688321087.0, "step": 29345 }, { "entropy": 0.5766245359554887, "epoch": 2.199503022382729, "grad_norm": 0.24686764180660248, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.8208852723240853, "num_tokens": 689811739.0, "step": 29350 }, { "entropy": 0.5459514420479537, "epoch": 2.199877746316581, "grad_norm": 0.24363435804843903, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.8281732615083456, "num_tokens": 691273458.0, "step": 29355 }, { "entropy": 0.5502787908539176, "epoch": 2.2002524702504327, "grad_norm": 0.30663540959358215, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8251979697495699, "num_tokens": 692751526.0, "step": 29360 }, { "entropy": 0.5355699233710766, "epoch": 2.2006271941842845, "grad_norm": 0.2441694438457489, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.8302197385579347, "num_tokens": 694258113.0, "step": 29365 }, { "entropy": 0.5270497022196651, "epoch": 2.2010019181181364, "grad_norm": 0.2292497307062149, "learning_rate": 0.0002, "loss": 0.5747, "mean_token_accuracy": 0.8323084820061922, "num_tokens": 695754724.0, "step": 29370 }, { "entropy": 0.5296929303556681, "epoch": 2.2013766420519882, "grad_norm": 0.22331953048706055, "learning_rate": 0.0002, "loss": 0.5733, "mean_token_accuracy": 0.831391080096364, "num_tokens": 697200156.0, "step": 29375 }, { "entropy": 0.5563674392178655, "epoch": 2.20175136598584, "grad_norm": 0.3132978677749634, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.8222976315766573, "num_tokens": 698678957.0, "step": 29380 }, { "entropy": 0.5510443234816194, "epoch": 2.202126089919692, "grad_norm": 0.23952819406986237, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8227766282856465, "num_tokens": 700153174.0, "step": 29385 }, { "entropy": 0.5281240612268447, "epoch": 2.202500813853544, "grad_norm": 0.24341532588005066, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.8273822981864214, "num_tokens": 701629592.0, "step": 29390 }, { "entropy": 0.5460993051528931, "epoch": 2.2028755377873956, "grad_norm": 0.2413148283958435, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.8269878998398781, "num_tokens": 703146923.0, "step": 29395 }, { "entropy": 0.5415790002793074, "epoch": 2.2032502617212475, "grad_norm": 0.233343243598938, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8296374909579753, "num_tokens": 704597887.0, "step": 29400 }, { "entropy": 0.5483077261596918, "epoch": 2.2036249856550993, "grad_norm": 0.2626986801624298, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.8280101589858532, "num_tokens": 706085303.0, "step": 29405 }, { "entropy": 0.5579145602881909, "epoch": 2.203999709588951, "grad_norm": 0.24161297082901, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8201811842620372, "num_tokens": 707614186.0, "step": 29410 }, { "entropy": 0.5365854389965534, "epoch": 2.204374433522803, "grad_norm": 0.2919309437274933, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.8293185565620661, "num_tokens": 709090515.0, "step": 29415 }, { "entropy": 0.5380104817450047, "epoch": 2.204749157456655, "grad_norm": 0.22684651613235474, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.8268035400658846, "num_tokens": 710600700.0, "step": 29420 }, { "entropy": 0.536933486070484, "epoch": 2.2051238813905067, "grad_norm": 0.2723143398761749, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8284131448715926, "num_tokens": 712117036.0, "step": 29425 }, { "entropy": 0.5463985614478588, "epoch": 2.2054986053243586, "grad_norm": 0.2306266576051712, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.8261674687266349, "num_tokens": 713600564.0, "step": 29430 }, { "entropy": 0.5368195535615087, "epoch": 2.2058733292582104, "grad_norm": 0.25609126687049866, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.8279404871165752, "num_tokens": 715072907.0, "step": 29435 }, { "entropy": 0.542291846498847, "epoch": 2.2062480531920623, "grad_norm": 0.2682764530181885, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.8272953435778618, "num_tokens": 716541282.0, "step": 29440 }, { "entropy": 0.5330667601898312, "epoch": 2.206622777125914, "grad_norm": 0.2629755735397339, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.82897062972188, "num_tokens": 718023798.0, "step": 29445 }, { "entropy": 0.5410901308059692, "epoch": 2.206997501059766, "grad_norm": 0.23410409688949585, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.8240939557552338, "num_tokens": 719460910.0, "step": 29450 }, { "entropy": 0.5350570099428296, "epoch": 2.207372224993618, "grad_norm": 0.29972517490386963, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.8304869275540113, "num_tokens": 720908213.0, "step": 29455 }, { "entropy": 0.5406288973987102, "epoch": 2.2077469489274697, "grad_norm": 0.288918137550354, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.8283060330897569, "num_tokens": 722366609.0, "step": 29460 }, { "entropy": 0.5410604787990451, "epoch": 2.2081216728613215, "grad_norm": 0.25331974029541016, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.8251708827912807, "num_tokens": 723851766.0, "step": 29465 }, { "entropy": 0.5427046282216906, "epoch": 2.2084963967951734, "grad_norm": 0.2942405343055725, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8256516367197037, "num_tokens": 725306968.0, "step": 29470 }, { "entropy": 0.5548179453238845, "epoch": 2.2088711207290253, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8247608676552772, "num_tokens": 726774181.0, "step": 29475 }, { "entropy": 0.5514738438650966, "epoch": 2.209245844662877, "grad_norm": 0.22692212462425232, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.8244105331599713, "num_tokens": 728274796.0, "step": 29480 }, { "entropy": 0.5475392315536738, "epoch": 2.209620568596729, "grad_norm": 0.23642566800117493, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.8294642247259617, "num_tokens": 729739174.0, "step": 29485 }, { "entropy": 0.5583768632262945, "epoch": 2.209995292530581, "grad_norm": 0.2326965630054474, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.828818815946579, "num_tokens": 731237576.0, "step": 29490 }, { "entropy": 0.5485259039327502, "epoch": 2.2103700164644327, "grad_norm": 0.22473357617855072, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.8284360993653536, "num_tokens": 732685736.0, "step": 29495 }, { "entropy": 0.5766409756615758, "epoch": 2.2107447403982845, "grad_norm": 0.25056731700897217, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.8253389950841665, "num_tokens": 734196822.0, "step": 29500 }, { "entropy": 0.5658443242311477, "epoch": 2.2111194643321364, "grad_norm": 0.2292793244123459, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8275316406041384, "num_tokens": 735705632.0, "step": 29505 }, { "entropy": 0.5552662631496788, "epoch": 2.2114941882659886, "grad_norm": 0.23808175325393677, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.828274380788207, "num_tokens": 737141262.0, "step": 29510 }, { "entropy": 0.5740658953785897, "epoch": 2.21186891219984, "grad_norm": 0.27626875042915344, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.824902455508709, "num_tokens": 738603498.0, "step": 29515 }, { "entropy": 0.5597456779330969, "epoch": 2.2122436361336923, "grad_norm": 0.2123783826828003, "learning_rate": 0.0002, "loss": 0.5904, "mean_token_accuracy": 0.8273885075002909, "num_tokens": 740056142.0, "step": 29520 }, { "entropy": 0.5573654171079397, "epoch": 2.2126183600675438, "grad_norm": 0.3157748878002167, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.8266442347317934, "num_tokens": 741554670.0, "step": 29525 }, { "entropy": 0.5655892726033926, "epoch": 2.212993084001396, "grad_norm": 0.2299482524394989, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.8288362540304661, "num_tokens": 743001602.0, "step": 29530 }, { "entropy": 0.559451486542821, "epoch": 2.213367807935248, "grad_norm": 0.27008056640625, "learning_rate": 0.0002, "loss": 0.5761, "mean_token_accuracy": 0.8329408243298531, "num_tokens": 744447524.0, "step": 29535 }, { "entropy": 0.5761143984273076, "epoch": 2.2137425318690997, "grad_norm": 0.24266114830970764, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8258761122822762, "num_tokens": 745925806.0, "step": 29540 }, { "entropy": 0.5466820130124688, "epoch": 2.2141172558029516, "grad_norm": 0.23704443871974945, "learning_rate": 0.0002, "loss": 0.5622, "mean_token_accuracy": 0.8366920206695795, "num_tokens": 747403171.0, "step": 29545 }, { "entropy": 0.567573575861752, "epoch": 2.2144919797368035, "grad_norm": 0.23378536105155945, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.826365714147687, "num_tokens": 748904156.0, "step": 29550 }, { "entropy": 0.5550396252423525, "epoch": 2.2148667036706553, "grad_norm": 0.2358839213848114, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.8304952319711447, "num_tokens": 750375590.0, "step": 29555 }, { "entropy": 0.5628495821729302, "epoch": 2.215241427604507, "grad_norm": 0.2574308514595032, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.827613615244627, "num_tokens": 751874461.0, "step": 29560 }, { "entropy": 0.5580152310431004, "epoch": 2.215616151538359, "grad_norm": 0.2887294888496399, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.8262633711099625, "num_tokens": 753347506.0, "step": 29565 }, { "entropy": 0.5488748325034976, "epoch": 2.215990875472211, "grad_norm": 0.24854767322540283, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.8277531046420336, "num_tokens": 754851999.0, "step": 29570 }, { "entropy": 0.5632142871618271, "epoch": 2.2163655994060627, "grad_norm": 0.42761898040771484, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.8282023817300797, "num_tokens": 756291599.0, "step": 29575 }, { "entropy": 0.5748666033148766, "epoch": 2.2167403233399146, "grad_norm": 0.24911749362945557, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.8242589507251978, "num_tokens": 757804253.0, "step": 29580 }, { "entropy": 0.5851305413991212, "epoch": 2.2171150472737664, "grad_norm": 0.2269563227891922, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.8216980572789907, "num_tokens": 759266791.0, "step": 29585 }, { "entropy": 0.5926659870892763, "epoch": 2.2174897712076183, "grad_norm": 0.2506222426891327, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.8255448527634144, "num_tokens": 760738644.0, "step": 29590 }, { "entropy": 0.5690154070034623, "epoch": 2.21786449514147, "grad_norm": 0.23259685933589935, "learning_rate": 0.0002, "loss": 0.5866, "mean_token_accuracy": 0.8266915321350098, "num_tokens": 762191916.0, "step": 29595 }, { "entropy": 0.5763735549524427, "epoch": 2.218239219075322, "grad_norm": 0.22048968076705933, "learning_rate": 0.0002, "loss": 0.5984, "mean_token_accuracy": 0.8267841570079326, "num_tokens": 763643807.0, "step": 29600 }, { "entropy": 0.5863795522600412, "epoch": 2.218613943009174, "grad_norm": 0.249283567070961, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.8246434804052114, "num_tokens": 765092865.0, "step": 29605 }, { "entropy": 0.583566771633923, "epoch": 2.2189886669430257, "grad_norm": 0.23541045188903809, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.825519110634923, "num_tokens": 766588725.0, "step": 29610 }, { "entropy": 0.5791357116773724, "epoch": 2.2193633908768775, "grad_norm": 0.24322834610939026, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.8262923043221235, "num_tokens": 768073979.0, "step": 29615 }, { "entropy": 0.5753343177959322, "epoch": 2.2197381148107294, "grad_norm": 0.24131785333156586, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.8290462229400873, "num_tokens": 769511635.0, "step": 29620 }, { "entropy": 0.5655429013073444, "epoch": 2.220112838744581, "grad_norm": 0.22760501503944397, "learning_rate": 0.0002, "loss": 0.5798, "mean_token_accuracy": 0.8316728502511979, "num_tokens": 770947089.0, "step": 29625 }, { "entropy": 0.570973395742476, "epoch": 2.220487562678433, "grad_norm": 0.24794068932533264, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.8280722521245479, "num_tokens": 772455840.0, "step": 29630 }, { "entropy": 0.5875348713248968, "epoch": 2.220862286612285, "grad_norm": 0.23168380558490753, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.830159941315651, "num_tokens": 773922921.0, "step": 29635 }, { "entropy": 0.5837575735524296, "epoch": 2.2212370105461368, "grad_norm": 0.27139797806739807, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.8288096148520708, "num_tokens": 775393957.0, "step": 29640 }, { "entropy": 0.5591001296415925, "epoch": 2.2216117344799886, "grad_norm": 0.23480601608753204, "learning_rate": 0.0002, "loss": 0.5725, "mean_token_accuracy": 0.8344740729779005, "num_tokens": 776836685.0, "step": 29645 }, { "entropy": 0.5541544489562511, "epoch": 2.2219864584138405, "grad_norm": 0.2483888864517212, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.8311452768743038, "num_tokens": 778213481.0, "step": 29650 }, { "entropy": 0.5699593652039766, "epoch": 2.2223611823476923, "grad_norm": 0.24015220999717712, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8266005080193282, "num_tokens": 779706940.0, "step": 29655 }, { "entropy": 0.5692623823881149, "epoch": 2.222735906281544, "grad_norm": 0.2949755787849426, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.8275459550321103, "num_tokens": 781185376.0, "step": 29660 }, { "entropy": 0.565940305031836, "epoch": 2.223110630215396, "grad_norm": 0.26582011580467224, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.8291928838938475, "num_tokens": 782649054.0, "step": 29665 }, { "entropy": 0.5661148807033897, "epoch": 2.223485354149248, "grad_norm": 0.25631800293922424, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.8256242386996746, "num_tokens": 784097599.0, "step": 29670 }, { "entropy": 0.5759626276791096, "epoch": 2.2238600780830997, "grad_norm": 0.3182002902030945, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.8271537628024817, "num_tokens": 785548207.0, "step": 29675 }, { "entropy": 0.5793337270617485, "epoch": 2.2242348020169516, "grad_norm": 0.29525306820869446, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.8289748728275299, "num_tokens": 787011738.0, "step": 29680 }, { "entropy": 0.5791856583207846, "epoch": 2.2246095259508034, "grad_norm": 0.23046554625034332, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.8288672886788845, "num_tokens": 788496787.0, "step": 29685 }, { "entropy": 0.5851228168234229, "epoch": 2.2249842498846553, "grad_norm": 0.3595074713230133, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.8278138607740402, "num_tokens": 789992200.0, "step": 29690 }, { "entropy": 0.5828134147450328, "epoch": 2.225358973818507, "grad_norm": 0.24137002229690552, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.8263357579708099, "num_tokens": 791472457.0, "step": 29695 }, { "entropy": 0.552247516065836, "epoch": 2.225733697752359, "grad_norm": 0.28385260701179504, "learning_rate": 0.0002, "loss": 0.5712, "mean_token_accuracy": 0.8336217060685158, "num_tokens": 792936069.0, "step": 29700 }, { "entropy": 0.5791467253118754, "epoch": 2.226108421686211, "grad_norm": 0.261081337928772, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.8240511264652014, "num_tokens": 794392195.0, "step": 29705 }, { "entropy": 0.5702958276495338, "epoch": 2.2264831456200627, "grad_norm": 0.28278884291648865, "learning_rate": 0.0002, "loss": 0.5801, "mean_token_accuracy": 0.8301220390945673, "num_tokens": 795862408.0, "step": 29710 }, { "entropy": 0.5737975621595979, "epoch": 2.2268578695539145, "grad_norm": 0.26741552352905273, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8271347176283598, "num_tokens": 797314631.0, "step": 29715 }, { "entropy": 0.5549812860786915, "epoch": 2.2272325934877664, "grad_norm": 0.34754708409309387, "learning_rate": 0.0002, "loss": 0.586, "mean_token_accuracy": 0.8311603374779224, "num_tokens": 798772877.0, "step": 29720 }, { "entropy": 0.5711937068030238, "epoch": 2.227607317421618, "grad_norm": 0.23538506031036377, "learning_rate": 0.0002, "loss": 0.5897, "mean_token_accuracy": 0.8258208516985178, "num_tokens": 800238196.0, "step": 29725 }, { "entropy": 0.5608920373022557, "epoch": 2.22798204135547, "grad_norm": 0.2596566677093506, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.8292876794934273, "num_tokens": 801728393.0, "step": 29730 }, { "entropy": 0.5725503388792277, "epoch": 2.228356765289322, "grad_norm": 0.2910402715206146, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.8275371383875608, "num_tokens": 803182071.0, "step": 29735 }, { "entropy": 0.5665801629424095, "epoch": 2.2287314892231738, "grad_norm": 0.2574111819267273, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.8244276605546474, "num_tokens": 804609797.0, "step": 29740 }, { "entropy": 0.5842675069347024, "epoch": 2.2291062131570256, "grad_norm": 0.2436477243900299, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8248929399996996, "num_tokens": 806120259.0, "step": 29745 }, { "entropy": 0.5766727350652218, "epoch": 2.2294809370908775, "grad_norm": 0.2618686556816101, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.8270966004580259, "num_tokens": 807590364.0, "step": 29750 }, { "entropy": 0.5627671649679542, "epoch": 2.2298556610247293, "grad_norm": 0.2505953311920166, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.8296556796878576, "num_tokens": 809070028.0, "step": 29755 }, { "entropy": 0.579799297451973, "epoch": 2.230230384958581, "grad_norm": 0.3510667085647583, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8281482305377722, "num_tokens": 810550014.0, "step": 29760 }, { "entropy": 0.5757378343492746, "epoch": 2.230605108892433, "grad_norm": 0.32681700587272644, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.8233568266034126, "num_tokens": 812014943.0, "step": 29765 }, { "entropy": 0.5562997186556459, "epoch": 2.230979832826285, "grad_norm": 0.26814600825309753, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.8284334000200033, "num_tokens": 813486256.0, "step": 29770 }, { "entropy": 0.5626898027956486, "epoch": 2.2313545567601367, "grad_norm": 0.3881663680076599, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.827724463865161, "num_tokens": 814966815.0, "step": 29775 }, { "entropy": 0.5634942954406142, "epoch": 2.2317292806939886, "grad_norm": 0.2394987940788269, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.8241786323487759, "num_tokens": 816423246.0, "step": 29780 }, { "entropy": 0.5695645652711392, "epoch": 2.2321040046278404, "grad_norm": 0.24611899256706238, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.8286453451961279, "num_tokens": 817886584.0, "step": 29785 }, { "entropy": 0.585136223398149, "epoch": 2.2324787285616923, "grad_norm": 0.24374781548976898, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.8264318026602269, "num_tokens": 819380167.0, "step": 29790 }, { "entropy": 0.5670833844691515, "epoch": 2.232853452495544, "grad_norm": 0.23797333240509033, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.8266512479633092, "num_tokens": 820885084.0, "step": 29795 }, { "entropy": 0.5749098662286997, "epoch": 2.233228176429396, "grad_norm": 0.2442825436592102, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.8263152725994587, "num_tokens": 822371935.0, "step": 29800 }, { "entropy": 0.5513798261992633, "epoch": 2.233602900363248, "grad_norm": 0.24669525027275085, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.8313356429338455, "num_tokens": 823841341.0, "step": 29805 }, { "entropy": 0.5646902773529291, "epoch": 2.2339776242970997, "grad_norm": 0.26367923617362976, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8279421590268612, "num_tokens": 825292243.0, "step": 29810 }, { "entropy": 0.5558692893013358, "epoch": 2.2343523482309515, "grad_norm": 0.33796244859695435, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.8232785388827324, "num_tokens": 826799717.0, "step": 29815 }, { "entropy": 0.5607697559520602, "epoch": 2.2347270721648034, "grad_norm": 0.26048943400382996, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8222147595137358, "num_tokens": 828322451.0, "step": 29820 }, { "entropy": 0.5543203761801123, "epoch": 2.2351017960986557, "grad_norm": 0.34532272815704346, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.8273830715566873, "num_tokens": 829804132.0, "step": 29825 }, { "entropy": 0.5367040472105146, "epoch": 2.235476520032507, "grad_norm": 0.24132254719734192, "learning_rate": 0.0002, "loss": 0.582, "mean_token_accuracy": 0.8307726014405489, "num_tokens": 831270225.0, "step": 29830 }, { "entropy": 0.5565998826175929, "epoch": 2.2358512439663594, "grad_norm": 0.230245903134346, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.8229469634592533, "num_tokens": 832735597.0, "step": 29835 }, { "entropy": 0.557695990242064, "epoch": 2.236225967900211, "grad_norm": 0.2842468321323395, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.8291021671146155, "num_tokens": 834221704.0, "step": 29840 }, { "entropy": 0.5670332940295338, "epoch": 2.236600691834063, "grad_norm": 0.24606138467788696, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.8316508930176496, "num_tokens": 835725768.0, "step": 29845 }, { "entropy": 0.576038703136146, "epoch": 2.236975415767915, "grad_norm": 0.4123401641845703, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.8237747378647328, "num_tokens": 837198394.0, "step": 29850 }, { "entropy": 0.5676457023248076, "epoch": 2.2373501397017668, "grad_norm": 0.24167288839817047, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.8280981253832579, "num_tokens": 838673747.0, "step": 29855 }, { "entropy": 0.5635495016351342, "epoch": 2.2377248636356186, "grad_norm": 0.263311505317688, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.8330185879021883, "num_tokens": 840115087.0, "step": 29860 }, { "entropy": 0.5668978555127978, "epoch": 2.2380995875694705, "grad_norm": 0.2322802096605301, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.8280794616788626, "num_tokens": 841602438.0, "step": 29865 }, { "entropy": 0.5723569994792342, "epoch": 2.2384743115033223, "grad_norm": 0.25107160210609436, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.8275658100843429, "num_tokens": 843058033.0, "step": 29870 }, { "entropy": 0.5624005561694503, "epoch": 2.238849035437174, "grad_norm": 0.2900739312171936, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.8281900499016046, "num_tokens": 844514139.0, "step": 29875 }, { "entropy": 0.5588383371010422, "epoch": 2.239223759371026, "grad_norm": 0.22345396876335144, "learning_rate": 0.0002, "loss": 0.5825, "mean_token_accuracy": 0.8281366165727377, "num_tokens": 846029568.0, "step": 29880 }, { "entropy": 0.5600601607933641, "epoch": 2.239598483304878, "grad_norm": 0.2386794239282608, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.8288665656000376, "num_tokens": 847473548.0, "step": 29885 }, { "entropy": 0.574822410568595, "epoch": 2.2399732072387297, "grad_norm": 0.24668635427951813, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.8231969445943832, "num_tokens": 848900614.0, "step": 29890 }, { "entropy": 0.5706438310444355, "epoch": 2.2403479311725816, "grad_norm": 0.23213981091976166, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8250727895647287, "num_tokens": 850356108.0, "step": 29895 }, { "entropy": 0.5640717821195722, "epoch": 2.2407226551064334, "grad_norm": 0.29919546842575073, "learning_rate": 0.0002, "loss": 0.5866, "mean_token_accuracy": 0.8303822986781597, "num_tokens": 851815457.0, "step": 29900 }, { "entropy": 0.5713424786925316, "epoch": 2.2410973790402853, "grad_norm": 0.24078020453453064, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.825006277859211, "num_tokens": 853314604.0, "step": 29905 }, { "entropy": 0.5647733287885786, "epoch": 2.241472102974137, "grad_norm": 0.25405997037887573, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.8264350194483996, "num_tokens": 854789814.0, "step": 29910 }, { "entropy": 0.5740187091752886, "epoch": 2.241846826907989, "grad_norm": 0.390239953994751, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.8249683123081922, "num_tokens": 856257309.0, "step": 29915 }, { "entropy": 0.5447427448816597, "epoch": 2.242221550841841, "grad_norm": 0.3321515917778015, "learning_rate": 0.0002, "loss": 0.5816, "mean_token_accuracy": 0.8313568394631148, "num_tokens": 857702333.0, "step": 29920 }, { "entropy": 0.5491242675110698, "epoch": 2.2425962747756927, "grad_norm": 0.30517688393592834, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.8290664289146662, "num_tokens": 859133611.0, "step": 29925 }, { "entropy": 0.5562342792749405, "epoch": 2.2429709987095445, "grad_norm": 0.272686243057251, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.8279590524733067, "num_tokens": 860586731.0, "step": 29930 }, { "entropy": 0.5682722192257643, "epoch": 2.2433457226433964, "grad_norm": 0.24725958704948425, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8217222765088081, "num_tokens": 862094573.0, "step": 29935 }, { "entropy": 0.5681687522679567, "epoch": 2.2437204465772482, "grad_norm": 0.2455282062292099, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.8240564178675414, "num_tokens": 863565102.0, "step": 29940 }, { "entropy": 0.5718919796869159, "epoch": 2.2440951705111, "grad_norm": 0.27659347653388977, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.8239245392382145, "num_tokens": 865085780.0, "step": 29945 }, { "entropy": 0.562154751457274, "epoch": 2.244469894444952, "grad_norm": 0.22526980936527252, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8255276683717966, "num_tokens": 866582109.0, "step": 29950 }, { "entropy": 0.5529136821627617, "epoch": 2.244844618378804, "grad_norm": 0.410190224647522, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.8264068584889174, "num_tokens": 868042627.0, "step": 29955 }, { "entropy": 0.5460242465138435, "epoch": 2.2452193423126556, "grad_norm": 0.2802013158798218, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.8318824540823698, "num_tokens": 869503205.0, "step": 29960 }, { "entropy": 0.5487469246610999, "epoch": 2.2455940662465075, "grad_norm": 0.2158295065164566, "learning_rate": 0.0002, "loss": 0.5721, "mean_token_accuracy": 0.8332658126950264, "num_tokens": 870920539.0, "step": 29965 }, { "entropy": 0.5582254065200687, "epoch": 2.2459687901803593, "grad_norm": 0.24628283083438873, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.8283437836915255, "num_tokens": 872388347.0, "step": 29970 }, { "entropy": 0.5618296937085688, "epoch": 2.246343514114211, "grad_norm": 0.22219213843345642, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.8268006145954132, "num_tokens": 873871026.0, "step": 29975 }, { "entropy": 0.5729801677167415, "epoch": 2.246718238048063, "grad_norm": 0.24551072716712952, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.825959886983037, "num_tokens": 875365620.0, "step": 29980 }, { "entropy": 0.5569476947188378, "epoch": 2.247092961981915, "grad_norm": 0.29094570875167847, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.8289563901722431, "num_tokens": 876842734.0, "step": 29985 }, { "entropy": 0.5600274095311761, "epoch": 2.2474676859157667, "grad_norm": 0.24333463609218597, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.8265852525830268, "num_tokens": 878322994.0, "step": 29990 }, { "entropy": 0.5582928163930774, "epoch": 2.2478424098496186, "grad_norm": 0.2635142505168915, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8264009047299623, "num_tokens": 879773471.0, "step": 29995 }, { "entropy": 0.5570537758991122, "epoch": 2.2482171337834704, "grad_norm": 0.5981279611587524, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.825417060777545, "num_tokens": 881294635.0, "step": 30000 }, { "entropy": 0.5544997630640864, "epoch": 2.2485918577173223, "grad_norm": 0.23055341839790344, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.8294340703636408, "num_tokens": 882761868.0, "step": 30005 }, { "entropy": 0.547325386852026, "epoch": 2.248966581651174, "grad_norm": 0.23522116243839264, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.8310263212770224, "num_tokens": 884234963.0, "step": 30010 }, { "entropy": 0.5663078587502242, "epoch": 2.249341305585026, "grad_norm": 0.24829630553722382, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.8270972691476345, "num_tokens": 885679544.0, "step": 30015 }, { "entropy": 0.5563201572746038, "epoch": 2.249716029518878, "grad_norm": 0.24653874337673187, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.8257892198860646, "num_tokens": 887155470.0, "step": 30020 }, { "entropy": 0.5568827578797937, "epoch": 2.2500907534527297, "grad_norm": 0.2255827933549881, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.827275262773037, "num_tokens": 888591696.0, "step": 30025 }, { "entropy": 0.5601636979728937, "epoch": 2.2504654773865815, "grad_norm": 0.6379826068878174, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.8284722372889519, "num_tokens": 890086213.0, "step": 30030 }, { "entropy": 0.5499820787459612, "epoch": 2.2508402013204334, "grad_norm": 0.23478315770626068, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.8320409346371889, "num_tokens": 891528340.0, "step": 30035 }, { "entropy": 0.5521613992750645, "epoch": 2.2512149252542852, "grad_norm": 0.24320316314697266, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.8247039936482906, "num_tokens": 893029110.0, "step": 30040 }, { "entropy": 0.5341022352688014, "epoch": 2.251589649188137, "grad_norm": 0.2335442751646042, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.830832102522254, "num_tokens": 894492614.0, "step": 30045 }, { "entropy": 0.5474144430831075, "epoch": 2.251964373121989, "grad_norm": 0.21617868542671204, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.8257016450166702, "num_tokens": 895976685.0, "step": 30050 }, { "entropy": 0.5486366575583815, "epoch": 2.252339097055841, "grad_norm": 0.2809966206550598, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.82654033228755, "num_tokens": 897437888.0, "step": 30055 }, { "entropy": 0.5523547034710645, "epoch": 2.2527138209896926, "grad_norm": 0.2421993613243103, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.8291143957525492, "num_tokens": 898892700.0, "step": 30060 }, { "entropy": 0.5607435014098883, "epoch": 2.2530885449235445, "grad_norm": 0.38659316301345825, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.830182333663106, "num_tokens": 900325970.0, "step": 30065 }, { "entropy": 0.5687054893001914, "epoch": 2.2534632688573963, "grad_norm": 0.32523974776268005, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.8267391588538885, "num_tokens": 901791126.0, "step": 30070 }, { "entropy": 0.5608272146433592, "epoch": 2.253837992791248, "grad_norm": 0.42239856719970703, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.8300007876008749, "num_tokens": 903226367.0, "step": 30075 }, { "entropy": 0.5525570107623935, "epoch": 2.2542127167251, "grad_norm": 0.3192571997642517, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.8304305754601955, "num_tokens": 904696153.0, "step": 30080 }, { "entropy": 0.5632004326209425, "epoch": 2.254587440658952, "grad_norm": 0.249164879322052, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.8291204057633876, "num_tokens": 906186234.0, "step": 30085 }, { "entropy": 0.5744803119450808, "epoch": 2.2549621645928037, "grad_norm": 0.231892928481102, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.824311189725995, "num_tokens": 907674625.0, "step": 30090 }, { "entropy": 0.5681999480351806, "epoch": 2.2553368885266556, "grad_norm": 0.2375902384519577, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8270037293434143, "num_tokens": 909185798.0, "step": 30095 }, { "entropy": 0.55904445592314, "epoch": 2.2557116124605074, "grad_norm": 0.315090537071228, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.8301916059106589, "num_tokens": 910627771.0, "step": 30100 }, { "entropy": 0.5735155742615461, "epoch": 2.2560863363943593, "grad_norm": 0.2501789927482605, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.8239176504313945, "num_tokens": 912093940.0, "step": 30105 }, { "entropy": 0.5533436437137425, "epoch": 2.256461060328211, "grad_norm": 0.2419077754020691, "learning_rate": 0.0002, "loss": 0.5852, "mean_token_accuracy": 0.8294026739895344, "num_tokens": 913567303.0, "step": 30110 }, { "entropy": 0.5621696664020419, "epoch": 2.256835784262063, "grad_norm": 0.2205413430929184, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.8319023191928864, "num_tokens": 915125924.0, "step": 30115 }, { "entropy": 0.5749652558937669, "epoch": 2.2572105081959153, "grad_norm": 0.3625737130641937, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.8257653001695872, "num_tokens": 916616799.0, "step": 30120 }, { "entropy": 0.5717734588310123, "epoch": 2.2575852321297667, "grad_norm": 0.2416422963142395, "learning_rate": 0.0002, "loss": 0.5968, "mean_token_accuracy": 0.8285190861672163, "num_tokens": 918115376.0, "step": 30125 }, { "entropy": 0.5730351947247982, "epoch": 2.257959956063619, "grad_norm": 0.23735474050045013, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.8280622392892838, "num_tokens": 919537660.0, "step": 30130 }, { "entropy": 0.5761398866772651, "epoch": 2.2583346799974704, "grad_norm": 0.37636327743530273, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.828460893407464, "num_tokens": 920985717.0, "step": 30135 }, { "entropy": 0.559117041900754, "epoch": 2.2587094039313227, "grad_norm": 0.3499390482902527, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.8289554495364427, "num_tokens": 922425744.0, "step": 30140 }, { "entropy": 0.5743341632187366, "epoch": 2.259084127865174, "grad_norm": 0.26768767833709717, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.8278420679271221, "num_tokens": 923949208.0, "step": 30145 }, { "entropy": 0.5647026395425201, "epoch": 2.2594588517990264, "grad_norm": 0.23238696157932281, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.8294714357703924, "num_tokens": 925436505.0, "step": 30150 }, { "entropy": 0.5827602833509445, "epoch": 2.259833575732878, "grad_norm": 0.24548734724521637, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8264646217226982, "num_tokens": 926898841.0, "step": 30155 }, { "entropy": 0.5782284529879689, "epoch": 2.26020829966673, "grad_norm": 0.29332539439201355, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.8276435419917106, "num_tokens": 928369757.0, "step": 30160 }, { "entropy": 0.5796103037893772, "epoch": 2.260583023600582, "grad_norm": 0.2665368318557739, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.8266089331358671, "num_tokens": 929825693.0, "step": 30165 }, { "entropy": 0.5979709697887301, "epoch": 2.260957747534434, "grad_norm": 0.2708618640899658, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8228403374552726, "num_tokens": 931342693.0, "step": 30170 }, { "entropy": 0.5951769929379225, "epoch": 2.2613324714682856, "grad_norm": 0.26140591502189636, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.8284271284937859, "num_tokens": 932825592.0, "step": 30175 }, { "entropy": 0.6042393049225211, "epoch": 2.2617071954021375, "grad_norm": 0.26833653450012207, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8247043579816818, "num_tokens": 934314276.0, "step": 30180 }, { "entropy": 0.604105906188488, "epoch": 2.2620819193359893, "grad_norm": 0.29385706782341003, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.8261686276644469, "num_tokens": 935775085.0, "step": 30185 }, { "entropy": 0.6152703328058123, "epoch": 2.262456643269841, "grad_norm": 0.2460724413394928, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8265478275716305, "num_tokens": 937264799.0, "step": 30190 }, { "entropy": 0.5847141237929463, "epoch": 2.262831367203693, "grad_norm": 0.30757972598075867, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.8299323234707117, "num_tokens": 938724809.0, "step": 30195 }, { "entropy": 0.5830584226176143, "epoch": 2.263206091137545, "grad_norm": 0.28493642807006836, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.8317073158919811, "num_tokens": 940170782.0, "step": 30200 }, { "entropy": 0.6071354804560543, "epoch": 2.2635808150713967, "grad_norm": 0.2600284814834595, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.8265117526054382, "num_tokens": 941609966.0, "step": 30205 }, { "entropy": 0.6002450684085489, "epoch": 2.2639555390052486, "grad_norm": 0.23882567882537842, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.8328992646187544, "num_tokens": 943051518.0, "step": 30210 }, { "entropy": 0.5885534327477216, "epoch": 2.2643302629391004, "grad_norm": 0.23399794101715088, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.831918365880847, "num_tokens": 944464671.0, "step": 30215 }, { "entropy": 0.5925717094913125, "epoch": 2.2647049868729523, "grad_norm": 0.2355065941810608, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.8273440599441528, "num_tokens": 945969561.0, "step": 30220 }, { "entropy": 0.5863165467977524, "epoch": 2.265079710806804, "grad_norm": 0.2633466124534607, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.8299812838435173, "num_tokens": 947419382.0, "step": 30225 }, { "entropy": 0.5841702282428741, "epoch": 2.265454434740656, "grad_norm": 0.22793762385845184, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.8288929849863053, "num_tokens": 948931890.0, "step": 30230 }, { "entropy": 0.5846249308437109, "epoch": 2.265829158674508, "grad_norm": 0.21593379974365234, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.8274924788624048, "num_tokens": 950424038.0, "step": 30235 }, { "entropy": 0.5810654159635306, "epoch": 2.2662038826083597, "grad_norm": 0.23802796006202698, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.831444101780653, "num_tokens": 951897725.0, "step": 30240 }, { "entropy": 0.5856218721717596, "epoch": 2.2665786065422115, "grad_norm": 0.2797088921070099, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.8272011756896973, "num_tokens": 953368452.0, "step": 30245 }, { "entropy": 0.5917748445644975, "epoch": 2.2669533304760634, "grad_norm": 0.2350783795118332, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.8272908162325621, "num_tokens": 954842931.0, "step": 30250 }, { "entropy": 0.5826430236920714, "epoch": 2.2673280544099153, "grad_norm": 0.27672064304351807, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.8261896498501301, "num_tokens": 956308293.0, "step": 30255 }, { "entropy": 0.5825035968795419, "epoch": 2.267702778343767, "grad_norm": 0.22251389920711517, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.8284301299601793, "num_tokens": 957789962.0, "step": 30260 }, { "entropy": 0.5953902538865805, "epoch": 2.268077502277619, "grad_norm": 0.23899874091148376, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.8278988908976317, "num_tokens": 959212772.0, "step": 30265 }, { "entropy": 0.5880041792988777, "epoch": 2.268452226211471, "grad_norm": 0.23681123554706573, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8264632936567068, "num_tokens": 960671669.0, "step": 30270 }, { "entropy": 0.5909073682501912, "epoch": 2.2688269501453227, "grad_norm": 0.22781150043010712, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8269750986248254, "num_tokens": 962072568.0, "step": 30275 }, { "entropy": 0.5783312320709229, "epoch": 2.2692016740791745, "grad_norm": 0.26276111602783203, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8278118900954723, "num_tokens": 963524078.0, "step": 30280 }, { "entropy": 0.5874262269586324, "epoch": 2.2695763980130264, "grad_norm": 0.25754866003990173, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.8285910364240407, "num_tokens": 964985747.0, "step": 30285 }, { "entropy": 0.5824828801676631, "epoch": 2.269951121946878, "grad_norm": 0.23885414004325867, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.8289552804082632, "num_tokens": 966432345.0, "step": 30290 }, { "entropy": 0.5755215402692556, "epoch": 2.27032584588073, "grad_norm": 0.2613891661167145, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.8286705136299133, "num_tokens": 967928186.0, "step": 30295 }, { "entropy": 0.5952414881438017, "epoch": 2.270700569814582, "grad_norm": 0.25969138741493225, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.8215052042156458, "num_tokens": 969430036.0, "step": 30300 }, { "entropy": 0.5639399589970708, "epoch": 2.2710752937484338, "grad_norm": 0.38668087124824524, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.8286597050726414, "num_tokens": 970948382.0, "step": 30305 }, { "entropy": 0.56746124625206, "epoch": 2.2714500176822856, "grad_norm": 0.2399868220090866, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.8244974769651889, "num_tokens": 972420928.0, "step": 30310 }, { "entropy": 0.5601017998531461, "epoch": 2.2718247416161375, "grad_norm": 0.2734539210796356, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.8327671200037002, "num_tokens": 973887129.0, "step": 30315 }, { "entropy": 0.5632558835670352, "epoch": 2.2721994655499893, "grad_norm": 0.222214937210083, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.8304458007216453, "num_tokens": 975366305.0, "step": 30320 }, { "entropy": 0.5767917942255736, "epoch": 2.272574189483841, "grad_norm": 0.24886463582515717, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.8250406689941883, "num_tokens": 976816893.0, "step": 30325 }, { "entropy": 0.5720984060317278, "epoch": 2.272948913417693, "grad_norm": 0.22594404220581055, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.8261551685631275, "num_tokens": 978281238.0, "step": 30330 }, { "entropy": 0.5769647700712085, "epoch": 2.273323637351545, "grad_norm": 0.22718609869480133, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.8262198250740767, "num_tokens": 979824327.0, "step": 30335 }, { "entropy": 0.5692548597231507, "epoch": 2.2736983612853967, "grad_norm": 0.42614954710006714, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.8264286112040281, "num_tokens": 981300838.0, "step": 30340 }, { "entropy": 0.5599923321977258, "epoch": 2.2740730852192486, "grad_norm": 0.21382670104503632, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.8279903888702392, "num_tokens": 982802013.0, "step": 30345 }, { "entropy": 0.5548151372000575, "epoch": 2.2744478091531004, "grad_norm": 0.22458793222904205, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.8278381746262312, "num_tokens": 984250740.0, "step": 30350 }, { "entropy": 0.5651680298149586, "epoch": 2.2748225330869523, "grad_norm": 0.22682492434978485, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.8234700433909893, "num_tokens": 985727245.0, "step": 30355 }, { "entropy": 0.5521441491320729, "epoch": 2.275197257020804, "grad_norm": 0.34035524725914, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.8276833154261112, "num_tokens": 987206272.0, "step": 30360 }, { "entropy": 0.5440495308488608, "epoch": 2.275571980954656, "grad_norm": 0.2809511125087738, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.8277754351496697, "num_tokens": 988711666.0, "step": 30365 }, { "entropy": 0.5633984526619316, "epoch": 2.275946704888508, "grad_norm": 0.3860737979412079, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.8247249323874712, "num_tokens": 990234264.0, "step": 30370 }, { "entropy": 0.5581991974264383, "epoch": 2.2763214288223597, "grad_norm": 0.2807561159133911, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.8290100313723088, "num_tokens": 991681799.0, "step": 30375 }, { "entropy": 0.5512819034978748, "epoch": 2.2766961527562115, "grad_norm": 0.24163095653057098, "learning_rate": 0.0002, "loss": 0.5817, "mean_token_accuracy": 0.8295398585498333, "num_tokens": 993132275.0, "step": 30380 }, { "entropy": 0.5686680616810917, "epoch": 2.2770708766900634, "grad_norm": 0.23329338431358337, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8268109198659659, "num_tokens": 994618523.0, "step": 30385 }, { "entropy": 0.5771468540653586, "epoch": 2.277445600623915, "grad_norm": 0.25349876284599304, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8257239375263452, "num_tokens": 996120776.0, "step": 30390 }, { "entropy": 0.5777367603033781, "epoch": 2.277820324557767, "grad_norm": 0.2494896948337555, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.8243761386722326, "num_tokens": 997611973.0, "step": 30395 }, { "entropy": 0.5538074795156718, "epoch": 2.278195048491619, "grad_norm": 0.26356756687164307, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.8284344490617513, "num_tokens": 999099387.0, "step": 30400 }, { "entropy": 0.5599976401776076, "epoch": 2.2785697724254708, "grad_norm": 0.26357921957969666, "learning_rate": 0.0002, "loss": 0.5868, "mean_token_accuracy": 0.8279029201716185, "num_tokens": 1000540089.0, "step": 30405 }, { "entropy": 0.563263657130301, "epoch": 2.2789444963593226, "grad_norm": 0.2396586388349533, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.8271870348602534, "num_tokens": 1002027549.0, "step": 30410 }, { "entropy": 0.5637488780543208, "epoch": 2.2793192202931745, "grad_norm": 0.24956798553466797, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.8277385964989662, "num_tokens": 1003515662.0, "step": 30415 }, { "entropy": 0.5540051160380244, "epoch": 2.2796939442270263, "grad_norm": 0.26334893703460693, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.8263368580490351, "num_tokens": 1004976615.0, "step": 30420 }, { "entropy": 0.562870211340487, "epoch": 2.280068668160878, "grad_norm": 0.28462016582489014, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.8257512837648392, "num_tokens": 1006484573.0, "step": 30425 }, { "entropy": 0.5622886503115296, "epoch": 2.28044339209473, "grad_norm": 0.228091761469841, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.8261158619076013, "num_tokens": 1007959285.0, "step": 30430 }, { "entropy": 0.5724986802786589, "epoch": 2.2808181160285823, "grad_norm": 0.2635810375213623, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.82569043263793, "num_tokens": 1009453464.0, "step": 30435 }, { "entropy": 0.565988233871758, "epoch": 2.2811928399624337, "grad_norm": 0.24546600878238678, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.8265752349048853, "num_tokens": 1010936943.0, "step": 30440 }, { "entropy": 0.5761617425829172, "epoch": 2.281567563896286, "grad_norm": 0.26470816135406494, "learning_rate": 0.0002, "loss": 0.5956, "mean_token_accuracy": 0.8272782389074564, "num_tokens": 1012432181.0, "step": 30445 }, { "entropy": 0.5680162793025374, "epoch": 2.2819422878301374, "grad_norm": 0.24085667729377747, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.8266953032463789, "num_tokens": 1013945843.0, "step": 30450 }, { "entropy": 0.56235033236444, "epoch": 2.2823170117639897, "grad_norm": 0.2349500060081482, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.8278907794505358, "num_tokens": 1015412840.0, "step": 30455 }, { "entropy": 0.5602896749973297, "epoch": 2.282691735697841, "grad_norm": 0.24614940583705902, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.8263905569911003, "num_tokens": 1016843404.0, "step": 30460 }, { "entropy": 0.5726953946053982, "epoch": 2.2830664596316934, "grad_norm": 0.2505972385406494, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.8271491404622793, "num_tokens": 1018305648.0, "step": 30465 }, { "entropy": 0.5732722884044051, "epoch": 2.2834411835655453, "grad_norm": 0.24467606842517853, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8249855022877455, "num_tokens": 1019763416.0, "step": 30470 }, { "entropy": 0.5822890903800726, "epoch": 2.283815907499397, "grad_norm": 0.234291210770607, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.8287724822759628, "num_tokens": 1021199009.0, "step": 30475 }, { "entropy": 0.569944136030972, "epoch": 2.284190631433249, "grad_norm": 0.22718627750873566, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.8297379437834025, "num_tokens": 1022661426.0, "step": 30480 }, { "entropy": 0.5568302178755402, "epoch": 2.284565355367101, "grad_norm": 0.2613808810710907, "learning_rate": 0.0002, "loss": 0.5813, "mean_token_accuracy": 0.8330122902989388, "num_tokens": 1024132541.0, "step": 30485 }, { "entropy": 0.578910618275404, "epoch": 2.2849400793009527, "grad_norm": 0.26591962575912476, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.8248019829392433, "num_tokens": 1025607639.0, "step": 30490 }, { "entropy": 0.5629317082464695, "epoch": 2.2853148032348045, "grad_norm": 0.2573268711566925, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.8267624869942665, "num_tokens": 1027112110.0, "step": 30495 }, { "entropy": 0.5572154002264142, "epoch": 2.2856895271686564, "grad_norm": 0.26336824893951416, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.8292158439755439, "num_tokens": 1028585209.0, "step": 30500 }, { "entropy": 0.5641408668830991, "epoch": 2.286064251102508, "grad_norm": 0.2277444303035736, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.8279547110199929, "num_tokens": 1030059701.0, "step": 30505 }, { "entropy": 0.5597941918298602, "epoch": 2.28643897503636, "grad_norm": 0.21320727467536926, "learning_rate": 0.0002, "loss": 0.5928, "mean_token_accuracy": 0.830700957775116, "num_tokens": 1031509564.0, "step": 30510 }, { "entropy": 0.55823088362813, "epoch": 2.286813698970212, "grad_norm": 0.23198238015174866, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.8308357983827591, "num_tokens": 1032983224.0, "step": 30515 }, { "entropy": 0.5693762807175518, "epoch": 2.2871884229040638, "grad_norm": 0.424642950296402, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.826687004417181, "num_tokens": 1034448019.0, "step": 30520 }, { "entropy": 0.546661539003253, "epoch": 2.2875631468379156, "grad_norm": 0.21570858359336853, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.8341871287673712, "num_tokens": 1035895500.0, "step": 30525 }, { "entropy": 0.5505851818248629, "epoch": 2.2879378707717675, "grad_norm": 0.2672179639339447, "learning_rate": 0.0002, "loss": 0.5752, "mean_token_accuracy": 0.8302469115704298, "num_tokens": 1037361860.0, "step": 30530 }, { "entropy": 0.5769900975748896, "epoch": 2.2883125947056193, "grad_norm": 0.23059912025928497, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8230614107102155, "num_tokens": 1038832841.0, "step": 30535 }, { "entropy": 0.5841079499572516, "epoch": 2.288687318639471, "grad_norm": 0.29076769948005676, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.8254938300698995, "num_tokens": 1040295752.0, "step": 30540 }, { "entropy": 0.5732757268473506, "epoch": 2.289062042573323, "grad_norm": 0.2622549533843994, "learning_rate": 0.0002, "loss": 0.583, "mean_token_accuracy": 0.8320473592728377, "num_tokens": 1041737660.0, "step": 30545 }, { "entropy": 0.5720995660871268, "epoch": 2.289436766507175, "grad_norm": 0.38280197978019714, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.8328590836375952, "num_tokens": 1043197157.0, "step": 30550 }, { "entropy": 0.5857152758166194, "epoch": 2.2898114904410267, "grad_norm": 0.28525903820991516, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.8252694390714168, "num_tokens": 1044685184.0, "step": 30555 }, { "entropy": 0.5816283728927374, "epoch": 2.2901862143748786, "grad_norm": 0.26770755648612976, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.8271807711571455, "num_tokens": 1046140128.0, "step": 30560 }, { "entropy": 0.5932178407907486, "epoch": 2.2905609383087304, "grad_norm": 0.2202347368001938, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.8260854925960303, "num_tokens": 1047603948.0, "step": 30565 }, { "entropy": 0.5817353317514062, "epoch": 2.2909356622425823, "grad_norm": 0.23626042902469635, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.82376122251153, "num_tokens": 1049101292.0, "step": 30570 }, { "entropy": 0.5743265300989151, "epoch": 2.291310386176434, "grad_norm": 0.2507588267326355, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.8291286159306764, "num_tokens": 1050612832.0, "step": 30575 }, { "entropy": 0.5739397745579481, "epoch": 2.291685110110286, "grad_norm": 0.28609511256217957, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.8299692168831825, "num_tokens": 1052039604.0, "step": 30580 }, { "entropy": 0.566959411278367, "epoch": 2.292059834044138, "grad_norm": 0.24653029441833496, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.8277049075812102, "num_tokens": 1053505153.0, "step": 30585 }, { "entropy": 0.5774901606142521, "epoch": 2.2924345579779897, "grad_norm": 0.25923171639442444, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.8238996520638466, "num_tokens": 1055002507.0, "step": 30590 }, { "entropy": 0.5841484421864152, "epoch": 2.2928092819118415, "grad_norm": 0.25355422496795654, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.8239340882748365, "num_tokens": 1056458653.0, "step": 30595 }, { "entropy": 0.5639364320784808, "epoch": 2.2931840058456934, "grad_norm": 0.27256664633750916, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.8261064197868109, "num_tokens": 1057887656.0, "step": 30600 }, { "entropy": 0.5704306561499835, "epoch": 2.2935587297795452, "grad_norm": 0.23468324542045593, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.8235494151711464, "num_tokens": 1059353275.0, "step": 30605 }, { "entropy": 0.5500003887340427, "epoch": 2.293933453713397, "grad_norm": 0.2737633287906647, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.8287685871124267, "num_tokens": 1060803495.0, "step": 30610 }, { "entropy": 0.562740414403379, "epoch": 2.294308177647249, "grad_norm": 0.23909102380275726, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.8270723812282086, "num_tokens": 1062225602.0, "step": 30615 }, { "entropy": 0.5674644116312265, "epoch": 2.2946829015811008, "grad_norm": 0.23984317481517792, "learning_rate": 0.0002, "loss": 0.6074, "mean_token_accuracy": 0.8271855868399143, "num_tokens": 1063694426.0, "step": 30620 }, { "entropy": 0.5505656130611897, "epoch": 2.2950576255149526, "grad_norm": 0.24126826226711273, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.8294897697865963, "num_tokens": 1065168319.0, "step": 30625 }, { "entropy": 0.5622221939265728, "epoch": 2.2954323494488045, "grad_norm": 0.23994188010692596, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.8247504681348801, "num_tokens": 1066621175.0, "step": 30630 }, { "entropy": 0.5618781443685293, "epoch": 2.2958070733826563, "grad_norm": 0.2321726381778717, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8245729506015778, "num_tokens": 1068079959.0, "step": 30635 }, { "entropy": 0.5479046289809049, "epoch": 2.296181797316508, "grad_norm": 0.23008304834365845, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8264094959944487, "num_tokens": 1069555459.0, "step": 30640 }, { "entropy": 0.5622271662577987, "epoch": 2.29655652125036, "grad_norm": 0.3045707046985626, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.8230988621711731, "num_tokens": 1071009016.0, "step": 30645 }, { "entropy": 0.5497551459819079, "epoch": 2.296931245184212, "grad_norm": 0.2616031765937805, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.828471440449357, "num_tokens": 1072480513.0, "step": 30650 }, { "entropy": 0.5520896855741739, "epoch": 2.2973059691180637, "grad_norm": 0.3602041006088257, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.8274680655449629, "num_tokens": 1073926158.0, "step": 30655 }, { "entropy": 0.5234482415020466, "epoch": 2.2976806930519156, "grad_norm": 0.24550002813339233, "learning_rate": 0.0002, "loss": 0.5788, "mean_token_accuracy": 0.8310138624161482, "num_tokens": 1075363042.0, "step": 30660 }, { "entropy": 0.5331699652597308, "epoch": 2.2980554169857674, "grad_norm": 0.278001993894577, "learning_rate": 0.0002, "loss": 0.5821, "mean_token_accuracy": 0.8271727573126555, "num_tokens": 1076835119.0, "step": 30665 }, { "entropy": 0.5240421198308468, "epoch": 2.2984301409196193, "grad_norm": 0.2420491725206375, "learning_rate": 0.0002, "loss": 0.5829, "mean_token_accuracy": 0.8339335586875677, "num_tokens": 1078266665.0, "step": 30670 }, { "entropy": 0.5581711111590266, "epoch": 2.298804864853471, "grad_norm": 0.24717850983142853, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.8245218846946955, "num_tokens": 1079749670.0, "step": 30675 }, { "entropy": 0.5485467260703445, "epoch": 2.299179588787323, "grad_norm": 0.2538166344165802, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8268943030387164, "num_tokens": 1081238062.0, "step": 30680 }, { "entropy": 0.5443764500319958, "epoch": 2.299554312721175, "grad_norm": 0.22690783441066742, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.8303640242666006, "num_tokens": 1082695353.0, "step": 30685 }, { "entropy": 0.5328588753938674, "epoch": 2.2999290366550267, "grad_norm": 0.25554490089416504, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.8306614864617586, "num_tokens": 1084108309.0, "step": 30690 }, { "entropy": 0.5268340840935707, "epoch": 2.3003037605888785, "grad_norm": 0.22439488768577576, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.8283416707068681, "num_tokens": 1085593711.0, "step": 30695 }, { "entropy": 0.5448537787422538, "epoch": 2.3006784845227304, "grad_norm": 0.22704795002937317, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.8269654776901006, "num_tokens": 1087064279.0, "step": 30700 }, { "entropy": 0.5372259669005871, "epoch": 2.3010532084565822, "grad_norm": 0.334475040435791, "learning_rate": 0.0002, "loss": 0.5838, "mean_token_accuracy": 0.8282719988375902, "num_tokens": 1088543057.0, "step": 30705 }, { "entropy": 0.5604031619615852, "epoch": 2.301427932390434, "grad_norm": 0.23941153287887573, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.8246685557067395, "num_tokens": 1090080949.0, "step": 30710 }, { "entropy": 0.5254616543650628, "epoch": 2.301802656324286, "grad_norm": 0.24078898131847382, "learning_rate": 0.0002, "loss": 0.5763, "mean_token_accuracy": 0.8305737569928169, "num_tokens": 1091548886.0, "step": 30715 }, { "entropy": 0.5533818781375885, "epoch": 2.302177380258138, "grad_norm": 0.2636030614376068, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.8275712668895722, "num_tokens": 1093025569.0, "step": 30720 }, { "entropy": 0.5435106767341494, "epoch": 2.3025521041919896, "grad_norm": 0.25338998436927795, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.8280106071382761, "num_tokens": 1094496420.0, "step": 30725 }, { "entropy": 0.5475801715627313, "epoch": 2.3029268281258415, "grad_norm": 0.24445286393165588, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.8287892438471317, "num_tokens": 1095949311.0, "step": 30730 }, { "entropy": 0.5543645879253745, "epoch": 2.3033015520596933, "grad_norm": 0.23690594732761383, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.82675042450428, "num_tokens": 1097440830.0, "step": 30735 }, { "entropy": 0.5519372021779418, "epoch": 2.303676275993545, "grad_norm": 0.2515835165977478, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.825119499862194, "num_tokens": 1098903653.0, "step": 30740 }, { "entropy": 0.5538964122533798, "epoch": 2.304050999927397, "grad_norm": 0.24832428991794586, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.8261529497802258, "num_tokens": 1100348164.0, "step": 30745 }, { "entropy": 0.5485427051782608, "epoch": 2.3044257238612493, "grad_norm": 0.2403414249420166, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.8271969128400087, "num_tokens": 1101808060.0, "step": 30750 }, { "entropy": 0.5443807581439615, "epoch": 2.3048004477951007, "grad_norm": 0.3667350709438324, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.8271094180643559, "num_tokens": 1103269280.0, "step": 30755 }, { "entropy": 0.5551697921007872, "epoch": 2.305175171728953, "grad_norm": 0.2370242178440094, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.8283082261681557, "num_tokens": 1104759998.0, "step": 30760 }, { "entropy": 0.5592098886147141, "epoch": 2.3055498956628044, "grad_norm": 0.31298285722732544, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8277370933443308, "num_tokens": 1106193514.0, "step": 30765 }, { "entropy": 0.5580922631546855, "epoch": 2.3059246195966567, "grad_norm": 0.5417881608009338, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.8274560775607824, "num_tokens": 1107632779.0, "step": 30770 }, { "entropy": 0.5524208083748817, "epoch": 2.306299343530508, "grad_norm": 0.2711823582649231, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.8282887168228626, "num_tokens": 1109085801.0, "step": 30775 }, { "entropy": 0.5554026814177633, "epoch": 2.3066740674643604, "grad_norm": 0.23553217947483063, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.8295228891074657, "num_tokens": 1110526567.0, "step": 30780 }, { "entropy": 0.5624912470579148, "epoch": 2.3070487913982123, "grad_norm": 0.22273942828178406, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.82707748003304, "num_tokens": 1111984504.0, "step": 30785 }, { "entropy": 0.5631322527304292, "epoch": 2.307423515332064, "grad_norm": 0.2358706146478653, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8295339811593294, "num_tokens": 1113446781.0, "step": 30790 }, { "entropy": 0.5561795681715012, "epoch": 2.307798239265916, "grad_norm": 0.21668416261672974, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.8281598810106516, "num_tokens": 1114884104.0, "step": 30795 }, { "entropy": 0.5626509673893452, "epoch": 2.308172963199768, "grad_norm": 0.24279852211475372, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.829466451331973, "num_tokens": 1116344317.0, "step": 30800 }, { "entropy": 0.5625198205932975, "epoch": 2.3085476871336197, "grad_norm": 0.25536128878593445, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.8300918396562338, "num_tokens": 1117795874.0, "step": 30805 }, { "entropy": 0.5495354132726789, "epoch": 2.3089224110674715, "grad_norm": 0.22196955978870392, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.8310857750475407, "num_tokens": 1119213286.0, "step": 30810 }, { "entropy": 0.555387701280415, "epoch": 2.3092971350013234, "grad_norm": 0.2880202531814575, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.8282792046666145, "num_tokens": 1120735350.0, "step": 30815 }, { "entropy": 0.5647219233214855, "epoch": 2.3096718589351752, "grad_norm": 0.413031667470932, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.8269190479069948, "num_tokens": 1122203336.0, "step": 30820 }, { "entropy": 0.5753024347126484, "epoch": 2.310046582869027, "grad_norm": 0.23021700978279114, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8271233219653368, "num_tokens": 1123681315.0, "step": 30825 }, { "entropy": 0.5839934131130576, "epoch": 2.310421306802879, "grad_norm": 0.22203999757766724, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.8248299364000559, "num_tokens": 1125137179.0, "step": 30830 }, { "entropy": 0.5601791886612772, "epoch": 2.310796030736731, "grad_norm": 0.2243146002292633, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8256324894726277, "num_tokens": 1126623471.0, "step": 30835 }, { "entropy": 0.5588551174849272, "epoch": 2.3111707546705826, "grad_norm": 0.22691282629966736, "learning_rate": 0.0002, "loss": 0.5931, "mean_token_accuracy": 0.8287938646972179, "num_tokens": 1128096858.0, "step": 30840 }, { "entropy": 0.5546923484653234, "epoch": 2.3115454786044345, "grad_norm": 0.23848548531532288, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.8319150831550359, "num_tokens": 1129601478.0, "step": 30845 }, { "entropy": 0.5635550366714597, "epoch": 2.3119202025382863, "grad_norm": 0.2368849366903305, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.8282029964029789, "num_tokens": 1131043157.0, "step": 30850 }, { "entropy": 0.5759812165051699, "epoch": 2.312294926472138, "grad_norm": 0.2668931782245636, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.8280833385884762, "num_tokens": 1132510051.0, "step": 30855 }, { "entropy": 0.5799791313707828, "epoch": 2.31266965040599, "grad_norm": 0.2664942741394043, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.8236545518040657, "num_tokens": 1133996499.0, "step": 30860 }, { "entropy": 0.5786644283682107, "epoch": 2.313044374339842, "grad_norm": 0.23065835237503052, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8248482078313828, "num_tokens": 1135460328.0, "step": 30865 }, { "entropy": 0.5798336351290345, "epoch": 2.3134190982736937, "grad_norm": 0.22165651619434357, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.8307068675756455, "num_tokens": 1136918853.0, "step": 30870 }, { "entropy": 0.5714520750567317, "epoch": 2.3137938222075456, "grad_norm": 0.2672731280326843, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8273858185857534, "num_tokens": 1138377070.0, "step": 30875 }, { "entropy": 0.5662767887115479, "epoch": 2.3141685461413974, "grad_norm": 0.2353489100933075, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.828460818901658, "num_tokens": 1139877012.0, "step": 30880 }, { "entropy": 0.5697112306952477, "epoch": 2.3145432700752493, "grad_norm": 0.21867835521697998, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8284196656197309, "num_tokens": 1141393325.0, "step": 30885 }, { "entropy": 0.5699557544663548, "epoch": 2.314917994009101, "grad_norm": 0.20258228480815887, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.8259484600275755, "num_tokens": 1142861802.0, "step": 30890 }, { "entropy": 0.5589843641966581, "epoch": 2.315292717942953, "grad_norm": 0.2348291128873825, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.8299199383705854, "num_tokens": 1144287857.0, "step": 30895 }, { "entropy": 0.5989979041740299, "epoch": 2.315667441876805, "grad_norm": 0.23493701219558716, "learning_rate": 0.0002, "loss": 0.6297, "mean_token_accuracy": 0.8206301506608724, "num_tokens": 1145799233.0, "step": 30900 }, { "entropy": 0.5601348740980029, "epoch": 2.3160421658106567, "grad_norm": 0.3035380244255066, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.8309493575245142, "num_tokens": 1147234469.0, "step": 30905 }, { "entropy": 0.5820858554914594, "epoch": 2.3164168897445085, "grad_norm": 0.21454693377017975, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.8267791055142879, "num_tokens": 1148708509.0, "step": 30910 }, { "entropy": 0.5897776909172535, "epoch": 2.3167916136783604, "grad_norm": 0.40060511231422424, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8281441111117601, "num_tokens": 1150126439.0, "step": 30915 }, { "entropy": 0.5790052354335785, "epoch": 2.3171663376122122, "grad_norm": 0.2837221622467041, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.8263934507966042, "num_tokens": 1151573239.0, "step": 30920 }, { "entropy": 0.5853357011452317, "epoch": 2.317541061546064, "grad_norm": 0.3355206549167633, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.8279665797948838, "num_tokens": 1153038078.0, "step": 30925 }, { "entropy": 0.5658855333924293, "epoch": 2.317915785479916, "grad_norm": 0.24756301939487457, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.8296516224741936, "num_tokens": 1154491183.0, "step": 30930 }, { "entropy": 0.5769852260127664, "epoch": 2.318290509413768, "grad_norm": 0.2522296905517578, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.827050457149744, "num_tokens": 1155966784.0, "step": 30935 }, { "entropy": 0.5790222492069006, "epoch": 2.3186652333476196, "grad_norm": 0.23469644784927368, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8285893615335226, "num_tokens": 1157460746.0, "step": 30940 }, { "entropy": 0.5613591741770506, "epoch": 2.3190399572814715, "grad_norm": 0.2661628723144531, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.8319977849721909, "num_tokens": 1158890252.0, "step": 30945 }, { "entropy": 0.5922056881710887, "epoch": 2.3194146812153233, "grad_norm": 0.290307879447937, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.8249381139874459, "num_tokens": 1160377345.0, "step": 30950 }, { "entropy": 0.5979558603838087, "epoch": 2.319789405149175, "grad_norm": 0.29173654317855835, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.8254034332931042, "num_tokens": 1161848956.0, "step": 30955 }, { "entropy": 0.5722424672916532, "epoch": 2.320164129083027, "grad_norm": 0.2593463659286499, "learning_rate": 0.0002, "loss": 0.5896, "mean_token_accuracy": 0.8281623423099518, "num_tokens": 1163302890.0, "step": 30960 }, { "entropy": 0.5965215291827917, "epoch": 2.320538853016879, "grad_norm": 0.26416268944740295, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.824975997954607, "num_tokens": 1164790332.0, "step": 30965 }, { "entropy": 0.5793297750875354, "epoch": 2.3209135769507308, "grad_norm": 0.23392902314662933, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.8291887901723385, "num_tokens": 1166244296.0, "step": 30970 }, { "entropy": 0.5712076397612691, "epoch": 2.3212883008845826, "grad_norm": 0.2710360288619995, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.8292441692203283, "num_tokens": 1167675089.0, "step": 30975 }, { "entropy": 0.5879088779911399, "epoch": 2.3216630248184345, "grad_norm": 0.27714401483535767, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.8262277863919735, "num_tokens": 1169144432.0, "step": 30980 }, { "entropy": 0.5782565541565419, "epoch": 2.3220377487522863, "grad_norm": 0.2594282925128937, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.8268520645797253, "num_tokens": 1170610579.0, "step": 30985 }, { "entropy": 0.5863968351855874, "epoch": 2.322412472686138, "grad_norm": 0.24229559302330017, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.8231562528759241, "num_tokens": 1172084556.0, "step": 30990 }, { "entropy": 0.5810888148844242, "epoch": 2.32278719661999, "grad_norm": 0.24473895132541656, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.8282319951802493, "num_tokens": 1173553287.0, "step": 30995 }, { "entropy": 0.580514020845294, "epoch": 2.323161920553842, "grad_norm": 0.2463901937007904, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.8268295083194971, "num_tokens": 1175010265.0, "step": 31000 }, { "entropy": 0.5901028588414192, "epoch": 2.3235366444876937, "grad_norm": 0.2503586709499359, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.8281080514192581, "num_tokens": 1455274.0, "step": 31005 }, { "entropy": 0.5807734256610274, "epoch": 2.3239113684215456, "grad_norm": 0.35345935821533203, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.8336092121899128, "num_tokens": 2874904.0, "step": 31010 }, { "entropy": 0.6154422573745251, "epoch": 2.3242860923553974, "grad_norm": 0.25063714385032654, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.822099681943655, "num_tokens": 4343100.0, "step": 31015 }, { "entropy": 0.5973182624205947, "epoch": 2.3246608162892493, "grad_norm": 0.22554989159107208, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.8269188988953828, "num_tokens": 5776060.0, "step": 31020 }, { "entropy": 0.5816012915223837, "epoch": 2.325035540223101, "grad_norm": 0.2770936191082001, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.8301197402179241, "num_tokens": 7256282.0, "step": 31025 }, { "entropy": 0.5835265493020415, "epoch": 2.325410264156953, "grad_norm": 0.24094615876674652, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.8251150291413069, "num_tokens": 8744371.0, "step": 31030 }, { "entropy": 0.5891366859897971, "epoch": 2.325784988090805, "grad_norm": 0.23866567015647888, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.8256762754172087, "num_tokens": 10248504.0, "step": 31035 }, { "entropy": 0.5815355105325579, "epoch": 2.3261597120246567, "grad_norm": 0.26801756024360657, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8281556464731693, "num_tokens": 11661670.0, "step": 31040 }, { "entropy": 0.579759961180389, "epoch": 2.3265344359585085, "grad_norm": 0.28760525584220886, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8257053982466459, "num_tokens": 13123476.0, "step": 31045 }, { "entropy": 0.5835690401494503, "epoch": 2.3269091598923604, "grad_norm": 0.2324167639017105, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.8272046461701393, "num_tokens": 14576835.0, "step": 31050 }, { "entropy": 0.5733671847730875, "epoch": 2.3272838838262127, "grad_norm": 0.24685341119766235, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.8265475355088711, "num_tokens": 16063071.0, "step": 31055 }, { "entropy": 0.5703809171915054, "epoch": 2.327658607760064, "grad_norm": 0.23631244897842407, "learning_rate": 0.0002, "loss": 0.5838, "mean_token_accuracy": 0.8290488019585609, "num_tokens": 17573009.0, "step": 31060 }, { "entropy": 0.5962609140202403, "epoch": 2.3280333316939164, "grad_norm": 0.4791445732116699, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.8254910316318274, "num_tokens": 19083366.0, "step": 31065 }, { "entropy": 0.5902905531227589, "epoch": 2.3284080556277678, "grad_norm": 0.28294071555137634, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.8228900242596865, "num_tokens": 20557714.0, "step": 31070 }, { "entropy": 0.5813926935195923, "epoch": 2.32878277956162, "grad_norm": 0.22965766489505768, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.8227020107209683, "num_tokens": 22073814.0, "step": 31075 }, { "entropy": 0.5759088514372707, "epoch": 2.3291575034954715, "grad_norm": 0.23435747623443604, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.8268799498677254, "num_tokens": 23536350.0, "step": 31080 }, { "entropy": 0.5690046412870288, "epoch": 2.3295322274293238, "grad_norm": 0.2458024024963379, "learning_rate": 0.0002, "loss": 0.5928, "mean_token_accuracy": 0.8310895569622516, "num_tokens": 24994177.0, "step": 31085 }, { "entropy": 0.5599111676216125, "epoch": 2.3299069513631756, "grad_norm": 0.28058063983917236, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.8311355870217085, "num_tokens": 26487024.0, "step": 31090 }, { "entropy": 0.5774744642898441, "epoch": 2.3302816752970275, "grad_norm": 0.25274422764778137, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8237370818853378, "num_tokens": 27969805.0, "step": 31095 }, { "entropy": 0.5681332895532251, "epoch": 2.3306563992308793, "grad_norm": 0.34266552329063416, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.8288666747510434, "num_tokens": 29402004.0, "step": 31100 }, { "entropy": 0.5816077135503293, "epoch": 2.331031123164731, "grad_norm": 0.23528476059436798, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.8266837682574988, "num_tokens": 30879844.0, "step": 31105 }, { "entropy": 0.580901600793004, "epoch": 2.331405847098583, "grad_norm": 0.23196618258953094, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.8249468386173249, "num_tokens": 32348538.0, "step": 31110 }, { "entropy": 0.5808057533577085, "epoch": 2.331780571032435, "grad_norm": 0.2325570434331894, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8263949126005172, "num_tokens": 33834700.0, "step": 31115 }, { "entropy": 0.5513842098414898, "epoch": 2.3321552949662867, "grad_norm": 0.27785053849220276, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.8339277967810631, "num_tokens": 35262837.0, "step": 31120 }, { "entropy": 0.5781084703281522, "epoch": 2.3325300189001386, "grad_norm": 0.24675454199314117, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.8229123502969742, "num_tokens": 36735982.0, "step": 31125 }, { "entropy": 0.567069460824132, "epoch": 2.3329047428339904, "grad_norm": 0.25500133633613586, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.8260569699108601, "num_tokens": 38189050.0, "step": 31130 }, { "entropy": 0.5748501645401121, "epoch": 2.3332794667678423, "grad_norm": 0.3397543430328369, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.8302323088049889, "num_tokens": 39641587.0, "step": 31135 }, { "entropy": 0.5888799570500851, "epoch": 2.333654190701694, "grad_norm": 0.28451254963874817, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.8255431588739157, "num_tokens": 41095246.0, "step": 31140 }, { "entropy": 0.5518697071820498, "epoch": 2.334028914635546, "grad_norm": 0.27771812677383423, "learning_rate": 0.0002, "loss": 0.5683, "mean_token_accuracy": 0.8314957335591316, "num_tokens": 42562345.0, "step": 31145 }, { "entropy": 0.5644044487737119, "epoch": 2.334403638569398, "grad_norm": 0.24308015406131744, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.8261136472225189, "num_tokens": 44043879.0, "step": 31150 }, { "entropy": 0.5637058390304446, "epoch": 2.3347783625032497, "grad_norm": 0.22398030757904053, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.8296515293419361, "num_tokens": 45516085.0, "step": 31155 }, { "entropy": 0.559751041047275, "epoch": 2.3351530864371015, "grad_norm": 0.2796287536621094, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.8285297561436892, "num_tokens": 46986376.0, "step": 31160 }, { "entropy": 0.5738558772951364, "epoch": 2.3355278103709534, "grad_norm": 0.25070464611053467, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.8286082569509745, "num_tokens": 48444137.0, "step": 31165 }, { "entropy": 0.5726870620623231, "epoch": 2.335902534304805, "grad_norm": 0.28761282563209534, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.8281773198395967, "num_tokens": 49890352.0, "step": 31170 }, { "entropy": 0.5641782941296697, "epoch": 2.336277258238657, "grad_norm": 0.2286682426929474, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.83127197958529, "num_tokens": 51320400.0, "step": 31175 }, { "entropy": 0.5886174133047462, "epoch": 2.336651982172509, "grad_norm": 0.2223968505859375, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.8229175385087728, "num_tokens": 52782325.0, "step": 31180 }, { "entropy": 0.5808269545435906, "epoch": 2.3370267061063608, "grad_norm": 0.24796785414218903, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.8266877502202987, "num_tokens": 54235038.0, "step": 31185 }, { "entropy": 0.580626112036407, "epoch": 2.3374014300402126, "grad_norm": 0.22096063196659088, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.8273493450134992, "num_tokens": 55703331.0, "step": 31190 }, { "entropy": 0.5799058482050896, "epoch": 2.3377761539740645, "grad_norm": 0.24125316739082336, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.8224948264658452, "num_tokens": 57147694.0, "step": 31195 }, { "entropy": 0.5854608226567507, "epoch": 2.3381508779079163, "grad_norm": 0.26337772607803345, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8250209458172322, "num_tokens": 58665469.0, "step": 31200 }, { "entropy": 0.5877020685002208, "epoch": 2.338525601841768, "grad_norm": 0.23824645578861237, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.8280330289155244, "num_tokens": 60175386.0, "step": 31205 }, { "entropy": 0.5738905649632215, "epoch": 2.33890032577562, "grad_norm": 0.2551211416721344, "learning_rate": 0.0002, "loss": 0.5811, "mean_token_accuracy": 0.8348773781210184, "num_tokens": 61612746.0, "step": 31210 }, { "entropy": 0.5780707402154803, "epoch": 2.339275049709472, "grad_norm": 0.25170454382896423, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.830079510062933, "num_tokens": 63060161.0, "step": 31215 }, { "entropy": 0.5883559105917812, "epoch": 2.3396497736433237, "grad_norm": 0.22690467536449432, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.8261275433003903, "num_tokens": 64505818.0, "step": 31220 }, { "entropy": 0.5880358787253499, "epoch": 2.3400244975771756, "grad_norm": 0.2434202879667282, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8253872286528349, "num_tokens": 66036972.0, "step": 31225 }, { "entropy": 0.5823473745957017, "epoch": 2.3403992215110274, "grad_norm": 0.24949422478675842, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.8275894436985254, "num_tokens": 67535214.0, "step": 31230 }, { "entropy": 0.5745927456766367, "epoch": 2.3407739454448793, "grad_norm": 0.24758432805538177, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.8275155950337648, "num_tokens": 69026160.0, "step": 31235 }, { "entropy": 0.5704536518082023, "epoch": 2.341148669378731, "grad_norm": 0.25536176562309265, "learning_rate": 0.0002, "loss": 0.5794, "mean_token_accuracy": 0.8316865589469671, "num_tokens": 70448627.0, "step": 31240 }, { "entropy": 0.5754206920042634, "epoch": 2.341523393312583, "grad_norm": 0.24917972087860107, "learning_rate": 0.0002, "loss": 0.5807, "mean_token_accuracy": 0.8300635606050492, "num_tokens": 71916568.0, "step": 31245 }, { "entropy": 0.583929781243205, "epoch": 2.341898117246435, "grad_norm": 0.21936607360839844, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8258099626749754, "num_tokens": 73394078.0, "step": 31250 }, { "entropy": 0.5864656653255225, "epoch": 2.3422728411802867, "grad_norm": 0.23494288325309753, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.8249461140483618, "num_tokens": 74876347.0, "step": 31255 }, { "entropy": 0.5743827378377319, "epoch": 2.3426475651141385, "grad_norm": 0.2489030808210373, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.8299147203564644, "num_tokens": 76307736.0, "step": 31260 }, { "entropy": 0.5639070006087422, "epoch": 2.3430222890479904, "grad_norm": 0.2826066017150879, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.830300322920084, "num_tokens": 77754751.0, "step": 31265 }, { "entropy": 0.5723853992298246, "epoch": 2.343397012981842, "grad_norm": 0.22947059571743011, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.8267846014350653, "num_tokens": 79225717.0, "step": 31270 }, { "entropy": 0.5713848151266575, "epoch": 2.343771736915694, "grad_norm": 0.19981519877910614, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.8276477344334126, "num_tokens": 80754157.0, "step": 31275 }, { "entropy": 0.5705990349873901, "epoch": 2.344146460849546, "grad_norm": 0.22920791804790497, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.8277252979576588, "num_tokens": 82263749.0, "step": 31280 }, { "entropy": 0.5637249944731593, "epoch": 2.3445211847833978, "grad_norm": 0.24278080463409424, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.8287624284625054, "num_tokens": 83714588.0, "step": 31285 }, { "entropy": 0.5750782746821642, "epoch": 2.3448959087172496, "grad_norm": 0.25318223237991333, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.8272010948508978, "num_tokens": 85185400.0, "step": 31290 }, { "entropy": 0.5784531969577074, "epoch": 2.3452706326511015, "grad_norm": 0.23008404672145844, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.8250638421624898, "num_tokens": 86674200.0, "step": 31295 }, { "entropy": 0.5775827746838331, "epoch": 2.3456453565849533, "grad_norm": 0.21106185019016266, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8278724618256093, "num_tokens": 88115374.0, "step": 31300 }, { "entropy": 0.6016567537561059, "epoch": 2.346020080518805, "grad_norm": 0.26109936833381653, "learning_rate": 0.0002, "loss": 0.6389, "mean_token_accuracy": 0.8215201925486326, "num_tokens": 89618998.0, "step": 31305 }, { "entropy": 0.5747384421527386, "epoch": 2.346394804452657, "grad_norm": 0.2582244873046875, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.8285988859832287, "num_tokens": 91071948.0, "step": 31310 }, { "entropy": 0.5713606415316462, "epoch": 2.346769528386509, "grad_norm": 0.24109047651290894, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.8259695332497359, "num_tokens": 92565950.0, "step": 31315 }, { "entropy": 0.5652105452492833, "epoch": 2.3471442523203607, "grad_norm": 0.2443837821483612, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.8286720741540193, "num_tokens": 94065301.0, "step": 31320 }, { "entropy": 0.562979813478887, "epoch": 2.3475189762542126, "grad_norm": 0.23729389905929565, "learning_rate": 0.0002, "loss": 0.5889, "mean_token_accuracy": 0.8265338502824306, "num_tokens": 95549200.0, "step": 31325 }, { "entropy": 0.5629799684509635, "epoch": 2.3478937001880644, "grad_norm": 0.21454912424087524, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.829310916364193, "num_tokens": 97023830.0, "step": 31330 }, { "entropy": 0.574602803401649, "epoch": 2.3482684241219163, "grad_norm": 0.22592712938785553, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.8268916450440884, "num_tokens": 98475293.0, "step": 31335 }, { "entropy": 0.5831495627760888, "epoch": 2.348643148055768, "grad_norm": 0.23428888618946075, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.827509456127882, "num_tokens": 99913076.0, "step": 31340 }, { "entropy": 0.57340067923069, "epoch": 2.34901787198962, "grad_norm": 0.28984346985816956, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.8227890390902758, "num_tokens": 101393858.0, "step": 31345 }, { "entropy": 0.569337896630168, "epoch": 2.349392595923472, "grad_norm": 0.24240344762802124, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8294431809335947, "num_tokens": 102877777.0, "step": 31350 }, { "entropy": 0.5771433791145683, "epoch": 2.3497673198573237, "grad_norm": 0.33773887157440186, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.8272117294371129, "num_tokens": 104359193.0, "step": 31355 }, { "entropy": 0.5815864400938153, "epoch": 2.3501420437911755, "grad_norm": 0.2530736029148102, "learning_rate": 0.0002, "loss": 0.5904, "mean_token_accuracy": 0.8311817966401577, "num_tokens": 105812080.0, "step": 31360 }, { "entropy": 0.5887176768854261, "epoch": 2.3505167677250274, "grad_norm": 0.25493085384368896, "learning_rate": 0.0002, "loss": 0.6039, "mean_token_accuracy": 0.8256212159991264, "num_tokens": 107303302.0, "step": 31365 }, { "entropy": 0.5801446042954922, "epoch": 2.3508914916588797, "grad_norm": 0.3080693781375885, "learning_rate": 0.0002, "loss": 0.6, "mean_token_accuracy": 0.8283830758184194, "num_tokens": 108754852.0, "step": 31370 }, { "entropy": 0.5775259433314204, "epoch": 2.351266215592731, "grad_norm": 0.25399065017700195, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.825117439776659, "num_tokens": 110248162.0, "step": 31375 }, { "entropy": 0.5834339886903763, "epoch": 2.3516409395265834, "grad_norm": 0.2484104037284851, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.8264596849679947, "num_tokens": 111721773.0, "step": 31380 }, { "entropy": 0.5827478354796767, "epoch": 2.352015663460435, "grad_norm": 0.2640033960342407, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.8259058196097613, "num_tokens": 113140424.0, "step": 31385 }, { "entropy": 0.5664261335507035, "epoch": 2.352390387394287, "grad_norm": 0.23481011390686035, "learning_rate": 0.0002, "loss": 0.5815, "mean_token_accuracy": 0.82672645971179, "num_tokens": 114605768.0, "step": 31390 }, { "entropy": 0.599953793734312, "epoch": 2.3527651113281385, "grad_norm": 0.34995347261428833, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.8234875503927469, "num_tokens": 116108221.0, "step": 31395 }, { "entropy": 0.5905083758756519, "epoch": 2.3531398352619908, "grad_norm": 0.2403239756822586, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8254049919545651, "num_tokens": 117558575.0, "step": 31400 }, { "entropy": 0.5842440146952868, "epoch": 2.3535145591958426, "grad_norm": 0.24473759531974792, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8281533908098936, "num_tokens": 119014081.0, "step": 31405 }, { "entropy": 0.5931394755840301, "epoch": 2.3538892831296945, "grad_norm": 0.26553910970687866, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8242324572056532, "num_tokens": 120427107.0, "step": 31410 }, { "entropy": 0.5777901979163289, "epoch": 2.3542640070635463, "grad_norm": 0.24363626539707184, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.8294572155922652, "num_tokens": 121891152.0, "step": 31415 }, { "entropy": 0.6020380744710565, "epoch": 2.354638730997398, "grad_norm": 0.23969362676143646, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.8259952630847692, "num_tokens": 123403306.0, "step": 31420 }, { "entropy": 0.5980217881500721, "epoch": 2.35501345493125, "grad_norm": 0.2631835341453552, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.8291136033833026, "num_tokens": 124895684.0, "step": 31425 }, { "entropy": 0.5865612743422389, "epoch": 2.355388178865102, "grad_norm": 0.2885136306285858, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.8271120548248291, "num_tokens": 126351183.0, "step": 31430 }, { "entropy": 0.578868443891406, "epoch": 2.3557629027989537, "grad_norm": 0.24096912145614624, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.8299962170422077, "num_tokens": 127860316.0, "step": 31435 }, { "entropy": 0.5833576342090965, "epoch": 2.3561376267328056, "grad_norm": 0.23106056451797485, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.8266796838492155, "num_tokens": 129371359.0, "step": 31440 }, { "entropy": 0.5806875845417381, "epoch": 2.3565123506666574, "grad_norm": 0.21987678110599518, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.8295911729335785, "num_tokens": 130842169.0, "step": 31445 }, { "entropy": 0.5805419089272619, "epoch": 2.3568870746005093, "grad_norm": 0.22842161357402802, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.8250758465379476, "num_tokens": 132314831.0, "step": 31450 }, { "entropy": 0.576514956727624, "epoch": 2.357261798534361, "grad_norm": 0.2515700161457062, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8278831098228693, "num_tokens": 133790890.0, "step": 31455 }, { "entropy": 0.5650898195803166, "epoch": 2.357636522468213, "grad_norm": 0.2378040999174118, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.8267495300620794, "num_tokens": 135252584.0, "step": 31460 }, { "entropy": 0.5440246447920799, "epoch": 2.358011246402065, "grad_norm": 0.4735899865627289, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.8295205798000097, "num_tokens": 136744150.0, "step": 31465 }, { "entropy": 0.560994659923017, "epoch": 2.3583859703359167, "grad_norm": 0.256911039352417, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.8236211735755206, "num_tokens": 138225576.0, "step": 31470 }, { "entropy": 0.5522196201607585, "epoch": 2.3587606942697685, "grad_norm": 0.2376130372285843, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.8275258980691433, "num_tokens": 139659723.0, "step": 31475 }, { "entropy": 0.5484738236293196, "epoch": 2.3591354182036204, "grad_norm": 0.22389940917491913, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.8326758999377489, "num_tokens": 141084096.0, "step": 31480 }, { "entropy": 0.542274184152484, "epoch": 2.3595101421374722, "grad_norm": 0.23747575283050537, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.8282905526459217, "num_tokens": 142575532.0, "step": 31485 }, { "entropy": 0.55944698844105, "epoch": 2.359884866071324, "grad_norm": 0.26396122574806213, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.8236989796161651, "num_tokens": 144042897.0, "step": 31490 }, { "entropy": 0.5421863315626979, "epoch": 2.360259590005176, "grad_norm": 0.2492949366569519, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.8276008136570454, "num_tokens": 145454879.0, "step": 31495 }, { "entropy": 0.5469182359054685, "epoch": 2.360634313939028, "grad_norm": 0.2666272222995758, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.8265487551689148, "num_tokens": 146939409.0, "step": 31500 }, { "entropy": 0.563115730509162, "epoch": 2.3610090378728796, "grad_norm": 0.24775944650173187, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.8239857006818057, "num_tokens": 148416458.0, "step": 31505 }, { "entropy": 0.5480203593149782, "epoch": 2.3613837618067315, "grad_norm": 0.2189180999994278, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.8281572133302688, "num_tokens": 149912171.0, "step": 31510 }, { "entropy": 0.5655999772250653, "epoch": 2.3617584857405833, "grad_norm": 0.22884316742420197, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.8257303562015295, "num_tokens": 151401925.0, "step": 31515 }, { "entropy": 0.5600044833496213, "epoch": 2.362133209674435, "grad_norm": 0.2223149985074997, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.8280769281089306, "num_tokens": 152909175.0, "step": 31520 }, { "entropy": 0.5602582294493914, "epoch": 2.362507933608287, "grad_norm": 0.34946250915527344, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.827766977250576, "num_tokens": 154357773.0, "step": 31525 }, { "entropy": 0.5666434558108449, "epoch": 2.362882657542139, "grad_norm": 0.2830217182636261, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.8272063322365284, "num_tokens": 155833345.0, "step": 31530 }, { "entropy": 0.5684842182323336, "epoch": 2.3632573814759907, "grad_norm": 0.22547507286071777, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.8291352774947882, "num_tokens": 157362816.0, "step": 31535 }, { "entropy": 0.5667960042133927, "epoch": 2.3636321054098426, "grad_norm": 0.28229621052742004, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.8292410112917423, "num_tokens": 158872478.0, "step": 31540 }, { "entropy": 0.5759741421788931, "epoch": 2.3640068293436944, "grad_norm": 0.2578602135181427, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.8298574153333902, "num_tokens": 160343737.0, "step": 31545 }, { "entropy": 0.5756199471652508, "epoch": 2.3643815532775463, "grad_norm": 0.26070505380630493, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.826805005595088, "num_tokens": 161784627.0, "step": 31550 }, { "entropy": 0.5570037273690105, "epoch": 2.364756277211398, "grad_norm": 0.2674122750759125, "learning_rate": 0.0002, "loss": 0.5699, "mean_token_accuracy": 0.8329801186919212, "num_tokens": 163244021.0, "step": 31555 }, { "entropy": 0.5710486659780145, "epoch": 2.36513100114525, "grad_norm": 0.28149253129959106, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.8285740934312343, "num_tokens": 164686338.0, "step": 31560 }, { "entropy": 0.591329351812601, "epoch": 2.365505725079102, "grad_norm": 0.2662815451622009, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.822596638277173, "num_tokens": 166157444.0, "step": 31565 }, { "entropy": 0.5844504563137889, "epoch": 2.3658804490129537, "grad_norm": 0.28777509927749634, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.8244180146604776, "num_tokens": 167632693.0, "step": 31570 }, { "entropy": 0.5811272256076336, "epoch": 2.3662551729468055, "grad_norm": 0.23526670038700104, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.8315320700407028, "num_tokens": 169029120.0, "step": 31575 }, { "entropy": 0.5758929183706641, "epoch": 2.3666298968806574, "grad_norm": 0.24587760865688324, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.8280300050973892, "num_tokens": 170495906.0, "step": 31580 }, { "entropy": 0.5957433879375458, "epoch": 2.3670046208145092, "grad_norm": 0.24754498898983002, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.8235680256038904, "num_tokens": 171986796.0, "step": 31585 }, { "entropy": 0.5934617277234793, "epoch": 2.367379344748361, "grad_norm": 0.2148224264383316, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.8244051270186901, "num_tokens": 173429031.0, "step": 31590 }, { "entropy": 0.5861922826617956, "epoch": 2.367754068682213, "grad_norm": 0.22837300598621368, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.8261854987591505, "num_tokens": 174864883.0, "step": 31595 }, { "entropy": 0.5887130554765463, "epoch": 2.368128792616065, "grad_norm": 0.2636319398880005, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.8259095109999179, "num_tokens": 176368908.0, "step": 31600 }, { "entropy": 0.5811872301623225, "epoch": 2.3685035165499166, "grad_norm": 0.27814704179763794, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.8270835436880588, "num_tokens": 177873804.0, "step": 31605 }, { "entropy": 0.5676548080518842, "epoch": 2.3688782404837685, "grad_norm": 0.2799417972564697, "learning_rate": 0.0002, "loss": 0.5917, "mean_token_accuracy": 0.8287068381905556, "num_tokens": 179323167.0, "step": 31610 }, { "entropy": 0.5647530568763613, "epoch": 2.3692529644176203, "grad_norm": 0.2565387189388275, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.8302552867680788, "num_tokens": 180780880.0, "step": 31615 }, { "entropy": 0.5666155479848385, "epoch": 2.369627688351472, "grad_norm": 0.2911015450954437, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.8273736886680126, "num_tokens": 182269945.0, "step": 31620 }, { "entropy": 0.5661087419837714, "epoch": 2.370002412285324, "grad_norm": 0.24690720438957214, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.8285403259098529, "num_tokens": 183759252.0, "step": 31625 }, { "entropy": 0.5656003043055534, "epoch": 2.370377136219176, "grad_norm": 0.25886160135269165, "learning_rate": 0.0002, "loss": 0.5788, "mean_token_accuracy": 0.8318205073475837, "num_tokens": 185201386.0, "step": 31630 }, { "entropy": 0.5787554170936346, "epoch": 2.3707518601530277, "grad_norm": 0.27737605571746826, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.8267965123057366, "num_tokens": 186714613.0, "step": 31635 }, { "entropy": 0.5786828896030783, "epoch": 2.3711265840868796, "grad_norm": 0.2561221420764923, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.8283954542130232, "num_tokens": 188181595.0, "step": 31640 }, { "entropy": 0.5744262866675853, "epoch": 2.3715013080207314, "grad_norm": 0.28295066952705383, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8250812660902739, "num_tokens": 189648680.0, "step": 31645 }, { "entropy": 0.5738594602793455, "epoch": 2.3718760319545833, "grad_norm": 0.24559295177459717, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.8262447752058506, "num_tokens": 191086194.0, "step": 31650 }, { "entropy": 0.5723112720996142, "epoch": 2.372250755888435, "grad_norm": 0.24832993745803833, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.8273163985460996, "num_tokens": 192542533.0, "step": 31655 }, { "entropy": 0.5845197105780244, "epoch": 2.372625479822287, "grad_norm": 0.2325076311826706, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.8230484060943126, "num_tokens": 194001060.0, "step": 31660 }, { "entropy": 0.5806993989273905, "epoch": 2.373000203756139, "grad_norm": 0.2451142817735672, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.823180765286088, "num_tokens": 195484049.0, "step": 31665 }, { "entropy": 0.5801516652107239, "epoch": 2.3733749276899907, "grad_norm": 0.23044101893901825, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.8232249304652214, "num_tokens": 196965835.0, "step": 31670 }, { "entropy": 0.5778635708615184, "epoch": 2.373749651623843, "grad_norm": 0.28808581829071045, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.8296654865145683, "num_tokens": 198465987.0, "step": 31675 }, { "entropy": 0.5711821688339114, "epoch": 2.3741243755576944, "grad_norm": 0.26139578223228455, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.8315067972987891, "num_tokens": 199896162.0, "step": 31680 }, { "entropy": 0.5637209286913276, "epoch": 2.3744990994915467, "grad_norm": 0.2736130952835083, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.8305062282830477, "num_tokens": 201341770.0, "step": 31685 }, { "entropy": 0.5700902311131358, "epoch": 2.374873823425398, "grad_norm": 0.2459469437599182, "learning_rate": 0.0002, "loss": 0.586, "mean_token_accuracy": 0.8296544287353754, "num_tokens": 202835714.0, "step": 31690 }, { "entropy": 0.5749735713005066, "epoch": 2.3752485473592504, "grad_norm": 0.23965783417224884, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.8314267292618751, "num_tokens": 204328060.0, "step": 31695 }, { "entropy": 0.5836805999279022, "epoch": 2.375623271293102, "grad_norm": 0.23203271627426147, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8241748556494712, "num_tokens": 205797880.0, "step": 31700 }, { "entropy": 0.5846852170303464, "epoch": 2.375997995226954, "grad_norm": 0.2866305410861969, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.8296076867729425, "num_tokens": 207299180.0, "step": 31705 }, { "entropy": 0.5688976053148508, "epoch": 2.3763727191608055, "grad_norm": 0.2574857175350189, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.829381263256073, "num_tokens": 208775511.0, "step": 31710 }, { "entropy": 0.5789737310260534, "epoch": 2.376747443094658, "grad_norm": 0.22573257982730865, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.8283118791878223, "num_tokens": 210257607.0, "step": 31715 }, { "entropy": 0.5923110155388713, "epoch": 2.3771221670285096, "grad_norm": 0.28397324681282043, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.8280636750161647, "num_tokens": 211751677.0, "step": 31720 }, { "entropy": 0.5755450919270515, "epoch": 2.3774968909623615, "grad_norm": 0.25781771540641785, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.8286034848541022, "num_tokens": 213222715.0, "step": 31725 }, { "entropy": 0.5978421796113252, "epoch": 2.3778716148962133, "grad_norm": 0.27048563957214355, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8259608503431082, "num_tokens": 214707997.0, "step": 31730 }, { "entropy": 0.5860833352431655, "epoch": 2.378246338830065, "grad_norm": 0.2995573580265045, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.8242920111864805, "num_tokens": 216177910.0, "step": 31735 }, { "entropy": 0.5748717674985528, "epoch": 2.378621062763917, "grad_norm": 0.2435111254453659, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.8289298351854086, "num_tokens": 217663341.0, "step": 31740 }, { "entropy": 0.58577899672091, "epoch": 2.378995786697769, "grad_norm": 0.23441408574581146, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.8223038613796234, "num_tokens": 219158135.0, "step": 31745 }, { "entropy": 0.5818076955154539, "epoch": 2.3793705106316207, "grad_norm": 0.22717757523059845, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.8279997400939465, "num_tokens": 220608789.0, "step": 31750 }, { "entropy": 0.5682599406689406, "epoch": 2.3797452345654726, "grad_norm": 0.26610976457595825, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.8292857579886913, "num_tokens": 222118640.0, "step": 31755 }, { "entropy": 0.5894714120775462, "epoch": 2.3801199584993245, "grad_norm": 0.26471778750419617, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.8249919444322587, "num_tokens": 223626962.0, "step": 31760 }, { "entropy": 0.57920145932585, "epoch": 2.3804946824331763, "grad_norm": 0.3093937039375305, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.8315690498799085, "num_tokens": 225074253.0, "step": 31765 }, { "entropy": 0.5830182863399387, "epoch": 2.380869406367028, "grad_norm": 0.2720746695995331, "learning_rate": 0.0002, "loss": 0.607, "mean_token_accuracy": 0.8227760873734951, "num_tokens": 226554217.0, "step": 31770 }, { "entropy": 0.570719576254487, "epoch": 2.38124413030088, "grad_norm": 0.23052185773849487, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.8277657214552164, "num_tokens": 228019650.0, "step": 31775 }, { "entropy": 0.5767193475738168, "epoch": 2.381618854234732, "grad_norm": 0.24364058673381805, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.8247248221188783, "num_tokens": 229495453.0, "step": 31780 }, { "entropy": 0.5833165753632784, "epoch": 2.3819935781685837, "grad_norm": 0.23437567055225372, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8260389868170023, "num_tokens": 230973074.0, "step": 31785 }, { "entropy": 0.5866976611316204, "epoch": 2.3823683021024356, "grad_norm": 0.21993619203567505, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.8280199933797121, "num_tokens": 232479173.0, "step": 31790 }, { "entropy": 0.587349615432322, "epoch": 2.3827430260362874, "grad_norm": 0.22550879418849945, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.8289303060621023, "num_tokens": 233919665.0, "step": 31795 }, { "entropy": 0.5880981039255857, "epoch": 2.3831177499701393, "grad_norm": 0.26587462425231934, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8298203174024821, "num_tokens": 235386603.0, "step": 31800 }, { "entropy": 0.587483317963779, "epoch": 2.383492473903991, "grad_norm": 0.2375945895910263, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.8256377089768648, "num_tokens": 236849993.0, "step": 31805 }, { "entropy": 0.5942364638671279, "epoch": 2.383867197837843, "grad_norm": 0.21933427453041077, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.8285918161273003, "num_tokens": 238316092.0, "step": 31810 }, { "entropy": 0.580949823372066, "epoch": 2.384241921771695, "grad_norm": 0.2593228220939636, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8294793985784054, "num_tokens": 239790221.0, "step": 31815 }, { "entropy": 0.5866340313106775, "epoch": 2.3846166457055467, "grad_norm": 0.2254374623298645, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.8257940802723169, "num_tokens": 241258110.0, "step": 31820 }, { "entropy": 0.5762506948783994, "epoch": 2.3849913696393985, "grad_norm": 0.2438342273235321, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.8266094554215669, "num_tokens": 242732140.0, "step": 31825 }, { "entropy": 0.5904749527573585, "epoch": 2.3853660935732504, "grad_norm": 0.28354009985923767, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.8258054722100496, "num_tokens": 244249898.0, "step": 31830 }, { "entropy": 0.5779352920129895, "epoch": 2.385740817507102, "grad_norm": 0.23916257917881012, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8283789452165365, "num_tokens": 245721003.0, "step": 31835 }, { "entropy": 0.5972892496734857, "epoch": 2.386115541440954, "grad_norm": 0.2612365484237671, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.8237109977751971, "num_tokens": 247221226.0, "step": 31840 }, { "entropy": 0.577489099279046, "epoch": 2.386490265374806, "grad_norm": 0.28057217597961426, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.8332999128848314, "num_tokens": 248706640.0, "step": 31845 }, { "entropy": 0.5790728883817792, "epoch": 2.3868649893086578, "grad_norm": 0.2475820779800415, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.8273459456861019, "num_tokens": 250138251.0, "step": 31850 }, { "entropy": 0.5871975295245647, "epoch": 2.3872397132425096, "grad_norm": 0.23746713995933533, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.823734500259161, "num_tokens": 251626955.0, "step": 31855 }, { "entropy": 0.5670273208990693, "epoch": 2.3876144371763615, "grad_norm": 0.24897564947605133, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.8302919518202543, "num_tokens": 253066360.0, "step": 31860 }, { "entropy": 0.5806392204016447, "epoch": 2.3879891611102133, "grad_norm": 0.2829619348049164, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8281151924282313, "num_tokens": 254523197.0, "step": 31865 }, { "entropy": 0.5817502638325095, "epoch": 2.388363885044065, "grad_norm": 0.24131721258163452, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8257746830582618, "num_tokens": 255967796.0, "step": 31870 }, { "entropy": 0.5813524363562464, "epoch": 2.388738608977917, "grad_norm": 0.2411511093378067, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.8307303316891194, "num_tokens": 257382749.0, "step": 31875 }, { "entropy": 0.5850302536040545, "epoch": 2.389113332911769, "grad_norm": 0.24726121127605438, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8256417810916901, "num_tokens": 258864889.0, "step": 31880 }, { "entropy": 0.575787204131484, "epoch": 2.3894880568456207, "grad_norm": 0.2580661177635193, "learning_rate": 0.0002, "loss": 0.5861, "mean_token_accuracy": 0.8302299667149782, "num_tokens": 260342150.0, "step": 31885 }, { "entropy": 0.5781327987089753, "epoch": 2.3898627807794726, "grad_norm": 0.22731533646583557, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.8289380710572004, "num_tokens": 261880034.0, "step": 31890 }, { "entropy": 0.5710392758250237, "epoch": 2.3902375047133244, "grad_norm": 0.2516385018825531, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.8297437328845263, "num_tokens": 263353060.0, "step": 31895 }, { "entropy": 0.589339017868042, "epoch": 2.3906122286471763, "grad_norm": 0.27400752902030945, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8288548227399588, "num_tokens": 264837073.0, "step": 31900 }, { "entropy": 0.5842547945678234, "epoch": 2.390986952581028, "grad_norm": 0.2388003021478653, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8242570545524359, "num_tokens": 266368057.0, "step": 31905 }, { "entropy": 0.5916743254289031, "epoch": 2.39136167651488, "grad_norm": 0.2357110232114792, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8248881001025439, "num_tokens": 267876702.0, "step": 31910 }, { "entropy": 0.5848558163270354, "epoch": 2.391736400448732, "grad_norm": 0.25811147689819336, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.8258214920759201, "num_tokens": 269337220.0, "step": 31915 }, { "entropy": 0.578358200751245, "epoch": 2.3921111243825837, "grad_norm": 0.23821981251239777, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.8297016065567732, "num_tokens": 270809705.0, "step": 31920 }, { "entropy": 0.6010798674076796, "epoch": 2.3924858483164355, "grad_norm": 0.24131333827972412, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.8233534004539251, "num_tokens": 272329197.0, "step": 31925 }, { "entropy": 0.5833082659170031, "epoch": 2.3928605722502874, "grad_norm": 0.27940043807029724, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.8307878069579602, "num_tokens": 273774783.0, "step": 31930 }, { "entropy": 0.5752652071416378, "epoch": 2.393235296184139, "grad_norm": 0.25081706047058105, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.8280942864716053, "num_tokens": 275215971.0, "step": 31935 }, { "entropy": 0.5989489747211337, "epoch": 2.393610020117991, "grad_norm": 0.2703881561756134, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.8280065223574639, "num_tokens": 276700814.0, "step": 31940 }, { "entropy": 0.5894534332677722, "epoch": 2.393984744051843, "grad_norm": 0.24982286989688873, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.8266176495701074, "num_tokens": 278168174.0, "step": 31945 }, { "entropy": 0.5835925780236721, "epoch": 2.3943594679856948, "grad_norm": 0.2701490521430969, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.8288770474493503, "num_tokens": 279630407.0, "step": 31950 }, { "entropy": 0.5821809023618698, "epoch": 2.3947341919195466, "grad_norm": 0.226424440741539, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.8273421887308359, "num_tokens": 281156216.0, "step": 31955 }, { "entropy": 0.5906208898872137, "epoch": 2.3951089158533985, "grad_norm": 0.618463933467865, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.8272736221551895, "num_tokens": 282620946.0, "step": 31960 }, { "entropy": 0.5984113803133368, "epoch": 2.3954836397872503, "grad_norm": 0.3089694678783417, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8250149909406901, "num_tokens": 284108384.0, "step": 31965 }, { "entropy": 0.5917044084519147, "epoch": 2.395858363721102, "grad_norm": 0.2595638334751129, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.8293745327740908, "num_tokens": 285602837.0, "step": 31970 }, { "entropy": 0.5817019941285253, "epoch": 2.396233087654954, "grad_norm": 0.21802322566509247, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.826355317607522, "num_tokens": 287129124.0, "step": 31975 }, { "entropy": 0.5806429512798786, "epoch": 2.396607811588806, "grad_norm": 0.2301643341779709, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.8252925243228674, "num_tokens": 288613908.0, "step": 31980 }, { "entropy": 0.5676796138286591, "epoch": 2.3969825355226577, "grad_norm": 0.2557547986507416, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.8308270581066608, "num_tokens": 290076730.0, "step": 31985 }, { "entropy": 0.5573181727901101, "epoch": 2.39735725945651, "grad_norm": 0.25638994574546814, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.8338303279131651, "num_tokens": 291525629.0, "step": 31990 }, { "entropy": 0.5821303589269519, "epoch": 2.3977319833903614, "grad_norm": 0.2848249673843384, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.8300261829048395, "num_tokens": 292968842.0, "step": 31995 }, { "entropy": 0.5902868699282408, "epoch": 2.3981067073242137, "grad_norm": 0.21911579370498657, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.8255521982908249, "num_tokens": 294431012.0, "step": 32000 }, { "entropy": 0.5922263987362385, "epoch": 2.398481431258065, "grad_norm": 0.22865869104862213, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.8262888453900814, "num_tokens": 295864300.0, "step": 32005 }, { "entropy": 0.5819376643747092, "epoch": 2.3988561551919174, "grad_norm": 0.22604934871196747, "learning_rate": 0.0002, "loss": 0.6061, "mean_token_accuracy": 0.8267748259007931, "num_tokens": 297321967.0, "step": 32010 }, { "entropy": 0.5844245260581374, "epoch": 2.399230879125769, "grad_norm": 0.24021418392658234, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8267910305410624, "num_tokens": 298792741.0, "step": 32015 }, { "entropy": 0.5573802722617984, "epoch": 2.399605603059621, "grad_norm": 0.23073464632034302, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.8341387439519167, "num_tokens": 300239882.0, "step": 32020 }, { "entropy": 0.5588560152798892, "epoch": 2.399980326993473, "grad_norm": 0.31727033853530884, "learning_rate": 0.0002, "loss": 0.582, "mean_token_accuracy": 0.8329798486083746, "num_tokens": 301687977.0, "step": 32025 }, { "entropy": 0.5853870732709765, "epoch": 2.400355050927325, "grad_norm": 0.2317207157611847, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.8268982347100973, "num_tokens": 303177383.0, "step": 32030 }, { "entropy": 0.5753704661503434, "epoch": 2.4007297748611767, "grad_norm": 0.26180994510650635, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.8267346266657114, "num_tokens": 304626309.0, "step": 32035 }, { "entropy": 0.5631355909630656, "epoch": 2.4011044987950285, "grad_norm": 0.24250009655952454, "learning_rate": 0.0002, "loss": 0.5786, "mean_token_accuracy": 0.8323627218604088, "num_tokens": 306107951.0, "step": 32040 }, { "entropy": 0.5875999221578241, "epoch": 2.4014792227288804, "grad_norm": 0.287447065114975, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.8292304702103138, "num_tokens": 307586391.0, "step": 32045 }, { "entropy": 0.5871063575148583, "epoch": 2.401853946662732, "grad_norm": 0.2413349151611328, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.8311231769621372, "num_tokens": 309023488.0, "step": 32050 }, { "entropy": 0.5900866152718663, "epoch": 2.402228670596584, "grad_norm": 0.27051597833633423, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.8261124335229397, "num_tokens": 310466746.0, "step": 32055 }, { "entropy": 0.5874069649726152, "epoch": 2.402603394530436, "grad_norm": 0.26369619369506836, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.8286989331245422, "num_tokens": 311905010.0, "step": 32060 }, { "entropy": 0.6033893594518304, "epoch": 2.4029781184642878, "grad_norm": 0.23872682452201843, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.8220305010676384, "num_tokens": 313409819.0, "step": 32065 }, { "entropy": 0.5967029836028814, "epoch": 2.4033528423981396, "grad_norm": 0.24295984208583832, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.8249223016202449, "num_tokens": 314916306.0, "step": 32070 }, { "entropy": 0.6081725567579269, "epoch": 2.4037275663319915, "grad_norm": 0.25128042697906494, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.8252796284854412, "num_tokens": 316358332.0, "step": 32075 }, { "entropy": 0.6011153953149915, "epoch": 2.4041022902658433, "grad_norm": 0.25013938546180725, "learning_rate": 0.0002, "loss": 0.6061, "mean_token_accuracy": 0.8263900119811296, "num_tokens": 317823792.0, "step": 32080 }, { "entropy": 0.5876702049747109, "epoch": 2.404477014199695, "grad_norm": 0.23918984830379486, "learning_rate": 0.0002, "loss": 0.6046, "mean_token_accuracy": 0.8258168820291758, "num_tokens": 319310730.0, "step": 32085 }, { "entropy": 0.5825800040736795, "epoch": 2.404851738133547, "grad_norm": 0.24043214321136475, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.8283402059227228, "num_tokens": 320802086.0, "step": 32090 }, { "entropy": 0.5813474424183369, "epoch": 2.405226462067399, "grad_norm": 0.24900749325752258, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8295359693467617, "num_tokens": 322260487.0, "step": 32095 }, { "entropy": 0.5935312679037452, "epoch": 2.4056011860012507, "grad_norm": 0.23494188487529755, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.8232962884008884, "num_tokens": 323686211.0, "step": 32100 }, { "entropy": 0.5808441579341889, "epoch": 2.4059759099351026, "grad_norm": 0.23134995996952057, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.8274944104254246, "num_tokens": 325203189.0, "step": 32105 }, { "entropy": 0.5852665854617953, "epoch": 2.4063506338689544, "grad_norm": 0.2589744031429291, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.8279713205993176, "num_tokens": 326690611.0, "step": 32110 }, { "entropy": 0.5857129996642471, "epoch": 2.4067253578028063, "grad_norm": 0.2384401112794876, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.8241331331431866, "num_tokens": 328223818.0, "step": 32115 }, { "entropy": 0.5799067787826061, "epoch": 2.407100081736658, "grad_norm": 0.2517494857311249, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8270711164921523, "num_tokens": 329674354.0, "step": 32120 }, { "entropy": 0.5795097455382348, "epoch": 2.40747480567051, "grad_norm": 0.23995612561702728, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.8267700769007206, "num_tokens": 331144487.0, "step": 32125 }, { "entropy": 0.5506875064224005, "epoch": 2.407849529604362, "grad_norm": 0.23531244695186615, "learning_rate": 0.0002, "loss": 0.578, "mean_token_accuracy": 0.8301741074770689, "num_tokens": 332577628.0, "step": 32130 }, { "entropy": 0.5653359457850456, "epoch": 2.4082242535382137, "grad_norm": 0.24278409779071808, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.8277476072311402, "num_tokens": 334018810.0, "step": 32135 }, { "entropy": 0.5774102415889502, "epoch": 2.4085989774720655, "grad_norm": 0.23687894642353058, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.8299291417002678, "num_tokens": 335486990.0, "step": 32140 }, { "entropy": 0.5779309028759598, "epoch": 2.4089737014059174, "grad_norm": 0.22440306842327118, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.8279781930148602, "num_tokens": 336908046.0, "step": 32145 }, { "entropy": 0.5856639666482806, "epoch": 2.4093484253397692, "grad_norm": 0.2657519578933716, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.8294507600367069, "num_tokens": 338344710.0, "step": 32150 }, { "entropy": 0.5905918221920728, "epoch": 2.409723149273621, "grad_norm": 0.24739396572113037, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.8269201014190912, "num_tokens": 339830880.0, "step": 32155 }, { "entropy": 0.5918672975152731, "epoch": 2.410097873207473, "grad_norm": 0.22445106506347656, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.8268326353281736, "num_tokens": 341293740.0, "step": 32160 }, { "entropy": 0.5763901820406317, "epoch": 2.410472597141325, "grad_norm": 0.2514171302318573, "learning_rate": 0.0002, "loss": 0.5819, "mean_token_accuracy": 0.8308712854981423, "num_tokens": 342746404.0, "step": 32165 }, { "entropy": 0.580409156344831, "epoch": 2.4108473210751766, "grad_norm": 0.24586130678653717, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.8276447989046574, "num_tokens": 344250863.0, "step": 32170 }, { "entropy": 0.5914193656295538, "epoch": 2.4112220450090285, "grad_norm": 0.28226521611213684, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.8231961023062467, "num_tokens": 345735794.0, "step": 32175 }, { "entropy": 0.5712072433903813, "epoch": 2.4115967689428803, "grad_norm": 0.22928805649280548, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.8275111928582192, "num_tokens": 347186875.0, "step": 32180 }, { "entropy": 0.582858826778829, "epoch": 2.411971492876732, "grad_norm": 0.24864725768566132, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.8240896552801132, "num_tokens": 348666192.0, "step": 32185 }, { "entropy": 0.577405178360641, "epoch": 2.412346216810584, "grad_norm": 0.2384195625782013, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.8281489130109548, "num_tokens": 350098990.0, "step": 32190 }, { "entropy": 0.5650198999792337, "epoch": 2.412720940744436, "grad_norm": 0.23904013633728027, "learning_rate": 0.0002, "loss": 0.5758, "mean_token_accuracy": 0.8288716465234757, "num_tokens": 351542215.0, "step": 32195 }, { "entropy": 0.5714661540463567, "epoch": 2.4130956646782877, "grad_norm": 0.22946636378765106, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.8282988626509905, "num_tokens": 353037320.0, "step": 32200 }, { "entropy": 0.595069614984095, "epoch": 2.4134703886121396, "grad_norm": 0.2332800179719925, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.8258513186126948, "num_tokens": 354527800.0, "step": 32205 }, { "entropy": 0.5890627833083272, "epoch": 2.4138451125459914, "grad_norm": 0.2513478994369507, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.8260656882077455, "num_tokens": 356017165.0, "step": 32210 }, { "entropy": 0.5801958238705993, "epoch": 2.4142198364798433, "grad_norm": 0.22574856877326965, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.8311863735318183, "num_tokens": 357471231.0, "step": 32215 }, { "entropy": 0.5919646695256233, "epoch": 2.414594560413695, "grad_norm": 0.2535550892353058, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.8282137624919415, "num_tokens": 358945051.0, "step": 32220 }, { "entropy": 0.5821955306455493, "epoch": 2.414969284347547, "grad_norm": 0.2682496905326843, "learning_rate": 0.0002, "loss": 0.5674, "mean_token_accuracy": 0.8333765707910061, "num_tokens": 360386009.0, "step": 32225 }, { "entropy": 0.5964642947539687, "epoch": 2.415344008281399, "grad_norm": 0.2354930192232132, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.8233964454382658, "num_tokens": 361877880.0, "step": 32230 }, { "entropy": 0.5872525054961443, "epoch": 2.4157187322152507, "grad_norm": 0.2414322793483734, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.8295182775706053, "num_tokens": 363338915.0, "step": 32235 }, { "entropy": 0.5916944712400436, "epoch": 2.4160934561491025, "grad_norm": 0.2700941264629364, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.8254287123680115, "num_tokens": 364844617.0, "step": 32240 }, { "entropy": 0.5721357394009828, "epoch": 2.4164681800829544, "grad_norm": 0.23617857694625854, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.8270677305758, "num_tokens": 366328775.0, "step": 32245 }, { "entropy": 0.5723654178902506, "epoch": 2.4168429040168062, "grad_norm": 0.26269882917404175, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.8292790003120899, "num_tokens": 367810533.0, "step": 32250 }, { "entropy": 0.5851835820823907, "epoch": 2.417217627950658, "grad_norm": 0.241243377327919, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.8256970435380936, "num_tokens": 369244699.0, "step": 32255 }, { "entropy": 0.5704143105074764, "epoch": 2.41759235188451, "grad_norm": 0.23706014454364777, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.8282625388354063, "num_tokens": 370736880.0, "step": 32260 }, { "entropy": 0.5661629503592849, "epoch": 2.417967075818362, "grad_norm": 0.2465900182723999, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.8304664853960275, "num_tokens": 372194170.0, "step": 32265 }, { "entropy": 0.5721715381368995, "epoch": 2.4183417997522136, "grad_norm": 0.2642858624458313, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.8275947783142328, "num_tokens": 373685923.0, "step": 32270 }, { "entropy": 0.5757311746478081, "epoch": 2.4187165236860655, "grad_norm": 0.23717303574085236, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.8276773083955049, "num_tokens": 375204098.0, "step": 32275 }, { "entropy": 0.5783892767503858, "epoch": 2.4190912476199173, "grad_norm": 0.24272680282592773, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.8316380985081195, "num_tokens": 376652504.0, "step": 32280 }, { "entropy": 0.592614141665399, "epoch": 2.419465971553769, "grad_norm": 0.24596861004829407, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.8237970918416977, "num_tokens": 378120903.0, "step": 32285 }, { "entropy": 0.57255371324718, "epoch": 2.419840695487621, "grad_norm": 0.22712354362010956, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.8293491132557392, "num_tokens": 379594178.0, "step": 32290 }, { "entropy": 0.5666238486766815, "epoch": 2.4202154194214733, "grad_norm": 0.23040080070495605, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.831797493621707, "num_tokens": 381094564.0, "step": 32295 }, { "entropy": 0.5721789095550776, "epoch": 2.4205901433553247, "grad_norm": 0.2369692325592041, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.8278058927506209, "num_tokens": 382510004.0, "step": 32300 }, { "entropy": 0.5924309965223074, "epoch": 2.420964867289177, "grad_norm": 0.25088655948638916, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.8245649050921202, "num_tokens": 383986722.0, "step": 32305 }, { "entropy": 0.588514374755323, "epoch": 2.4213395912230284, "grad_norm": 0.2788129448890686, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.8264981269836426, "num_tokens": 385417336.0, "step": 32310 }, { "entropy": 0.5794274769723415, "epoch": 2.4217143151568807, "grad_norm": 0.35827773809432983, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.8301018919795752, "num_tokens": 386855111.0, "step": 32315 }, { "entropy": 0.5919246286153793, "epoch": 2.422089039090732, "grad_norm": 0.25783368945121765, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.8276551391929388, "num_tokens": 388336792.0, "step": 32320 }, { "entropy": 0.5785694412887097, "epoch": 2.4224637630245844, "grad_norm": 0.22947494685649872, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.8270346194505691, "num_tokens": 389796571.0, "step": 32325 }, { "entropy": 0.585726459696889, "epoch": 2.422838486958436, "grad_norm": 0.24334928393363953, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8266869217157364, "num_tokens": 391291688.0, "step": 32330 }, { "entropy": 0.5817725501954556, "epoch": 2.423213210892288, "grad_norm": 0.23253129422664642, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.8274506241083145, "num_tokens": 392732887.0, "step": 32335 }, { "entropy": 0.5803808711469174, "epoch": 2.42358793482614, "grad_norm": 0.2624197006225586, "learning_rate": 0.0002, "loss": 0.5853, "mean_token_accuracy": 0.8286755170673132, "num_tokens": 394198168.0, "step": 32340 }, { "entropy": 0.584285743162036, "epoch": 2.423962658759992, "grad_norm": 0.2601517140865326, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.8284897193312645, "num_tokens": 395653453.0, "step": 32345 }, { "entropy": 0.5854998031631112, "epoch": 2.4243373826938437, "grad_norm": 0.21751704812049866, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.830429420247674, "num_tokens": 397122346.0, "step": 32350 }, { "entropy": 0.5927421426400542, "epoch": 2.4247121066276955, "grad_norm": 0.26540854573249817, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8283426776528359, "num_tokens": 398599143.0, "step": 32355 }, { "entropy": 0.5987776214256882, "epoch": 2.4250868305615474, "grad_norm": 0.2624434232711792, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.8239257741719485, "num_tokens": 400101186.0, "step": 32360 }, { "entropy": 0.5887712722644209, "epoch": 2.4254615544953992, "grad_norm": 0.2408532202243805, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.8265984911471606, "num_tokens": 401597776.0, "step": 32365 }, { "entropy": 0.5867174845188856, "epoch": 2.425836278429251, "grad_norm": 0.2699785530567169, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8269344840198756, "num_tokens": 403078292.0, "step": 32370 }, { "entropy": 0.5732623152434826, "epoch": 2.426211002363103, "grad_norm": 0.23716013133525848, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.8301095556467771, "num_tokens": 404599443.0, "step": 32375 }, { "entropy": 0.5902482371777296, "epoch": 2.426585726296955, "grad_norm": 0.24543796479701996, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.8250875156372786, "num_tokens": 406044521.0, "step": 32380 }, { "entropy": 0.5744960837066173, "epoch": 2.4269604502308066, "grad_norm": 0.2343248724937439, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.8287379931658506, "num_tokens": 407574816.0, "step": 32385 }, { "entropy": 0.5832854943349958, "epoch": 2.4273351741646585, "grad_norm": 0.27926674485206604, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.8273057971149683, "num_tokens": 409028688.0, "step": 32390 }, { "entropy": 0.5886194586753846, "epoch": 2.4277098980985103, "grad_norm": 0.2410757690668106, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.8263068236410618, "num_tokens": 410517293.0, "step": 32395 }, { "entropy": 0.5803997101262212, "epoch": 2.428084622032362, "grad_norm": 0.2224690318107605, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.8279377941042185, "num_tokens": 412018818.0, "step": 32400 }, { "entropy": 0.5876524347811938, "epoch": 2.428459345966214, "grad_norm": 0.2522510588169098, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.8250468574464321, "num_tokens": 413512394.0, "step": 32405 }, { "entropy": 0.5850795291364193, "epoch": 2.428834069900066, "grad_norm": 0.31541264057159424, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.8283915895968675, "num_tokens": 414997368.0, "step": 32410 }, { "entropy": 0.5750199170783162, "epoch": 2.4292087938339177, "grad_norm": 0.2234661728143692, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.8313505258411169, "num_tokens": 416462804.0, "step": 32415 }, { "entropy": 0.5790883168578148, "epoch": 2.4295835177677696, "grad_norm": 0.20785298943519592, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.8263180840760469, "num_tokens": 417901294.0, "step": 32420 }, { "entropy": 0.5849016554653644, "epoch": 2.4299582417016214, "grad_norm": 0.24939176440238953, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.8225171905010938, "num_tokens": 419375874.0, "step": 32425 }, { "entropy": 0.562196366302669, "epoch": 2.4303329656354733, "grad_norm": 0.2270435392856598, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.8323538042604923, "num_tokens": 420868328.0, "step": 32430 }, { "entropy": 0.583614949695766, "epoch": 2.430707689569325, "grad_norm": 0.22388271987438202, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8252276692539453, "num_tokens": 422378191.0, "step": 32435 }, { "entropy": 0.5945857860147953, "epoch": 2.431082413503177, "grad_norm": 0.22642254829406738, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8237037859857083, "num_tokens": 423890457.0, "step": 32440 }, { "entropy": 0.6005589669570327, "epoch": 2.431457137437029, "grad_norm": 0.2475682646036148, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.8232116773724556, "num_tokens": 425355095.0, "step": 32445 }, { "entropy": 0.570688371732831, "epoch": 2.4318318613708807, "grad_norm": 0.23482239246368408, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.8313535500317812, "num_tokens": 426846381.0, "step": 32450 }, { "entropy": 0.5700773151591421, "epoch": 2.4322065853047325, "grad_norm": 0.272892564535141, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.831304432079196, "num_tokens": 428314355.0, "step": 32455 }, { "entropy": 0.5834876520559191, "epoch": 2.4325813092385844, "grad_norm": 0.24819928407669067, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.8227284729480744, "num_tokens": 429804647.0, "step": 32460 }, { "entropy": 0.5675956018269062, "epoch": 2.4329560331724362, "grad_norm": 0.30217117071151733, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.8288919627666473, "num_tokens": 431302824.0, "step": 32465 }, { "entropy": 0.5650573916733265, "epoch": 2.433330757106288, "grad_norm": 0.22407881915569305, "learning_rate": 0.0002, "loss": 0.5866, "mean_token_accuracy": 0.8312453299760818, "num_tokens": 432750282.0, "step": 32470 }, { "entropy": 0.5816331438720226, "epoch": 2.43370548104014, "grad_norm": 0.2686538100242615, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.8272709347307682, "num_tokens": 434265734.0, "step": 32475 }, { "entropy": 0.5818540519103408, "epoch": 2.434080204973992, "grad_norm": 0.31879302859306335, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.8275761637836695, "num_tokens": 435722457.0, "step": 32480 }, { "entropy": 0.5724137438461184, "epoch": 2.4344549289078437, "grad_norm": 0.24636611342430115, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.8306695502251387, "num_tokens": 437152740.0, "step": 32485 }, { "entropy": 0.5704237760975956, "epoch": 2.4348296528416955, "grad_norm": 0.2112446427345276, "learning_rate": 0.0002, "loss": 0.5789, "mean_token_accuracy": 0.82997259311378, "num_tokens": 438639389.0, "step": 32490 }, { "entropy": 0.576739490404725, "epoch": 2.4352043767755474, "grad_norm": 0.22854185104370117, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8251001831144095, "num_tokens": 440129893.0, "step": 32495 }, { "entropy": 0.5769728474318981, "epoch": 2.435579100709399, "grad_norm": 0.2348562777042389, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.8287102349102498, "num_tokens": 441578326.0, "step": 32500 }, { "entropy": 0.577790530771017, "epoch": 2.435953824643251, "grad_norm": 0.25112083554267883, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.8296837873756886, "num_tokens": 443042298.0, "step": 32505 }, { "entropy": 0.5866988498717547, "epoch": 2.436328548577103, "grad_norm": 0.23276017606258392, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.8268729291856289, "num_tokens": 444525485.0, "step": 32510 }, { "entropy": 0.5691687995567918, "epoch": 2.4367032725109548, "grad_norm": 0.22889722883701324, "learning_rate": 0.0002, "loss": 0.578, "mean_token_accuracy": 0.8317707266658545, "num_tokens": 445996260.0, "step": 32515 }, { "entropy": 0.5994825910776853, "epoch": 2.4370779964448066, "grad_norm": 0.22757753729820251, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.821923616155982, "num_tokens": 447525435.0, "step": 32520 }, { "entropy": 0.6036242669448256, "epoch": 2.4374527203786585, "grad_norm": 0.22880880534648895, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.8274787478148937, "num_tokens": 449006122.0, "step": 32525 }, { "entropy": 0.5845753440633417, "epoch": 2.4378274443125103, "grad_norm": 0.2587302625179291, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.8310748681426048, "num_tokens": 450487872.0, "step": 32530 }, { "entropy": 0.5981196733191609, "epoch": 2.438202168246362, "grad_norm": 0.26860541105270386, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.8271226931363345, "num_tokens": 451963884.0, "step": 32535 }, { "entropy": 0.6027504775673151, "epoch": 2.438576892180214, "grad_norm": 0.23423577845096588, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.8235483836382628, "num_tokens": 453438088.0, "step": 32540 }, { "entropy": 0.5917394392192363, "epoch": 2.438951616114066, "grad_norm": 0.23449848592281342, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.827957397326827, "num_tokens": 454913709.0, "step": 32545 }, { "entropy": 0.6042880786582827, "epoch": 2.4393263400479177, "grad_norm": 0.31212854385375977, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.8237145435065031, "num_tokens": 456401534.0, "step": 32550 }, { "entropy": 0.5882534122094512, "epoch": 2.4397010639817696, "grad_norm": 0.3069530129432678, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.8280180271714925, "num_tokens": 457847845.0, "step": 32555 }, { "entropy": 0.5947522792965174, "epoch": 2.4400757879156214, "grad_norm": 0.2503322958946228, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.8300323057919741, "num_tokens": 459357051.0, "step": 32560 }, { "entropy": 0.5943122200667859, "epoch": 2.4404505118494733, "grad_norm": 0.22629904747009277, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.825160538777709, "num_tokens": 460875814.0, "step": 32565 }, { "entropy": 0.5997674221172928, "epoch": 2.440825235783325, "grad_norm": 0.2802772521972656, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.8267462402582169, "num_tokens": 462319601.0, "step": 32570 }, { "entropy": 0.5978281723335386, "epoch": 2.441199959717177, "grad_norm": 0.22875653207302094, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.8280680377036334, "num_tokens": 463826207.0, "step": 32575 }, { "entropy": 0.5879377637058496, "epoch": 2.441574683651029, "grad_norm": 0.27373361587524414, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.832879013568163, "num_tokens": 465276325.0, "step": 32580 }, { "entropy": 0.586911640316248, "epoch": 2.4419494075848807, "grad_norm": 0.2551930546760559, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.8295128252357244, "num_tokens": 466789516.0, "step": 32585 }, { "entropy": 0.5830727448686958, "epoch": 2.4423241315187325, "grad_norm": 0.253384530544281, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.823928589001298, "num_tokens": 468291846.0, "step": 32590 }, { "entropy": 0.6000771023333072, "epoch": 2.4426988554525844, "grad_norm": 0.2594519555568695, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.822408702597022, "num_tokens": 469776109.0, "step": 32595 }, { "entropy": 0.5855193950235844, "epoch": 2.443073579386436, "grad_norm": 0.24079838395118713, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.8270916070789098, "num_tokens": 471307950.0, "step": 32600 }, { "entropy": 0.5616881089285016, "epoch": 2.443448303320288, "grad_norm": 0.2774029076099396, "learning_rate": 0.0002, "loss": 0.5795, "mean_token_accuracy": 0.8329925682395697, "num_tokens": 472798556.0, "step": 32605 }, { "entropy": 0.5907058231532574, "epoch": 2.4438230272541404, "grad_norm": 0.2337825745344162, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8244540587067604, "num_tokens": 474286759.0, "step": 32610 }, { "entropy": 0.5872817894443869, "epoch": 2.4441977511879918, "grad_norm": 0.22818803787231445, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.8275883447378873, "num_tokens": 475796705.0, "step": 32615 }, { "entropy": 0.6011915381997823, "epoch": 2.444572475121844, "grad_norm": 0.2364863157272339, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.8261902805417776, "num_tokens": 477302411.0, "step": 32620 }, { "entropy": 0.6091198571026325, "epoch": 2.4449471990556955, "grad_norm": 0.29192888736724854, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.8212238565087319, "num_tokens": 478814805.0, "step": 32625 }, { "entropy": 0.580169647373259, "epoch": 2.4453219229895478, "grad_norm": 0.2666081190109253, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.8314008828252554, "num_tokens": 480248550.0, "step": 32630 }, { "entropy": 0.5856858391314745, "epoch": 2.445696646923399, "grad_norm": 0.2502861022949219, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.8260742843151092, "num_tokens": 481662972.0, "step": 32635 }, { "entropy": 0.572227887250483, "epoch": 2.4460713708572515, "grad_norm": 0.2460404634475708, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.8307200066745282, "num_tokens": 483165611.0, "step": 32640 }, { "entropy": 0.5600448912009597, "epoch": 2.4464460947911033, "grad_norm": 0.2700527310371399, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.8320655137300491, "num_tokens": 484556219.0, "step": 32645 }, { "entropy": 0.5750818526372313, "epoch": 2.446820818724955, "grad_norm": 0.29813292622566223, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.8273634657263755, "num_tokens": 486040740.0, "step": 32650 }, { "entropy": 0.5534722840413451, "epoch": 2.447195542658807, "grad_norm": 0.2728685736656189, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.8287228293716907, "num_tokens": 487502646.0, "step": 32655 }, { "entropy": 0.5548222323879599, "epoch": 2.447570266592659, "grad_norm": 0.2202608287334442, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.8239054415374994, "num_tokens": 488984197.0, "step": 32660 }, { "entropy": 0.5484095266088843, "epoch": 2.4479449905265107, "grad_norm": 0.2377728372812271, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.8280187740921974, "num_tokens": 490468663.0, "step": 32665 }, { "entropy": 0.5521008748561144, "epoch": 2.4483197144603626, "grad_norm": 0.25172021985054016, "learning_rate": 0.0002, "loss": 0.6155, "mean_token_accuracy": 0.8246272802352905, "num_tokens": 491968189.0, "step": 32670 }, { "entropy": 0.5389262160286308, "epoch": 2.4486944383942144, "grad_norm": 0.25119054317474365, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8267648201435804, "num_tokens": 493452908.0, "step": 32675 }, { "entropy": 0.5348702093586326, "epoch": 2.4490691623280663, "grad_norm": 0.25215011835098267, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.8325774148106575, "num_tokens": 494921467.0, "step": 32680 }, { "entropy": 0.5497217068448663, "epoch": 2.449443886261918, "grad_norm": 0.26950398087501526, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.8275170274078846, "num_tokens": 496396944.0, "step": 32685 }, { "entropy": 0.5573344713076949, "epoch": 2.44981861019577, "grad_norm": 0.22076350450515747, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.8295442264527082, "num_tokens": 497819581.0, "step": 32690 }, { "entropy": 0.576651168614626, "epoch": 2.450193334129622, "grad_norm": 0.24817928671836853, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.8285971418023109, "num_tokens": 499328956.0, "step": 32695 }, { "entropy": 0.5783043140545487, "epoch": 2.4505680580634737, "grad_norm": 0.2612142264842987, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8280246544629335, "num_tokens": 500785896.0, "step": 32700 }, { "entropy": 0.5702359789982439, "epoch": 2.4509427819973255, "grad_norm": 0.2943398952484131, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.8285211466252804, "num_tokens": 502217353.0, "step": 32705 }, { "entropy": 0.5594785062596201, "epoch": 2.4513175059311774, "grad_norm": 0.2630804181098938, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.831867266818881, "num_tokens": 503691795.0, "step": 32710 }, { "entropy": 0.5718655455857515, "epoch": 2.451692229865029, "grad_norm": 0.3127661645412445, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.8257859040051698, "num_tokens": 505192536.0, "step": 32715 }, { "entropy": 0.5737687857821584, "epoch": 2.452066953798881, "grad_norm": 0.23308707773685455, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.829063156619668, "num_tokens": 506639451.0, "step": 32720 }, { "entropy": 0.5728488173335791, "epoch": 2.452441677732733, "grad_norm": 0.22600507736206055, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.8251374442130327, "num_tokens": 508106772.0, "step": 32725 }, { "entropy": 0.56732636410743, "epoch": 2.4528164016665848, "grad_norm": 0.242203027009964, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.8269155643880367, "num_tokens": 509545387.0, "step": 32730 }, { "entropy": 0.5595388565212488, "epoch": 2.4531911256004366, "grad_norm": 0.22409671545028687, "learning_rate": 0.0002, "loss": 0.5829, "mean_token_accuracy": 0.8307270906865597, "num_tokens": 511009608.0, "step": 32735 }, { "entropy": 0.5846981747075916, "epoch": 2.4535658495342885, "grad_norm": 0.2887248694896698, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.8287704315036535, "num_tokens": 512476996.0, "step": 32740 }, { "entropy": 0.5754891224205494, "epoch": 2.4539405734681403, "grad_norm": 0.27286940813064575, "learning_rate": 0.0002, "loss": 0.5897, "mean_token_accuracy": 0.8259676650166512, "num_tokens": 513940147.0, "step": 32745 }, { "entropy": 0.5831406651064753, "epoch": 2.454315297401992, "grad_norm": 0.24861443042755127, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.8258618906140327, "num_tokens": 515403215.0, "step": 32750 }, { "entropy": 0.5852568246424198, "epoch": 2.454690021335844, "grad_norm": 0.25081026554107666, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.8260011464357376, "num_tokens": 516875657.0, "step": 32755 }, { "entropy": 0.6095270095393062, "epoch": 2.455064745269696, "grad_norm": 0.24420605599880219, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.8255530402064324, "num_tokens": 518347640.0, "step": 32760 }, { "entropy": 0.5811734188348054, "epoch": 2.4554394692035477, "grad_norm": 0.28238141536712646, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.8321223437786103, "num_tokens": 519849471.0, "step": 32765 }, { "entropy": 0.5921874457970262, "epoch": 2.4558141931373996, "grad_norm": 0.2365661859512329, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.8259067311882973, "num_tokens": 521320256.0, "step": 32770 }, { "entropy": 0.579974508099258, "epoch": 2.4561889170712514, "grad_norm": 0.2527240514755249, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.829365074634552, "num_tokens": 522803015.0, "step": 32775 }, { "entropy": 0.5938315041363239, "epoch": 2.4565636410051033, "grad_norm": 0.2397012859582901, "learning_rate": 0.0002, "loss": 0.5968, "mean_token_accuracy": 0.8301792275160551, "num_tokens": 524273364.0, "step": 32780 }, { "entropy": 0.5891179775819182, "epoch": 2.456938364938955, "grad_norm": 0.23417268693447113, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.8270396593958139, "num_tokens": 525722051.0, "step": 32785 }, { "entropy": 0.5908998804166913, "epoch": 2.457313088872807, "grad_norm": 0.24215658009052277, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.8274946060031653, "num_tokens": 527218364.0, "step": 32790 }, { "entropy": 0.5937913179397583, "epoch": 2.457687812806659, "grad_norm": 0.26719456911087036, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.8261671233922243, "num_tokens": 528663189.0, "step": 32795 }, { "entropy": 0.5836660642176866, "epoch": 2.4580625367405107, "grad_norm": 0.25781869888305664, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8307544730603695, "num_tokens": 530129183.0, "step": 32800 }, { "entropy": 0.588035742379725, "epoch": 2.4584372606743625, "grad_norm": 0.21441768109798431, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.8285172212868929, "num_tokens": 531572663.0, "step": 32805 }, { "entropy": 0.5952005295082927, "epoch": 2.4588119846082144, "grad_norm": 0.23640017211437225, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.8235711146146059, "num_tokens": 533059532.0, "step": 32810 }, { "entropy": 0.5882448129355907, "epoch": 2.4591867085420662, "grad_norm": 0.22882315516471863, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.8232187893241644, "num_tokens": 534541683.0, "step": 32815 }, { "entropy": 0.5797942789271474, "epoch": 2.459561432475918, "grad_norm": 0.24327339231967926, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.8243903994560242, "num_tokens": 535979770.0, "step": 32820 }, { "entropy": 0.5632038718089462, "epoch": 2.45993615640977, "grad_norm": 0.23186539113521576, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.828014774248004, "num_tokens": 537415370.0, "step": 32825 }, { "entropy": 0.5573812337592244, "epoch": 2.4603108803436218, "grad_norm": 0.23415488004684448, "learning_rate": 0.0002, "loss": 0.586, "mean_token_accuracy": 0.8321805018931627, "num_tokens": 538874381.0, "step": 32830 }, { "entropy": 0.5753375709056854, "epoch": 2.4606856042774736, "grad_norm": 0.23490814864635468, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.8275979477912188, "num_tokens": 540354268.0, "step": 32835 }, { "entropy": 0.5867151718586683, "epoch": 2.4610603282113255, "grad_norm": 0.2475753128528595, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.824563154950738, "num_tokens": 541852155.0, "step": 32840 }, { "entropy": 0.5925454879179597, "epoch": 2.4614350521451773, "grad_norm": 0.24258790910243988, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.8301088843494654, "num_tokens": 543318379.0, "step": 32845 }, { "entropy": 0.5988032560795545, "epoch": 2.461809776079029, "grad_norm": 0.28493157029151917, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.8234932646155357, "num_tokens": 544784203.0, "step": 32850 }, { "entropy": 0.5846908176317811, "epoch": 2.462184500012881, "grad_norm": 0.25401771068573, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.8245670653879642, "num_tokens": 546226536.0, "step": 32855 }, { "entropy": 0.5706622501835227, "epoch": 2.462559223946733, "grad_norm": 0.26022157073020935, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.8318946745246649, "num_tokens": 547693163.0, "step": 32860 }, { "entropy": 0.581649268232286, "epoch": 2.4629339478805847, "grad_norm": 0.24731549620628357, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8307056810706854, "num_tokens": 549159167.0, "step": 32865 }, { "entropy": 0.5904791904613376, "epoch": 2.4633086718144366, "grad_norm": 0.2325434535741806, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.8268812123686076, "num_tokens": 550656424.0, "step": 32870 }, { "entropy": 0.5866472363471985, "epoch": 2.4636833957482884, "grad_norm": 0.23886068165302277, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.8276804126799107, "num_tokens": 552148781.0, "step": 32875 }, { "entropy": 0.5902085427194834, "epoch": 2.4640581196821403, "grad_norm": 0.2685464918613434, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.8284977097064257, "num_tokens": 553566384.0, "step": 32880 }, { "entropy": 0.5860115859657526, "epoch": 2.464432843615992, "grad_norm": 0.2360668033361435, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8284068796783686, "num_tokens": 555053229.0, "step": 32885 }, { "entropy": 0.5922267081215977, "epoch": 2.464807567549844, "grad_norm": 0.2580331563949585, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.826955109462142, "num_tokens": 556549824.0, "step": 32890 }, { "entropy": 0.587172283604741, "epoch": 2.465182291483696, "grad_norm": 0.29137441515922546, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.827914659306407, "num_tokens": 558040003.0, "step": 32895 }, { "entropy": 0.5874948808923364, "epoch": 2.4655570154175477, "grad_norm": 0.2658793330192566, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.8322279214859009, "num_tokens": 559531672.0, "step": 32900 }, { "entropy": 0.5938275309279561, "epoch": 2.4659317393513995, "grad_norm": 0.2529953122138977, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8248552966862917, "num_tokens": 561013844.0, "step": 32905 }, { "entropy": 0.5948531256988645, "epoch": 2.4663064632852514, "grad_norm": 0.26912981271743774, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8286583665758371, "num_tokens": 562429119.0, "step": 32910 }, { "entropy": 0.5883864542469383, "epoch": 2.4666811872191032, "grad_norm": 0.2481624186038971, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.8280665967613459, "num_tokens": 563915006.0, "step": 32915 }, { "entropy": 0.5964195175096393, "epoch": 2.467055911152955, "grad_norm": 0.2435181885957718, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8249896503984928, "num_tokens": 565407756.0, "step": 32920 }, { "entropy": 0.6155917599797249, "epoch": 2.4674306350868074, "grad_norm": 0.2628183662891388, "learning_rate": 0.0002, "loss": 0.6173, "mean_token_accuracy": 0.8250880345702172, "num_tokens": 566896948.0, "step": 32925 }, { "entropy": 0.6164260381832719, "epoch": 2.467805359020659, "grad_norm": 0.23259082436561584, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.8304353799670935, "num_tokens": 568375222.0, "step": 32930 }, { "entropy": 0.6260314408689738, "epoch": 2.468180082954511, "grad_norm": 0.22230975329875946, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.8271316275000572, "num_tokens": 569814727.0, "step": 32935 }, { "entropy": 0.5988000301644206, "epoch": 2.4685548068883625, "grad_norm": 0.2898625433444977, "learning_rate": 0.0002, "loss": 0.6046, "mean_token_accuracy": 0.8279195442795754, "num_tokens": 571302786.0, "step": 32940 }, { "entropy": 0.6072560992091894, "epoch": 2.468929530822215, "grad_norm": 0.23361678421497345, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.8268382146954536, "num_tokens": 572812379.0, "step": 32945 }, { "entropy": 0.5903033895418048, "epoch": 2.469304254756066, "grad_norm": 0.2590128481388092, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8267424423247576, "num_tokens": 574343632.0, "step": 32950 }, { "entropy": 0.5770324556156993, "epoch": 2.4696789786899185, "grad_norm": 0.26341143250465393, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.8311528574675322, "num_tokens": 575808851.0, "step": 32955 }, { "entropy": 0.5709156587719917, "epoch": 2.4700537026237703, "grad_norm": 0.2576274275779724, "learning_rate": 0.0002, "loss": 0.5837, "mean_token_accuracy": 0.8316735792905092, "num_tokens": 577315229.0, "step": 32960 }, { "entropy": 0.557935967668891, "epoch": 2.470428426557622, "grad_norm": 0.22985468804836273, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.8290133491158486, "num_tokens": 578784298.0, "step": 32965 }, { "entropy": 0.5646339535713196, "epoch": 2.470803150491474, "grad_norm": 0.24500630795955658, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.827968668192625, "num_tokens": 580236717.0, "step": 32970 }, { "entropy": 0.5638533381745219, "epoch": 2.471177874425326, "grad_norm": 0.26632457971572876, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.8306948099285364, "num_tokens": 581684544.0, "step": 32975 }, { "entropy": 0.5614160001277924, "epoch": 2.4715525983591777, "grad_norm": 0.22634975612163544, "learning_rate": 0.0002, "loss": 0.5868, "mean_token_accuracy": 0.8301280718296766, "num_tokens": 583143813.0, "step": 32980 }, { "entropy": 0.5672186955809593, "epoch": 2.4719273222930296, "grad_norm": 0.24828708171844482, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.8267255011945963, "num_tokens": 584596791.0, "step": 32985 }, { "entropy": 0.565141873806715, "epoch": 2.4723020462268814, "grad_norm": 0.2505829930305481, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.8283142119646072, "num_tokens": 586077653.0, "step": 32990 }, { "entropy": 0.5743874449282884, "epoch": 2.4726767701607333, "grad_norm": 0.23511065542697906, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.8263187371194363, "num_tokens": 587547021.0, "step": 32995 }, { "entropy": 0.5806661888957023, "epoch": 2.473051494094585, "grad_norm": 0.2406286597251892, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.8284466899931431, "num_tokens": 588982842.0, "step": 33000 }, { "entropy": 0.5733365742489696, "epoch": 2.473426218028437, "grad_norm": 0.2350441813468933, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.8265797294676304, "num_tokens": 590422803.0, "step": 33005 }, { "entropy": 0.5628069046884775, "epoch": 2.473800941962289, "grad_norm": 0.26756149530410767, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.8295052915811538, "num_tokens": 591900002.0, "step": 33010 }, { "entropy": 0.5620760103687644, "epoch": 2.4741756658961407, "grad_norm": 0.2518260180950165, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.8325334772467613, "num_tokens": 593340593.0, "step": 33015 }, { "entropy": 0.5666407331824302, "epoch": 2.4745503898299925, "grad_norm": 0.256307989358902, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.8301961220800876, "num_tokens": 594832565.0, "step": 33020 }, { "entropy": 0.5787010312080383, "epoch": 2.4749251137638444, "grad_norm": 0.26630398631095886, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.8246053989976645, "num_tokens": 596331551.0, "step": 33025 }, { "entropy": 0.589979424700141, "epoch": 2.4752998376976962, "grad_norm": 0.23538145422935486, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.8253075558692217, "num_tokens": 597834838.0, "step": 33030 }, { "entropy": 0.5820243418216705, "epoch": 2.475674561631548, "grad_norm": 0.2984318733215332, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.8303518902510405, "num_tokens": 599307530.0, "step": 33035 }, { "entropy": 0.5804696848616004, "epoch": 2.4760492855654, "grad_norm": 0.24467399716377258, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.8275602120906115, "num_tokens": 600751323.0, "step": 33040 }, { "entropy": 0.5781412875279784, "epoch": 2.476424009499252, "grad_norm": 0.5887347459793091, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.8270156186074018, "num_tokens": 602221130.0, "step": 33045 }, { "entropy": 0.5725544465705752, "epoch": 2.4767987334331036, "grad_norm": 0.24534523487091064, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.8260160014033318, "num_tokens": 603696767.0, "step": 33050 }, { "entropy": 0.5731052065268158, "epoch": 2.4771734573669555, "grad_norm": 0.2307472676038742, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8267068147659302, "num_tokens": 605154870.0, "step": 33055 }, { "entropy": 0.5678319657221437, "epoch": 2.4775481813008073, "grad_norm": 0.4253060817718506, "learning_rate": 0.0002, "loss": 0.5884, "mean_token_accuracy": 0.8311568185687065, "num_tokens": 606609079.0, "step": 33060 }, { "entropy": 0.5702751809731126, "epoch": 2.477922905234659, "grad_norm": 0.2856251001358032, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8302302841097117, "num_tokens": 608064013.0, "step": 33065 }, { "entropy": 0.5616416381672025, "epoch": 2.478297629168511, "grad_norm": 0.24798090755939484, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.8337597560137511, "num_tokens": 609523726.0, "step": 33070 }, { "entropy": 0.5691877918317914, "epoch": 2.478672353102363, "grad_norm": 0.2322784960269928, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.8303589582443237, "num_tokens": 610983992.0, "step": 33075 }, { "entropy": 0.5774574806913734, "epoch": 2.4790470770362147, "grad_norm": 0.2215331494808197, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.8246217589825392, "num_tokens": 612473617.0, "step": 33080 }, { "entropy": 0.5781062703579665, "epoch": 2.4794218009700666, "grad_norm": 0.2485165148973465, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.821602001786232, "num_tokens": 613960055.0, "step": 33085 }, { "entropy": 0.5663200018927455, "epoch": 2.4797965249039184, "grad_norm": 0.22975130379199982, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.8290147945284844, "num_tokens": 615450859.0, "step": 33090 }, { "entropy": 0.5533722462132573, "epoch": 2.4801712488377703, "grad_norm": 0.2417893260717392, "learning_rate": 0.0002, "loss": 0.5821, "mean_token_accuracy": 0.831939771026373, "num_tokens": 616883108.0, "step": 33095 }, { "entropy": 0.5663565799593926, "epoch": 2.480545972771622, "grad_norm": 0.25028863549232483, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.8259816635400057, "num_tokens": 618411944.0, "step": 33100 }, { "entropy": 0.5735079789534211, "epoch": 2.480920696705474, "grad_norm": 0.23346801102161407, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.8275547631084919, "num_tokens": 619871005.0, "step": 33105 }, { "entropy": 0.5718889106065035, "epoch": 2.481295420639326, "grad_norm": 0.2492133527994156, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.8299093134701252, "num_tokens": 621314355.0, "step": 33110 }, { "entropy": 0.5847709406167269, "epoch": 2.4816701445731777, "grad_norm": 0.2356880158185959, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.8252200298011303, "num_tokens": 622755823.0, "step": 33115 }, { "entropy": 0.5805540746077895, "epoch": 2.4820448685070295, "grad_norm": 0.23990444839000702, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.8293674893677234, "num_tokens": 624197612.0, "step": 33120 }, { "entropy": 0.5726242363452911, "epoch": 2.4824195924408814, "grad_norm": 0.26513671875, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.8269323691725731, "num_tokens": 625650783.0, "step": 33125 }, { "entropy": 0.5755089741200209, "epoch": 2.4827943163747332, "grad_norm": 0.26451584696769714, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.8256840828806162, "num_tokens": 627090122.0, "step": 33130 }, { "entropy": 0.5862360645085574, "epoch": 2.483169040308585, "grad_norm": 0.24491705000400543, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8253689657896757, "num_tokens": 628599435.0, "step": 33135 }, { "entropy": 0.5654121322557331, "epoch": 2.483543764242437, "grad_norm": 0.24395541846752167, "learning_rate": 0.0002, "loss": 0.5904, "mean_token_accuracy": 0.8300512555986643, "num_tokens": 630114603.0, "step": 33140 }, { "entropy": 0.5866551455110312, "epoch": 2.483918488176289, "grad_norm": 0.2509382367134094, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.8246554158627987, "num_tokens": 631606549.0, "step": 33145 }, { "entropy": 0.5588282715529204, "epoch": 2.4842932121101406, "grad_norm": 0.23037342727184296, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.8288352530449629, "num_tokens": 633068472.0, "step": 33150 }, { "entropy": 0.556581013277173, "epoch": 2.4846679360439925, "grad_norm": 0.4700872004032135, "learning_rate": 0.0002, "loss": 0.5884, "mean_token_accuracy": 0.8299404509365559, "num_tokens": 634527888.0, "step": 33155 }, { "entropy": 0.5479594826698303, "epoch": 2.4850426599778443, "grad_norm": 0.2641195058822632, "learning_rate": 0.0002, "loss": 0.5865, "mean_token_accuracy": 0.8315205737948418, "num_tokens": 635983730.0, "step": 33160 }, { "entropy": 0.5754089834168553, "epoch": 2.485417383911696, "grad_norm": 0.225258931517601, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.8279486682265997, "num_tokens": 637459933.0, "step": 33165 }, { "entropy": 0.58854606077075, "epoch": 2.485792107845548, "grad_norm": 0.2726365923881531, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.830618042498827, "num_tokens": 638937560.0, "step": 33170 }, { "entropy": 0.5972503749653697, "epoch": 2.4861668317794, "grad_norm": 0.2546030282974243, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.8228571496903896, "num_tokens": 640449153.0, "step": 33175 }, { "entropy": 0.5685197727754712, "epoch": 2.4865415557132517, "grad_norm": 0.2328704446554184, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.8313365958631038, "num_tokens": 641930321.0, "step": 33180 }, { "entropy": 0.5890532094985247, "epoch": 2.4869162796471036, "grad_norm": 0.23080751299858093, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.829401908442378, "num_tokens": 643393090.0, "step": 33185 }, { "entropy": 0.5830464186146855, "epoch": 2.4872910035809555, "grad_norm": 0.2233700156211853, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.8321016937494278, "num_tokens": 644856146.0, "step": 33190 }, { "entropy": 0.5906804932281375, "epoch": 2.4876657275148073, "grad_norm": 0.24317611753940582, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.8285885032266378, "num_tokens": 646360913.0, "step": 33195 }, { "entropy": 0.5934204598888755, "epoch": 2.488040451448659, "grad_norm": 0.2421817034482956, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.8272854011505842, "num_tokens": 647814103.0, "step": 33200 }, { "entropy": 0.5884794954210519, "epoch": 2.488415175382511, "grad_norm": 0.24687312543392181, "learning_rate": 0.0002, "loss": 0.6006, "mean_token_accuracy": 0.8299336079508066, "num_tokens": 649268465.0, "step": 33205 }, { "entropy": 0.5858507603406906, "epoch": 2.488789899316363, "grad_norm": 0.25086042284965515, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.8308642148971558, "num_tokens": 650790793.0, "step": 33210 }, { "entropy": 0.5668131781741976, "epoch": 2.4891646232502147, "grad_norm": 0.25131645798683167, "learning_rate": 0.0002, "loss": 0.5791, "mean_token_accuracy": 0.831506846472621, "num_tokens": 652238985.0, "step": 33215 }, { "entropy": 0.5686827953904867, "epoch": 2.4895393471840666, "grad_norm": 0.25786638259887695, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.8295129392296076, "num_tokens": 653699925.0, "step": 33220 }, { "entropy": 0.5756217880174518, "epoch": 2.4899140711179184, "grad_norm": 0.2590281069278717, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.832021526247263, "num_tokens": 655162091.0, "step": 33225 }, { "entropy": 0.5934043245390057, "epoch": 2.4902887950517707, "grad_norm": 0.26370757818222046, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.8289054103195668, "num_tokens": 656649384.0, "step": 33230 }, { "entropy": 0.5892185280099511, "epoch": 2.490663518985622, "grad_norm": 0.2813085913658142, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.8278569050133229, "num_tokens": 658110430.0, "step": 33235 }, { "entropy": 0.5964241044595837, "epoch": 2.4910382429194744, "grad_norm": 0.3530297875404358, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.8305644005537033, "num_tokens": 659576233.0, "step": 33240 }, { "entropy": 0.5847392527386546, "epoch": 2.491412966853326, "grad_norm": 0.28385576605796814, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.8292398821562529, "num_tokens": 661040511.0, "step": 33245 }, { "entropy": 0.5982695955783128, "epoch": 2.491787690787178, "grad_norm": 0.22495310008525848, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.8296855960041285, "num_tokens": 662499678.0, "step": 33250 }, { "entropy": 0.6030413679778576, "epoch": 2.4921624147210295, "grad_norm": 0.22794419527053833, "learning_rate": 0.0002, "loss": 0.5783, "mean_token_accuracy": 0.829883512109518, "num_tokens": 663970180.0, "step": 33255 }, { "entropy": 0.61550488807261, "epoch": 2.492537138654882, "grad_norm": 0.25292572379112244, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.823323666676879, "num_tokens": 665462139.0, "step": 33260 }, { "entropy": 0.6100685127079487, "epoch": 2.492911862588733, "grad_norm": 0.2430812567472458, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.8281544376164675, "num_tokens": 666933095.0, "step": 33265 }, { "entropy": 0.6025645688176156, "epoch": 2.4932865865225855, "grad_norm": 0.2378273606300354, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.8257570095360279, "num_tokens": 668418427.0, "step": 33270 }, { "entropy": 0.5828891726210713, "epoch": 2.4936613104564374, "grad_norm": 0.22893370687961578, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.8287880919873715, "num_tokens": 669914146.0, "step": 33275 }, { "entropy": 0.5992822809144854, "epoch": 2.494036034390289, "grad_norm": 0.27643176913261414, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.8260413859039545, "num_tokens": 671365976.0, "step": 33280 }, { "entropy": 0.5851485474035144, "epoch": 2.494410758324141, "grad_norm": 0.2839253544807434, "learning_rate": 0.0002, "loss": 0.5824, "mean_token_accuracy": 0.8325541574507952, "num_tokens": 672810801.0, "step": 33285 }, { "entropy": 0.6044858381152153, "epoch": 2.494785482257993, "grad_norm": 0.37856966257095337, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.8299022264778614, "num_tokens": 674261314.0, "step": 33290 }, { "entropy": 0.5944208588451148, "epoch": 2.4951602061918448, "grad_norm": 0.30457767844200134, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.8286944177001715, "num_tokens": 675738914.0, "step": 33295 }, { "entropy": 0.6000744273886085, "epoch": 2.4955349301256966, "grad_norm": 0.25503578782081604, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.825122520327568, "num_tokens": 677238411.0, "step": 33300 }, { "entropy": 0.6109772795811296, "epoch": 2.4959096540595485, "grad_norm": 0.26205208897590637, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.8274376835674048, "num_tokens": 678729150.0, "step": 33305 }, { "entropy": 0.5948384813964367, "epoch": 2.4962843779934003, "grad_norm": 0.2469872534275055, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.8288163058459759, "num_tokens": 680189531.0, "step": 33310 }, { "entropy": 0.5814574025571346, "epoch": 2.496659101927252, "grad_norm": 0.23565568029880524, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.8285948615521193, "num_tokens": 681657792.0, "step": 33315 }, { "entropy": 0.5945459742099046, "epoch": 2.497033825861104, "grad_norm": 0.23832491040229797, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.8253834713250399, "num_tokens": 683184902.0, "step": 33320 }, { "entropy": 0.5993938069790602, "epoch": 2.497408549794956, "grad_norm": 0.24043938517570496, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.8252529360353946, "num_tokens": 684652066.0, "step": 33325 }, { "entropy": 0.5885632054880261, "epoch": 2.4977832737288077, "grad_norm": 0.2514958679676056, "learning_rate": 0.0002, "loss": 0.5889, "mean_token_accuracy": 0.8258158206939697, "num_tokens": 686136758.0, "step": 33330 }, { "entropy": 0.5868182206526399, "epoch": 2.4981579976626596, "grad_norm": 0.2298382669687271, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8267566725611687, "num_tokens": 687646363.0, "step": 33335 }, { "entropy": 0.5790489472448825, "epoch": 2.4985327215965114, "grad_norm": 0.24854828417301178, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.8271519839763641, "num_tokens": 689106955.0, "step": 33340 }, { "entropy": 0.5714057054370641, "epoch": 2.4989074455303633, "grad_norm": 0.28403714299201965, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.8304348785430193, "num_tokens": 690576281.0, "step": 33345 }, { "entropy": 0.5763123471289873, "epoch": 2.499282169464215, "grad_norm": 0.28285518288612366, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.8279094699770212, "num_tokens": 692000753.0, "step": 33350 }, { "entropy": 0.5935292148962616, "epoch": 2.499656893398067, "grad_norm": 0.23937666416168213, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.8228464625775814, "num_tokens": 693471549.0, "step": 33355 }, { "entropy": 0.5777219662442803, "epoch": 2.500031617331919, "grad_norm": 0.34277287125587463, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.8267708823084832, "num_tokens": 694939616.0, "step": 33360 }, { "entropy": 0.5762029767036438, "epoch": 2.5004063412657707, "grad_norm": 0.3847297728061676, "learning_rate": 0.0002, "loss": 0.5984, "mean_token_accuracy": 0.8267992869019508, "num_tokens": 696444180.0, "step": 33365 }, { "entropy": 0.572658209502697, "epoch": 2.5007810651996225, "grad_norm": 0.26578181982040405, "learning_rate": 0.0002, "loss": 0.583, "mean_token_accuracy": 0.8280684944242239, "num_tokens": 697922912.0, "step": 33370 }, { "entropy": 0.5852980304509401, "epoch": 2.5011557891334744, "grad_norm": 0.22677505016326904, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.8273441892117261, "num_tokens": 699401499.0, "step": 33375 }, { "entropy": 0.5771800626069308, "epoch": 2.501530513067326, "grad_norm": 0.24945765733718872, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8270628478378057, "num_tokens": 700834227.0, "step": 33380 }, { "entropy": 0.5818700706586242, "epoch": 2.501905237001178, "grad_norm": 0.27477550506591797, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.8305540461093187, "num_tokens": 702303581.0, "step": 33385 }, { "entropy": 0.5773191887885332, "epoch": 2.50227996093503, "grad_norm": 0.23467019200325012, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.8279021978378296, "num_tokens": 703762756.0, "step": 33390 }, { "entropy": 0.5774190496653319, "epoch": 2.5026546848688818, "grad_norm": 0.2424738109111786, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.8282549541443587, "num_tokens": 705210928.0, "step": 33395 }, { "entropy": 0.5874887682497502, "epoch": 2.5030294088027336, "grad_norm": 0.2717142105102539, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.8285929713398218, "num_tokens": 706720764.0, "step": 33400 }, { "entropy": 0.5628626922145485, "epoch": 2.5034041327365855, "grad_norm": 0.23578327894210815, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.8332899250090122, "num_tokens": 708200931.0, "step": 33405 }, { "entropy": 0.577278190292418, "epoch": 2.5037788566704373, "grad_norm": 0.24225158989429474, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.8254779875278473, "num_tokens": 709679897.0, "step": 33410 }, { "entropy": 0.5859972257167101, "epoch": 2.504153580604289, "grad_norm": 0.22222235798835754, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.8267557680606842, "num_tokens": 711203473.0, "step": 33415 }, { "entropy": 0.582721590436995, "epoch": 2.504528304538141, "grad_norm": 0.24689006805419922, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8295527413487435, "num_tokens": 712657374.0, "step": 33420 }, { "entropy": 0.5814922001212836, "epoch": 2.504903028471993, "grad_norm": 0.24455153942108154, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.8289216563105584, "num_tokens": 714094495.0, "step": 33425 }, { "entropy": 0.5838152276352048, "epoch": 2.5052777524058447, "grad_norm": 0.23976543545722961, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.8267605841159821, "num_tokens": 715525029.0, "step": 33430 }, { "entropy": 0.585818936675787, "epoch": 2.5056524763396966, "grad_norm": 0.239897221326828, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.8264617763459683, "num_tokens": 717037920.0, "step": 33435 }, { "entropy": 0.5953851358965039, "epoch": 2.5060272002735484, "grad_norm": 0.27385154366493225, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.8294309265911579, "num_tokens": 718498576.0, "step": 33440 }, { "entropy": 0.5821040656417609, "epoch": 2.5064019242074003, "grad_norm": 0.24164700508117676, "learning_rate": 0.0002, "loss": 0.5875, "mean_token_accuracy": 0.8283556777983904, "num_tokens": 719936896.0, "step": 33445 }, { "entropy": 0.5728748928755522, "epoch": 2.506776648141252, "grad_norm": 0.27786746621131897, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.8271965824067593, "num_tokens": 721370996.0, "step": 33450 }, { "entropy": 0.5840030066668987, "epoch": 2.507151372075104, "grad_norm": 0.24481840431690216, "learning_rate": 0.0002, "loss": 0.5931, "mean_token_accuracy": 0.8306849680840969, "num_tokens": 722857729.0, "step": 33455 }, { "entropy": 0.6046760151162743, "epoch": 2.507526096008956, "grad_norm": 0.21624845266342163, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.8244971211999654, "num_tokens": 724316629.0, "step": 33460 }, { "entropy": 0.5949021676555276, "epoch": 2.5079008199428077, "grad_norm": 0.2398478239774704, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.8290462736040354, "num_tokens": 725800109.0, "step": 33465 }, { "entropy": 0.5831798249855638, "epoch": 2.5082755438766595, "grad_norm": 0.2659108638763428, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.8288285426795483, "num_tokens": 727270304.0, "step": 33470 }, { "entropy": 0.5771545674651861, "epoch": 2.5086502678105114, "grad_norm": 0.22204634547233582, "learning_rate": 0.0002, "loss": 0.5716, "mean_token_accuracy": 0.8311407472938299, "num_tokens": 728705668.0, "step": 33475 }, { "entropy": 0.596390699222684, "epoch": 2.509024991744363, "grad_norm": 0.2901824116706848, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.8269239597022533, "num_tokens": 730214996.0, "step": 33480 }, { "entropy": 0.5754032492637634, "epoch": 2.509399715678215, "grad_norm": 0.23589856922626495, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.8320700790733099, "num_tokens": 731649196.0, "step": 33485 }, { "entropy": 0.5760828975588084, "epoch": 2.509774439612067, "grad_norm": 0.2462090402841568, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.8279409736394883, "num_tokens": 733072461.0, "step": 33490 }, { "entropy": 0.5869699796661735, "epoch": 2.5101491635459188, "grad_norm": 0.25018832087516785, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.8279163926839829, "num_tokens": 734531354.0, "step": 33495 }, { "entropy": 0.5939465522766113, "epoch": 2.5105238874797706, "grad_norm": 0.260300874710083, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.8275994416326284, "num_tokens": 736003047.0, "step": 33500 }, { "entropy": 0.5869860984385014, "epoch": 2.5108986114136225, "grad_norm": 0.252108097076416, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.8324296366423368, "num_tokens": 737494022.0, "step": 33505 }, { "entropy": 0.5812731597572565, "epoch": 2.5112733353474743, "grad_norm": 0.25349855422973633, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.8330067858099938, "num_tokens": 738966118.0, "step": 33510 }, { "entropy": 0.5970904976129532, "epoch": 2.511648059281326, "grad_norm": 0.26555997133255005, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.8276342555880547, "num_tokens": 740423620.0, "step": 33515 }, { "entropy": 0.5934262791648507, "epoch": 2.512022783215178, "grad_norm": 0.2817263901233673, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8260124459862709, "num_tokens": 741869562.0, "step": 33520 }, { "entropy": 0.586634180136025, "epoch": 2.5123975071490303, "grad_norm": 0.337506502866745, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.8282528884708882, "num_tokens": 743333916.0, "step": 33525 }, { "entropy": 0.59330994207412, "epoch": 2.5127722310828817, "grad_norm": 0.23006638884544373, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.828413088619709, "num_tokens": 744800850.0, "step": 33530 }, { "entropy": 0.5936797766014934, "epoch": 2.513146955016734, "grad_norm": 0.2808327376842499, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8266960222274065, "num_tokens": 746273327.0, "step": 33535 }, { "entropy": 0.5812141997739673, "epoch": 2.5135216789505854, "grad_norm": 0.2546623647212982, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.828606815263629, "num_tokens": 747730786.0, "step": 33540 }, { "entropy": 0.5904103945940733, "epoch": 2.5138964028844377, "grad_norm": 0.2310667634010315, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.8310226753354073, "num_tokens": 749163186.0, "step": 33545 }, { "entropy": 0.5944708339869976, "epoch": 2.514271126818289, "grad_norm": 0.24240097403526306, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.8264963943511248, "num_tokens": 750655328.0, "step": 33550 }, { "entropy": 0.5861467089504003, "epoch": 2.5146458507521414, "grad_norm": 0.22647495567798615, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.8304572373628616, "num_tokens": 752081580.0, "step": 33555 }, { "entropy": 0.5892071051523089, "epoch": 2.515020574685993, "grad_norm": 0.2500360608100891, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.8256743904203176, "num_tokens": 753555630.0, "step": 33560 }, { "entropy": 0.5880011370405555, "epoch": 2.515395298619845, "grad_norm": 0.26972654461860657, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.8249331429600716, "num_tokens": 755013890.0, "step": 33565 }, { "entropy": 0.581085023842752, "epoch": 2.5157700225536965, "grad_norm": 0.28976428508758545, "learning_rate": 0.0002, "loss": 0.589, "mean_token_accuracy": 0.828641901537776, "num_tokens": 756436851.0, "step": 33570 }, { "entropy": 0.5949817908927798, "epoch": 2.516144746487549, "grad_norm": 0.28493645787239075, "learning_rate": 0.0002, "loss": 0.6039, "mean_token_accuracy": 0.8273966383188963, "num_tokens": 757893287.0, "step": 33575 }, { "entropy": 0.5942776456475258, "epoch": 2.5165194704214002, "grad_norm": 0.2445148527622223, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.8256272573024035, "num_tokens": 759361269.0, "step": 33580 }, { "entropy": 0.6006829690188169, "epoch": 2.5168941943552525, "grad_norm": 0.27229398488998413, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.82598333992064, "num_tokens": 760798753.0, "step": 33585 }, { "entropy": 0.6069116368889809, "epoch": 2.517268918289104, "grad_norm": 0.2398868203163147, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.8278317522257567, "num_tokens": 762265916.0, "step": 33590 }, { "entropy": 0.6006594071164727, "epoch": 2.5176436422229562, "grad_norm": 0.27996015548706055, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.8260716497898102, "num_tokens": 763751340.0, "step": 33595 }, { "entropy": 0.6089663183316588, "epoch": 2.518018366156808, "grad_norm": 0.2303759604692459, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.8222167935222388, "num_tokens": 765240466.0, "step": 33600 }, { "entropy": 0.5958047611638904, "epoch": 2.51839309009066, "grad_norm": 0.34723204374313354, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.8260925628244877, "num_tokens": 766750538.0, "step": 33605 }, { "entropy": 0.6177825532853604, "epoch": 2.5187678140245118, "grad_norm": 0.28743699193000793, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.8268218412995338, "num_tokens": 768220151.0, "step": 33610 }, { "entropy": 0.6025818323716521, "epoch": 2.5191425379583636, "grad_norm": 0.35001325607299805, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.8270637627691031, "num_tokens": 769669775.0, "step": 33615 }, { "entropy": 0.5972505448386073, "epoch": 2.5195172618922155, "grad_norm": 0.24081364274024963, "learning_rate": 0.0002, "loss": 0.583, "mean_token_accuracy": 0.8289439667016267, "num_tokens": 771161027.0, "step": 33620 }, { "entropy": 0.6018792118877172, "epoch": 2.5198919858260673, "grad_norm": 0.27437636256217957, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8273123539984226, "num_tokens": 772650914.0, "step": 33625 }, { "entropy": 0.6099013866856694, "epoch": 2.520266709759919, "grad_norm": 0.3095892369747162, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.8276466339826584, "num_tokens": 774137597.0, "step": 33630 }, { "entropy": 0.6167932054027915, "epoch": 2.520641433693771, "grad_norm": 0.2915824055671692, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.8271701607853175, "num_tokens": 775603234.0, "step": 33635 }, { "entropy": 0.6197208682075143, "epoch": 2.521016157627623, "grad_norm": 0.22930096089839935, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.8297929797321558, "num_tokens": 777106013.0, "step": 33640 }, { "entropy": 0.6230817997828126, "epoch": 2.5213908815614747, "grad_norm": 0.2505619525909424, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.8275652155280113, "num_tokens": 778550015.0, "step": 33645 }, { "entropy": 0.6240850923582911, "epoch": 2.5217656054953266, "grad_norm": 0.2413485199213028, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.828027244284749, "num_tokens": 780019132.0, "step": 33650 }, { "entropy": 0.620860861428082, "epoch": 2.5221403294291784, "grad_norm": 0.2579077184200287, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.8264628753066063, "num_tokens": 781452806.0, "step": 33655 }, { "entropy": 0.601209968328476, "epoch": 2.5225150533630303, "grad_norm": 0.23334844410419464, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.8306892745196819, "num_tokens": 782903483.0, "step": 33660 }, { "entropy": 0.6082924319431186, "epoch": 2.522889777296882, "grad_norm": 0.2386772632598877, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.8276292499154806, "num_tokens": 784366071.0, "step": 33665 }, { "entropy": 0.6160595759749412, "epoch": 2.523264501230734, "grad_norm": 0.2154020369052887, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8281392443925142, "num_tokens": 785846186.0, "step": 33670 }, { "entropy": 0.6087418917566538, "epoch": 2.523639225164586, "grad_norm": 0.2875327467918396, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.8316137306392193, "num_tokens": 787320883.0, "step": 33675 }, { "entropy": 0.6130554070696235, "epoch": 2.5240139490984377, "grad_norm": 0.3065500259399414, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.8287326656281948, "num_tokens": 788755279.0, "step": 33680 }, { "entropy": 0.6267173238098621, "epoch": 2.5243886730322895, "grad_norm": 0.22713994979858398, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.8286103945225477, "num_tokens": 790220934.0, "step": 33685 }, { "entropy": 0.6262736031785607, "epoch": 2.5247633969661414, "grad_norm": 0.24313972890377045, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.8247307077050209, "num_tokens": 791696047.0, "step": 33690 }, { "entropy": 0.6297829143702984, "epoch": 2.5251381208999932, "grad_norm": 0.3223097622394562, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.8276236481964588, "num_tokens": 793155396.0, "step": 33695 }, { "entropy": 0.6386333351954818, "epoch": 2.525512844833845, "grad_norm": 0.2725522518157959, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.8286702327430249, "num_tokens": 794617177.0, "step": 33700 }, { "entropy": 0.6279758233577013, "epoch": 2.525887568767697, "grad_norm": 0.3228202760219574, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.828118659183383, "num_tokens": 796083857.0, "step": 33705 }, { "entropy": 0.6225121412426233, "epoch": 2.526262292701549, "grad_norm": 0.26810145378112793, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.8246933046728373, "num_tokens": 797595069.0, "step": 33710 }, { "entropy": 0.612529325298965, "epoch": 2.5266370166354006, "grad_norm": 0.2321140021085739, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.8248391732573509, "num_tokens": 799061796.0, "step": 33715 }, { "entropy": 0.5949527593329549, "epoch": 2.5270117405692525, "grad_norm": 0.3012702763080597, "learning_rate": 0.0002, "loss": 0.578, "mean_token_accuracy": 0.8315167676657438, "num_tokens": 800526701.0, "step": 33720 }, { "entropy": 0.6020006947219372, "epoch": 2.5273864645031043, "grad_norm": 0.3059597611427307, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.8293010961264372, "num_tokens": 801999833.0, "step": 33725 }, { "entropy": 0.6204636264592409, "epoch": 2.527761188436956, "grad_norm": 0.3676382601261139, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.825869121402502, "num_tokens": 803457488.0, "step": 33730 }, { "entropy": 0.6099863858893514, "epoch": 2.528135912370808, "grad_norm": 0.2625587284564972, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.8273207940161228, "num_tokens": 804912620.0, "step": 33735 }, { "entropy": 0.6081977853551507, "epoch": 2.52851063630466, "grad_norm": 0.23823164403438568, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.8284183625131846, "num_tokens": 806335925.0, "step": 33740 }, { "entropy": 0.6026411710307003, "epoch": 2.5288853602385117, "grad_norm": 0.23132580518722534, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.8269651260226965, "num_tokens": 807755956.0, "step": 33745 }, { "entropy": 0.5936434453353285, "epoch": 2.5292600841723636, "grad_norm": 0.2239101082086563, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8232205253094435, "num_tokens": 809233686.0, "step": 33750 }, { "entropy": 0.5891286930069327, "epoch": 2.5296348081062154, "grad_norm": 0.2330952137708664, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.8283258214592933, "num_tokens": 810685660.0, "step": 33755 }, { "entropy": 0.6174839520826936, "epoch": 2.5300095320400673, "grad_norm": 0.22574947774410248, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.825612973049283, "num_tokens": 812147104.0, "step": 33760 }, { "entropy": 0.608029086329043, "epoch": 2.530384255973919, "grad_norm": 0.2749286890029907, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.8267582394182682, "num_tokens": 813621754.0, "step": 33765 }, { "entropy": 0.6054896580055356, "epoch": 2.530758979907771, "grad_norm": 0.21299001574516296, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.8282095994800329, "num_tokens": 815137337.0, "step": 33770 }, { "entropy": 0.6257111778482795, "epoch": 2.531133703841623, "grad_norm": 0.24819490313529968, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.8227697070688009, "num_tokens": 816606796.0, "step": 33775 }, { "entropy": 0.6049016013741493, "epoch": 2.5315084277754747, "grad_norm": 0.23051999509334564, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8266663860529662, "num_tokens": 818118855.0, "step": 33780 }, { "entropy": 0.6123170601204038, "epoch": 2.5318831517093265, "grad_norm": 0.24217446148395538, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.8304034747183323, "num_tokens": 819586691.0, "step": 33785 }, { "entropy": 0.6033154647797346, "epoch": 2.5322578756431784, "grad_norm": 0.24104399979114532, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.829724257439375, "num_tokens": 821050963.0, "step": 33790 }, { "entropy": 0.5958147374913096, "epoch": 2.5326325995770302, "grad_norm": 0.28752174973487854, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.8293351389467716, "num_tokens": 822527359.0, "step": 33795 }, { "entropy": 0.6142045550048352, "epoch": 2.533007323510882, "grad_norm": 0.25209274888038635, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.82551559060812, "num_tokens": 824004213.0, "step": 33800 }, { "entropy": 0.607014111801982, "epoch": 2.533382047444734, "grad_norm": 0.2563716471195221, "learning_rate": 0.0002, "loss": 0.5794, "mean_token_accuracy": 0.8283923264592886, "num_tokens": 825446719.0, "step": 33805 }, { "entropy": 0.6066041374579072, "epoch": 2.533756771378586, "grad_norm": 0.29253169894218445, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.8307765670120716, "num_tokens": 826921490.0, "step": 33810 }, { "entropy": 0.6063059091567993, "epoch": 2.5341314953124376, "grad_norm": 0.38436979055404663, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.8273189902305603, "num_tokens": 828430177.0, "step": 33815 }, { "entropy": 0.592229519225657, "epoch": 2.5345062192462895, "grad_norm": 0.24674102663993835, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.831123748794198, "num_tokens": 829866119.0, "step": 33820 }, { "entropy": 0.6116991179063916, "epoch": 2.5348809431801413, "grad_norm": 0.24981354176998138, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.8273160122334957, "num_tokens": 831372472.0, "step": 33825 }, { "entropy": 0.6135754158720375, "epoch": 2.5352556671139936, "grad_norm": 0.4661269187927246, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.8303077317774296, "num_tokens": 832812519.0, "step": 33830 }, { "entropy": 0.6146511301398278, "epoch": 2.535630391047845, "grad_norm": 0.2615060806274414, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.8295989897102117, "num_tokens": 834284965.0, "step": 33835 }, { "entropy": 0.6149183677509427, "epoch": 2.5360051149816973, "grad_norm": 0.23631204664707184, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.8264978431165219, "num_tokens": 835835523.0, "step": 33840 }, { "entropy": 0.6169368322938681, "epoch": 2.5363798389155487, "grad_norm": 0.30673059821128845, "learning_rate": 0.0002, "loss": 0.6, "mean_token_accuracy": 0.8284355785697699, "num_tokens": 837294570.0, "step": 33845 }, { "entropy": 0.6109692964702844, "epoch": 2.536754562849401, "grad_norm": 0.25347989797592163, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.832335764542222, "num_tokens": 838745735.0, "step": 33850 }, { "entropy": 0.6027953695505858, "epoch": 2.5371292867832524, "grad_norm": 0.3203732669353485, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.825236838310957, "num_tokens": 840214488.0, "step": 33855 }, { "entropy": 0.5878516329452396, "epoch": 2.5375040107171047, "grad_norm": 0.2956458330154419, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.8269354484975338, "num_tokens": 841674947.0, "step": 33860 }, { "entropy": 0.5958652855828405, "epoch": 2.537878734650956, "grad_norm": 0.29165369272232056, "learning_rate": 0.0002, "loss": 0.5798, "mean_token_accuracy": 0.8332475699484349, "num_tokens": 843099707.0, "step": 33865 }, { "entropy": 0.6034199256449938, "epoch": 2.5382534585848084, "grad_norm": 0.2948721945285797, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.827077854797244, "num_tokens": 844543884.0, "step": 33870 }, { "entropy": 0.6001154001802206, "epoch": 2.53862818251866, "grad_norm": 0.32614514231681824, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.8279913250356913, "num_tokens": 845992385.0, "step": 33875 }, { "entropy": 0.593868600949645, "epoch": 2.539002906452512, "grad_norm": 0.2376624494791031, "learning_rate": 0.0002, "loss": 0.5816, "mean_token_accuracy": 0.8297640398144722, "num_tokens": 847518331.0, "step": 33880 }, { "entropy": 0.6057668589055538, "epoch": 2.5393776303863635, "grad_norm": 0.2562137544155121, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.8229742534458637, "num_tokens": 849008596.0, "step": 33885 }, { "entropy": 0.6061307838186621, "epoch": 2.539752354320216, "grad_norm": 0.3251033425331116, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.82694870531559, "num_tokens": 850502085.0, "step": 33890 }, { "entropy": 0.60466268658638, "epoch": 2.5401270782540672, "grad_norm": 0.25669288635253906, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.8299629960209132, "num_tokens": 851984848.0, "step": 33895 }, { "entropy": 0.6016965242102742, "epoch": 2.5405018021879195, "grad_norm": 0.23387280106544495, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.8299735825508833, "num_tokens": 853458746.0, "step": 33900 }, { "entropy": 0.6159062488004565, "epoch": 2.540876526121771, "grad_norm": 0.2570951581001282, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.8278856705874205, "num_tokens": 854903696.0, "step": 33905 }, { "entropy": 0.6108326511457562, "epoch": 2.5412512500556232, "grad_norm": 0.26510900259017944, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.8282750096172095, "num_tokens": 856390944.0, "step": 33910 }, { "entropy": 0.6155365873128176, "epoch": 2.541625973989475, "grad_norm": 0.2330278754234314, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.825502572581172, "num_tokens": 857849743.0, "step": 33915 }, { "entropy": 0.6147184319794178, "epoch": 2.542000697923327, "grad_norm": 0.24750789999961853, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.8261721588671207, "num_tokens": 859285173.0, "step": 33920 }, { "entropy": 0.6177911072969436, "epoch": 2.542375421857179, "grad_norm": 0.31017759442329407, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.8243754774332046, "num_tokens": 860721092.0, "step": 33925 }, { "entropy": 0.6033464150503278, "epoch": 2.5427501457910306, "grad_norm": 0.25418081879615784, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.8287623275071383, "num_tokens": 862187127.0, "step": 33930 }, { "entropy": 0.5913675524294376, "epoch": 2.5431248697248825, "grad_norm": 0.23985251784324646, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.828140103071928, "num_tokens": 863657028.0, "step": 33935 }, { "entropy": 0.5875145804136992, "epoch": 2.5434995936587343, "grad_norm": 0.24322591722011566, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.8276610773056745, "num_tokens": 865163058.0, "step": 33940 }, { "entropy": 0.5953361175954341, "epoch": 2.543874317592586, "grad_norm": 0.2338140904903412, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.8245298612862826, "num_tokens": 866631544.0, "step": 33945 }, { "entropy": 0.589363539032638, "epoch": 2.544249041526438, "grad_norm": 0.2569054663181305, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.8311261046677828, "num_tokens": 868121685.0, "step": 33950 }, { "entropy": 0.5976879013702273, "epoch": 2.54462376546029, "grad_norm": 0.23954236507415771, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.8268420685082674, "num_tokens": 869579531.0, "step": 33955 }, { "entropy": 0.5959308845922351, "epoch": 2.5449984893941417, "grad_norm": 0.24653734266757965, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.8213685519993306, "num_tokens": 871092446.0, "step": 33960 }, { "entropy": 0.5828568523749709, "epoch": 2.5453732133279936, "grad_norm": 0.25078389048576355, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.8302636995911599, "num_tokens": 872580228.0, "step": 33965 }, { "entropy": 0.5989336671307683, "epoch": 2.5457479372618455, "grad_norm": 0.24856220185756683, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.824337349459529, "num_tokens": 874045310.0, "step": 33970 }, { "entropy": 0.5994239972904325, "epoch": 2.5461226611956973, "grad_norm": 0.3426662087440491, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.8288715459406376, "num_tokens": 875553840.0, "step": 33975 }, { "entropy": 0.628128202445805, "epoch": 2.546497385129549, "grad_norm": 0.28601840138435364, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.8235662896186113, "num_tokens": 877009033.0, "step": 33980 }, { "entropy": 0.601467957906425, "epoch": 2.546872109063401, "grad_norm": 0.29019978642463684, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.8303605798631907, "num_tokens": 878442651.0, "step": 33985 }, { "entropy": 0.6131215866655111, "epoch": 2.547246832997253, "grad_norm": 0.2289908230304718, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.8255802661180496, "num_tokens": 879902792.0, "step": 33990 }, { "entropy": 0.6094000255689025, "epoch": 2.5476215569311047, "grad_norm": 0.26238498091697693, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.8298600677400827, "num_tokens": 881372861.0, "step": 33995 }, { "entropy": 0.6032205944880843, "epoch": 2.5479962808649566, "grad_norm": 0.3208712935447693, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.8280256006866693, "num_tokens": 882838493.0, "step": 34000 } ], "logging_steps": 5, "max_steps": 40032, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.500623162910085e+20, "train_batch_size": 2, "trial_name": null, "trial_params": null }