| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9848024316109423, | |
| "eval_steps": 500, | |
| "global_step": 411, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00729483282674772, | |
| "grad_norm": 0.14541301131248474, | |
| "learning_rate": 2.3809523809523811e-07, | |
| "loss": 0.7602, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01458966565349544, | |
| "grad_norm": 0.1567784547805786, | |
| "learning_rate": 4.7619047619047623e-07, | |
| "loss": 0.8215, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02188449848024316, | |
| "grad_norm": 0.1584789901971817, | |
| "learning_rate": 7.142857142857143e-07, | |
| "loss": 0.8269, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.02917933130699088, | |
| "grad_norm": 0.157843217253685, | |
| "learning_rate": 9.523809523809525e-07, | |
| "loss": 0.7909, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0364741641337386, | |
| "grad_norm": 0.1607961654663086, | |
| "learning_rate": 1.1904761904761906e-06, | |
| "loss": 0.8198, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04376899696048632, | |
| "grad_norm": 0.15348272025585175, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": 0.7687, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.05106382978723404, | |
| "grad_norm": 0.1496104598045349, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.7942, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.05835866261398176, | |
| "grad_norm": 0.2734036445617676, | |
| "learning_rate": 1.904761904761905e-06, | |
| "loss": 0.847, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.06565349544072949, | |
| "grad_norm": 0.13683773577213287, | |
| "learning_rate": 2.1428571428571427e-06, | |
| "loss": 0.754, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0729483282674772, | |
| "grad_norm": 0.11306589841842651, | |
| "learning_rate": 2.380952380952381e-06, | |
| "loss": 0.6991, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08024316109422493, | |
| "grad_norm": 0.12233421206474304, | |
| "learning_rate": 2.6190476190476192e-06, | |
| "loss": 0.7829, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.08753799392097264, | |
| "grad_norm": 0.10262873023748398, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 0.7048, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.09483282674772037, | |
| "grad_norm": 0.10435234010219574, | |
| "learning_rate": 3.0952380952380957e-06, | |
| "loss": 0.768, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.10212765957446808, | |
| "grad_norm": 0.0735386535525322, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.6147, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.1094224924012158, | |
| "grad_norm": 0.07339954376220703, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 0.7452, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.11671732522796352, | |
| "grad_norm": 0.06846445798873901, | |
| "learning_rate": 3.80952380952381e-06, | |
| "loss": 0.7383, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.12401215805471125, | |
| "grad_norm": 0.07185480743646622, | |
| "learning_rate": 4.047619047619048e-06, | |
| "loss": 0.7081, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.13130699088145897, | |
| "grad_norm": 0.06281375139951706, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": 0.6171, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.1386018237082067, | |
| "grad_norm": 0.08302997052669525, | |
| "learning_rate": 4.523809523809524e-06, | |
| "loss": 0.6136, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1458966565349544, | |
| "grad_norm": 0.07521834969520569, | |
| "learning_rate": 4.761904761904762e-06, | |
| "loss": 0.6401, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15319148936170213, | |
| "grad_norm": 0.07346966117620468, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6575, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.16048632218844985, | |
| "grad_norm": 0.06220546364784241, | |
| "learning_rate": 5.2380952380952384e-06, | |
| "loss": 0.585, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.16778115501519758, | |
| "grad_norm": 0.06210927292704582, | |
| "learning_rate": 5.476190476190477e-06, | |
| "loss": 0.6116, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.17507598784194528, | |
| "grad_norm": 0.06617508083581924, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 0.6591, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.182370820668693, | |
| "grad_norm": 0.06115543842315674, | |
| "learning_rate": 5.9523809523809525e-06, | |
| "loss": 0.6164, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.18966565349544073, | |
| "grad_norm": 0.05512455105781555, | |
| "learning_rate": 6.1904761904761914e-06, | |
| "loss": 0.6131, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.19696048632218846, | |
| "grad_norm": 0.05426128953695297, | |
| "learning_rate": 6.4285714285714295e-06, | |
| "loss": 0.6233, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.20425531914893616, | |
| "grad_norm": 0.04495101794600487, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.6017, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.2115501519756839, | |
| "grad_norm": 0.052700527012348175, | |
| "learning_rate": 6.9047619047619055e-06, | |
| "loss": 0.6209, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.2188449848024316, | |
| "grad_norm": 0.05274520814418793, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 0.562, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.22613981762917934, | |
| "grad_norm": 0.0418085902929306, | |
| "learning_rate": 7.380952380952382e-06, | |
| "loss": 0.5356, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.23343465045592704, | |
| "grad_norm": 0.04744059965014458, | |
| "learning_rate": 7.61904761904762e-06, | |
| "loss": 0.5759, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.24072948328267477, | |
| "grad_norm": 0.051624756306409836, | |
| "learning_rate": 7.857142857142858e-06, | |
| "loss": 0.5989, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.2480243161094225, | |
| "grad_norm": 0.04632480815052986, | |
| "learning_rate": 8.095238095238097e-06, | |
| "loss": 0.6036, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.2553191489361702, | |
| "grad_norm": 0.040394943207502365, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.5439, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.26261398176291795, | |
| "grad_norm": 0.047632846981287, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 0.6187, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.26990881458966565, | |
| "grad_norm": 0.04498811811208725, | |
| "learning_rate": 8.80952380952381e-06, | |
| "loss": 0.5686, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.2772036474164134, | |
| "grad_norm": 0.04858787730336189, | |
| "learning_rate": 9.047619047619049e-06, | |
| "loss": 0.6224, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2844984802431611, | |
| "grad_norm": 0.04534129053354263, | |
| "learning_rate": 9.285714285714288e-06, | |
| "loss": 0.576, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.2917933130699088, | |
| "grad_norm": 0.04878037050366402, | |
| "learning_rate": 9.523809523809525e-06, | |
| "loss": 0.5956, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.29908814589665655, | |
| "grad_norm": 0.044632136821746826, | |
| "learning_rate": 9.761904761904762e-06, | |
| "loss": 0.5748, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.30638297872340425, | |
| "grad_norm": 0.041874803602695465, | |
| "learning_rate": 1e-05, | |
| "loss": 0.5752, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.31367781155015195, | |
| "grad_norm": 0.041942398995161057, | |
| "learning_rate": 9.999818789066164e-06, | |
| "loss": 0.5759, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.3209726443768997, | |
| "grad_norm": 0.045705121010541916, | |
| "learning_rate": 9.999275169399614e-06, | |
| "loss": 0.5962, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.3282674772036474, | |
| "grad_norm": 0.043411824852228165, | |
| "learning_rate": 9.998369180404283e-06, | |
| "loss": 0.54, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.33556231003039516, | |
| "grad_norm": 0.03998137265443802, | |
| "learning_rate": 9.997100887750215e-06, | |
| "loss": 0.5874, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.047370899468660355, | |
| "learning_rate": 9.995470383368808e-06, | |
| "loss": 0.6051, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.35015197568389056, | |
| "grad_norm": 0.04455406963825226, | |
| "learning_rate": 9.993477785446151e-06, | |
| "loss": 0.5604, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3574468085106383, | |
| "grad_norm": 0.043418001383543015, | |
| "learning_rate": 9.991123238414455e-06, | |
| "loss": 0.5555, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.364741641337386, | |
| "grad_norm": 0.03939136862754822, | |
| "learning_rate": 9.988406912941591e-06, | |
| "loss": 0.5493, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3720364741641337, | |
| "grad_norm": 0.04485655948519707, | |
| "learning_rate": 9.985329005918702e-06, | |
| "loss": 0.5804, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.37933130699088147, | |
| "grad_norm": 0.0435781255364418, | |
| "learning_rate": 9.981889740445958e-06, | |
| "loss": 0.5617, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.38662613981762917, | |
| "grad_norm": 0.03838958591222763, | |
| "learning_rate": 9.978089365816357e-06, | |
| "loss": 0.5481, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3939209726443769, | |
| "grad_norm": 0.03926938772201538, | |
| "learning_rate": 9.973928157497675e-06, | |
| "loss": 0.5195, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.4012158054711246, | |
| "grad_norm": 0.049530286341905594, | |
| "learning_rate": 9.969406417112489e-06, | |
| "loss": 0.5854, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.4085106382978723, | |
| "grad_norm": 0.08943431824445724, | |
| "learning_rate": 9.964524472416319e-06, | |
| "loss": 0.5706, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.4158054711246201, | |
| "grad_norm": 0.04114034026861191, | |
| "learning_rate": 9.959282677273869e-06, | |
| "loss": 0.4923, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.4231003039513678, | |
| "grad_norm": 0.03834295645356178, | |
| "learning_rate": 9.953681411633376e-06, | |
| "loss": 0.5151, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.43039513677811553, | |
| "grad_norm": 0.03940470516681671, | |
| "learning_rate": 9.947721081499068e-06, | |
| "loss": 0.5274, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.4376899696048632, | |
| "grad_norm": 0.05276661738753319, | |
| "learning_rate": 9.941402118901743e-06, | |
| "loss": 0.5312, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4449848024316109, | |
| "grad_norm": 0.04116562008857727, | |
| "learning_rate": 9.934724981867447e-06, | |
| "loss": 0.5073, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.4522796352583587, | |
| "grad_norm": 0.039049182087183, | |
| "learning_rate": 9.927690154384273e-06, | |
| "loss": 0.5367, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.4595744680851064, | |
| "grad_norm": 0.042383261024951935, | |
| "learning_rate": 9.920298146367287e-06, | |
| "loss": 0.5232, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.4668693009118541, | |
| "grad_norm": 0.04153553023934364, | |
| "learning_rate": 9.912549493621555e-06, | |
| "loss": 0.5438, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.47416413373860183, | |
| "grad_norm": 0.04116344451904297, | |
| "learning_rate": 9.904444757803322e-06, | |
| "loss": 0.4803, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.48145896656534953, | |
| "grad_norm": 0.06467548757791519, | |
| "learning_rate": 9.895984526379282e-06, | |
| "loss": 0.5554, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.4887537993920973, | |
| "grad_norm": 0.04420805722475052, | |
| "learning_rate": 9.887169412584012e-06, | |
| "loss": 0.5659, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.496048632218845, | |
| "grad_norm": 0.04072507843375206, | |
| "learning_rate": 9.878000055375512e-06, | |
| "loss": 0.486, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.5033434650455927, | |
| "grad_norm": 0.04508865624666214, | |
| "learning_rate": 9.868477119388897e-06, | |
| "loss": 0.5284, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.5106382978723404, | |
| "grad_norm": 0.04231835529208183, | |
| "learning_rate": 9.858601294888212e-06, | |
| "loss": 0.5185, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5179331306990882, | |
| "grad_norm": 0.03981684520840645, | |
| "learning_rate": 9.848373297716414e-06, | |
| "loss": 0.5246, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.5252279635258359, | |
| "grad_norm": 0.045293230563402176, | |
| "learning_rate": 9.837793869243468e-06, | |
| "loss": 0.5403, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.5325227963525836, | |
| "grad_norm": 0.0415407195687294, | |
| "learning_rate": 9.826863776312621e-06, | |
| "loss": 0.568, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.5398176291793313, | |
| "grad_norm": 0.04549698531627655, | |
| "learning_rate": 9.815583811184809e-06, | |
| "loss": 0.5547, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.547112462006079, | |
| "grad_norm": 0.03895876184105873, | |
| "learning_rate": 9.803954791481239e-06, | |
| "loss": 0.5374, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5544072948328268, | |
| "grad_norm": 0.046192847192287445, | |
| "learning_rate": 9.79197756012412e-06, | |
| "loss": 0.5561, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.5617021276595745, | |
| "grad_norm": 0.03921407088637352, | |
| "learning_rate": 9.779652985275562e-06, | |
| "loss": 0.5488, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.5689969604863222, | |
| "grad_norm": 0.037232838571071625, | |
| "learning_rate": 9.766981960274653e-06, | |
| "loss": 0.4963, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.5762917933130699, | |
| "grad_norm": 0.05492810904979706, | |
| "learning_rate": 9.753965403572703e-06, | |
| "loss": 0.5621, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.5835866261398176, | |
| "grad_norm": 0.04202823340892792, | |
| "learning_rate": 9.740604258666668e-06, | |
| "loss": 0.5479, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5908814589665653, | |
| "grad_norm": 0.04189832881093025, | |
| "learning_rate": 9.726899494030768e-06, | |
| "loss": 0.5802, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.5981762917933131, | |
| "grad_norm": 0.039709825068712234, | |
| "learning_rate": 9.712852103046281e-06, | |
| "loss": 0.5166, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.6054711246200608, | |
| "grad_norm": 0.04080045223236084, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.5289, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.6127659574468085, | |
| "grad_norm": 0.038535572588443756, | |
| "learning_rate": 9.68373353965814e-06, | |
| "loss": 0.5352, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.6200607902735562, | |
| "grad_norm": 0.04705570638179779, | |
| "learning_rate": 9.66866447789531e-06, | |
| "loss": 0.5235, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6273556231003039, | |
| "grad_norm": 0.042207516729831696, | |
| "learning_rate": 9.65325701091256e-06, | |
| "loss": 0.5147, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.6346504559270517, | |
| "grad_norm": 0.04210168495774269, | |
| "learning_rate": 9.637512255510475e-06, | |
| "loss": 0.5241, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.6419452887537994, | |
| "grad_norm": 0.03785989060997963, | |
| "learning_rate": 9.62143135293779e-06, | |
| "loss": 0.5429, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.6492401215805471, | |
| "grad_norm": 0.04283512756228447, | |
| "learning_rate": 9.605015468808651e-06, | |
| "loss": 0.5242, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.6565349544072948, | |
| "grad_norm": 0.043273307383060455, | |
| "learning_rate": 9.588265793018141e-06, | |
| "loss": 0.5455, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6638297872340425, | |
| "grad_norm": 0.04218590632081032, | |
| "learning_rate": 9.571183539656011e-06, | |
| "loss": 0.5778, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.6711246200607903, | |
| "grad_norm": 0.03844400867819786, | |
| "learning_rate": 9.553769946918698e-06, | |
| "loss": 0.5233, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.678419452887538, | |
| "grad_norm": 0.04001948982477188, | |
| "learning_rate": 9.536026277019562e-06, | |
| "loss": 0.5156, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.04228726401925087, | |
| "learning_rate": 9.517953816097396e-06, | |
| "loss": 0.5138, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6930091185410334, | |
| "grad_norm": 0.03879157081246376, | |
| "learning_rate": 9.499553874123213e-06, | |
| "loss": 0.4926, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7003039513677811, | |
| "grad_norm": 0.04016513749957085, | |
| "learning_rate": 9.480827784805278e-06, | |
| "loss": 0.497, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.7075987841945289, | |
| "grad_norm": 0.03983764350414276, | |
| "learning_rate": 9.461776905492446e-06, | |
| "loss": 0.4852, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.7148936170212766, | |
| "grad_norm": 0.056514669209718704, | |
| "learning_rate": 9.442402617075765e-06, | |
| "loss": 0.5288, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.7221884498480243, | |
| "grad_norm": 0.046206481754779816, | |
| "learning_rate": 9.422706323888398e-06, | |
| "loss": 0.5418, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.729483282674772, | |
| "grad_norm": 0.0474584735929966, | |
| "learning_rate": 9.402689453603815e-06, | |
| "loss": 0.5531, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7367781155015197, | |
| "grad_norm": 0.037484850734472275, | |
| "learning_rate": 9.382353457132318e-06, | |
| "loss": 0.4869, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.7440729483282674, | |
| "grad_norm": 0.03749077394604683, | |
| "learning_rate": 9.361699808515877e-06, | |
| "loss": 0.5275, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.7513677811550152, | |
| "grad_norm": 0.038470759987831116, | |
| "learning_rate": 9.340730004821266e-06, | |
| "loss": 0.5044, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.7586626139817629, | |
| "grad_norm": 0.038027700036764145, | |
| "learning_rate": 9.31944556603157e-06, | |
| "loss": 0.5025, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.7659574468085106, | |
| "grad_norm": 0.046422988176345825, | |
| "learning_rate": 9.297848034936007e-06, | |
| "loss": 0.5341, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7732522796352583, | |
| "grad_norm": 0.043657850474119186, | |
| "learning_rate": 9.275938977018082e-06, | |
| "loss": 0.5085, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.780547112462006, | |
| "grad_norm": 0.04235101863741875, | |
| "learning_rate": 9.253719980342134e-06, | |
| "loss": 0.5397, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.7878419452887538, | |
| "grad_norm": 0.04412844404578209, | |
| "learning_rate": 9.231192655438222e-06, | |
| "loss": 0.5522, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.7951367781155015, | |
| "grad_norm": 0.04110129550099373, | |
| "learning_rate": 9.208358635185372e-06, | |
| "loss": 0.5785, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.8024316109422492, | |
| "grad_norm": 0.03757128119468689, | |
| "learning_rate": 9.185219574693242e-06, | |
| "loss": 0.4777, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8097264437689969, | |
| "grad_norm": 0.03927014395594597, | |
| "learning_rate": 9.161777151182137e-06, | |
| "loss": 0.526, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.8170212765957446, | |
| "grad_norm": 0.03983665257692337, | |
| "learning_rate": 9.138033063861436e-06, | |
| "loss": 0.5138, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.8243161094224924, | |
| "grad_norm": 0.038819894194602966, | |
| "learning_rate": 9.113989033806434e-06, | |
| "loss": 0.494, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.8316109422492401, | |
| "grad_norm": 0.05275421962141991, | |
| "learning_rate": 9.089646803833589e-06, | |
| "loss": 0.539, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.8389057750759878, | |
| "grad_norm": 0.04078809916973114, | |
| "learning_rate": 9.06500813837419e-06, | |
| "loss": 0.4778, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.8462006079027355, | |
| "grad_norm": 0.040415696799755096, | |
| "learning_rate": 9.040074823346466e-06, | |
| "loss": 0.5443, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.8534954407294832, | |
| "grad_norm": 0.03834336996078491, | |
| "learning_rate": 9.014848666026138e-06, | |
| "loss": 0.4945, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.8607902735562311, | |
| "grad_norm": 0.04293690249323845, | |
| "learning_rate": 8.989331494915417e-06, | |
| "loss": 0.5404, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.8680851063829788, | |
| "grad_norm": 0.04116823151707649, | |
| "learning_rate": 8.963525159610465e-06, | |
| "loss": 0.5274, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.8753799392097265, | |
| "grad_norm": 0.04143698886036873, | |
| "learning_rate": 8.937431530667329e-06, | |
| "loss": 0.4916, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8826747720364742, | |
| "grad_norm": 0.039400726556777954, | |
| "learning_rate": 8.911052499466358e-06, | |
| "loss": 0.5573, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.8899696048632219, | |
| "grad_norm": 0.035739775747060776, | |
| "learning_rate": 8.884389978075098e-06, | |
| "loss": 0.4961, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.8972644376899696, | |
| "grad_norm": 0.04682913422584534, | |
| "learning_rate": 8.857445899109716e-06, | |
| "loss": 0.4712, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.9045592705167174, | |
| "grad_norm": 0.04531010612845421, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.5242, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.9118541033434651, | |
| "grad_norm": 0.04528380185365677, | |
| "learning_rate": 8.80272090082227e-06, | |
| "loss": 0.5506, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9191489361702128, | |
| "grad_norm": 0.03824332728981972, | |
| "learning_rate": 8.774943948207427e-06, | |
| "loss": 0.4581, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.9264437689969605, | |
| "grad_norm": 0.03896916285157204, | |
| "learning_rate": 8.746893371145367e-06, | |
| "loss": 0.5504, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.9337386018237082, | |
| "grad_norm": 0.03746696934103966, | |
| "learning_rate": 8.718571202864598e-06, | |
| "loss": 0.4589, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.941033434650456, | |
| "grad_norm": 0.04142184555530548, | |
| "learning_rate": 8.689979496279747e-06, | |
| "loss": 0.5299, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.9483282674772037, | |
| "grad_norm": 0.03700762987136841, | |
| "learning_rate": 8.661120323842751e-06, | |
| "loss": 0.5159, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9556231003039514, | |
| "grad_norm": 0.036684855818748474, | |
| "learning_rate": 8.631995777392645e-06, | |
| "loss": 0.4854, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.9629179331306991, | |
| "grad_norm": 0.06939133256673813, | |
| "learning_rate": 8.602607968003935e-06, | |
| "loss": 0.5101, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.9702127659574468, | |
| "grad_norm": 0.039062708616256714, | |
| "learning_rate": 8.572959025833573e-06, | |
| "loss": 0.5005, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.9775075987841946, | |
| "grad_norm": 0.04555986076593399, | |
| "learning_rate": 8.543051099966558e-06, | |
| "loss": 0.5465, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.9848024316109423, | |
| "grad_norm": 0.04333364591002464, | |
| "learning_rate": 8.512886358260162e-06, | |
| "loss": 0.5237, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.99209726443769, | |
| "grad_norm": 0.04095487669110298, | |
| "learning_rate": 8.482466987186785e-06, | |
| "loss": 0.5335, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.9993920972644377, | |
| "grad_norm": 0.0442386157810688, | |
| "learning_rate": 8.451795191675488e-06, | |
| "loss": 0.5107, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.0442386157810688, | |
| "learning_rate": 8.420873194952153e-06, | |
| "loss": 0.465, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.0072948328267477, | |
| "grad_norm": 0.1445915699005127, | |
| "learning_rate": 8.38970323837834e-06, | |
| "loss": 0.4704, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.0145896656534954, | |
| "grad_norm": 0.042022328823804855, | |
| "learning_rate": 8.358287581288824e-06, | |
| "loss": 0.4282, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.021884498480243, | |
| "grad_norm": 0.04201134666800499, | |
| "learning_rate": 8.326628500827826e-06, | |
| "loss": 0.4539, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.0291793313069908, | |
| "grad_norm": 0.04877388849854469, | |
| "learning_rate": 8.294728291783967e-06, | |
| "loss": 0.4641, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.0364741641337385, | |
| "grad_norm": 0.046164825558662415, | |
| "learning_rate": 8.262589266423908e-06, | |
| "loss": 0.419, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.0437689969604864, | |
| "grad_norm": 0.041141681373119354, | |
| "learning_rate": 8.230213754324773e-06, | |
| "loss": 0.4224, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.0510638297872341, | |
| "grad_norm": 0.03967837244272232, | |
| "learning_rate": 8.19760410220527e-06, | |
| "loss": 0.4268, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.0583586626139818, | |
| "grad_norm": 0.05634555220603943, | |
| "learning_rate": 8.16476267375561e-06, | |
| "loss": 0.4025, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.0656534954407295, | |
| "grad_norm": 0.041606318205595016, | |
| "learning_rate": 8.131691849466154e-06, | |
| "loss": 0.4335, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.0729483282674772, | |
| "grad_norm": 0.03656647726893425, | |
| "learning_rate": 8.098394026454886e-06, | |
| "loss": 0.456, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.080243161094225, | |
| "grad_norm": 0.041005730628967285, | |
| "learning_rate": 8.064871618293647e-06, | |
| "loss": 0.3925, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.0875379939209726, | |
| "grad_norm": 0.04722120240330696, | |
| "learning_rate": 8.031127054833192e-06, | |
| "loss": 0.3981, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0948328267477203, | |
| "grad_norm": 0.043071143329143524, | |
| "learning_rate": 7.997162782027061e-06, | |
| "loss": 0.4296, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.102127659574468, | |
| "grad_norm": 0.04518291354179382, | |
| "learning_rate": 7.962981261754295e-06, | |
| "loss": 0.4376, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.1094224924012157, | |
| "grad_norm": 0.04998685419559479, | |
| "learning_rate": 7.928584971640974e-06, | |
| "loss": 0.452, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.1167173252279636, | |
| "grad_norm": 0.04469837248325348, | |
| "learning_rate": 7.893976404880643e-06, | |
| "loss": 0.4316, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.1240121580547113, | |
| "grad_norm": 0.040255557745695114, | |
| "learning_rate": 7.859158070053578e-06, | |
| "loss": 0.4378, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.131306990881459, | |
| "grad_norm": 0.04467206820845604, | |
| "learning_rate": 7.824132490944968e-06, | |
| "loss": 0.4215, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.1386018237082067, | |
| "grad_norm": 0.03889721632003784, | |
| "learning_rate": 7.788902206361974e-06, | |
| "loss": 0.4257, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.1458966565349544, | |
| "grad_norm": 0.04140063747763634, | |
| "learning_rate": 7.753469769949701e-06, | |
| "loss": 0.4434, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.1531914893617021, | |
| "grad_norm": 0.039931997656822205, | |
| "learning_rate": 7.717837750006106e-06, | |
| "loss": 0.41, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.1604863221884498, | |
| "grad_norm": 0.03909624367952347, | |
| "learning_rate": 7.682008729295834e-06, | |
| "loss": 0.3904, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.1677811550151975, | |
| "grad_norm": 0.0401025116443634, | |
| "learning_rate": 7.645985304863004e-06, | |
| "loss": 0.4618, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.1750759878419452, | |
| "grad_norm": 0.04733911529183388, | |
| "learning_rate": 7.609770087842969e-06, | |
| "loss": 0.4247, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.182370820668693, | |
| "grad_norm": 0.037687744945287704, | |
| "learning_rate": 7.573365703273045e-06, | |
| "loss": 0.4071, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.1896656534954406, | |
| "grad_norm": 0.039216116070747375, | |
| "learning_rate": 7.536774789902246e-06, | |
| "loss": 0.4259, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.1969604863221885, | |
| "grad_norm": 0.0408397912979126, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.394, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.2042553191489362, | |
| "grad_norm": 0.04507288709282875, | |
| "learning_rate": 7.463043999163919e-06, | |
| "loss": 0.4605, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.211550151975684, | |
| "grad_norm": 0.03989469259977341, | |
| "learning_rate": 7.4259094661265685e-06, | |
| "loss": 0.4285, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.2188449848024316, | |
| "grad_norm": 0.0407419353723526, | |
| "learning_rate": 7.388599092561315e-06, | |
| "loss": 0.4204, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.2261398176291793, | |
| "grad_norm": 0.040525760501623154, | |
| "learning_rate": 7.351115582887212e-06, | |
| "loss": 0.4253, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.233434650455927, | |
| "grad_norm": 0.04370498284697533, | |
| "learning_rate": 7.313461654072974e-06, | |
| "loss": 0.4071, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.2407294832826747, | |
| "grad_norm": 0.0392344668507576, | |
| "learning_rate": 7.2756400354400445e-06, | |
| "loss": 0.4093, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.2480243161094224, | |
| "grad_norm": 0.03849213197827339, | |
| "learning_rate": 7.237653468464756e-06, | |
| "loss": 0.4157, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.2553191489361701, | |
| "grad_norm": 0.04228688403964043, | |
| "learning_rate": 7.199504706579617e-06, | |
| "loss": 0.482, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.262613981762918, | |
| "grad_norm": 0.037325162440538406, | |
| "learning_rate": 7.161196514973735e-06, | |
| "loss": 0.4224, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.2699088145896655, | |
| "grad_norm": 0.047044239938259125, | |
| "learning_rate": 7.122731670392381e-06, | |
| "loss": 0.4249, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.2772036474164135, | |
| "grad_norm": 0.04322784021496773, | |
| "learning_rate": 7.0841129609357165e-06, | |
| "loss": 0.4051, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.2844984802431612, | |
| "grad_norm": 0.041998326778411865, | |
| "learning_rate": 7.045343185856701e-06, | |
| "loss": 0.4106, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.2917933130699089, | |
| "grad_norm": 0.040727648884058, | |
| "learning_rate": 7.006425155358195e-06, | |
| "loss": 0.4427, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.2990881458966566, | |
| "grad_norm": 0.04059009999036789, | |
| "learning_rate": 6.967361690389258e-06, | |
| "loss": 0.437, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.3063829787234043, | |
| "grad_norm": 0.042023915797472, | |
| "learning_rate": 6.92815562244068e-06, | |
| "loss": 0.4315, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.313677811550152, | |
| "grad_norm": 0.04910752549767494, | |
| "learning_rate": 6.888809793339729e-06, | |
| "loss": 0.4436, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.3209726443768997, | |
| "grad_norm": 0.04180140420794487, | |
| "learning_rate": 6.849327055044182e-06, | |
| "loss": 0.3948, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.3282674772036474, | |
| "grad_norm": 0.03989269211888313, | |
| "learning_rate": 6.80971026943559e-06, | |
| "loss": 0.3929, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.335562310030395, | |
| "grad_norm": 0.04497074335813522, | |
| "learning_rate": 6.769962308111839e-06, | |
| "loss": 0.4429, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.342857142857143, | |
| "grad_norm": 0.04516409710049629, | |
| "learning_rate": 6.7300860521790034e-06, | |
| "loss": 0.4363, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.3501519756838904, | |
| "grad_norm": 0.041362229734659195, | |
| "learning_rate": 6.690084392042514e-06, | |
| "loss": 0.4058, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.3574468085106384, | |
| "grad_norm": 0.04281953349709511, | |
| "learning_rate": 6.649960227197648e-06, | |
| "loss": 0.423, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.364741641337386, | |
| "grad_norm": 0.046076931059360504, | |
| "learning_rate": 6.609716466019356e-06, | |
| "loss": 0.4427, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.3720364741641338, | |
| "grad_norm": 0.03960058465600014, | |
| "learning_rate": 6.569356025551454e-06, | |
| "loss": 0.4193, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.3793313069908815, | |
| "grad_norm": 0.044169649481773376, | |
| "learning_rate": 6.5288818312951886e-06, | |
| "loss": 0.4034, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.3866261398176292, | |
| "grad_norm": 0.04062066227197647, | |
| "learning_rate": 6.4882968169971734e-06, | |
| "loss": 0.4018, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.3939209726443769, | |
| "grad_norm": 0.04406093806028366, | |
| "learning_rate": 6.447603924436744e-06, | |
| "loss": 0.4498, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.4012158054711246, | |
| "grad_norm": 0.04197722300887108, | |
| "learning_rate": 6.406806103212725e-06, | |
| "loss": 0.4356, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.4085106382978723, | |
| "grad_norm": 0.04061530530452728, | |
| "learning_rate": 6.365906310529631e-06, | |
| "loss": 0.4441, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.41580547112462, | |
| "grad_norm": 0.046513479202985764, | |
| "learning_rate": 6.32490751098331e-06, | |
| "loss": 0.4166, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.4231003039513679, | |
| "grad_norm": 0.03948912024497986, | |
| "learning_rate": 6.2838126763460635e-06, | |
| "loss": 0.4478, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.4303951367781156, | |
| "grad_norm": 0.04548676684498787, | |
| "learning_rate": 6.2426247853512355e-06, | |
| "loss": 0.4653, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.4376899696048633, | |
| "grad_norm": 0.041050177067518234, | |
| "learning_rate": 6.2013468234773034e-06, | |
| "loss": 0.3953, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.444984802431611, | |
| "grad_norm": 0.03936685994267464, | |
| "learning_rate": 6.1599817827314744e-06, | |
| "loss": 0.4256, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.4522796352583587, | |
| "grad_norm": 0.041237395256757736, | |
| "learning_rate": 6.118532661432812e-06, | |
| "loss": 0.3892, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.4595744680851064, | |
| "grad_norm": 0.043174393475055695, | |
| "learning_rate": 6.077002463994908e-06, | |
| "loss": 0.4174, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.466869300911854, | |
| "grad_norm": 0.04198073223233223, | |
| "learning_rate": 6.035394200708104e-06, | |
| "loss": 0.4278, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.4741641337386018, | |
| "grad_norm": 0.045515723526477814, | |
| "learning_rate": 5.993710887521302e-06, | |
| "loss": 0.4346, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.4814589665653495, | |
| "grad_norm": 0.04443354532122612, | |
| "learning_rate": 5.951955545823342e-06, | |
| "loss": 0.4116, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.4887537993920974, | |
| "grad_norm": 0.04223044961690903, | |
| "learning_rate": 5.910131202224011e-06, | |
| "loss": 0.3844, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.4960486322188449, | |
| "grad_norm": 0.04305846244096756, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.4414, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.5033434650455928, | |
| "grad_norm": 0.04148327186703682, | |
| "learning_rate": 5.826287640548425e-06, | |
| "loss": 0.4327, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.5106382978723403, | |
| "grad_norm": 0.0433870293200016, | |
| "learning_rate": 5.784274499820214e-06, | |
| "loss": 0.3787, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.5179331306990882, | |
| "grad_norm": 0.041102319955825806, | |
| "learning_rate": 5.742204511446203e-06, | |
| "loss": 0.4189, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.525227963525836, | |
| "grad_norm": 0.04175707325339317, | |
| "learning_rate": 5.7000807248431466e-06, | |
| "loss": 0.427, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5325227963525836, | |
| "grad_norm": 0.04286907613277435, | |
| "learning_rate": 5.657906193327325e-06, | |
| "loss": 0.4, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.5398176291793313, | |
| "grad_norm": 0.04246861860156059, | |
| "learning_rate": 5.615683973893235e-06, | |
| "loss": 0.4097, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.547112462006079, | |
| "grad_norm": 0.03898885101079941, | |
| "learning_rate": 5.573417126992004e-06, | |
| "loss": 0.4237, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.554407294832827, | |
| "grad_norm": 0.04554813727736473, | |
| "learning_rate": 5.5311087163095475e-06, | |
| "loss": 0.436, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.5617021276595744, | |
| "grad_norm": 0.04189833253622055, | |
| "learning_rate": 5.4887618085445094e-06, | |
| "loss": 0.4121, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.5689969604863223, | |
| "grad_norm": 0.05306672677397728, | |
| "learning_rate": 5.446379473185972e-06, | |
| "loss": 0.4015, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.5762917933130698, | |
| "grad_norm": 0.04060041531920433, | |
| "learning_rate": 5.403964782290962e-06, | |
| "loss": 0.3967, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.5835866261398177, | |
| "grad_norm": 0.045451849699020386, | |
| "learning_rate": 5.361520810261779e-06, | |
| "loss": 0.4161, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.5908814589665652, | |
| "grad_norm": 0.043955542147159576, | |
| "learning_rate": 5.319050633623141e-06, | |
| "loss": 0.4205, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.598176291793313, | |
| "grad_norm": 0.040733452886343, | |
| "learning_rate": 5.276557330799203e-06, | |
| "loss": 0.4165, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.6054711246200608, | |
| "grad_norm": 0.04190356284379959, | |
| "learning_rate": 5.234043981890395e-06, | |
| "loss": 0.4515, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.6127659574468085, | |
| "grad_norm": 0.037713076919317245, | |
| "learning_rate": 5.191513668450178e-06, | |
| "loss": 0.4131, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.6200607902735562, | |
| "grad_norm": 0.038250233978033066, | |
| "learning_rate": 5.1489694732616805e-06, | |
| "loss": 0.4028, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.627355623100304, | |
| "grad_norm": 0.039751507341861725, | |
| "learning_rate": 5.106414480114238e-06, | |
| "loss": 0.4121, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.6346504559270518, | |
| "grad_norm": 0.044864848256111145, | |
| "learning_rate": 5.06385177357987e-06, | |
| "loss": 0.4708, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.6419452887537993, | |
| "grad_norm": 0.04169140383601189, | |
| "learning_rate": 5.021284438789694e-06, | |
| "loss": 0.425, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.6492401215805472, | |
| "grad_norm": 0.04238287732005119, | |
| "learning_rate": 4.9787155612103076e-06, | |
| "loss": 0.409, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.6565349544072947, | |
| "grad_norm": 0.03984750807285309, | |
| "learning_rate": 4.936148226420133e-06, | |
| "loss": 0.4451, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.6638297872340426, | |
| "grad_norm": 0.03823258727788925, | |
| "learning_rate": 4.893585519885764e-06, | |
| "loss": 0.4318, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.6711246200607903, | |
| "grad_norm": 0.043166667222976685, | |
| "learning_rate": 4.851030526738321e-06, | |
| "loss": 0.4348, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.678419452887538, | |
| "grad_norm": 0.04118693992495537, | |
| "learning_rate": 4.808486331549824e-06, | |
| "loss": 0.435, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.6857142857142857, | |
| "grad_norm": 0.040095556527376175, | |
| "learning_rate": 4.765956018109607e-06, | |
| "loss": 0.4506, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.6930091185410334, | |
| "grad_norm": 0.04523642733693123, | |
| "learning_rate": 4.7234426692007985e-06, | |
| "loss": 0.4394, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.7003039513677811, | |
| "grad_norm": 0.041244085878133774, | |
| "learning_rate": 4.680949366376858e-06, | |
| "loss": 0.4698, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.7075987841945288, | |
| "grad_norm": 0.04374610632658005, | |
| "learning_rate": 4.638479189738224e-06, | |
| "loss": 0.4129, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.7148936170212767, | |
| "grad_norm": 0.040487710386514664, | |
| "learning_rate": 4.596035217709039e-06, | |
| "loss": 0.4362, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.7221884498480242, | |
| "grad_norm": 0.044370926916599274, | |
| "learning_rate": 4.553620526814029e-06, | |
| "loss": 0.4155, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.7294832826747721, | |
| "grad_norm": 0.04036295786499977, | |
| "learning_rate": 4.511238191455491e-06, | |
| "loss": 0.4214, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.7367781155015196, | |
| "grad_norm": 0.03773313760757446, | |
| "learning_rate": 4.468891283690454e-06, | |
| "loss": 0.4298, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.7440729483282675, | |
| "grad_norm": 0.045683182775974274, | |
| "learning_rate": 4.426582873007999e-06, | |
| "loss": 0.4485, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.7513677811550152, | |
| "grad_norm": 0.04686903581023216, | |
| "learning_rate": 4.384316026106766e-06, | |
| "loss": 0.4303, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.758662613981763, | |
| "grad_norm": 0.045155324041843414, | |
| "learning_rate": 4.342093806672678e-06, | |
| "loss": 0.4409, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.7659574468085106, | |
| "grad_norm": 0.0418829619884491, | |
| "learning_rate": 4.299919275156857e-06, | |
| "loss": 0.4149, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.7732522796352583, | |
| "grad_norm": 0.041985101997852325, | |
| "learning_rate": 4.2577954885537985e-06, | |
| "loss": 0.4293, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.780547112462006, | |
| "grad_norm": 0.042692556977272034, | |
| "learning_rate": 4.215725500179788e-06, | |
| "loss": 0.4258, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.7878419452887537, | |
| "grad_norm": 0.04013342410326004, | |
| "learning_rate": 4.173712359451576e-06, | |
| "loss": 0.4015, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.7951367781155017, | |
| "grad_norm": 0.038998380303382874, | |
| "learning_rate": 4.131759111665349e-06, | |
| "loss": 0.4596, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.8024316109422491, | |
| "grad_norm": 0.039829254150390625, | |
| "learning_rate": 4.0898687977759895e-06, | |
| "loss": 0.4128, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.809726443768997, | |
| "grad_norm": 0.04312862455844879, | |
| "learning_rate": 4.048044454176658e-06, | |
| "loss": 0.4243, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.8170212765957445, | |
| "grad_norm": 0.04533419758081436, | |
| "learning_rate": 4.0062891124787e-06, | |
| "loss": 0.4414, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.8243161094224924, | |
| "grad_norm": 0.0438460148870945, | |
| "learning_rate": 3.964605799291897e-06, | |
| "loss": 0.4553, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.8316109422492401, | |
| "grad_norm": 0.0429726168513298, | |
| "learning_rate": 3.922997536005094e-06, | |
| "loss": 0.4311, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.8389057750759878, | |
| "grad_norm": 0.039493922144174576, | |
| "learning_rate": 3.88146733856719e-06, | |
| "loss": 0.4387, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.8462006079027355, | |
| "grad_norm": 0.04514075070619583, | |
| "learning_rate": 3.840018217268527e-06, | |
| "loss": 0.4442, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.8534954407294832, | |
| "grad_norm": 0.04080420732498169, | |
| "learning_rate": 3.7986531765226965e-06, | |
| "loss": 0.3884, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.8607902735562312, | |
| "grad_norm": 0.04457089304924011, | |
| "learning_rate": 3.757375214648764e-06, | |
| "loss": 0.3804, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.8680851063829786, | |
| "grad_norm": 0.044411323964595795, | |
| "learning_rate": 3.716187323653939e-06, | |
| "loss": 0.4164, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.8753799392097266, | |
| "grad_norm": 0.04527450352907181, | |
| "learning_rate": 3.675092489016693e-06, | |
| "loss": 0.4402, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.882674772036474, | |
| "grad_norm": 0.039380993694067, | |
| "learning_rate": 3.6340936894703717e-06, | |
| "loss": 0.4329, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.889969604863222, | |
| "grad_norm": 0.04769477993249893, | |
| "learning_rate": 3.593193896787277e-06, | |
| "loss": 0.4447, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.8972644376899694, | |
| "grad_norm": 0.04017976298928261, | |
| "learning_rate": 3.5523960755632573e-06, | |
| "loss": 0.4066, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.9045592705167174, | |
| "grad_norm": 0.04179855436086655, | |
| "learning_rate": 3.5117031830028274e-06, | |
| "loss": 0.4048, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.911854103343465, | |
| "grad_norm": 0.041397638618946075, | |
| "learning_rate": 3.4711181687048114e-06, | |
| "loss": 0.4296, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.9191489361702128, | |
| "grad_norm": 0.05517794191837311, | |
| "learning_rate": 3.4306439744485453e-06, | |
| "loss": 0.4266, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.9264437689969605, | |
| "grad_norm": 0.04322275519371033, | |
| "learning_rate": 3.3902835339806463e-06, | |
| "loss": 0.4084, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.9337386018237082, | |
| "grad_norm": 0.04079868271946907, | |
| "learning_rate": 3.3500397728023536e-06, | |
| "loss": 0.3987, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.941033434650456, | |
| "grad_norm": 0.040178705006837845, | |
| "learning_rate": 3.309915607957487e-06, | |
| "loss": 0.3508, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.9483282674772036, | |
| "grad_norm": 0.042311254888772964, | |
| "learning_rate": 3.2699139478209987e-06, | |
| "loss": 0.4315, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.9556231003039515, | |
| "grad_norm": 0.041265442967414856, | |
| "learning_rate": 3.2300376918881628e-06, | |
| "loss": 0.4096, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.962917933130699, | |
| "grad_norm": 0.035929929465055466, | |
| "learning_rate": 3.19028973056441e-06, | |
| "loss": 0.3872, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.9702127659574469, | |
| "grad_norm": 0.04031127318739891, | |
| "learning_rate": 3.150672944955818e-06, | |
| "loss": 0.4299, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.9775075987841946, | |
| "grad_norm": 0.043629132211208344, | |
| "learning_rate": 3.111190206660273e-06, | |
| "loss": 0.4371, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.9848024316109423, | |
| "grad_norm": 0.03935433551669121, | |
| "learning_rate": 3.0718443775593233e-06, | |
| "loss": 0.3912, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.99209726443769, | |
| "grad_norm": 0.04069478437304497, | |
| "learning_rate": 3.0326383096107424e-06, | |
| "loss": 0.416, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.9993920972644377, | |
| "grad_norm": 0.05225847661495209, | |
| "learning_rate": 2.993574844641807e-06, | |
| "loss": 0.3656, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.05225847661495209, | |
| "learning_rate": 2.9546568141433007e-06, | |
| "loss": 0.5271, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.007294832826748, | |
| "grad_norm": 0.15690822899341583, | |
| "learning_rate": 2.915887039064287e-06, | |
| "loss": 0.3677, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.0145896656534954, | |
| "grad_norm": 0.040792327374219894, | |
| "learning_rate": 2.8772683296076197e-06, | |
| "loss": 0.3493, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.0218844984802433, | |
| "grad_norm": 0.042940009385347366, | |
| "learning_rate": 2.838803485026265e-06, | |
| "loss": 0.3622, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.029179331306991, | |
| "grad_norm": 0.03872222825884819, | |
| "learning_rate": 2.800495293420384e-06, | |
| "loss": 0.3204, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.0364741641337387, | |
| "grad_norm": 0.03756758198142052, | |
| "learning_rate": 2.762346531535246e-06, | |
| "loss": 0.3158, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.043768996960486, | |
| "grad_norm": 0.04466132074594498, | |
| "learning_rate": 2.724359964559958e-06, | |
| "loss": 0.3638, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.051063829787234, | |
| "grad_norm": 0.04124055802822113, | |
| "learning_rate": 2.686538345927027e-06, | |
| "loss": 0.3231, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.0583586626139816, | |
| "grad_norm": 0.04917893931269646, | |
| "learning_rate": 2.6488844171127903e-06, | |
| "loss": 0.3683, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.0656534954407295, | |
| "grad_norm": 0.04118992015719414, | |
| "learning_rate": 2.611400907438685e-06, | |
| "loss": 0.294, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.072948328267477, | |
| "grad_norm": 0.04872892051935196, | |
| "learning_rate": 2.574090533873431e-06, | |
| "loss": 0.3156, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.080243161094225, | |
| "grad_norm": 0.04159025847911835, | |
| "learning_rate": 2.5369560008360826e-06, | |
| "loss": 0.3467, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.087537993920973, | |
| "grad_norm": 0.04014930874109268, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.3303, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.0948328267477203, | |
| "grad_norm": 0.04120990261435509, | |
| "learning_rate": 2.4632252100977567e-06, | |
| "loss": 0.3086, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.1021276595744682, | |
| "grad_norm": 0.04079623147845268, | |
| "learning_rate": 2.426634296726955e-06, | |
| "loss": 0.3364, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.1094224924012157, | |
| "grad_norm": 0.04097681865096092, | |
| "learning_rate": 2.3902299121570332e-06, | |
| "loss": 0.3254, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.1167173252279636, | |
| "grad_norm": 0.04335152730345726, | |
| "learning_rate": 2.354014695136997e-06, | |
| "loss": 0.2875, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.124012158054711, | |
| "grad_norm": 0.041609250009059906, | |
| "learning_rate": 2.317991270704167e-06, | |
| "loss": 0.3585, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.131306990881459, | |
| "grad_norm": 0.03962058201432228, | |
| "learning_rate": 2.282162249993895e-06, | |
| "loss": 0.3094, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.1386018237082065, | |
| "grad_norm": 0.041423097252845764, | |
| "learning_rate": 2.2465302300503012e-06, | |
| "loss": 0.358, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.1458966565349544, | |
| "grad_norm": 0.044281214475631714, | |
| "learning_rate": 2.211097793638029e-06, | |
| "loss": 0.3575, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.153191489361702, | |
| "grad_norm": 0.04825511574745178, | |
| "learning_rate": 2.175867509055033e-06, | |
| "loss": 0.3364, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.16048632218845, | |
| "grad_norm": 0.04354681074619293, | |
| "learning_rate": 2.1408419299464245e-06, | |
| "loss": 0.2979, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.1677811550151977, | |
| "grad_norm": 0.042697008699178696, | |
| "learning_rate": 2.106023595119358e-06, | |
| "loss": 0.3356, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.1750759878419452, | |
| "grad_norm": 0.047276780009269714, | |
| "learning_rate": 2.071415028359026e-06, | |
| "loss": 0.3634, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.182370820668693, | |
| "grad_norm": 0.041072778403759, | |
| "learning_rate": 2.037018738245707e-06, | |
| "loss": 0.3362, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.1896656534954406, | |
| "grad_norm": 0.04692791774868965, | |
| "learning_rate": 2.0028372179729405e-06, | |
| "loss": 0.3511, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.1969604863221885, | |
| "grad_norm": 0.04538114741444588, | |
| "learning_rate": 1.9688729451668116e-06, | |
| "loss": 0.3336, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.204255319148936, | |
| "grad_norm": 0.044003862887620926, | |
| "learning_rate": 1.935128381706355e-06, | |
| "loss": 0.3349, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.211550151975684, | |
| "grad_norm": 0.045857448130846024, | |
| "learning_rate": 1.901605973545116e-06, | |
| "loss": 0.3537, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.2188449848024314, | |
| "grad_norm": 0.04272821545600891, | |
| "learning_rate": 1.8683081505338468e-06, | |
| "loss": 0.3373, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.2261398176291793, | |
| "grad_norm": 0.04273563250899315, | |
| "learning_rate": 1.8352373262443918e-06, | |
| "loss": 0.3436, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.2334346504559273, | |
| "grad_norm": 0.058361783623695374, | |
| "learning_rate": 1.8023958977947303e-06, | |
| "loss": 0.3245, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.2407294832826747, | |
| "grad_norm": 0.04002346843481064, | |
| "learning_rate": 1.7697862456752273e-06, | |
| "loss": 0.3317, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.2480243161094227, | |
| "grad_norm": 0.039896223694086075, | |
| "learning_rate": 1.7374107335760937e-06, | |
| "loss": 0.2976, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.25531914893617, | |
| "grad_norm": 0.04140615463256836, | |
| "learning_rate": 1.7052717082160348e-06, | |
| "loss": 0.3178, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.262613981762918, | |
| "grad_norm": 0.04329473525285721, | |
| "learning_rate": 1.6733714991721738e-06, | |
| "loss": 0.3187, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.2699088145896655, | |
| "grad_norm": 0.04440492019057274, | |
| "learning_rate": 1.6417124187111778e-06, | |
| "loss": 0.3194, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.2772036474164135, | |
| "grad_norm": 0.03996637463569641, | |
| "learning_rate": 1.610296761621662e-06, | |
| "loss": 0.3504, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.284498480243161, | |
| "grad_norm": 0.04870344325900078, | |
| "learning_rate": 1.5791268050478487e-06, | |
| "loss": 0.3599, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.291793313069909, | |
| "grad_norm": 0.049775756895542145, | |
| "learning_rate": 1.5482048083245116e-06, | |
| "loss": 0.3051, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.2990881458966568, | |
| "grad_norm": 0.04166199639439583, | |
| "learning_rate": 1.517533012813217e-06, | |
| "loss": 0.3383, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.3063829787234043, | |
| "grad_norm": 0.04483890160918236, | |
| "learning_rate": 1.4871136417398407e-06, | |
| "loss": 0.3302, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.3136778115501517, | |
| "grad_norm": 0.04276059940457344, | |
| "learning_rate": 1.4569489000334435e-06, | |
| "loss": 0.3615, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.3209726443768997, | |
| "grad_norm": 0.04011907801032066, | |
| "learning_rate": 1.427040974166427e-06, | |
| "loss": 0.3139, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.3282674772036476, | |
| "grad_norm": 0.043183084577322006, | |
| "learning_rate": 1.3973920319960654e-06, | |
| "loss": 0.3327, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.335562310030395, | |
| "grad_norm": 0.045110031962394714, | |
| "learning_rate": 1.3680042226073554e-06, | |
| "loss": 0.3183, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.342857142857143, | |
| "grad_norm": 0.04653245955705643, | |
| "learning_rate": 1.3388796761572493e-06, | |
| "loss": 0.3411, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.3501519756838904, | |
| "grad_norm": 0.04192928597331047, | |
| "learning_rate": 1.310020503720254e-06, | |
| "loss": 0.3722, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.3574468085106384, | |
| "grad_norm": 0.04330296441912651, | |
| "learning_rate": 1.2814287971354023e-06, | |
| "loss": 0.325, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.364741641337386, | |
| "grad_norm": 0.04404173046350479, | |
| "learning_rate": 1.253106628854635e-06, | |
| "loss": 0.3247, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.3720364741641338, | |
| "grad_norm": 0.04104992374777794, | |
| "learning_rate": 1.2250560517925747e-06, | |
| "loss": 0.3079, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.3793313069908812, | |
| "grad_norm": 0.04262121394276619, | |
| "learning_rate": 1.197279099177731e-06, | |
| "loss": 0.3446, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.386626139817629, | |
| "grad_norm": 0.04929178208112717, | |
| "learning_rate": 1.1697777844051105e-06, | |
| "loss": 0.3501, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.393920972644377, | |
| "grad_norm": 0.04329733923077583, | |
| "learning_rate": 1.1425541008902852e-06, | |
| "loss": 0.3213, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.4012158054711246, | |
| "grad_norm": 0.04333839192986488, | |
| "learning_rate": 1.1156100219249022e-06, | |
| "loss": 0.3232, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.4085106382978725, | |
| "grad_norm": 0.04259442910552025, | |
| "learning_rate": 1.0889475005336447e-06, | |
| "loss": 0.3632, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.41580547112462, | |
| "grad_norm": 0.04376016557216644, | |
| "learning_rate": 1.0625684693326727e-06, | |
| "loss": 0.3355, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.423100303951368, | |
| "grad_norm": 0.04070465639233589, | |
| "learning_rate": 1.0364748403895368e-06, | |
| "loss": 0.341, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.4303951367781154, | |
| "grad_norm": 0.041908472776412964, | |
| "learning_rate": 1.0106685050845838e-06, | |
| "loss": 0.3383, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.4376899696048633, | |
| "grad_norm": 0.04458033666014671, | |
| "learning_rate": 9.851513339738627e-07, | |
| "loss": 0.3246, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.4449848024316108, | |
| "grad_norm": 0.04246847331523895, | |
| "learning_rate": 9.599251766535344e-07, | |
| "loss": 0.3418, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.4522796352583587, | |
| "grad_norm": 0.04456906393170357, | |
| "learning_rate": 9.349918616258113e-07, | |
| "loss": 0.3536, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.4595744680851066, | |
| "grad_norm": 0.041079938411712646, | |
| "learning_rate": 9.10353196166412e-07, | |
| "loss": 0.3278, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.466869300911854, | |
| "grad_norm": 0.049959778785705566, | |
| "learning_rate": 8.860109661935673e-07, | |
| "loss": 0.3417, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.474164133738602, | |
| "grad_norm": 0.044310178607702255, | |
| "learning_rate": 8.619669361385663e-07, | |
| "loss": 0.3148, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.4814589665653495, | |
| "grad_norm": 0.04187872260808945, | |
| "learning_rate": 8.382228488178639e-07, | |
| "loss": 0.3392, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.4887537993920974, | |
| "grad_norm": 0.04105791822075844, | |
| "learning_rate": 8.147804253067581e-07, | |
| "loss": 0.3273, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.496048632218845, | |
| "grad_norm": 0.039138007909059525, | |
| "learning_rate": 7.916413648146282e-07, | |
| "loss": 0.3112, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.503343465045593, | |
| "grad_norm": 0.04459141194820404, | |
| "learning_rate": 7.6880734456178e-07, | |
| "loss": 0.3463, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.5106382978723403, | |
| "grad_norm": 0.043715398758649826, | |
| "learning_rate": 7.462800196578662e-07, | |
| "loss": 0.3439, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.517933130699088, | |
| "grad_norm": 0.043532464653253555, | |
| "learning_rate": 7.240610229819195e-07, | |
| "loss": 0.3303, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.525227963525836, | |
| "grad_norm": 0.04273553937673569, | |
| "learning_rate": 7.021519650639952e-07, | |
| "loss": 0.3171, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.5325227963525836, | |
| "grad_norm": 0.05441723391413689, | |
| "learning_rate": 6.805544339684295e-07, | |
| "loss": 0.3239, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.539817629179331, | |
| "grad_norm": 0.04585114121437073, | |
| "learning_rate": 6.592699951787362e-07, | |
| "loss": 0.3378, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.547112462006079, | |
| "grad_norm": 0.04242338612675667, | |
| "learning_rate": 6.383001914841252e-07, | |
| "loss": 0.2992, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.554407294832827, | |
| "grad_norm": 0.046155836433172226, | |
| "learning_rate": 6.17646542867682e-07, | |
| "loss": 0.3503, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.5617021276595744, | |
| "grad_norm": 0.04374154284596443, | |
| "learning_rate": 5.973105463961864e-07, | |
| "loss": 0.3385, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.5689969604863223, | |
| "grad_norm": 0.04297053441405296, | |
| "learning_rate": 5.772936761116027e-07, | |
| "loss": 0.3396, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.57629179331307, | |
| "grad_norm": 0.04214682802557945, | |
| "learning_rate": 5.575973829242365e-07, | |
| "loss": 0.3373, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.5835866261398177, | |
| "grad_norm": 0.04097369685769081, | |
| "learning_rate": 5.382230945075556e-07, | |
| "loss": 0.3386, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.590881458966565, | |
| "grad_norm": 0.042690787464380264, | |
| "learning_rate": 5.191722151947227e-07, | |
| "loss": 0.3319, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.598176291793313, | |
| "grad_norm": 0.040518004447221756, | |
| "learning_rate": 5.004461258767873e-07, | |
| "loss": 0.3174, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.6054711246200606, | |
| "grad_norm": 0.04100370407104492, | |
| "learning_rate": 4.820461839026047e-07, | |
| "loss": 0.34, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.6127659574468085, | |
| "grad_norm": 0.04036758467555046, | |
| "learning_rate": 4.639737229804403e-07, | |
| "loss": 0.3351, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.6200607902735564, | |
| "grad_norm": 0.04206588491797447, | |
| "learning_rate": 4.4623005308130243e-07, | |
| "loss": 0.3244, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.627355623100304, | |
| "grad_norm": 0.04280061274766922, | |
| "learning_rate": 4.2881646034398926e-07, | |
| "loss": 0.3065, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.634650455927052, | |
| "grad_norm": 0.04229553043842316, | |
| "learning_rate": 4.1173420698186027e-07, | |
| "loss": 0.3306, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.6419452887537993, | |
| "grad_norm": 0.044544368982315063, | |
| "learning_rate": 3.9498453119134917e-07, | |
| "loss": 0.3514, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.6492401215805472, | |
| "grad_norm": 0.045995116233825684, | |
| "learning_rate": 3.7856864706221187e-07, | |
| "loss": 0.3498, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.6565349544072947, | |
| "grad_norm": 0.048596058040857315, | |
| "learning_rate": 3.6248774448952695e-07, | |
| "loss": 0.3358, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.6638297872340426, | |
| "grad_norm": 0.04591159150004387, | |
| "learning_rate": 3.467429890874424e-07, | |
| "loss": 0.3129, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.67112462006079, | |
| "grad_norm": 0.041663773357868195, | |
| "learning_rate": 3.313355221046888e-07, | |
| "loss": 0.3213, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.678419452887538, | |
| "grad_norm": 0.04105694591999054, | |
| "learning_rate": 3.1626646034186084e-07, | |
| "loss": 0.345, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.685714285714286, | |
| "grad_norm": 0.044980090111494064, | |
| "learning_rate": 3.015368960704584e-07, | |
| "loss": 0.3265, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.6930091185410334, | |
| "grad_norm": 0.04313720017671585, | |
| "learning_rate": 2.871478969537206e-07, | |
| "loss": 0.3434, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.700303951367781, | |
| "grad_norm": 0.04387833923101425, | |
| "learning_rate": 2.7310050596923323e-07, | |
| "loss": 0.33, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.707598784194529, | |
| "grad_norm": 0.04352449253201485, | |
| "learning_rate": 2.593957413333331e-07, | |
| "loss": 0.3315, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.7148936170212767, | |
| "grad_norm": 0.04267902672290802, | |
| "learning_rate": 2.4603459642729867e-07, | |
| "loss": 0.3574, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.722188449848024, | |
| "grad_norm": 0.046177759766578674, | |
| "learning_rate": 2.330180397253473e-07, | |
| "loss": 0.3428, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.729483282674772, | |
| "grad_norm": 0.0405619777739048, | |
| "learning_rate": 2.2034701472443854e-07, | |
| "loss": 0.2903, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.7367781155015196, | |
| "grad_norm": 0.04294833540916443, | |
| "learning_rate": 2.0802243987588068e-07, | |
| "loss": 0.3664, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 2.7440729483282675, | |
| "grad_norm": 0.048284079879522324, | |
| "learning_rate": 1.9604520851876196e-07, | |
| "loss": 0.3294, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.7513677811550155, | |
| "grad_norm": 0.04744973033666611, | |
| "learning_rate": 1.8441618881519186e-07, | |
| "loss": 0.321, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 2.758662613981763, | |
| "grad_norm": 0.03917940333485603, | |
| "learning_rate": 1.7313622368738014e-07, | |
| "loss": 0.307, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.7659574468085104, | |
| "grad_norm": 0.04587104544043541, | |
| "learning_rate": 1.6220613075653201e-07, | |
| "loss": 0.3464, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 2.7732522796352583, | |
| "grad_norm": 0.042470064014196396, | |
| "learning_rate": 1.51626702283586e-07, | |
| "loss": 0.2907, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.7805471124620063, | |
| "grad_norm": 0.048345521092414856, | |
| "learning_rate": 1.4139870511178767e-07, | |
| "loss": 0.3481, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 2.7878419452887537, | |
| "grad_norm": 0.038930460810661316, | |
| "learning_rate": 1.3152288061110518e-07, | |
| "loss": 0.2987, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.7951367781155017, | |
| "grad_norm": 0.04472014680504799, | |
| "learning_rate": 1.2199994462448906e-07, | |
| "loss": 0.3612, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.802431610942249, | |
| "grad_norm": 0.05124653875827789, | |
| "learning_rate": 1.1283058741598962e-07, | |
| "loss": 0.3051, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.809726443768997, | |
| "grad_norm": 0.038610782474279404, | |
| "learning_rate": 1.0401547362071939e-07, | |
| "loss": 0.3362, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 2.8170212765957445, | |
| "grad_norm": 0.042344819754362106, | |
| "learning_rate": 9.555524219667989e-08, | |
| "loss": 0.3206, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.8243161094224924, | |
| "grad_norm": 0.046877894550561905, | |
| "learning_rate": 8.745050637844532e-08, | |
| "loss": 0.3332, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 2.83161094224924, | |
| "grad_norm": 0.04104023799300194, | |
| "learning_rate": 7.970185363271432e-08, | |
| "loss": 0.2941, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.838905775075988, | |
| "grad_norm": 0.04339218884706497, | |
| "learning_rate": 7.230984561572729e-08, | |
| "loss": 0.3409, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 2.8462006079027358, | |
| "grad_norm": 0.047086864709854126, | |
| "learning_rate": 6.527501813255344e-08, | |
| "loss": 0.3282, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.8534954407294832, | |
| "grad_norm": 0.04140612855553627, | |
| "learning_rate": 5.8597881098257924e-08, | |
| "loss": 0.3706, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 2.860790273556231, | |
| "grad_norm": 0.045086655765771866, | |
| "learning_rate": 5.227891850093314e-08, | |
| "loss": 0.3159, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.8680851063829786, | |
| "grad_norm": 0.04669662564992905, | |
| "learning_rate": 4.631858836662562e-08, | |
| "loss": 0.3212, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.8753799392097266, | |
| "grad_norm": 0.048425789922475815, | |
| "learning_rate": 4.071732272613149e-08, | |
| "loss": 0.3781, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.882674772036474, | |
| "grad_norm": 0.0414654016494751, | |
| "learning_rate": 3.5475527583681005e-08, | |
| "loss": 0.3487, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 2.889969604863222, | |
| "grad_norm": 0.04240646958351135, | |
| "learning_rate": 3.059358288751202e-08, | |
| "loss": 0.3063, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.8972644376899694, | |
| "grad_norm": 0.047270409762859344, | |
| "learning_rate": 2.6071842502326526e-08, | |
| "loss": 0.3352, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 2.9045592705167174, | |
| "grad_norm": 0.043454963713884354, | |
| "learning_rate": 2.1910634183644475e-08, | |
| "loss": 0.3442, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.9118541033434653, | |
| "grad_norm": 0.045333363115787506, | |
| "learning_rate": 1.811025955404333e-08, | |
| "loss": 0.3196, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 2.9191489361702128, | |
| "grad_norm": 0.04678984358906746, | |
| "learning_rate": 1.4670994081297796e-08, | |
| "loss": 0.3319, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.9264437689969602, | |
| "grad_norm": 0.05119337886571884, | |
| "learning_rate": 1.159308705841078e-08, | |
| "loss": 0.3614, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 2.933738601823708, | |
| "grad_norm": 0.048367924988269806, | |
| "learning_rate": 8.87676158554507e-09, | |
| "loss": 0.3615, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.941033434650456, | |
| "grad_norm": 0.04424307495355606, | |
| "learning_rate": 6.5222145538501595e-09, | |
| "loss": 0.3027, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.9483282674772036, | |
| "grad_norm": 0.0484929159283638, | |
| "learning_rate": 4.5296166311931125e-09, | |
| "loss": 0.3259, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.9556231003039515, | |
| "grad_norm": 0.04274160414934158, | |
| "learning_rate": 2.899112249786229e-09, | |
| "loss": 0.3219, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 2.962917933130699, | |
| "grad_norm": 0.04609229788184166, | |
| "learning_rate": 1.6308195957182028e-09, | |
| "loss": 0.3074, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.970212765957447, | |
| "grad_norm": 0.05541510134935379, | |
| "learning_rate": 7.24830600386528e-10, | |
| "loss": 0.3853, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 2.977507598784195, | |
| "grad_norm": 0.0421581007540226, | |
| "learning_rate": 1.812109338367174e-10, | |
| "loss": 0.3192, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.9848024316109423, | |
| "grad_norm": 0.04373352229595184, | |
| "learning_rate": 0.0, | |
| "loss": 0.3766, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 2.9848024316109423, | |
| "step": 411, | |
| "total_flos": 3.260394272163103e+17, | |
| "train_loss": 0.4436677635586175, | |
| "train_runtime": 142049.243, | |
| "train_samples_per_second": 0.139, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 411, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.260394272163103e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |