{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9848024316109423, "eval_steps": 500, "global_step": 411, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00729483282674772, "grad_norm": 0.14541301131248474, "learning_rate": 2.3809523809523811e-07, "loss": 0.7602, "step": 1 }, { "epoch": 0.01458966565349544, "grad_norm": 0.1567784547805786, "learning_rate": 4.7619047619047623e-07, "loss": 0.8215, "step": 2 }, { "epoch": 0.02188449848024316, "grad_norm": 0.1584789901971817, "learning_rate": 7.142857142857143e-07, "loss": 0.8269, "step": 3 }, { "epoch": 0.02917933130699088, "grad_norm": 0.157843217253685, "learning_rate": 9.523809523809525e-07, "loss": 0.7909, "step": 4 }, { "epoch": 0.0364741641337386, "grad_norm": 0.1607961654663086, "learning_rate": 1.1904761904761906e-06, "loss": 0.8198, "step": 5 }, { "epoch": 0.04376899696048632, "grad_norm": 0.15348272025585175, "learning_rate": 1.4285714285714286e-06, "loss": 0.7687, "step": 6 }, { "epoch": 0.05106382978723404, "grad_norm": 0.1496104598045349, "learning_rate": 1.6666666666666667e-06, "loss": 0.7942, "step": 7 }, { "epoch": 0.05835866261398176, "grad_norm": 0.2734036445617676, "learning_rate": 1.904761904761905e-06, "loss": 0.847, "step": 8 }, { "epoch": 0.06565349544072949, "grad_norm": 0.13683773577213287, "learning_rate": 2.1428571428571427e-06, "loss": 0.754, "step": 9 }, { "epoch": 0.0729483282674772, "grad_norm": 0.11306589841842651, "learning_rate": 2.380952380952381e-06, "loss": 0.6991, "step": 10 }, { "epoch": 0.08024316109422493, "grad_norm": 0.12233421206474304, "learning_rate": 2.6190476190476192e-06, "loss": 0.7829, "step": 11 }, { "epoch": 0.08753799392097264, "grad_norm": 0.10262873023748398, "learning_rate": 2.8571428571428573e-06, "loss": 0.7048, "step": 12 }, { "epoch": 0.09483282674772037, "grad_norm": 0.10435234010219574, "learning_rate": 3.0952380952380957e-06, "loss": 0.768, "step": 13 }, { "epoch": 0.10212765957446808, "grad_norm": 0.0735386535525322, "learning_rate": 3.3333333333333333e-06, "loss": 0.6147, "step": 14 }, { "epoch": 0.1094224924012158, "grad_norm": 0.07339954376220703, "learning_rate": 3.5714285714285718e-06, "loss": 0.7452, "step": 15 }, { "epoch": 0.11671732522796352, "grad_norm": 0.06846445798873901, "learning_rate": 3.80952380952381e-06, "loss": 0.7383, "step": 16 }, { "epoch": 0.12401215805471125, "grad_norm": 0.07185480743646622, "learning_rate": 4.047619047619048e-06, "loss": 0.7081, "step": 17 }, { "epoch": 0.13130699088145897, "grad_norm": 0.06281375139951706, "learning_rate": 4.2857142857142855e-06, "loss": 0.6171, "step": 18 }, { "epoch": 0.1386018237082067, "grad_norm": 0.08302997052669525, "learning_rate": 4.523809523809524e-06, "loss": 0.6136, "step": 19 }, { "epoch": 0.1458966565349544, "grad_norm": 0.07521834969520569, "learning_rate": 4.761904761904762e-06, "loss": 0.6401, "step": 20 }, { "epoch": 0.15319148936170213, "grad_norm": 0.07346966117620468, "learning_rate": 5e-06, "loss": 0.6575, "step": 21 }, { "epoch": 0.16048632218844985, "grad_norm": 0.06220546364784241, "learning_rate": 5.2380952380952384e-06, "loss": 0.585, "step": 22 }, { "epoch": 0.16778115501519758, "grad_norm": 0.06210927292704582, "learning_rate": 5.476190476190477e-06, "loss": 0.6116, "step": 23 }, { "epoch": 0.17507598784194528, "grad_norm": 0.06617508083581924, "learning_rate": 5.7142857142857145e-06, "loss": 0.6591, "step": 24 }, { "epoch": 0.182370820668693, "grad_norm": 0.06115543842315674, "learning_rate": 5.9523809523809525e-06, "loss": 0.6164, "step": 25 }, { "epoch": 0.18966565349544073, "grad_norm": 0.05512455105781555, "learning_rate": 6.1904761904761914e-06, "loss": 0.6131, "step": 26 }, { "epoch": 0.19696048632218846, "grad_norm": 0.05426128953695297, "learning_rate": 6.4285714285714295e-06, "loss": 0.6233, "step": 27 }, { "epoch": 0.20425531914893616, "grad_norm": 0.04495101794600487, "learning_rate": 6.666666666666667e-06, "loss": 0.6017, "step": 28 }, { "epoch": 0.2115501519756839, "grad_norm": 0.052700527012348175, "learning_rate": 6.9047619047619055e-06, "loss": 0.6209, "step": 29 }, { "epoch": 0.2188449848024316, "grad_norm": 0.05274520814418793, "learning_rate": 7.1428571428571436e-06, "loss": 0.562, "step": 30 }, { "epoch": 0.22613981762917934, "grad_norm": 0.0418085902929306, "learning_rate": 7.380952380952382e-06, "loss": 0.5356, "step": 31 }, { "epoch": 0.23343465045592704, "grad_norm": 0.04744059965014458, "learning_rate": 7.61904761904762e-06, "loss": 0.5759, "step": 32 }, { "epoch": 0.24072948328267477, "grad_norm": 0.051624756306409836, "learning_rate": 7.857142857142858e-06, "loss": 0.5989, "step": 33 }, { "epoch": 0.2480243161094225, "grad_norm": 0.04632480815052986, "learning_rate": 8.095238095238097e-06, "loss": 0.6036, "step": 34 }, { "epoch": 0.2553191489361702, "grad_norm": 0.040394943207502365, "learning_rate": 8.333333333333334e-06, "loss": 0.5439, "step": 35 }, { "epoch": 0.26261398176291795, "grad_norm": 0.047632846981287, "learning_rate": 8.571428571428571e-06, "loss": 0.6187, "step": 36 }, { "epoch": 0.26990881458966565, "grad_norm": 0.04498811811208725, "learning_rate": 8.80952380952381e-06, "loss": 0.5686, "step": 37 }, { "epoch": 0.2772036474164134, "grad_norm": 0.04858787730336189, "learning_rate": 9.047619047619049e-06, "loss": 0.6224, "step": 38 }, { "epoch": 0.2844984802431611, "grad_norm": 0.04534129053354263, "learning_rate": 9.285714285714288e-06, "loss": 0.576, "step": 39 }, { "epoch": 0.2917933130699088, "grad_norm": 0.04878037050366402, "learning_rate": 9.523809523809525e-06, "loss": 0.5956, "step": 40 }, { "epoch": 0.29908814589665655, "grad_norm": 0.044632136821746826, "learning_rate": 9.761904761904762e-06, "loss": 0.5748, "step": 41 }, { "epoch": 0.30638297872340425, "grad_norm": 0.041874803602695465, "learning_rate": 1e-05, "loss": 0.5752, "step": 42 }, { "epoch": 0.31367781155015195, "grad_norm": 0.041942398995161057, "learning_rate": 9.999818789066164e-06, "loss": 0.5759, "step": 43 }, { "epoch": 0.3209726443768997, "grad_norm": 0.045705121010541916, "learning_rate": 9.999275169399614e-06, "loss": 0.5962, "step": 44 }, { "epoch": 0.3282674772036474, "grad_norm": 0.043411824852228165, "learning_rate": 9.998369180404283e-06, "loss": 0.54, "step": 45 }, { "epoch": 0.33556231003039516, "grad_norm": 0.03998137265443802, "learning_rate": 9.997100887750215e-06, "loss": 0.5874, "step": 46 }, { "epoch": 0.34285714285714286, "grad_norm": 0.047370899468660355, "learning_rate": 9.995470383368808e-06, "loss": 0.6051, "step": 47 }, { "epoch": 0.35015197568389056, "grad_norm": 0.04455406963825226, "learning_rate": 9.993477785446151e-06, "loss": 0.5604, "step": 48 }, { "epoch": 0.3574468085106383, "grad_norm": 0.043418001383543015, "learning_rate": 9.991123238414455e-06, "loss": 0.5555, "step": 49 }, { "epoch": 0.364741641337386, "grad_norm": 0.03939136862754822, "learning_rate": 9.988406912941591e-06, "loss": 0.5493, "step": 50 }, { "epoch": 0.3720364741641337, "grad_norm": 0.04485655948519707, "learning_rate": 9.985329005918702e-06, "loss": 0.5804, "step": 51 }, { "epoch": 0.37933130699088147, "grad_norm": 0.0435781255364418, "learning_rate": 9.981889740445958e-06, "loss": 0.5617, "step": 52 }, { "epoch": 0.38662613981762917, "grad_norm": 0.03838958591222763, "learning_rate": 9.978089365816357e-06, "loss": 0.5481, "step": 53 }, { "epoch": 0.3939209726443769, "grad_norm": 0.03926938772201538, "learning_rate": 9.973928157497675e-06, "loss": 0.5195, "step": 54 }, { "epoch": 0.4012158054711246, "grad_norm": 0.049530286341905594, "learning_rate": 9.969406417112489e-06, "loss": 0.5854, "step": 55 }, { "epoch": 0.4085106382978723, "grad_norm": 0.08943431824445724, "learning_rate": 9.964524472416319e-06, "loss": 0.5706, "step": 56 }, { "epoch": 0.4158054711246201, "grad_norm": 0.04114034026861191, "learning_rate": 9.959282677273869e-06, "loss": 0.4923, "step": 57 }, { "epoch": 0.4231003039513678, "grad_norm": 0.03834295645356178, "learning_rate": 9.953681411633376e-06, "loss": 0.5151, "step": 58 }, { "epoch": 0.43039513677811553, "grad_norm": 0.03940470516681671, "learning_rate": 9.947721081499068e-06, "loss": 0.5274, "step": 59 }, { "epoch": 0.4376899696048632, "grad_norm": 0.05276661738753319, "learning_rate": 9.941402118901743e-06, "loss": 0.5312, "step": 60 }, { "epoch": 0.4449848024316109, "grad_norm": 0.04116562008857727, "learning_rate": 9.934724981867447e-06, "loss": 0.5073, "step": 61 }, { "epoch": 0.4522796352583587, "grad_norm": 0.039049182087183, "learning_rate": 9.927690154384273e-06, "loss": 0.5367, "step": 62 }, { "epoch": 0.4595744680851064, "grad_norm": 0.042383261024951935, "learning_rate": 9.920298146367287e-06, "loss": 0.5232, "step": 63 }, { "epoch": 0.4668693009118541, "grad_norm": 0.04153553023934364, "learning_rate": 9.912549493621555e-06, "loss": 0.5438, "step": 64 }, { "epoch": 0.47416413373860183, "grad_norm": 0.04116344451904297, "learning_rate": 9.904444757803322e-06, "loss": 0.4803, "step": 65 }, { "epoch": 0.48145896656534953, "grad_norm": 0.06467548757791519, "learning_rate": 9.895984526379282e-06, "loss": 0.5554, "step": 66 }, { "epoch": 0.4887537993920973, "grad_norm": 0.04420805722475052, "learning_rate": 9.887169412584012e-06, "loss": 0.5659, "step": 67 }, { "epoch": 0.496048632218845, "grad_norm": 0.04072507843375206, "learning_rate": 9.878000055375512e-06, "loss": 0.486, "step": 68 }, { "epoch": 0.5033434650455927, "grad_norm": 0.04508865624666214, "learning_rate": 9.868477119388897e-06, "loss": 0.5284, "step": 69 }, { "epoch": 0.5106382978723404, "grad_norm": 0.04231835529208183, "learning_rate": 9.858601294888212e-06, "loss": 0.5185, "step": 70 }, { "epoch": 0.5179331306990882, "grad_norm": 0.03981684520840645, "learning_rate": 9.848373297716414e-06, "loss": 0.5246, "step": 71 }, { "epoch": 0.5252279635258359, "grad_norm": 0.045293230563402176, "learning_rate": 9.837793869243468e-06, "loss": 0.5403, "step": 72 }, { "epoch": 0.5325227963525836, "grad_norm": 0.0415407195687294, "learning_rate": 9.826863776312621e-06, "loss": 0.568, "step": 73 }, { "epoch": 0.5398176291793313, "grad_norm": 0.04549698531627655, "learning_rate": 9.815583811184809e-06, "loss": 0.5547, "step": 74 }, { "epoch": 0.547112462006079, "grad_norm": 0.03895876184105873, "learning_rate": 9.803954791481239e-06, "loss": 0.5374, "step": 75 }, { "epoch": 0.5544072948328268, "grad_norm": 0.046192847192287445, "learning_rate": 9.79197756012412e-06, "loss": 0.5561, "step": 76 }, { "epoch": 0.5617021276595745, "grad_norm": 0.03921407088637352, "learning_rate": 9.779652985275562e-06, "loss": 0.5488, "step": 77 }, { "epoch": 0.5689969604863222, "grad_norm": 0.037232838571071625, "learning_rate": 9.766981960274653e-06, "loss": 0.4963, "step": 78 }, { "epoch": 0.5762917933130699, "grad_norm": 0.05492810904979706, "learning_rate": 9.753965403572703e-06, "loss": 0.5621, "step": 79 }, { "epoch": 0.5835866261398176, "grad_norm": 0.04202823340892792, "learning_rate": 9.740604258666668e-06, "loss": 0.5479, "step": 80 }, { "epoch": 0.5908814589665653, "grad_norm": 0.04189832881093025, "learning_rate": 9.726899494030768e-06, "loss": 0.5802, "step": 81 }, { "epoch": 0.5981762917933131, "grad_norm": 0.039709825068712234, "learning_rate": 9.712852103046281e-06, "loss": 0.5166, "step": 82 }, { "epoch": 0.6054711246200608, "grad_norm": 0.04080045223236084, "learning_rate": 9.698463103929542e-06, "loss": 0.5289, "step": 83 }, { "epoch": 0.6127659574468085, "grad_norm": 0.038535572588443756, "learning_rate": 9.68373353965814e-06, "loss": 0.5352, "step": 84 }, { "epoch": 0.6200607902735562, "grad_norm": 0.04705570638179779, "learning_rate": 9.66866447789531e-06, "loss": 0.5235, "step": 85 }, { "epoch": 0.6273556231003039, "grad_norm": 0.042207516729831696, "learning_rate": 9.65325701091256e-06, "loss": 0.5147, "step": 86 }, { "epoch": 0.6346504559270517, "grad_norm": 0.04210168495774269, "learning_rate": 9.637512255510475e-06, "loss": 0.5241, "step": 87 }, { "epoch": 0.6419452887537994, "grad_norm": 0.03785989060997963, "learning_rate": 9.62143135293779e-06, "loss": 0.5429, "step": 88 }, { "epoch": 0.6492401215805471, "grad_norm": 0.04283512756228447, "learning_rate": 9.605015468808651e-06, "loss": 0.5242, "step": 89 }, { "epoch": 0.6565349544072948, "grad_norm": 0.043273307383060455, "learning_rate": 9.588265793018141e-06, "loss": 0.5455, "step": 90 }, { "epoch": 0.6638297872340425, "grad_norm": 0.04218590632081032, "learning_rate": 9.571183539656011e-06, "loss": 0.5778, "step": 91 }, { "epoch": 0.6711246200607903, "grad_norm": 0.03844400867819786, "learning_rate": 9.553769946918698e-06, "loss": 0.5233, "step": 92 }, { "epoch": 0.678419452887538, "grad_norm": 0.04001948982477188, "learning_rate": 9.536026277019562e-06, "loss": 0.5156, "step": 93 }, { "epoch": 0.6857142857142857, "grad_norm": 0.04228726401925087, "learning_rate": 9.517953816097396e-06, "loss": 0.5138, "step": 94 }, { "epoch": 0.6930091185410334, "grad_norm": 0.03879157081246376, "learning_rate": 9.499553874123213e-06, "loss": 0.4926, "step": 95 }, { "epoch": 0.7003039513677811, "grad_norm": 0.04016513749957085, "learning_rate": 9.480827784805278e-06, "loss": 0.497, "step": 96 }, { "epoch": 0.7075987841945289, "grad_norm": 0.03983764350414276, "learning_rate": 9.461776905492446e-06, "loss": 0.4852, "step": 97 }, { "epoch": 0.7148936170212766, "grad_norm": 0.056514669209718704, "learning_rate": 9.442402617075765e-06, "loss": 0.5288, "step": 98 }, { "epoch": 0.7221884498480243, "grad_norm": 0.046206481754779816, "learning_rate": 9.422706323888398e-06, "loss": 0.5418, "step": 99 }, { "epoch": 0.729483282674772, "grad_norm": 0.0474584735929966, "learning_rate": 9.402689453603815e-06, "loss": 0.5531, "step": 100 }, { "epoch": 0.7367781155015197, "grad_norm": 0.037484850734472275, "learning_rate": 9.382353457132318e-06, "loss": 0.4869, "step": 101 }, { "epoch": 0.7440729483282674, "grad_norm": 0.03749077394604683, "learning_rate": 9.361699808515877e-06, "loss": 0.5275, "step": 102 }, { "epoch": 0.7513677811550152, "grad_norm": 0.038470759987831116, "learning_rate": 9.340730004821266e-06, "loss": 0.5044, "step": 103 }, { "epoch": 0.7586626139817629, "grad_norm": 0.038027700036764145, "learning_rate": 9.31944556603157e-06, "loss": 0.5025, "step": 104 }, { "epoch": 0.7659574468085106, "grad_norm": 0.046422988176345825, "learning_rate": 9.297848034936007e-06, "loss": 0.5341, "step": 105 }, { "epoch": 0.7732522796352583, "grad_norm": 0.043657850474119186, "learning_rate": 9.275938977018082e-06, "loss": 0.5085, "step": 106 }, { "epoch": 0.780547112462006, "grad_norm": 0.04235101863741875, "learning_rate": 9.253719980342134e-06, "loss": 0.5397, "step": 107 }, { "epoch": 0.7878419452887538, "grad_norm": 0.04412844404578209, "learning_rate": 9.231192655438222e-06, "loss": 0.5522, "step": 108 }, { "epoch": 0.7951367781155015, "grad_norm": 0.04110129550099373, "learning_rate": 9.208358635185372e-06, "loss": 0.5785, "step": 109 }, { "epoch": 0.8024316109422492, "grad_norm": 0.03757128119468689, "learning_rate": 9.185219574693242e-06, "loss": 0.4777, "step": 110 }, { "epoch": 0.8097264437689969, "grad_norm": 0.03927014395594597, "learning_rate": 9.161777151182137e-06, "loss": 0.526, "step": 111 }, { "epoch": 0.8170212765957446, "grad_norm": 0.03983665257692337, "learning_rate": 9.138033063861436e-06, "loss": 0.5138, "step": 112 }, { "epoch": 0.8243161094224924, "grad_norm": 0.038819894194602966, "learning_rate": 9.113989033806434e-06, "loss": 0.494, "step": 113 }, { "epoch": 0.8316109422492401, "grad_norm": 0.05275421962141991, "learning_rate": 9.089646803833589e-06, "loss": 0.539, "step": 114 }, { "epoch": 0.8389057750759878, "grad_norm": 0.04078809916973114, "learning_rate": 9.06500813837419e-06, "loss": 0.4778, "step": 115 }, { "epoch": 0.8462006079027355, "grad_norm": 0.040415696799755096, "learning_rate": 9.040074823346466e-06, "loss": 0.5443, "step": 116 }, { "epoch": 0.8534954407294832, "grad_norm": 0.03834336996078491, "learning_rate": 9.014848666026138e-06, "loss": 0.4945, "step": 117 }, { "epoch": 0.8607902735562311, "grad_norm": 0.04293690249323845, "learning_rate": 8.989331494915417e-06, "loss": 0.5404, "step": 118 }, { "epoch": 0.8680851063829788, "grad_norm": 0.04116823151707649, "learning_rate": 8.963525159610465e-06, "loss": 0.5274, "step": 119 }, { "epoch": 0.8753799392097265, "grad_norm": 0.04143698886036873, "learning_rate": 8.937431530667329e-06, "loss": 0.4916, "step": 120 }, { "epoch": 0.8826747720364742, "grad_norm": 0.039400726556777954, "learning_rate": 8.911052499466358e-06, "loss": 0.5573, "step": 121 }, { "epoch": 0.8899696048632219, "grad_norm": 0.035739775747060776, "learning_rate": 8.884389978075098e-06, "loss": 0.4961, "step": 122 }, { "epoch": 0.8972644376899696, "grad_norm": 0.04682913422584534, "learning_rate": 8.857445899109716e-06, "loss": 0.4712, "step": 123 }, { "epoch": 0.9045592705167174, "grad_norm": 0.04531010612845421, "learning_rate": 8.83022221559489e-06, "loss": 0.5242, "step": 124 }, { "epoch": 0.9118541033434651, "grad_norm": 0.04528380185365677, "learning_rate": 8.80272090082227e-06, "loss": 0.5506, "step": 125 }, { "epoch": 0.9191489361702128, "grad_norm": 0.03824332728981972, "learning_rate": 8.774943948207427e-06, "loss": 0.4581, "step": 126 }, { "epoch": 0.9264437689969605, "grad_norm": 0.03896916285157204, "learning_rate": 8.746893371145367e-06, "loss": 0.5504, "step": 127 }, { "epoch": 0.9337386018237082, "grad_norm": 0.03746696934103966, "learning_rate": 8.718571202864598e-06, "loss": 0.4589, "step": 128 }, { "epoch": 0.941033434650456, "grad_norm": 0.04142184555530548, "learning_rate": 8.689979496279747e-06, "loss": 0.5299, "step": 129 }, { "epoch": 0.9483282674772037, "grad_norm": 0.03700762987136841, "learning_rate": 8.661120323842751e-06, "loss": 0.5159, "step": 130 }, { "epoch": 0.9556231003039514, "grad_norm": 0.036684855818748474, "learning_rate": 8.631995777392645e-06, "loss": 0.4854, "step": 131 }, { "epoch": 0.9629179331306991, "grad_norm": 0.06939133256673813, "learning_rate": 8.602607968003935e-06, "loss": 0.5101, "step": 132 }, { "epoch": 0.9702127659574468, "grad_norm": 0.039062708616256714, "learning_rate": 8.572959025833573e-06, "loss": 0.5005, "step": 133 }, { "epoch": 0.9775075987841946, "grad_norm": 0.04555986076593399, "learning_rate": 8.543051099966558e-06, "loss": 0.5465, "step": 134 }, { "epoch": 0.9848024316109423, "grad_norm": 0.04333364591002464, "learning_rate": 8.512886358260162e-06, "loss": 0.5237, "step": 135 }, { "epoch": 0.99209726443769, "grad_norm": 0.04095487669110298, "learning_rate": 8.482466987186785e-06, "loss": 0.5335, "step": 136 }, { "epoch": 0.9993920972644377, "grad_norm": 0.0442386157810688, "learning_rate": 8.451795191675488e-06, "loss": 0.5107, "step": 137 }, { "epoch": 1.0, "grad_norm": 0.0442386157810688, "learning_rate": 8.420873194952153e-06, "loss": 0.465, "step": 138 }, { "epoch": 1.0072948328267477, "grad_norm": 0.1445915699005127, "learning_rate": 8.38970323837834e-06, "loss": 0.4704, "step": 139 }, { "epoch": 1.0145896656534954, "grad_norm": 0.042022328823804855, "learning_rate": 8.358287581288824e-06, "loss": 0.4282, "step": 140 }, { "epoch": 1.021884498480243, "grad_norm": 0.04201134666800499, "learning_rate": 8.326628500827826e-06, "loss": 0.4539, "step": 141 }, { "epoch": 1.0291793313069908, "grad_norm": 0.04877388849854469, "learning_rate": 8.294728291783967e-06, "loss": 0.4641, "step": 142 }, { "epoch": 1.0364741641337385, "grad_norm": 0.046164825558662415, "learning_rate": 8.262589266423908e-06, "loss": 0.419, "step": 143 }, { "epoch": 1.0437689969604864, "grad_norm": 0.041141681373119354, "learning_rate": 8.230213754324773e-06, "loss": 0.4224, "step": 144 }, { "epoch": 1.0510638297872341, "grad_norm": 0.03967837244272232, "learning_rate": 8.19760410220527e-06, "loss": 0.4268, "step": 145 }, { "epoch": 1.0583586626139818, "grad_norm": 0.05634555220603943, "learning_rate": 8.16476267375561e-06, "loss": 0.4025, "step": 146 }, { "epoch": 1.0656534954407295, "grad_norm": 0.041606318205595016, "learning_rate": 8.131691849466154e-06, "loss": 0.4335, "step": 147 }, { "epoch": 1.0729483282674772, "grad_norm": 0.03656647726893425, "learning_rate": 8.098394026454886e-06, "loss": 0.456, "step": 148 }, { "epoch": 1.080243161094225, "grad_norm": 0.041005730628967285, "learning_rate": 8.064871618293647e-06, "loss": 0.3925, "step": 149 }, { "epoch": 1.0875379939209726, "grad_norm": 0.04722120240330696, "learning_rate": 8.031127054833192e-06, "loss": 0.3981, "step": 150 }, { "epoch": 1.0948328267477203, "grad_norm": 0.043071143329143524, "learning_rate": 7.997162782027061e-06, "loss": 0.4296, "step": 151 }, { "epoch": 1.102127659574468, "grad_norm": 0.04518291354179382, "learning_rate": 7.962981261754295e-06, "loss": 0.4376, "step": 152 }, { "epoch": 1.1094224924012157, "grad_norm": 0.04998685419559479, "learning_rate": 7.928584971640974e-06, "loss": 0.452, "step": 153 }, { "epoch": 1.1167173252279636, "grad_norm": 0.04469837248325348, "learning_rate": 7.893976404880643e-06, "loss": 0.4316, "step": 154 }, { "epoch": 1.1240121580547113, "grad_norm": 0.040255557745695114, "learning_rate": 7.859158070053578e-06, "loss": 0.4378, "step": 155 }, { "epoch": 1.131306990881459, "grad_norm": 0.04467206820845604, "learning_rate": 7.824132490944968e-06, "loss": 0.4215, "step": 156 }, { "epoch": 1.1386018237082067, "grad_norm": 0.03889721632003784, "learning_rate": 7.788902206361974e-06, "loss": 0.4257, "step": 157 }, { "epoch": 1.1458966565349544, "grad_norm": 0.04140063747763634, "learning_rate": 7.753469769949701e-06, "loss": 0.4434, "step": 158 }, { "epoch": 1.1531914893617021, "grad_norm": 0.039931997656822205, "learning_rate": 7.717837750006106e-06, "loss": 0.41, "step": 159 }, { "epoch": 1.1604863221884498, "grad_norm": 0.03909624367952347, "learning_rate": 7.682008729295834e-06, "loss": 0.3904, "step": 160 }, { "epoch": 1.1677811550151975, "grad_norm": 0.0401025116443634, "learning_rate": 7.645985304863004e-06, "loss": 0.4618, "step": 161 }, { "epoch": 1.1750759878419452, "grad_norm": 0.04733911529183388, "learning_rate": 7.609770087842969e-06, "loss": 0.4247, "step": 162 }, { "epoch": 1.182370820668693, "grad_norm": 0.037687744945287704, "learning_rate": 7.573365703273045e-06, "loss": 0.4071, "step": 163 }, { "epoch": 1.1896656534954406, "grad_norm": 0.039216116070747375, "learning_rate": 7.536774789902246e-06, "loss": 0.4259, "step": 164 }, { "epoch": 1.1969604863221885, "grad_norm": 0.0408397912979126, "learning_rate": 7.500000000000001e-06, "loss": 0.394, "step": 165 }, { "epoch": 1.2042553191489362, "grad_norm": 0.04507288709282875, "learning_rate": 7.463043999163919e-06, "loss": 0.4605, "step": 166 }, { "epoch": 1.211550151975684, "grad_norm": 0.03989469259977341, "learning_rate": 7.4259094661265685e-06, "loss": 0.4285, "step": 167 }, { "epoch": 1.2188449848024316, "grad_norm": 0.0407419353723526, "learning_rate": 7.388599092561315e-06, "loss": 0.4204, "step": 168 }, { "epoch": 1.2261398176291793, "grad_norm": 0.040525760501623154, "learning_rate": 7.351115582887212e-06, "loss": 0.4253, "step": 169 }, { "epoch": 1.233434650455927, "grad_norm": 0.04370498284697533, "learning_rate": 7.313461654072974e-06, "loss": 0.4071, "step": 170 }, { "epoch": 1.2407294832826747, "grad_norm": 0.0392344668507576, "learning_rate": 7.2756400354400445e-06, "loss": 0.4093, "step": 171 }, { "epoch": 1.2480243161094224, "grad_norm": 0.03849213197827339, "learning_rate": 7.237653468464756e-06, "loss": 0.4157, "step": 172 }, { "epoch": 1.2553191489361701, "grad_norm": 0.04228688403964043, "learning_rate": 7.199504706579617e-06, "loss": 0.482, "step": 173 }, { "epoch": 1.262613981762918, "grad_norm": 0.037325162440538406, "learning_rate": 7.161196514973735e-06, "loss": 0.4224, "step": 174 }, { "epoch": 1.2699088145896655, "grad_norm": 0.047044239938259125, "learning_rate": 7.122731670392381e-06, "loss": 0.4249, "step": 175 }, { "epoch": 1.2772036474164135, "grad_norm": 0.04322784021496773, "learning_rate": 7.0841129609357165e-06, "loss": 0.4051, "step": 176 }, { "epoch": 1.2844984802431612, "grad_norm": 0.041998326778411865, "learning_rate": 7.045343185856701e-06, "loss": 0.4106, "step": 177 }, { "epoch": 1.2917933130699089, "grad_norm": 0.040727648884058, "learning_rate": 7.006425155358195e-06, "loss": 0.4427, "step": 178 }, { "epoch": 1.2990881458966566, "grad_norm": 0.04059009999036789, "learning_rate": 6.967361690389258e-06, "loss": 0.437, "step": 179 }, { "epoch": 1.3063829787234043, "grad_norm": 0.042023915797472, "learning_rate": 6.92815562244068e-06, "loss": 0.4315, "step": 180 }, { "epoch": 1.313677811550152, "grad_norm": 0.04910752549767494, "learning_rate": 6.888809793339729e-06, "loss": 0.4436, "step": 181 }, { "epoch": 1.3209726443768997, "grad_norm": 0.04180140420794487, "learning_rate": 6.849327055044182e-06, "loss": 0.3948, "step": 182 }, { "epoch": 1.3282674772036474, "grad_norm": 0.03989269211888313, "learning_rate": 6.80971026943559e-06, "loss": 0.3929, "step": 183 }, { "epoch": 1.335562310030395, "grad_norm": 0.04497074335813522, "learning_rate": 6.769962308111839e-06, "loss": 0.4429, "step": 184 }, { "epoch": 1.342857142857143, "grad_norm": 0.04516409710049629, "learning_rate": 6.7300860521790034e-06, "loss": 0.4363, "step": 185 }, { "epoch": 1.3501519756838904, "grad_norm": 0.041362229734659195, "learning_rate": 6.690084392042514e-06, "loss": 0.4058, "step": 186 }, { "epoch": 1.3574468085106384, "grad_norm": 0.04281953349709511, "learning_rate": 6.649960227197648e-06, "loss": 0.423, "step": 187 }, { "epoch": 1.364741641337386, "grad_norm": 0.046076931059360504, "learning_rate": 6.609716466019356e-06, "loss": 0.4427, "step": 188 }, { "epoch": 1.3720364741641338, "grad_norm": 0.03960058465600014, "learning_rate": 6.569356025551454e-06, "loss": 0.4193, "step": 189 }, { "epoch": 1.3793313069908815, "grad_norm": 0.044169649481773376, "learning_rate": 6.5288818312951886e-06, "loss": 0.4034, "step": 190 }, { "epoch": 1.3866261398176292, "grad_norm": 0.04062066227197647, "learning_rate": 6.4882968169971734e-06, "loss": 0.4018, "step": 191 }, { "epoch": 1.3939209726443769, "grad_norm": 0.04406093806028366, "learning_rate": 6.447603924436744e-06, "loss": 0.4498, "step": 192 }, { "epoch": 1.4012158054711246, "grad_norm": 0.04197722300887108, "learning_rate": 6.406806103212725e-06, "loss": 0.4356, "step": 193 }, { "epoch": 1.4085106382978723, "grad_norm": 0.04061530530452728, "learning_rate": 6.365906310529631e-06, "loss": 0.4441, "step": 194 }, { "epoch": 1.41580547112462, "grad_norm": 0.046513479202985764, "learning_rate": 6.32490751098331e-06, "loss": 0.4166, "step": 195 }, { "epoch": 1.4231003039513679, "grad_norm": 0.03948912024497986, "learning_rate": 6.2838126763460635e-06, "loss": 0.4478, "step": 196 }, { "epoch": 1.4303951367781156, "grad_norm": 0.04548676684498787, "learning_rate": 6.2426247853512355e-06, "loss": 0.4653, "step": 197 }, { "epoch": 1.4376899696048633, "grad_norm": 0.041050177067518234, "learning_rate": 6.2013468234773034e-06, "loss": 0.3953, "step": 198 }, { "epoch": 1.444984802431611, "grad_norm": 0.03936685994267464, "learning_rate": 6.1599817827314744e-06, "loss": 0.4256, "step": 199 }, { "epoch": 1.4522796352583587, "grad_norm": 0.041237395256757736, "learning_rate": 6.118532661432812e-06, "loss": 0.3892, "step": 200 }, { "epoch": 1.4595744680851064, "grad_norm": 0.043174393475055695, "learning_rate": 6.077002463994908e-06, "loss": 0.4174, "step": 201 }, { "epoch": 1.466869300911854, "grad_norm": 0.04198073223233223, "learning_rate": 6.035394200708104e-06, "loss": 0.4278, "step": 202 }, { "epoch": 1.4741641337386018, "grad_norm": 0.045515723526477814, "learning_rate": 5.993710887521302e-06, "loss": 0.4346, "step": 203 }, { "epoch": 1.4814589665653495, "grad_norm": 0.04443354532122612, "learning_rate": 5.951955545823342e-06, "loss": 0.4116, "step": 204 }, { "epoch": 1.4887537993920974, "grad_norm": 0.04223044961690903, "learning_rate": 5.910131202224011e-06, "loss": 0.3844, "step": 205 }, { "epoch": 1.4960486322188449, "grad_norm": 0.04305846244096756, "learning_rate": 5.8682408883346535e-06, "loss": 0.4414, "step": 206 }, { "epoch": 1.5033434650455928, "grad_norm": 0.04148327186703682, "learning_rate": 5.826287640548425e-06, "loss": 0.4327, "step": 207 }, { "epoch": 1.5106382978723403, "grad_norm": 0.0433870293200016, "learning_rate": 5.784274499820214e-06, "loss": 0.3787, "step": 208 }, { "epoch": 1.5179331306990882, "grad_norm": 0.041102319955825806, "learning_rate": 5.742204511446203e-06, "loss": 0.4189, "step": 209 }, { "epoch": 1.525227963525836, "grad_norm": 0.04175707325339317, "learning_rate": 5.7000807248431466e-06, "loss": 0.427, "step": 210 }, { "epoch": 1.5325227963525836, "grad_norm": 0.04286907613277435, "learning_rate": 5.657906193327325e-06, "loss": 0.4, "step": 211 }, { "epoch": 1.5398176291793313, "grad_norm": 0.04246861860156059, "learning_rate": 5.615683973893235e-06, "loss": 0.4097, "step": 212 }, { "epoch": 1.547112462006079, "grad_norm": 0.03898885101079941, "learning_rate": 5.573417126992004e-06, "loss": 0.4237, "step": 213 }, { "epoch": 1.554407294832827, "grad_norm": 0.04554813727736473, "learning_rate": 5.5311087163095475e-06, "loss": 0.436, "step": 214 }, { "epoch": 1.5617021276595744, "grad_norm": 0.04189833253622055, "learning_rate": 5.4887618085445094e-06, "loss": 0.4121, "step": 215 }, { "epoch": 1.5689969604863223, "grad_norm": 0.05306672677397728, "learning_rate": 5.446379473185972e-06, "loss": 0.4015, "step": 216 }, { "epoch": 1.5762917933130698, "grad_norm": 0.04060041531920433, "learning_rate": 5.403964782290962e-06, "loss": 0.3967, "step": 217 }, { "epoch": 1.5835866261398177, "grad_norm": 0.045451849699020386, "learning_rate": 5.361520810261779e-06, "loss": 0.4161, "step": 218 }, { "epoch": 1.5908814589665652, "grad_norm": 0.043955542147159576, "learning_rate": 5.319050633623141e-06, "loss": 0.4205, "step": 219 }, { "epoch": 1.598176291793313, "grad_norm": 0.040733452886343, "learning_rate": 5.276557330799203e-06, "loss": 0.4165, "step": 220 }, { "epoch": 1.6054711246200608, "grad_norm": 0.04190356284379959, "learning_rate": 5.234043981890395e-06, "loss": 0.4515, "step": 221 }, { "epoch": 1.6127659574468085, "grad_norm": 0.037713076919317245, "learning_rate": 5.191513668450178e-06, "loss": 0.4131, "step": 222 }, { "epoch": 1.6200607902735562, "grad_norm": 0.038250233978033066, "learning_rate": 5.1489694732616805e-06, "loss": 0.4028, "step": 223 }, { "epoch": 1.627355623100304, "grad_norm": 0.039751507341861725, "learning_rate": 5.106414480114238e-06, "loss": 0.4121, "step": 224 }, { "epoch": 1.6346504559270518, "grad_norm": 0.044864848256111145, "learning_rate": 5.06385177357987e-06, "loss": 0.4708, "step": 225 }, { "epoch": 1.6419452887537993, "grad_norm": 0.04169140383601189, "learning_rate": 5.021284438789694e-06, "loss": 0.425, "step": 226 }, { "epoch": 1.6492401215805472, "grad_norm": 0.04238287732005119, "learning_rate": 4.9787155612103076e-06, "loss": 0.409, "step": 227 }, { "epoch": 1.6565349544072947, "grad_norm": 0.03984750807285309, "learning_rate": 4.936148226420133e-06, "loss": 0.4451, "step": 228 }, { "epoch": 1.6638297872340426, "grad_norm": 0.03823258727788925, "learning_rate": 4.893585519885764e-06, "loss": 0.4318, "step": 229 }, { "epoch": 1.6711246200607903, "grad_norm": 0.043166667222976685, "learning_rate": 4.851030526738321e-06, "loss": 0.4348, "step": 230 }, { "epoch": 1.678419452887538, "grad_norm": 0.04118693992495537, "learning_rate": 4.808486331549824e-06, "loss": 0.435, "step": 231 }, { "epoch": 1.6857142857142857, "grad_norm": 0.040095556527376175, "learning_rate": 4.765956018109607e-06, "loss": 0.4506, "step": 232 }, { "epoch": 1.6930091185410334, "grad_norm": 0.04523642733693123, "learning_rate": 4.7234426692007985e-06, "loss": 0.4394, "step": 233 }, { "epoch": 1.7003039513677811, "grad_norm": 0.041244085878133774, "learning_rate": 4.680949366376858e-06, "loss": 0.4698, "step": 234 }, { "epoch": 1.7075987841945288, "grad_norm": 0.04374610632658005, "learning_rate": 4.638479189738224e-06, "loss": 0.4129, "step": 235 }, { "epoch": 1.7148936170212767, "grad_norm": 0.040487710386514664, "learning_rate": 4.596035217709039e-06, "loss": 0.4362, "step": 236 }, { "epoch": 1.7221884498480242, "grad_norm": 0.044370926916599274, "learning_rate": 4.553620526814029e-06, "loss": 0.4155, "step": 237 }, { "epoch": 1.7294832826747721, "grad_norm": 0.04036295786499977, "learning_rate": 4.511238191455491e-06, "loss": 0.4214, "step": 238 }, { "epoch": 1.7367781155015196, "grad_norm": 0.03773313760757446, "learning_rate": 4.468891283690454e-06, "loss": 0.4298, "step": 239 }, { "epoch": 1.7440729483282675, "grad_norm": 0.045683182775974274, "learning_rate": 4.426582873007999e-06, "loss": 0.4485, "step": 240 }, { "epoch": 1.7513677811550152, "grad_norm": 0.04686903581023216, "learning_rate": 4.384316026106766e-06, "loss": 0.4303, "step": 241 }, { "epoch": 1.758662613981763, "grad_norm": 0.045155324041843414, "learning_rate": 4.342093806672678e-06, "loss": 0.4409, "step": 242 }, { "epoch": 1.7659574468085106, "grad_norm": 0.0418829619884491, "learning_rate": 4.299919275156857e-06, "loss": 0.4149, "step": 243 }, { "epoch": 1.7732522796352583, "grad_norm": 0.041985101997852325, "learning_rate": 4.2577954885537985e-06, "loss": 0.4293, "step": 244 }, { "epoch": 1.780547112462006, "grad_norm": 0.042692556977272034, "learning_rate": 4.215725500179788e-06, "loss": 0.4258, "step": 245 }, { "epoch": 1.7878419452887537, "grad_norm": 0.04013342410326004, "learning_rate": 4.173712359451576e-06, "loss": 0.4015, "step": 246 }, { "epoch": 1.7951367781155017, "grad_norm": 0.038998380303382874, "learning_rate": 4.131759111665349e-06, "loss": 0.4596, "step": 247 }, { "epoch": 1.8024316109422491, "grad_norm": 0.039829254150390625, "learning_rate": 4.0898687977759895e-06, "loss": 0.4128, "step": 248 }, { "epoch": 1.809726443768997, "grad_norm": 0.04312862455844879, "learning_rate": 4.048044454176658e-06, "loss": 0.4243, "step": 249 }, { "epoch": 1.8170212765957445, "grad_norm": 0.04533419758081436, "learning_rate": 4.0062891124787e-06, "loss": 0.4414, "step": 250 }, { "epoch": 1.8243161094224924, "grad_norm": 0.0438460148870945, "learning_rate": 3.964605799291897e-06, "loss": 0.4553, "step": 251 }, { "epoch": 1.8316109422492401, "grad_norm": 0.0429726168513298, "learning_rate": 3.922997536005094e-06, "loss": 0.4311, "step": 252 }, { "epoch": 1.8389057750759878, "grad_norm": 0.039493922144174576, "learning_rate": 3.88146733856719e-06, "loss": 0.4387, "step": 253 }, { "epoch": 1.8462006079027355, "grad_norm": 0.04514075070619583, "learning_rate": 3.840018217268527e-06, "loss": 0.4442, "step": 254 }, { "epoch": 1.8534954407294832, "grad_norm": 0.04080420732498169, "learning_rate": 3.7986531765226965e-06, "loss": 0.3884, "step": 255 }, { "epoch": 1.8607902735562312, "grad_norm": 0.04457089304924011, "learning_rate": 3.757375214648764e-06, "loss": 0.3804, "step": 256 }, { "epoch": 1.8680851063829786, "grad_norm": 0.044411323964595795, "learning_rate": 3.716187323653939e-06, "loss": 0.4164, "step": 257 }, { "epoch": 1.8753799392097266, "grad_norm": 0.04527450352907181, "learning_rate": 3.675092489016693e-06, "loss": 0.4402, "step": 258 }, { "epoch": 1.882674772036474, "grad_norm": 0.039380993694067, "learning_rate": 3.6340936894703717e-06, "loss": 0.4329, "step": 259 }, { "epoch": 1.889969604863222, "grad_norm": 0.04769477993249893, "learning_rate": 3.593193896787277e-06, "loss": 0.4447, "step": 260 }, { "epoch": 1.8972644376899694, "grad_norm": 0.04017976298928261, "learning_rate": 3.5523960755632573e-06, "loss": 0.4066, "step": 261 }, { "epoch": 1.9045592705167174, "grad_norm": 0.04179855436086655, "learning_rate": 3.5117031830028274e-06, "loss": 0.4048, "step": 262 }, { "epoch": 1.911854103343465, "grad_norm": 0.041397638618946075, "learning_rate": 3.4711181687048114e-06, "loss": 0.4296, "step": 263 }, { "epoch": 1.9191489361702128, "grad_norm": 0.05517794191837311, "learning_rate": 3.4306439744485453e-06, "loss": 0.4266, "step": 264 }, { "epoch": 1.9264437689969605, "grad_norm": 0.04322275519371033, "learning_rate": 3.3902835339806463e-06, "loss": 0.4084, "step": 265 }, { "epoch": 1.9337386018237082, "grad_norm": 0.04079868271946907, "learning_rate": 3.3500397728023536e-06, "loss": 0.3987, "step": 266 }, { "epoch": 1.941033434650456, "grad_norm": 0.040178705006837845, "learning_rate": 3.309915607957487e-06, "loss": 0.3508, "step": 267 }, { "epoch": 1.9483282674772036, "grad_norm": 0.042311254888772964, "learning_rate": 3.2699139478209987e-06, "loss": 0.4315, "step": 268 }, { "epoch": 1.9556231003039515, "grad_norm": 0.041265442967414856, "learning_rate": 3.2300376918881628e-06, "loss": 0.4096, "step": 269 }, { "epoch": 1.962917933130699, "grad_norm": 0.035929929465055466, "learning_rate": 3.19028973056441e-06, "loss": 0.3872, "step": 270 }, { "epoch": 1.9702127659574469, "grad_norm": 0.04031127318739891, "learning_rate": 3.150672944955818e-06, "loss": 0.4299, "step": 271 }, { "epoch": 1.9775075987841946, "grad_norm": 0.043629132211208344, "learning_rate": 3.111190206660273e-06, "loss": 0.4371, "step": 272 }, { "epoch": 1.9848024316109423, "grad_norm": 0.03935433551669121, "learning_rate": 3.0718443775593233e-06, "loss": 0.3912, "step": 273 }, { "epoch": 1.99209726443769, "grad_norm": 0.04069478437304497, "learning_rate": 3.0326383096107424e-06, "loss": 0.416, "step": 274 }, { "epoch": 1.9993920972644377, "grad_norm": 0.05225847661495209, "learning_rate": 2.993574844641807e-06, "loss": 0.3656, "step": 275 }, { "epoch": 2.0, "grad_norm": 0.05225847661495209, "learning_rate": 2.9546568141433007e-06, "loss": 0.5271, "step": 276 }, { "epoch": 2.007294832826748, "grad_norm": 0.15690822899341583, "learning_rate": 2.915887039064287e-06, "loss": 0.3677, "step": 277 }, { "epoch": 2.0145896656534954, "grad_norm": 0.040792327374219894, "learning_rate": 2.8772683296076197e-06, "loss": 0.3493, "step": 278 }, { "epoch": 2.0218844984802433, "grad_norm": 0.042940009385347366, "learning_rate": 2.838803485026265e-06, "loss": 0.3622, "step": 279 }, { "epoch": 2.029179331306991, "grad_norm": 0.03872222825884819, "learning_rate": 2.800495293420384e-06, "loss": 0.3204, "step": 280 }, { "epoch": 2.0364741641337387, "grad_norm": 0.03756758198142052, "learning_rate": 2.762346531535246e-06, "loss": 0.3158, "step": 281 }, { "epoch": 2.043768996960486, "grad_norm": 0.04466132074594498, "learning_rate": 2.724359964559958e-06, "loss": 0.3638, "step": 282 }, { "epoch": 2.051063829787234, "grad_norm": 0.04124055802822113, "learning_rate": 2.686538345927027e-06, "loss": 0.3231, "step": 283 }, { "epoch": 2.0583586626139816, "grad_norm": 0.04917893931269646, "learning_rate": 2.6488844171127903e-06, "loss": 0.3683, "step": 284 }, { "epoch": 2.0656534954407295, "grad_norm": 0.04118992015719414, "learning_rate": 2.611400907438685e-06, "loss": 0.294, "step": 285 }, { "epoch": 2.072948328267477, "grad_norm": 0.04872892051935196, "learning_rate": 2.574090533873431e-06, "loss": 0.3156, "step": 286 }, { "epoch": 2.080243161094225, "grad_norm": 0.04159025847911835, "learning_rate": 2.5369560008360826e-06, "loss": 0.3467, "step": 287 }, { "epoch": 2.087537993920973, "grad_norm": 0.04014930874109268, "learning_rate": 2.5000000000000015e-06, "loss": 0.3303, "step": 288 }, { "epoch": 2.0948328267477203, "grad_norm": 0.04120990261435509, "learning_rate": 2.4632252100977567e-06, "loss": 0.3086, "step": 289 }, { "epoch": 2.1021276595744682, "grad_norm": 0.04079623147845268, "learning_rate": 2.426634296726955e-06, "loss": 0.3364, "step": 290 }, { "epoch": 2.1094224924012157, "grad_norm": 0.04097681865096092, "learning_rate": 2.3902299121570332e-06, "loss": 0.3254, "step": 291 }, { "epoch": 2.1167173252279636, "grad_norm": 0.04335152730345726, "learning_rate": 2.354014695136997e-06, "loss": 0.2875, "step": 292 }, { "epoch": 2.124012158054711, "grad_norm": 0.041609250009059906, "learning_rate": 2.317991270704167e-06, "loss": 0.3585, "step": 293 }, { "epoch": 2.131306990881459, "grad_norm": 0.03962058201432228, "learning_rate": 2.282162249993895e-06, "loss": 0.3094, "step": 294 }, { "epoch": 2.1386018237082065, "grad_norm": 0.041423097252845764, "learning_rate": 2.2465302300503012e-06, "loss": 0.358, "step": 295 }, { "epoch": 2.1458966565349544, "grad_norm": 0.044281214475631714, "learning_rate": 2.211097793638029e-06, "loss": 0.3575, "step": 296 }, { "epoch": 2.153191489361702, "grad_norm": 0.04825511574745178, "learning_rate": 2.175867509055033e-06, "loss": 0.3364, "step": 297 }, { "epoch": 2.16048632218845, "grad_norm": 0.04354681074619293, "learning_rate": 2.1408419299464245e-06, "loss": 0.2979, "step": 298 }, { "epoch": 2.1677811550151977, "grad_norm": 0.042697008699178696, "learning_rate": 2.106023595119358e-06, "loss": 0.3356, "step": 299 }, { "epoch": 2.1750759878419452, "grad_norm": 0.047276780009269714, "learning_rate": 2.071415028359026e-06, "loss": 0.3634, "step": 300 }, { "epoch": 2.182370820668693, "grad_norm": 0.041072778403759, "learning_rate": 2.037018738245707e-06, "loss": 0.3362, "step": 301 }, { "epoch": 2.1896656534954406, "grad_norm": 0.04692791774868965, "learning_rate": 2.0028372179729405e-06, "loss": 0.3511, "step": 302 }, { "epoch": 2.1969604863221885, "grad_norm": 0.04538114741444588, "learning_rate": 1.9688729451668116e-06, "loss": 0.3336, "step": 303 }, { "epoch": 2.204255319148936, "grad_norm": 0.044003862887620926, "learning_rate": 1.935128381706355e-06, "loss": 0.3349, "step": 304 }, { "epoch": 2.211550151975684, "grad_norm": 0.045857448130846024, "learning_rate": 1.901605973545116e-06, "loss": 0.3537, "step": 305 }, { "epoch": 2.2188449848024314, "grad_norm": 0.04272821545600891, "learning_rate": 1.8683081505338468e-06, "loss": 0.3373, "step": 306 }, { "epoch": 2.2261398176291793, "grad_norm": 0.04273563250899315, "learning_rate": 1.8352373262443918e-06, "loss": 0.3436, "step": 307 }, { "epoch": 2.2334346504559273, "grad_norm": 0.058361783623695374, "learning_rate": 1.8023958977947303e-06, "loss": 0.3245, "step": 308 }, { "epoch": 2.2407294832826747, "grad_norm": 0.04002346843481064, "learning_rate": 1.7697862456752273e-06, "loss": 0.3317, "step": 309 }, { "epoch": 2.2480243161094227, "grad_norm": 0.039896223694086075, "learning_rate": 1.7374107335760937e-06, "loss": 0.2976, "step": 310 }, { "epoch": 2.25531914893617, "grad_norm": 0.04140615463256836, "learning_rate": 1.7052717082160348e-06, "loss": 0.3178, "step": 311 }, { "epoch": 2.262613981762918, "grad_norm": 0.04329473525285721, "learning_rate": 1.6733714991721738e-06, "loss": 0.3187, "step": 312 }, { "epoch": 2.2699088145896655, "grad_norm": 0.04440492019057274, "learning_rate": 1.6417124187111778e-06, "loss": 0.3194, "step": 313 }, { "epoch": 2.2772036474164135, "grad_norm": 0.03996637463569641, "learning_rate": 1.610296761621662e-06, "loss": 0.3504, "step": 314 }, { "epoch": 2.284498480243161, "grad_norm": 0.04870344325900078, "learning_rate": 1.5791268050478487e-06, "loss": 0.3599, "step": 315 }, { "epoch": 2.291793313069909, "grad_norm": 0.049775756895542145, "learning_rate": 1.5482048083245116e-06, "loss": 0.3051, "step": 316 }, { "epoch": 2.2990881458966568, "grad_norm": 0.04166199639439583, "learning_rate": 1.517533012813217e-06, "loss": 0.3383, "step": 317 }, { "epoch": 2.3063829787234043, "grad_norm": 0.04483890160918236, "learning_rate": 1.4871136417398407e-06, "loss": 0.3302, "step": 318 }, { "epoch": 2.3136778115501517, "grad_norm": 0.04276059940457344, "learning_rate": 1.4569489000334435e-06, "loss": 0.3615, "step": 319 }, { "epoch": 2.3209726443768997, "grad_norm": 0.04011907801032066, "learning_rate": 1.427040974166427e-06, "loss": 0.3139, "step": 320 }, { "epoch": 2.3282674772036476, "grad_norm": 0.043183084577322006, "learning_rate": 1.3973920319960654e-06, "loss": 0.3327, "step": 321 }, { "epoch": 2.335562310030395, "grad_norm": 0.045110031962394714, "learning_rate": 1.3680042226073554e-06, "loss": 0.3183, "step": 322 }, { "epoch": 2.342857142857143, "grad_norm": 0.04653245955705643, "learning_rate": 1.3388796761572493e-06, "loss": 0.3411, "step": 323 }, { "epoch": 2.3501519756838904, "grad_norm": 0.04192928597331047, "learning_rate": 1.310020503720254e-06, "loss": 0.3722, "step": 324 }, { "epoch": 2.3574468085106384, "grad_norm": 0.04330296441912651, "learning_rate": 1.2814287971354023e-06, "loss": 0.325, "step": 325 }, { "epoch": 2.364741641337386, "grad_norm": 0.04404173046350479, "learning_rate": 1.253106628854635e-06, "loss": 0.3247, "step": 326 }, { "epoch": 2.3720364741641338, "grad_norm": 0.04104992374777794, "learning_rate": 1.2250560517925747e-06, "loss": 0.3079, "step": 327 }, { "epoch": 2.3793313069908812, "grad_norm": 0.04262121394276619, "learning_rate": 1.197279099177731e-06, "loss": 0.3446, "step": 328 }, { "epoch": 2.386626139817629, "grad_norm": 0.04929178208112717, "learning_rate": 1.1697777844051105e-06, "loss": 0.3501, "step": 329 }, { "epoch": 2.393920972644377, "grad_norm": 0.04329733923077583, "learning_rate": 1.1425541008902852e-06, "loss": 0.3213, "step": 330 }, { "epoch": 2.4012158054711246, "grad_norm": 0.04333839192986488, "learning_rate": 1.1156100219249022e-06, "loss": 0.3232, "step": 331 }, { "epoch": 2.4085106382978725, "grad_norm": 0.04259442910552025, "learning_rate": 1.0889475005336447e-06, "loss": 0.3632, "step": 332 }, { "epoch": 2.41580547112462, "grad_norm": 0.04376016557216644, "learning_rate": 1.0625684693326727e-06, "loss": 0.3355, "step": 333 }, { "epoch": 2.423100303951368, "grad_norm": 0.04070465639233589, "learning_rate": 1.0364748403895368e-06, "loss": 0.341, "step": 334 }, { "epoch": 2.4303951367781154, "grad_norm": 0.041908472776412964, "learning_rate": 1.0106685050845838e-06, "loss": 0.3383, "step": 335 }, { "epoch": 2.4376899696048633, "grad_norm": 0.04458033666014671, "learning_rate": 9.851513339738627e-07, "loss": 0.3246, "step": 336 }, { "epoch": 2.4449848024316108, "grad_norm": 0.04246847331523895, "learning_rate": 9.599251766535344e-07, "loss": 0.3418, "step": 337 }, { "epoch": 2.4522796352583587, "grad_norm": 0.04456906393170357, "learning_rate": 9.349918616258113e-07, "loss": 0.3536, "step": 338 }, { "epoch": 2.4595744680851066, "grad_norm": 0.041079938411712646, "learning_rate": 9.10353196166412e-07, "loss": 0.3278, "step": 339 }, { "epoch": 2.466869300911854, "grad_norm": 0.049959778785705566, "learning_rate": 8.860109661935673e-07, "loss": 0.3417, "step": 340 }, { "epoch": 2.474164133738602, "grad_norm": 0.044310178607702255, "learning_rate": 8.619669361385663e-07, "loss": 0.3148, "step": 341 }, { "epoch": 2.4814589665653495, "grad_norm": 0.04187872260808945, "learning_rate": 8.382228488178639e-07, "loss": 0.3392, "step": 342 }, { "epoch": 2.4887537993920974, "grad_norm": 0.04105791822075844, "learning_rate": 8.147804253067581e-07, "loss": 0.3273, "step": 343 }, { "epoch": 2.496048632218845, "grad_norm": 0.039138007909059525, "learning_rate": 7.916413648146282e-07, "loss": 0.3112, "step": 344 }, { "epoch": 2.503343465045593, "grad_norm": 0.04459141194820404, "learning_rate": 7.6880734456178e-07, "loss": 0.3463, "step": 345 }, { "epoch": 2.5106382978723403, "grad_norm": 0.043715398758649826, "learning_rate": 7.462800196578662e-07, "loss": 0.3439, "step": 346 }, { "epoch": 2.517933130699088, "grad_norm": 0.043532464653253555, "learning_rate": 7.240610229819195e-07, "loss": 0.3303, "step": 347 }, { "epoch": 2.525227963525836, "grad_norm": 0.04273553937673569, "learning_rate": 7.021519650639952e-07, "loss": 0.3171, "step": 348 }, { "epoch": 2.5325227963525836, "grad_norm": 0.05441723391413689, "learning_rate": 6.805544339684295e-07, "loss": 0.3239, "step": 349 }, { "epoch": 2.539817629179331, "grad_norm": 0.04585114121437073, "learning_rate": 6.592699951787362e-07, "loss": 0.3378, "step": 350 }, { "epoch": 2.547112462006079, "grad_norm": 0.04242338612675667, "learning_rate": 6.383001914841252e-07, "loss": 0.2992, "step": 351 }, { "epoch": 2.554407294832827, "grad_norm": 0.046155836433172226, "learning_rate": 6.17646542867682e-07, "loss": 0.3503, "step": 352 }, { "epoch": 2.5617021276595744, "grad_norm": 0.04374154284596443, "learning_rate": 5.973105463961864e-07, "loss": 0.3385, "step": 353 }, { "epoch": 2.5689969604863223, "grad_norm": 0.04297053441405296, "learning_rate": 5.772936761116027e-07, "loss": 0.3396, "step": 354 }, { "epoch": 2.57629179331307, "grad_norm": 0.04214682802557945, "learning_rate": 5.575973829242365e-07, "loss": 0.3373, "step": 355 }, { "epoch": 2.5835866261398177, "grad_norm": 0.04097369685769081, "learning_rate": 5.382230945075556e-07, "loss": 0.3386, "step": 356 }, { "epoch": 2.590881458966565, "grad_norm": 0.042690787464380264, "learning_rate": 5.191722151947227e-07, "loss": 0.3319, "step": 357 }, { "epoch": 2.598176291793313, "grad_norm": 0.040518004447221756, "learning_rate": 5.004461258767873e-07, "loss": 0.3174, "step": 358 }, { "epoch": 2.6054711246200606, "grad_norm": 0.04100370407104492, "learning_rate": 4.820461839026047e-07, "loss": 0.34, "step": 359 }, { "epoch": 2.6127659574468085, "grad_norm": 0.04036758467555046, "learning_rate": 4.639737229804403e-07, "loss": 0.3351, "step": 360 }, { "epoch": 2.6200607902735564, "grad_norm": 0.04206588491797447, "learning_rate": 4.4623005308130243e-07, "loss": 0.3244, "step": 361 }, { "epoch": 2.627355623100304, "grad_norm": 0.04280061274766922, "learning_rate": 4.2881646034398926e-07, "loss": 0.3065, "step": 362 }, { "epoch": 2.634650455927052, "grad_norm": 0.04229553043842316, "learning_rate": 4.1173420698186027e-07, "loss": 0.3306, "step": 363 }, { "epoch": 2.6419452887537993, "grad_norm": 0.044544368982315063, "learning_rate": 3.9498453119134917e-07, "loss": 0.3514, "step": 364 }, { "epoch": 2.6492401215805472, "grad_norm": 0.045995116233825684, "learning_rate": 3.7856864706221187e-07, "loss": 0.3498, "step": 365 }, { "epoch": 2.6565349544072947, "grad_norm": 0.048596058040857315, "learning_rate": 3.6248774448952695e-07, "loss": 0.3358, "step": 366 }, { "epoch": 2.6638297872340426, "grad_norm": 0.04591159150004387, "learning_rate": 3.467429890874424e-07, "loss": 0.3129, "step": 367 }, { "epoch": 2.67112462006079, "grad_norm": 0.041663773357868195, "learning_rate": 3.313355221046888e-07, "loss": 0.3213, "step": 368 }, { "epoch": 2.678419452887538, "grad_norm": 0.04105694591999054, "learning_rate": 3.1626646034186084e-07, "loss": 0.345, "step": 369 }, { "epoch": 2.685714285714286, "grad_norm": 0.044980090111494064, "learning_rate": 3.015368960704584e-07, "loss": 0.3265, "step": 370 }, { "epoch": 2.6930091185410334, "grad_norm": 0.04313720017671585, "learning_rate": 2.871478969537206e-07, "loss": 0.3434, "step": 371 }, { "epoch": 2.700303951367781, "grad_norm": 0.04387833923101425, "learning_rate": 2.7310050596923323e-07, "loss": 0.33, "step": 372 }, { "epoch": 2.707598784194529, "grad_norm": 0.04352449253201485, "learning_rate": 2.593957413333331e-07, "loss": 0.3315, "step": 373 }, { "epoch": 2.7148936170212767, "grad_norm": 0.04267902672290802, "learning_rate": 2.4603459642729867e-07, "loss": 0.3574, "step": 374 }, { "epoch": 2.722188449848024, "grad_norm": 0.046177759766578674, "learning_rate": 2.330180397253473e-07, "loss": 0.3428, "step": 375 }, { "epoch": 2.729483282674772, "grad_norm": 0.0405619777739048, "learning_rate": 2.2034701472443854e-07, "loss": 0.2903, "step": 376 }, { "epoch": 2.7367781155015196, "grad_norm": 0.04294833540916443, "learning_rate": 2.0802243987588068e-07, "loss": 0.3664, "step": 377 }, { "epoch": 2.7440729483282675, "grad_norm": 0.048284079879522324, "learning_rate": 1.9604520851876196e-07, "loss": 0.3294, "step": 378 }, { "epoch": 2.7513677811550155, "grad_norm": 0.04744973033666611, "learning_rate": 1.8441618881519186e-07, "loss": 0.321, "step": 379 }, { "epoch": 2.758662613981763, "grad_norm": 0.03917940333485603, "learning_rate": 1.7313622368738014e-07, "loss": 0.307, "step": 380 }, { "epoch": 2.7659574468085104, "grad_norm": 0.04587104544043541, "learning_rate": 1.6220613075653201e-07, "loss": 0.3464, "step": 381 }, { "epoch": 2.7732522796352583, "grad_norm": 0.042470064014196396, "learning_rate": 1.51626702283586e-07, "loss": 0.2907, "step": 382 }, { "epoch": 2.7805471124620063, "grad_norm": 0.048345521092414856, "learning_rate": 1.4139870511178767e-07, "loss": 0.3481, "step": 383 }, { "epoch": 2.7878419452887537, "grad_norm": 0.038930460810661316, "learning_rate": 1.3152288061110518e-07, "loss": 0.2987, "step": 384 }, { "epoch": 2.7951367781155017, "grad_norm": 0.04472014680504799, "learning_rate": 1.2199994462448906e-07, "loss": 0.3612, "step": 385 }, { "epoch": 2.802431610942249, "grad_norm": 0.05124653875827789, "learning_rate": 1.1283058741598962e-07, "loss": 0.3051, "step": 386 }, { "epoch": 2.809726443768997, "grad_norm": 0.038610782474279404, "learning_rate": 1.0401547362071939e-07, "loss": 0.3362, "step": 387 }, { "epoch": 2.8170212765957445, "grad_norm": 0.042344819754362106, "learning_rate": 9.555524219667989e-08, "loss": 0.3206, "step": 388 }, { "epoch": 2.8243161094224924, "grad_norm": 0.046877894550561905, "learning_rate": 8.745050637844532e-08, "loss": 0.3332, "step": 389 }, { "epoch": 2.83161094224924, "grad_norm": 0.04104023799300194, "learning_rate": 7.970185363271432e-08, "loss": 0.2941, "step": 390 }, { "epoch": 2.838905775075988, "grad_norm": 0.04339218884706497, "learning_rate": 7.230984561572729e-08, "loss": 0.3409, "step": 391 }, { "epoch": 2.8462006079027358, "grad_norm": 0.047086864709854126, "learning_rate": 6.527501813255344e-08, "loss": 0.3282, "step": 392 }, { "epoch": 2.8534954407294832, "grad_norm": 0.04140612855553627, "learning_rate": 5.8597881098257924e-08, "loss": 0.3706, "step": 393 }, { "epoch": 2.860790273556231, "grad_norm": 0.045086655765771866, "learning_rate": 5.227891850093314e-08, "loss": 0.3159, "step": 394 }, { "epoch": 2.8680851063829786, "grad_norm": 0.04669662564992905, "learning_rate": 4.631858836662562e-08, "loss": 0.3212, "step": 395 }, { "epoch": 2.8753799392097266, "grad_norm": 0.048425789922475815, "learning_rate": 4.071732272613149e-08, "loss": 0.3781, "step": 396 }, { "epoch": 2.882674772036474, "grad_norm": 0.0414654016494751, "learning_rate": 3.5475527583681005e-08, "loss": 0.3487, "step": 397 }, { "epoch": 2.889969604863222, "grad_norm": 0.04240646958351135, "learning_rate": 3.059358288751202e-08, "loss": 0.3063, "step": 398 }, { "epoch": 2.8972644376899694, "grad_norm": 0.047270409762859344, "learning_rate": 2.6071842502326526e-08, "loss": 0.3352, "step": 399 }, { "epoch": 2.9045592705167174, "grad_norm": 0.043454963713884354, "learning_rate": 2.1910634183644475e-08, "loss": 0.3442, "step": 400 }, { "epoch": 2.9118541033434653, "grad_norm": 0.045333363115787506, "learning_rate": 1.811025955404333e-08, "loss": 0.3196, "step": 401 }, { "epoch": 2.9191489361702128, "grad_norm": 0.04678984358906746, "learning_rate": 1.4670994081297796e-08, "loss": 0.3319, "step": 402 }, { "epoch": 2.9264437689969602, "grad_norm": 0.05119337886571884, "learning_rate": 1.159308705841078e-08, "loss": 0.3614, "step": 403 }, { "epoch": 2.933738601823708, "grad_norm": 0.048367924988269806, "learning_rate": 8.87676158554507e-09, "loss": 0.3615, "step": 404 }, { "epoch": 2.941033434650456, "grad_norm": 0.04424307495355606, "learning_rate": 6.5222145538501595e-09, "loss": 0.3027, "step": 405 }, { "epoch": 2.9483282674772036, "grad_norm": 0.0484929159283638, "learning_rate": 4.5296166311931125e-09, "loss": 0.3259, "step": 406 }, { "epoch": 2.9556231003039515, "grad_norm": 0.04274160414934158, "learning_rate": 2.899112249786229e-09, "loss": 0.3219, "step": 407 }, { "epoch": 2.962917933130699, "grad_norm": 0.04609229788184166, "learning_rate": 1.6308195957182028e-09, "loss": 0.3074, "step": 408 }, { "epoch": 2.970212765957447, "grad_norm": 0.05541510134935379, "learning_rate": 7.24830600386528e-10, "loss": 0.3853, "step": 409 }, { "epoch": 2.977507598784195, "grad_norm": 0.0421581007540226, "learning_rate": 1.812109338367174e-10, "loss": 0.3192, "step": 410 }, { "epoch": 2.9848024316109423, "grad_norm": 0.04373352229595184, "learning_rate": 0.0, "loss": 0.3766, "step": 411 }, { "epoch": 2.9848024316109423, "step": 411, "total_flos": 3.260394272163103e+17, "train_loss": 0.4436677635586175, "train_runtime": 142049.243, "train_samples_per_second": 0.139, "train_steps_per_second": 0.003 } ], "logging_steps": 1, "max_steps": 411, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.260394272163103e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }