tangken333's picture
End of training
df77ff3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9848024316109423,
"eval_steps": 500,
"global_step": 411,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00729483282674772,
"grad_norm": 0.14541301131248474,
"learning_rate": 2.3809523809523811e-07,
"loss": 0.7602,
"step": 1
},
{
"epoch": 0.01458966565349544,
"grad_norm": 0.1567784547805786,
"learning_rate": 4.7619047619047623e-07,
"loss": 0.8215,
"step": 2
},
{
"epoch": 0.02188449848024316,
"grad_norm": 0.1584789901971817,
"learning_rate": 7.142857142857143e-07,
"loss": 0.8269,
"step": 3
},
{
"epoch": 0.02917933130699088,
"grad_norm": 0.157843217253685,
"learning_rate": 9.523809523809525e-07,
"loss": 0.7909,
"step": 4
},
{
"epoch": 0.0364741641337386,
"grad_norm": 0.1607961654663086,
"learning_rate": 1.1904761904761906e-06,
"loss": 0.8198,
"step": 5
},
{
"epoch": 0.04376899696048632,
"grad_norm": 0.15348272025585175,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.7687,
"step": 6
},
{
"epoch": 0.05106382978723404,
"grad_norm": 0.1496104598045349,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.7942,
"step": 7
},
{
"epoch": 0.05835866261398176,
"grad_norm": 0.2734036445617676,
"learning_rate": 1.904761904761905e-06,
"loss": 0.847,
"step": 8
},
{
"epoch": 0.06565349544072949,
"grad_norm": 0.13683773577213287,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.754,
"step": 9
},
{
"epoch": 0.0729483282674772,
"grad_norm": 0.11306589841842651,
"learning_rate": 2.380952380952381e-06,
"loss": 0.6991,
"step": 10
},
{
"epoch": 0.08024316109422493,
"grad_norm": 0.12233421206474304,
"learning_rate": 2.6190476190476192e-06,
"loss": 0.7829,
"step": 11
},
{
"epoch": 0.08753799392097264,
"grad_norm": 0.10262873023748398,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.7048,
"step": 12
},
{
"epoch": 0.09483282674772037,
"grad_norm": 0.10435234010219574,
"learning_rate": 3.0952380952380957e-06,
"loss": 0.768,
"step": 13
},
{
"epoch": 0.10212765957446808,
"grad_norm": 0.0735386535525322,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6147,
"step": 14
},
{
"epoch": 0.1094224924012158,
"grad_norm": 0.07339954376220703,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.7452,
"step": 15
},
{
"epoch": 0.11671732522796352,
"grad_norm": 0.06846445798873901,
"learning_rate": 3.80952380952381e-06,
"loss": 0.7383,
"step": 16
},
{
"epoch": 0.12401215805471125,
"grad_norm": 0.07185480743646622,
"learning_rate": 4.047619047619048e-06,
"loss": 0.7081,
"step": 17
},
{
"epoch": 0.13130699088145897,
"grad_norm": 0.06281375139951706,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.6171,
"step": 18
},
{
"epoch": 0.1386018237082067,
"grad_norm": 0.08302997052669525,
"learning_rate": 4.523809523809524e-06,
"loss": 0.6136,
"step": 19
},
{
"epoch": 0.1458966565349544,
"grad_norm": 0.07521834969520569,
"learning_rate": 4.761904761904762e-06,
"loss": 0.6401,
"step": 20
},
{
"epoch": 0.15319148936170213,
"grad_norm": 0.07346966117620468,
"learning_rate": 5e-06,
"loss": 0.6575,
"step": 21
},
{
"epoch": 0.16048632218844985,
"grad_norm": 0.06220546364784241,
"learning_rate": 5.2380952380952384e-06,
"loss": 0.585,
"step": 22
},
{
"epoch": 0.16778115501519758,
"grad_norm": 0.06210927292704582,
"learning_rate": 5.476190476190477e-06,
"loss": 0.6116,
"step": 23
},
{
"epoch": 0.17507598784194528,
"grad_norm": 0.06617508083581924,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.6591,
"step": 24
},
{
"epoch": 0.182370820668693,
"grad_norm": 0.06115543842315674,
"learning_rate": 5.9523809523809525e-06,
"loss": 0.6164,
"step": 25
},
{
"epoch": 0.18966565349544073,
"grad_norm": 0.05512455105781555,
"learning_rate": 6.1904761904761914e-06,
"loss": 0.6131,
"step": 26
},
{
"epoch": 0.19696048632218846,
"grad_norm": 0.05426128953695297,
"learning_rate": 6.4285714285714295e-06,
"loss": 0.6233,
"step": 27
},
{
"epoch": 0.20425531914893616,
"grad_norm": 0.04495101794600487,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6017,
"step": 28
},
{
"epoch": 0.2115501519756839,
"grad_norm": 0.052700527012348175,
"learning_rate": 6.9047619047619055e-06,
"loss": 0.6209,
"step": 29
},
{
"epoch": 0.2188449848024316,
"grad_norm": 0.05274520814418793,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.562,
"step": 30
},
{
"epoch": 0.22613981762917934,
"grad_norm": 0.0418085902929306,
"learning_rate": 7.380952380952382e-06,
"loss": 0.5356,
"step": 31
},
{
"epoch": 0.23343465045592704,
"grad_norm": 0.04744059965014458,
"learning_rate": 7.61904761904762e-06,
"loss": 0.5759,
"step": 32
},
{
"epoch": 0.24072948328267477,
"grad_norm": 0.051624756306409836,
"learning_rate": 7.857142857142858e-06,
"loss": 0.5989,
"step": 33
},
{
"epoch": 0.2480243161094225,
"grad_norm": 0.04632480815052986,
"learning_rate": 8.095238095238097e-06,
"loss": 0.6036,
"step": 34
},
{
"epoch": 0.2553191489361702,
"grad_norm": 0.040394943207502365,
"learning_rate": 8.333333333333334e-06,
"loss": 0.5439,
"step": 35
},
{
"epoch": 0.26261398176291795,
"grad_norm": 0.047632846981287,
"learning_rate": 8.571428571428571e-06,
"loss": 0.6187,
"step": 36
},
{
"epoch": 0.26990881458966565,
"grad_norm": 0.04498811811208725,
"learning_rate": 8.80952380952381e-06,
"loss": 0.5686,
"step": 37
},
{
"epoch": 0.2772036474164134,
"grad_norm": 0.04858787730336189,
"learning_rate": 9.047619047619049e-06,
"loss": 0.6224,
"step": 38
},
{
"epoch": 0.2844984802431611,
"grad_norm": 0.04534129053354263,
"learning_rate": 9.285714285714288e-06,
"loss": 0.576,
"step": 39
},
{
"epoch": 0.2917933130699088,
"grad_norm": 0.04878037050366402,
"learning_rate": 9.523809523809525e-06,
"loss": 0.5956,
"step": 40
},
{
"epoch": 0.29908814589665655,
"grad_norm": 0.044632136821746826,
"learning_rate": 9.761904761904762e-06,
"loss": 0.5748,
"step": 41
},
{
"epoch": 0.30638297872340425,
"grad_norm": 0.041874803602695465,
"learning_rate": 1e-05,
"loss": 0.5752,
"step": 42
},
{
"epoch": 0.31367781155015195,
"grad_norm": 0.041942398995161057,
"learning_rate": 9.999818789066164e-06,
"loss": 0.5759,
"step": 43
},
{
"epoch": 0.3209726443768997,
"grad_norm": 0.045705121010541916,
"learning_rate": 9.999275169399614e-06,
"loss": 0.5962,
"step": 44
},
{
"epoch": 0.3282674772036474,
"grad_norm": 0.043411824852228165,
"learning_rate": 9.998369180404283e-06,
"loss": 0.54,
"step": 45
},
{
"epoch": 0.33556231003039516,
"grad_norm": 0.03998137265443802,
"learning_rate": 9.997100887750215e-06,
"loss": 0.5874,
"step": 46
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.047370899468660355,
"learning_rate": 9.995470383368808e-06,
"loss": 0.6051,
"step": 47
},
{
"epoch": 0.35015197568389056,
"grad_norm": 0.04455406963825226,
"learning_rate": 9.993477785446151e-06,
"loss": 0.5604,
"step": 48
},
{
"epoch": 0.3574468085106383,
"grad_norm": 0.043418001383543015,
"learning_rate": 9.991123238414455e-06,
"loss": 0.5555,
"step": 49
},
{
"epoch": 0.364741641337386,
"grad_norm": 0.03939136862754822,
"learning_rate": 9.988406912941591e-06,
"loss": 0.5493,
"step": 50
},
{
"epoch": 0.3720364741641337,
"grad_norm": 0.04485655948519707,
"learning_rate": 9.985329005918702e-06,
"loss": 0.5804,
"step": 51
},
{
"epoch": 0.37933130699088147,
"grad_norm": 0.0435781255364418,
"learning_rate": 9.981889740445958e-06,
"loss": 0.5617,
"step": 52
},
{
"epoch": 0.38662613981762917,
"grad_norm": 0.03838958591222763,
"learning_rate": 9.978089365816357e-06,
"loss": 0.5481,
"step": 53
},
{
"epoch": 0.3939209726443769,
"grad_norm": 0.03926938772201538,
"learning_rate": 9.973928157497675e-06,
"loss": 0.5195,
"step": 54
},
{
"epoch": 0.4012158054711246,
"grad_norm": 0.049530286341905594,
"learning_rate": 9.969406417112489e-06,
"loss": 0.5854,
"step": 55
},
{
"epoch": 0.4085106382978723,
"grad_norm": 0.08943431824445724,
"learning_rate": 9.964524472416319e-06,
"loss": 0.5706,
"step": 56
},
{
"epoch": 0.4158054711246201,
"grad_norm": 0.04114034026861191,
"learning_rate": 9.959282677273869e-06,
"loss": 0.4923,
"step": 57
},
{
"epoch": 0.4231003039513678,
"grad_norm": 0.03834295645356178,
"learning_rate": 9.953681411633376e-06,
"loss": 0.5151,
"step": 58
},
{
"epoch": 0.43039513677811553,
"grad_norm": 0.03940470516681671,
"learning_rate": 9.947721081499068e-06,
"loss": 0.5274,
"step": 59
},
{
"epoch": 0.4376899696048632,
"grad_norm": 0.05276661738753319,
"learning_rate": 9.941402118901743e-06,
"loss": 0.5312,
"step": 60
},
{
"epoch": 0.4449848024316109,
"grad_norm": 0.04116562008857727,
"learning_rate": 9.934724981867447e-06,
"loss": 0.5073,
"step": 61
},
{
"epoch": 0.4522796352583587,
"grad_norm": 0.039049182087183,
"learning_rate": 9.927690154384273e-06,
"loss": 0.5367,
"step": 62
},
{
"epoch": 0.4595744680851064,
"grad_norm": 0.042383261024951935,
"learning_rate": 9.920298146367287e-06,
"loss": 0.5232,
"step": 63
},
{
"epoch": 0.4668693009118541,
"grad_norm": 0.04153553023934364,
"learning_rate": 9.912549493621555e-06,
"loss": 0.5438,
"step": 64
},
{
"epoch": 0.47416413373860183,
"grad_norm": 0.04116344451904297,
"learning_rate": 9.904444757803322e-06,
"loss": 0.4803,
"step": 65
},
{
"epoch": 0.48145896656534953,
"grad_norm": 0.06467548757791519,
"learning_rate": 9.895984526379282e-06,
"loss": 0.5554,
"step": 66
},
{
"epoch": 0.4887537993920973,
"grad_norm": 0.04420805722475052,
"learning_rate": 9.887169412584012e-06,
"loss": 0.5659,
"step": 67
},
{
"epoch": 0.496048632218845,
"grad_norm": 0.04072507843375206,
"learning_rate": 9.878000055375512e-06,
"loss": 0.486,
"step": 68
},
{
"epoch": 0.5033434650455927,
"grad_norm": 0.04508865624666214,
"learning_rate": 9.868477119388897e-06,
"loss": 0.5284,
"step": 69
},
{
"epoch": 0.5106382978723404,
"grad_norm": 0.04231835529208183,
"learning_rate": 9.858601294888212e-06,
"loss": 0.5185,
"step": 70
},
{
"epoch": 0.5179331306990882,
"grad_norm": 0.03981684520840645,
"learning_rate": 9.848373297716414e-06,
"loss": 0.5246,
"step": 71
},
{
"epoch": 0.5252279635258359,
"grad_norm": 0.045293230563402176,
"learning_rate": 9.837793869243468e-06,
"loss": 0.5403,
"step": 72
},
{
"epoch": 0.5325227963525836,
"grad_norm": 0.0415407195687294,
"learning_rate": 9.826863776312621e-06,
"loss": 0.568,
"step": 73
},
{
"epoch": 0.5398176291793313,
"grad_norm": 0.04549698531627655,
"learning_rate": 9.815583811184809e-06,
"loss": 0.5547,
"step": 74
},
{
"epoch": 0.547112462006079,
"grad_norm": 0.03895876184105873,
"learning_rate": 9.803954791481239e-06,
"loss": 0.5374,
"step": 75
},
{
"epoch": 0.5544072948328268,
"grad_norm": 0.046192847192287445,
"learning_rate": 9.79197756012412e-06,
"loss": 0.5561,
"step": 76
},
{
"epoch": 0.5617021276595745,
"grad_norm": 0.03921407088637352,
"learning_rate": 9.779652985275562e-06,
"loss": 0.5488,
"step": 77
},
{
"epoch": 0.5689969604863222,
"grad_norm": 0.037232838571071625,
"learning_rate": 9.766981960274653e-06,
"loss": 0.4963,
"step": 78
},
{
"epoch": 0.5762917933130699,
"grad_norm": 0.05492810904979706,
"learning_rate": 9.753965403572703e-06,
"loss": 0.5621,
"step": 79
},
{
"epoch": 0.5835866261398176,
"grad_norm": 0.04202823340892792,
"learning_rate": 9.740604258666668e-06,
"loss": 0.5479,
"step": 80
},
{
"epoch": 0.5908814589665653,
"grad_norm": 0.04189832881093025,
"learning_rate": 9.726899494030768e-06,
"loss": 0.5802,
"step": 81
},
{
"epoch": 0.5981762917933131,
"grad_norm": 0.039709825068712234,
"learning_rate": 9.712852103046281e-06,
"loss": 0.5166,
"step": 82
},
{
"epoch": 0.6054711246200608,
"grad_norm": 0.04080045223236084,
"learning_rate": 9.698463103929542e-06,
"loss": 0.5289,
"step": 83
},
{
"epoch": 0.6127659574468085,
"grad_norm": 0.038535572588443756,
"learning_rate": 9.68373353965814e-06,
"loss": 0.5352,
"step": 84
},
{
"epoch": 0.6200607902735562,
"grad_norm": 0.04705570638179779,
"learning_rate": 9.66866447789531e-06,
"loss": 0.5235,
"step": 85
},
{
"epoch": 0.6273556231003039,
"grad_norm": 0.042207516729831696,
"learning_rate": 9.65325701091256e-06,
"loss": 0.5147,
"step": 86
},
{
"epoch": 0.6346504559270517,
"grad_norm": 0.04210168495774269,
"learning_rate": 9.637512255510475e-06,
"loss": 0.5241,
"step": 87
},
{
"epoch": 0.6419452887537994,
"grad_norm": 0.03785989060997963,
"learning_rate": 9.62143135293779e-06,
"loss": 0.5429,
"step": 88
},
{
"epoch": 0.6492401215805471,
"grad_norm": 0.04283512756228447,
"learning_rate": 9.605015468808651e-06,
"loss": 0.5242,
"step": 89
},
{
"epoch": 0.6565349544072948,
"grad_norm": 0.043273307383060455,
"learning_rate": 9.588265793018141e-06,
"loss": 0.5455,
"step": 90
},
{
"epoch": 0.6638297872340425,
"grad_norm": 0.04218590632081032,
"learning_rate": 9.571183539656011e-06,
"loss": 0.5778,
"step": 91
},
{
"epoch": 0.6711246200607903,
"grad_norm": 0.03844400867819786,
"learning_rate": 9.553769946918698e-06,
"loss": 0.5233,
"step": 92
},
{
"epoch": 0.678419452887538,
"grad_norm": 0.04001948982477188,
"learning_rate": 9.536026277019562e-06,
"loss": 0.5156,
"step": 93
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.04228726401925087,
"learning_rate": 9.517953816097396e-06,
"loss": 0.5138,
"step": 94
},
{
"epoch": 0.6930091185410334,
"grad_norm": 0.03879157081246376,
"learning_rate": 9.499553874123213e-06,
"loss": 0.4926,
"step": 95
},
{
"epoch": 0.7003039513677811,
"grad_norm": 0.04016513749957085,
"learning_rate": 9.480827784805278e-06,
"loss": 0.497,
"step": 96
},
{
"epoch": 0.7075987841945289,
"grad_norm": 0.03983764350414276,
"learning_rate": 9.461776905492446e-06,
"loss": 0.4852,
"step": 97
},
{
"epoch": 0.7148936170212766,
"grad_norm": 0.056514669209718704,
"learning_rate": 9.442402617075765e-06,
"loss": 0.5288,
"step": 98
},
{
"epoch": 0.7221884498480243,
"grad_norm": 0.046206481754779816,
"learning_rate": 9.422706323888398e-06,
"loss": 0.5418,
"step": 99
},
{
"epoch": 0.729483282674772,
"grad_norm": 0.0474584735929966,
"learning_rate": 9.402689453603815e-06,
"loss": 0.5531,
"step": 100
},
{
"epoch": 0.7367781155015197,
"grad_norm": 0.037484850734472275,
"learning_rate": 9.382353457132318e-06,
"loss": 0.4869,
"step": 101
},
{
"epoch": 0.7440729483282674,
"grad_norm": 0.03749077394604683,
"learning_rate": 9.361699808515877e-06,
"loss": 0.5275,
"step": 102
},
{
"epoch": 0.7513677811550152,
"grad_norm": 0.038470759987831116,
"learning_rate": 9.340730004821266e-06,
"loss": 0.5044,
"step": 103
},
{
"epoch": 0.7586626139817629,
"grad_norm": 0.038027700036764145,
"learning_rate": 9.31944556603157e-06,
"loss": 0.5025,
"step": 104
},
{
"epoch": 0.7659574468085106,
"grad_norm": 0.046422988176345825,
"learning_rate": 9.297848034936007e-06,
"loss": 0.5341,
"step": 105
},
{
"epoch": 0.7732522796352583,
"grad_norm": 0.043657850474119186,
"learning_rate": 9.275938977018082e-06,
"loss": 0.5085,
"step": 106
},
{
"epoch": 0.780547112462006,
"grad_norm": 0.04235101863741875,
"learning_rate": 9.253719980342134e-06,
"loss": 0.5397,
"step": 107
},
{
"epoch": 0.7878419452887538,
"grad_norm": 0.04412844404578209,
"learning_rate": 9.231192655438222e-06,
"loss": 0.5522,
"step": 108
},
{
"epoch": 0.7951367781155015,
"grad_norm": 0.04110129550099373,
"learning_rate": 9.208358635185372e-06,
"loss": 0.5785,
"step": 109
},
{
"epoch": 0.8024316109422492,
"grad_norm": 0.03757128119468689,
"learning_rate": 9.185219574693242e-06,
"loss": 0.4777,
"step": 110
},
{
"epoch": 0.8097264437689969,
"grad_norm": 0.03927014395594597,
"learning_rate": 9.161777151182137e-06,
"loss": 0.526,
"step": 111
},
{
"epoch": 0.8170212765957446,
"grad_norm": 0.03983665257692337,
"learning_rate": 9.138033063861436e-06,
"loss": 0.5138,
"step": 112
},
{
"epoch": 0.8243161094224924,
"grad_norm": 0.038819894194602966,
"learning_rate": 9.113989033806434e-06,
"loss": 0.494,
"step": 113
},
{
"epoch": 0.8316109422492401,
"grad_norm": 0.05275421962141991,
"learning_rate": 9.089646803833589e-06,
"loss": 0.539,
"step": 114
},
{
"epoch": 0.8389057750759878,
"grad_norm": 0.04078809916973114,
"learning_rate": 9.06500813837419e-06,
"loss": 0.4778,
"step": 115
},
{
"epoch": 0.8462006079027355,
"grad_norm": 0.040415696799755096,
"learning_rate": 9.040074823346466e-06,
"loss": 0.5443,
"step": 116
},
{
"epoch": 0.8534954407294832,
"grad_norm": 0.03834336996078491,
"learning_rate": 9.014848666026138e-06,
"loss": 0.4945,
"step": 117
},
{
"epoch": 0.8607902735562311,
"grad_norm": 0.04293690249323845,
"learning_rate": 8.989331494915417e-06,
"loss": 0.5404,
"step": 118
},
{
"epoch": 0.8680851063829788,
"grad_norm": 0.04116823151707649,
"learning_rate": 8.963525159610465e-06,
"loss": 0.5274,
"step": 119
},
{
"epoch": 0.8753799392097265,
"grad_norm": 0.04143698886036873,
"learning_rate": 8.937431530667329e-06,
"loss": 0.4916,
"step": 120
},
{
"epoch": 0.8826747720364742,
"grad_norm": 0.039400726556777954,
"learning_rate": 8.911052499466358e-06,
"loss": 0.5573,
"step": 121
},
{
"epoch": 0.8899696048632219,
"grad_norm": 0.035739775747060776,
"learning_rate": 8.884389978075098e-06,
"loss": 0.4961,
"step": 122
},
{
"epoch": 0.8972644376899696,
"grad_norm": 0.04682913422584534,
"learning_rate": 8.857445899109716e-06,
"loss": 0.4712,
"step": 123
},
{
"epoch": 0.9045592705167174,
"grad_norm": 0.04531010612845421,
"learning_rate": 8.83022221559489e-06,
"loss": 0.5242,
"step": 124
},
{
"epoch": 0.9118541033434651,
"grad_norm": 0.04528380185365677,
"learning_rate": 8.80272090082227e-06,
"loss": 0.5506,
"step": 125
},
{
"epoch": 0.9191489361702128,
"grad_norm": 0.03824332728981972,
"learning_rate": 8.774943948207427e-06,
"loss": 0.4581,
"step": 126
},
{
"epoch": 0.9264437689969605,
"grad_norm": 0.03896916285157204,
"learning_rate": 8.746893371145367e-06,
"loss": 0.5504,
"step": 127
},
{
"epoch": 0.9337386018237082,
"grad_norm": 0.03746696934103966,
"learning_rate": 8.718571202864598e-06,
"loss": 0.4589,
"step": 128
},
{
"epoch": 0.941033434650456,
"grad_norm": 0.04142184555530548,
"learning_rate": 8.689979496279747e-06,
"loss": 0.5299,
"step": 129
},
{
"epoch": 0.9483282674772037,
"grad_norm": 0.03700762987136841,
"learning_rate": 8.661120323842751e-06,
"loss": 0.5159,
"step": 130
},
{
"epoch": 0.9556231003039514,
"grad_norm": 0.036684855818748474,
"learning_rate": 8.631995777392645e-06,
"loss": 0.4854,
"step": 131
},
{
"epoch": 0.9629179331306991,
"grad_norm": 0.06939133256673813,
"learning_rate": 8.602607968003935e-06,
"loss": 0.5101,
"step": 132
},
{
"epoch": 0.9702127659574468,
"grad_norm": 0.039062708616256714,
"learning_rate": 8.572959025833573e-06,
"loss": 0.5005,
"step": 133
},
{
"epoch": 0.9775075987841946,
"grad_norm": 0.04555986076593399,
"learning_rate": 8.543051099966558e-06,
"loss": 0.5465,
"step": 134
},
{
"epoch": 0.9848024316109423,
"grad_norm": 0.04333364591002464,
"learning_rate": 8.512886358260162e-06,
"loss": 0.5237,
"step": 135
},
{
"epoch": 0.99209726443769,
"grad_norm": 0.04095487669110298,
"learning_rate": 8.482466987186785e-06,
"loss": 0.5335,
"step": 136
},
{
"epoch": 0.9993920972644377,
"grad_norm": 0.0442386157810688,
"learning_rate": 8.451795191675488e-06,
"loss": 0.5107,
"step": 137
},
{
"epoch": 1.0,
"grad_norm": 0.0442386157810688,
"learning_rate": 8.420873194952153e-06,
"loss": 0.465,
"step": 138
},
{
"epoch": 1.0072948328267477,
"grad_norm": 0.1445915699005127,
"learning_rate": 8.38970323837834e-06,
"loss": 0.4704,
"step": 139
},
{
"epoch": 1.0145896656534954,
"grad_norm": 0.042022328823804855,
"learning_rate": 8.358287581288824e-06,
"loss": 0.4282,
"step": 140
},
{
"epoch": 1.021884498480243,
"grad_norm": 0.04201134666800499,
"learning_rate": 8.326628500827826e-06,
"loss": 0.4539,
"step": 141
},
{
"epoch": 1.0291793313069908,
"grad_norm": 0.04877388849854469,
"learning_rate": 8.294728291783967e-06,
"loss": 0.4641,
"step": 142
},
{
"epoch": 1.0364741641337385,
"grad_norm": 0.046164825558662415,
"learning_rate": 8.262589266423908e-06,
"loss": 0.419,
"step": 143
},
{
"epoch": 1.0437689969604864,
"grad_norm": 0.041141681373119354,
"learning_rate": 8.230213754324773e-06,
"loss": 0.4224,
"step": 144
},
{
"epoch": 1.0510638297872341,
"grad_norm": 0.03967837244272232,
"learning_rate": 8.19760410220527e-06,
"loss": 0.4268,
"step": 145
},
{
"epoch": 1.0583586626139818,
"grad_norm": 0.05634555220603943,
"learning_rate": 8.16476267375561e-06,
"loss": 0.4025,
"step": 146
},
{
"epoch": 1.0656534954407295,
"grad_norm": 0.041606318205595016,
"learning_rate": 8.131691849466154e-06,
"loss": 0.4335,
"step": 147
},
{
"epoch": 1.0729483282674772,
"grad_norm": 0.03656647726893425,
"learning_rate": 8.098394026454886e-06,
"loss": 0.456,
"step": 148
},
{
"epoch": 1.080243161094225,
"grad_norm": 0.041005730628967285,
"learning_rate": 8.064871618293647e-06,
"loss": 0.3925,
"step": 149
},
{
"epoch": 1.0875379939209726,
"grad_norm": 0.04722120240330696,
"learning_rate": 8.031127054833192e-06,
"loss": 0.3981,
"step": 150
},
{
"epoch": 1.0948328267477203,
"grad_norm": 0.043071143329143524,
"learning_rate": 7.997162782027061e-06,
"loss": 0.4296,
"step": 151
},
{
"epoch": 1.102127659574468,
"grad_norm": 0.04518291354179382,
"learning_rate": 7.962981261754295e-06,
"loss": 0.4376,
"step": 152
},
{
"epoch": 1.1094224924012157,
"grad_norm": 0.04998685419559479,
"learning_rate": 7.928584971640974e-06,
"loss": 0.452,
"step": 153
},
{
"epoch": 1.1167173252279636,
"grad_norm": 0.04469837248325348,
"learning_rate": 7.893976404880643e-06,
"loss": 0.4316,
"step": 154
},
{
"epoch": 1.1240121580547113,
"grad_norm": 0.040255557745695114,
"learning_rate": 7.859158070053578e-06,
"loss": 0.4378,
"step": 155
},
{
"epoch": 1.131306990881459,
"grad_norm": 0.04467206820845604,
"learning_rate": 7.824132490944968e-06,
"loss": 0.4215,
"step": 156
},
{
"epoch": 1.1386018237082067,
"grad_norm": 0.03889721632003784,
"learning_rate": 7.788902206361974e-06,
"loss": 0.4257,
"step": 157
},
{
"epoch": 1.1458966565349544,
"grad_norm": 0.04140063747763634,
"learning_rate": 7.753469769949701e-06,
"loss": 0.4434,
"step": 158
},
{
"epoch": 1.1531914893617021,
"grad_norm": 0.039931997656822205,
"learning_rate": 7.717837750006106e-06,
"loss": 0.41,
"step": 159
},
{
"epoch": 1.1604863221884498,
"grad_norm": 0.03909624367952347,
"learning_rate": 7.682008729295834e-06,
"loss": 0.3904,
"step": 160
},
{
"epoch": 1.1677811550151975,
"grad_norm": 0.0401025116443634,
"learning_rate": 7.645985304863004e-06,
"loss": 0.4618,
"step": 161
},
{
"epoch": 1.1750759878419452,
"grad_norm": 0.04733911529183388,
"learning_rate": 7.609770087842969e-06,
"loss": 0.4247,
"step": 162
},
{
"epoch": 1.182370820668693,
"grad_norm": 0.037687744945287704,
"learning_rate": 7.573365703273045e-06,
"loss": 0.4071,
"step": 163
},
{
"epoch": 1.1896656534954406,
"grad_norm": 0.039216116070747375,
"learning_rate": 7.536774789902246e-06,
"loss": 0.4259,
"step": 164
},
{
"epoch": 1.1969604863221885,
"grad_norm": 0.0408397912979126,
"learning_rate": 7.500000000000001e-06,
"loss": 0.394,
"step": 165
},
{
"epoch": 1.2042553191489362,
"grad_norm": 0.04507288709282875,
"learning_rate": 7.463043999163919e-06,
"loss": 0.4605,
"step": 166
},
{
"epoch": 1.211550151975684,
"grad_norm": 0.03989469259977341,
"learning_rate": 7.4259094661265685e-06,
"loss": 0.4285,
"step": 167
},
{
"epoch": 1.2188449848024316,
"grad_norm": 0.0407419353723526,
"learning_rate": 7.388599092561315e-06,
"loss": 0.4204,
"step": 168
},
{
"epoch": 1.2261398176291793,
"grad_norm": 0.040525760501623154,
"learning_rate": 7.351115582887212e-06,
"loss": 0.4253,
"step": 169
},
{
"epoch": 1.233434650455927,
"grad_norm": 0.04370498284697533,
"learning_rate": 7.313461654072974e-06,
"loss": 0.4071,
"step": 170
},
{
"epoch": 1.2407294832826747,
"grad_norm": 0.0392344668507576,
"learning_rate": 7.2756400354400445e-06,
"loss": 0.4093,
"step": 171
},
{
"epoch": 1.2480243161094224,
"grad_norm": 0.03849213197827339,
"learning_rate": 7.237653468464756e-06,
"loss": 0.4157,
"step": 172
},
{
"epoch": 1.2553191489361701,
"grad_norm": 0.04228688403964043,
"learning_rate": 7.199504706579617e-06,
"loss": 0.482,
"step": 173
},
{
"epoch": 1.262613981762918,
"grad_norm": 0.037325162440538406,
"learning_rate": 7.161196514973735e-06,
"loss": 0.4224,
"step": 174
},
{
"epoch": 1.2699088145896655,
"grad_norm": 0.047044239938259125,
"learning_rate": 7.122731670392381e-06,
"loss": 0.4249,
"step": 175
},
{
"epoch": 1.2772036474164135,
"grad_norm": 0.04322784021496773,
"learning_rate": 7.0841129609357165e-06,
"loss": 0.4051,
"step": 176
},
{
"epoch": 1.2844984802431612,
"grad_norm": 0.041998326778411865,
"learning_rate": 7.045343185856701e-06,
"loss": 0.4106,
"step": 177
},
{
"epoch": 1.2917933130699089,
"grad_norm": 0.040727648884058,
"learning_rate": 7.006425155358195e-06,
"loss": 0.4427,
"step": 178
},
{
"epoch": 1.2990881458966566,
"grad_norm": 0.04059009999036789,
"learning_rate": 6.967361690389258e-06,
"loss": 0.437,
"step": 179
},
{
"epoch": 1.3063829787234043,
"grad_norm": 0.042023915797472,
"learning_rate": 6.92815562244068e-06,
"loss": 0.4315,
"step": 180
},
{
"epoch": 1.313677811550152,
"grad_norm": 0.04910752549767494,
"learning_rate": 6.888809793339729e-06,
"loss": 0.4436,
"step": 181
},
{
"epoch": 1.3209726443768997,
"grad_norm": 0.04180140420794487,
"learning_rate": 6.849327055044182e-06,
"loss": 0.3948,
"step": 182
},
{
"epoch": 1.3282674772036474,
"grad_norm": 0.03989269211888313,
"learning_rate": 6.80971026943559e-06,
"loss": 0.3929,
"step": 183
},
{
"epoch": 1.335562310030395,
"grad_norm": 0.04497074335813522,
"learning_rate": 6.769962308111839e-06,
"loss": 0.4429,
"step": 184
},
{
"epoch": 1.342857142857143,
"grad_norm": 0.04516409710049629,
"learning_rate": 6.7300860521790034e-06,
"loss": 0.4363,
"step": 185
},
{
"epoch": 1.3501519756838904,
"grad_norm": 0.041362229734659195,
"learning_rate": 6.690084392042514e-06,
"loss": 0.4058,
"step": 186
},
{
"epoch": 1.3574468085106384,
"grad_norm": 0.04281953349709511,
"learning_rate": 6.649960227197648e-06,
"loss": 0.423,
"step": 187
},
{
"epoch": 1.364741641337386,
"grad_norm": 0.046076931059360504,
"learning_rate": 6.609716466019356e-06,
"loss": 0.4427,
"step": 188
},
{
"epoch": 1.3720364741641338,
"grad_norm": 0.03960058465600014,
"learning_rate": 6.569356025551454e-06,
"loss": 0.4193,
"step": 189
},
{
"epoch": 1.3793313069908815,
"grad_norm": 0.044169649481773376,
"learning_rate": 6.5288818312951886e-06,
"loss": 0.4034,
"step": 190
},
{
"epoch": 1.3866261398176292,
"grad_norm": 0.04062066227197647,
"learning_rate": 6.4882968169971734e-06,
"loss": 0.4018,
"step": 191
},
{
"epoch": 1.3939209726443769,
"grad_norm": 0.04406093806028366,
"learning_rate": 6.447603924436744e-06,
"loss": 0.4498,
"step": 192
},
{
"epoch": 1.4012158054711246,
"grad_norm": 0.04197722300887108,
"learning_rate": 6.406806103212725e-06,
"loss": 0.4356,
"step": 193
},
{
"epoch": 1.4085106382978723,
"grad_norm": 0.04061530530452728,
"learning_rate": 6.365906310529631e-06,
"loss": 0.4441,
"step": 194
},
{
"epoch": 1.41580547112462,
"grad_norm": 0.046513479202985764,
"learning_rate": 6.32490751098331e-06,
"loss": 0.4166,
"step": 195
},
{
"epoch": 1.4231003039513679,
"grad_norm": 0.03948912024497986,
"learning_rate": 6.2838126763460635e-06,
"loss": 0.4478,
"step": 196
},
{
"epoch": 1.4303951367781156,
"grad_norm": 0.04548676684498787,
"learning_rate": 6.2426247853512355e-06,
"loss": 0.4653,
"step": 197
},
{
"epoch": 1.4376899696048633,
"grad_norm": 0.041050177067518234,
"learning_rate": 6.2013468234773034e-06,
"loss": 0.3953,
"step": 198
},
{
"epoch": 1.444984802431611,
"grad_norm": 0.03936685994267464,
"learning_rate": 6.1599817827314744e-06,
"loss": 0.4256,
"step": 199
},
{
"epoch": 1.4522796352583587,
"grad_norm": 0.041237395256757736,
"learning_rate": 6.118532661432812e-06,
"loss": 0.3892,
"step": 200
},
{
"epoch": 1.4595744680851064,
"grad_norm": 0.043174393475055695,
"learning_rate": 6.077002463994908e-06,
"loss": 0.4174,
"step": 201
},
{
"epoch": 1.466869300911854,
"grad_norm": 0.04198073223233223,
"learning_rate": 6.035394200708104e-06,
"loss": 0.4278,
"step": 202
},
{
"epoch": 1.4741641337386018,
"grad_norm": 0.045515723526477814,
"learning_rate": 5.993710887521302e-06,
"loss": 0.4346,
"step": 203
},
{
"epoch": 1.4814589665653495,
"grad_norm": 0.04443354532122612,
"learning_rate": 5.951955545823342e-06,
"loss": 0.4116,
"step": 204
},
{
"epoch": 1.4887537993920974,
"grad_norm": 0.04223044961690903,
"learning_rate": 5.910131202224011e-06,
"loss": 0.3844,
"step": 205
},
{
"epoch": 1.4960486322188449,
"grad_norm": 0.04305846244096756,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.4414,
"step": 206
},
{
"epoch": 1.5033434650455928,
"grad_norm": 0.04148327186703682,
"learning_rate": 5.826287640548425e-06,
"loss": 0.4327,
"step": 207
},
{
"epoch": 1.5106382978723403,
"grad_norm": 0.0433870293200016,
"learning_rate": 5.784274499820214e-06,
"loss": 0.3787,
"step": 208
},
{
"epoch": 1.5179331306990882,
"grad_norm": 0.041102319955825806,
"learning_rate": 5.742204511446203e-06,
"loss": 0.4189,
"step": 209
},
{
"epoch": 1.525227963525836,
"grad_norm": 0.04175707325339317,
"learning_rate": 5.7000807248431466e-06,
"loss": 0.427,
"step": 210
},
{
"epoch": 1.5325227963525836,
"grad_norm": 0.04286907613277435,
"learning_rate": 5.657906193327325e-06,
"loss": 0.4,
"step": 211
},
{
"epoch": 1.5398176291793313,
"grad_norm": 0.04246861860156059,
"learning_rate": 5.615683973893235e-06,
"loss": 0.4097,
"step": 212
},
{
"epoch": 1.547112462006079,
"grad_norm": 0.03898885101079941,
"learning_rate": 5.573417126992004e-06,
"loss": 0.4237,
"step": 213
},
{
"epoch": 1.554407294832827,
"grad_norm": 0.04554813727736473,
"learning_rate": 5.5311087163095475e-06,
"loss": 0.436,
"step": 214
},
{
"epoch": 1.5617021276595744,
"grad_norm": 0.04189833253622055,
"learning_rate": 5.4887618085445094e-06,
"loss": 0.4121,
"step": 215
},
{
"epoch": 1.5689969604863223,
"grad_norm": 0.05306672677397728,
"learning_rate": 5.446379473185972e-06,
"loss": 0.4015,
"step": 216
},
{
"epoch": 1.5762917933130698,
"grad_norm": 0.04060041531920433,
"learning_rate": 5.403964782290962e-06,
"loss": 0.3967,
"step": 217
},
{
"epoch": 1.5835866261398177,
"grad_norm": 0.045451849699020386,
"learning_rate": 5.361520810261779e-06,
"loss": 0.4161,
"step": 218
},
{
"epoch": 1.5908814589665652,
"grad_norm": 0.043955542147159576,
"learning_rate": 5.319050633623141e-06,
"loss": 0.4205,
"step": 219
},
{
"epoch": 1.598176291793313,
"grad_norm": 0.040733452886343,
"learning_rate": 5.276557330799203e-06,
"loss": 0.4165,
"step": 220
},
{
"epoch": 1.6054711246200608,
"grad_norm": 0.04190356284379959,
"learning_rate": 5.234043981890395e-06,
"loss": 0.4515,
"step": 221
},
{
"epoch": 1.6127659574468085,
"grad_norm": 0.037713076919317245,
"learning_rate": 5.191513668450178e-06,
"loss": 0.4131,
"step": 222
},
{
"epoch": 1.6200607902735562,
"grad_norm": 0.038250233978033066,
"learning_rate": 5.1489694732616805e-06,
"loss": 0.4028,
"step": 223
},
{
"epoch": 1.627355623100304,
"grad_norm": 0.039751507341861725,
"learning_rate": 5.106414480114238e-06,
"loss": 0.4121,
"step": 224
},
{
"epoch": 1.6346504559270518,
"grad_norm": 0.044864848256111145,
"learning_rate": 5.06385177357987e-06,
"loss": 0.4708,
"step": 225
},
{
"epoch": 1.6419452887537993,
"grad_norm": 0.04169140383601189,
"learning_rate": 5.021284438789694e-06,
"loss": 0.425,
"step": 226
},
{
"epoch": 1.6492401215805472,
"grad_norm": 0.04238287732005119,
"learning_rate": 4.9787155612103076e-06,
"loss": 0.409,
"step": 227
},
{
"epoch": 1.6565349544072947,
"grad_norm": 0.03984750807285309,
"learning_rate": 4.936148226420133e-06,
"loss": 0.4451,
"step": 228
},
{
"epoch": 1.6638297872340426,
"grad_norm": 0.03823258727788925,
"learning_rate": 4.893585519885764e-06,
"loss": 0.4318,
"step": 229
},
{
"epoch": 1.6711246200607903,
"grad_norm": 0.043166667222976685,
"learning_rate": 4.851030526738321e-06,
"loss": 0.4348,
"step": 230
},
{
"epoch": 1.678419452887538,
"grad_norm": 0.04118693992495537,
"learning_rate": 4.808486331549824e-06,
"loss": 0.435,
"step": 231
},
{
"epoch": 1.6857142857142857,
"grad_norm": 0.040095556527376175,
"learning_rate": 4.765956018109607e-06,
"loss": 0.4506,
"step": 232
},
{
"epoch": 1.6930091185410334,
"grad_norm": 0.04523642733693123,
"learning_rate": 4.7234426692007985e-06,
"loss": 0.4394,
"step": 233
},
{
"epoch": 1.7003039513677811,
"grad_norm": 0.041244085878133774,
"learning_rate": 4.680949366376858e-06,
"loss": 0.4698,
"step": 234
},
{
"epoch": 1.7075987841945288,
"grad_norm": 0.04374610632658005,
"learning_rate": 4.638479189738224e-06,
"loss": 0.4129,
"step": 235
},
{
"epoch": 1.7148936170212767,
"grad_norm": 0.040487710386514664,
"learning_rate": 4.596035217709039e-06,
"loss": 0.4362,
"step": 236
},
{
"epoch": 1.7221884498480242,
"grad_norm": 0.044370926916599274,
"learning_rate": 4.553620526814029e-06,
"loss": 0.4155,
"step": 237
},
{
"epoch": 1.7294832826747721,
"grad_norm": 0.04036295786499977,
"learning_rate": 4.511238191455491e-06,
"loss": 0.4214,
"step": 238
},
{
"epoch": 1.7367781155015196,
"grad_norm": 0.03773313760757446,
"learning_rate": 4.468891283690454e-06,
"loss": 0.4298,
"step": 239
},
{
"epoch": 1.7440729483282675,
"grad_norm": 0.045683182775974274,
"learning_rate": 4.426582873007999e-06,
"loss": 0.4485,
"step": 240
},
{
"epoch": 1.7513677811550152,
"grad_norm": 0.04686903581023216,
"learning_rate": 4.384316026106766e-06,
"loss": 0.4303,
"step": 241
},
{
"epoch": 1.758662613981763,
"grad_norm": 0.045155324041843414,
"learning_rate": 4.342093806672678e-06,
"loss": 0.4409,
"step": 242
},
{
"epoch": 1.7659574468085106,
"grad_norm": 0.0418829619884491,
"learning_rate": 4.299919275156857e-06,
"loss": 0.4149,
"step": 243
},
{
"epoch": 1.7732522796352583,
"grad_norm": 0.041985101997852325,
"learning_rate": 4.2577954885537985e-06,
"loss": 0.4293,
"step": 244
},
{
"epoch": 1.780547112462006,
"grad_norm": 0.042692556977272034,
"learning_rate": 4.215725500179788e-06,
"loss": 0.4258,
"step": 245
},
{
"epoch": 1.7878419452887537,
"grad_norm": 0.04013342410326004,
"learning_rate": 4.173712359451576e-06,
"loss": 0.4015,
"step": 246
},
{
"epoch": 1.7951367781155017,
"grad_norm": 0.038998380303382874,
"learning_rate": 4.131759111665349e-06,
"loss": 0.4596,
"step": 247
},
{
"epoch": 1.8024316109422491,
"grad_norm": 0.039829254150390625,
"learning_rate": 4.0898687977759895e-06,
"loss": 0.4128,
"step": 248
},
{
"epoch": 1.809726443768997,
"grad_norm": 0.04312862455844879,
"learning_rate": 4.048044454176658e-06,
"loss": 0.4243,
"step": 249
},
{
"epoch": 1.8170212765957445,
"grad_norm": 0.04533419758081436,
"learning_rate": 4.0062891124787e-06,
"loss": 0.4414,
"step": 250
},
{
"epoch": 1.8243161094224924,
"grad_norm": 0.0438460148870945,
"learning_rate": 3.964605799291897e-06,
"loss": 0.4553,
"step": 251
},
{
"epoch": 1.8316109422492401,
"grad_norm": 0.0429726168513298,
"learning_rate": 3.922997536005094e-06,
"loss": 0.4311,
"step": 252
},
{
"epoch": 1.8389057750759878,
"grad_norm": 0.039493922144174576,
"learning_rate": 3.88146733856719e-06,
"loss": 0.4387,
"step": 253
},
{
"epoch": 1.8462006079027355,
"grad_norm": 0.04514075070619583,
"learning_rate": 3.840018217268527e-06,
"loss": 0.4442,
"step": 254
},
{
"epoch": 1.8534954407294832,
"grad_norm": 0.04080420732498169,
"learning_rate": 3.7986531765226965e-06,
"loss": 0.3884,
"step": 255
},
{
"epoch": 1.8607902735562312,
"grad_norm": 0.04457089304924011,
"learning_rate": 3.757375214648764e-06,
"loss": 0.3804,
"step": 256
},
{
"epoch": 1.8680851063829786,
"grad_norm": 0.044411323964595795,
"learning_rate": 3.716187323653939e-06,
"loss": 0.4164,
"step": 257
},
{
"epoch": 1.8753799392097266,
"grad_norm": 0.04527450352907181,
"learning_rate": 3.675092489016693e-06,
"loss": 0.4402,
"step": 258
},
{
"epoch": 1.882674772036474,
"grad_norm": 0.039380993694067,
"learning_rate": 3.6340936894703717e-06,
"loss": 0.4329,
"step": 259
},
{
"epoch": 1.889969604863222,
"grad_norm": 0.04769477993249893,
"learning_rate": 3.593193896787277e-06,
"loss": 0.4447,
"step": 260
},
{
"epoch": 1.8972644376899694,
"grad_norm": 0.04017976298928261,
"learning_rate": 3.5523960755632573e-06,
"loss": 0.4066,
"step": 261
},
{
"epoch": 1.9045592705167174,
"grad_norm": 0.04179855436086655,
"learning_rate": 3.5117031830028274e-06,
"loss": 0.4048,
"step": 262
},
{
"epoch": 1.911854103343465,
"grad_norm": 0.041397638618946075,
"learning_rate": 3.4711181687048114e-06,
"loss": 0.4296,
"step": 263
},
{
"epoch": 1.9191489361702128,
"grad_norm": 0.05517794191837311,
"learning_rate": 3.4306439744485453e-06,
"loss": 0.4266,
"step": 264
},
{
"epoch": 1.9264437689969605,
"grad_norm": 0.04322275519371033,
"learning_rate": 3.3902835339806463e-06,
"loss": 0.4084,
"step": 265
},
{
"epoch": 1.9337386018237082,
"grad_norm": 0.04079868271946907,
"learning_rate": 3.3500397728023536e-06,
"loss": 0.3987,
"step": 266
},
{
"epoch": 1.941033434650456,
"grad_norm": 0.040178705006837845,
"learning_rate": 3.309915607957487e-06,
"loss": 0.3508,
"step": 267
},
{
"epoch": 1.9483282674772036,
"grad_norm": 0.042311254888772964,
"learning_rate": 3.2699139478209987e-06,
"loss": 0.4315,
"step": 268
},
{
"epoch": 1.9556231003039515,
"grad_norm": 0.041265442967414856,
"learning_rate": 3.2300376918881628e-06,
"loss": 0.4096,
"step": 269
},
{
"epoch": 1.962917933130699,
"grad_norm": 0.035929929465055466,
"learning_rate": 3.19028973056441e-06,
"loss": 0.3872,
"step": 270
},
{
"epoch": 1.9702127659574469,
"grad_norm": 0.04031127318739891,
"learning_rate": 3.150672944955818e-06,
"loss": 0.4299,
"step": 271
},
{
"epoch": 1.9775075987841946,
"grad_norm": 0.043629132211208344,
"learning_rate": 3.111190206660273e-06,
"loss": 0.4371,
"step": 272
},
{
"epoch": 1.9848024316109423,
"grad_norm": 0.03935433551669121,
"learning_rate": 3.0718443775593233e-06,
"loss": 0.3912,
"step": 273
},
{
"epoch": 1.99209726443769,
"grad_norm": 0.04069478437304497,
"learning_rate": 3.0326383096107424e-06,
"loss": 0.416,
"step": 274
},
{
"epoch": 1.9993920972644377,
"grad_norm": 0.05225847661495209,
"learning_rate": 2.993574844641807e-06,
"loss": 0.3656,
"step": 275
},
{
"epoch": 2.0,
"grad_norm": 0.05225847661495209,
"learning_rate": 2.9546568141433007e-06,
"loss": 0.5271,
"step": 276
},
{
"epoch": 2.007294832826748,
"grad_norm": 0.15690822899341583,
"learning_rate": 2.915887039064287e-06,
"loss": 0.3677,
"step": 277
},
{
"epoch": 2.0145896656534954,
"grad_norm": 0.040792327374219894,
"learning_rate": 2.8772683296076197e-06,
"loss": 0.3493,
"step": 278
},
{
"epoch": 2.0218844984802433,
"grad_norm": 0.042940009385347366,
"learning_rate": 2.838803485026265e-06,
"loss": 0.3622,
"step": 279
},
{
"epoch": 2.029179331306991,
"grad_norm": 0.03872222825884819,
"learning_rate": 2.800495293420384e-06,
"loss": 0.3204,
"step": 280
},
{
"epoch": 2.0364741641337387,
"grad_norm": 0.03756758198142052,
"learning_rate": 2.762346531535246e-06,
"loss": 0.3158,
"step": 281
},
{
"epoch": 2.043768996960486,
"grad_norm": 0.04466132074594498,
"learning_rate": 2.724359964559958e-06,
"loss": 0.3638,
"step": 282
},
{
"epoch": 2.051063829787234,
"grad_norm": 0.04124055802822113,
"learning_rate": 2.686538345927027e-06,
"loss": 0.3231,
"step": 283
},
{
"epoch": 2.0583586626139816,
"grad_norm": 0.04917893931269646,
"learning_rate": 2.6488844171127903e-06,
"loss": 0.3683,
"step": 284
},
{
"epoch": 2.0656534954407295,
"grad_norm": 0.04118992015719414,
"learning_rate": 2.611400907438685e-06,
"loss": 0.294,
"step": 285
},
{
"epoch": 2.072948328267477,
"grad_norm": 0.04872892051935196,
"learning_rate": 2.574090533873431e-06,
"loss": 0.3156,
"step": 286
},
{
"epoch": 2.080243161094225,
"grad_norm": 0.04159025847911835,
"learning_rate": 2.5369560008360826e-06,
"loss": 0.3467,
"step": 287
},
{
"epoch": 2.087537993920973,
"grad_norm": 0.04014930874109268,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.3303,
"step": 288
},
{
"epoch": 2.0948328267477203,
"grad_norm": 0.04120990261435509,
"learning_rate": 2.4632252100977567e-06,
"loss": 0.3086,
"step": 289
},
{
"epoch": 2.1021276595744682,
"grad_norm": 0.04079623147845268,
"learning_rate": 2.426634296726955e-06,
"loss": 0.3364,
"step": 290
},
{
"epoch": 2.1094224924012157,
"grad_norm": 0.04097681865096092,
"learning_rate": 2.3902299121570332e-06,
"loss": 0.3254,
"step": 291
},
{
"epoch": 2.1167173252279636,
"grad_norm": 0.04335152730345726,
"learning_rate": 2.354014695136997e-06,
"loss": 0.2875,
"step": 292
},
{
"epoch": 2.124012158054711,
"grad_norm": 0.041609250009059906,
"learning_rate": 2.317991270704167e-06,
"loss": 0.3585,
"step": 293
},
{
"epoch": 2.131306990881459,
"grad_norm": 0.03962058201432228,
"learning_rate": 2.282162249993895e-06,
"loss": 0.3094,
"step": 294
},
{
"epoch": 2.1386018237082065,
"grad_norm": 0.041423097252845764,
"learning_rate": 2.2465302300503012e-06,
"loss": 0.358,
"step": 295
},
{
"epoch": 2.1458966565349544,
"grad_norm": 0.044281214475631714,
"learning_rate": 2.211097793638029e-06,
"loss": 0.3575,
"step": 296
},
{
"epoch": 2.153191489361702,
"grad_norm": 0.04825511574745178,
"learning_rate": 2.175867509055033e-06,
"loss": 0.3364,
"step": 297
},
{
"epoch": 2.16048632218845,
"grad_norm": 0.04354681074619293,
"learning_rate": 2.1408419299464245e-06,
"loss": 0.2979,
"step": 298
},
{
"epoch": 2.1677811550151977,
"grad_norm": 0.042697008699178696,
"learning_rate": 2.106023595119358e-06,
"loss": 0.3356,
"step": 299
},
{
"epoch": 2.1750759878419452,
"grad_norm": 0.047276780009269714,
"learning_rate": 2.071415028359026e-06,
"loss": 0.3634,
"step": 300
},
{
"epoch": 2.182370820668693,
"grad_norm": 0.041072778403759,
"learning_rate": 2.037018738245707e-06,
"loss": 0.3362,
"step": 301
},
{
"epoch": 2.1896656534954406,
"grad_norm": 0.04692791774868965,
"learning_rate": 2.0028372179729405e-06,
"loss": 0.3511,
"step": 302
},
{
"epoch": 2.1969604863221885,
"grad_norm": 0.04538114741444588,
"learning_rate": 1.9688729451668116e-06,
"loss": 0.3336,
"step": 303
},
{
"epoch": 2.204255319148936,
"grad_norm": 0.044003862887620926,
"learning_rate": 1.935128381706355e-06,
"loss": 0.3349,
"step": 304
},
{
"epoch": 2.211550151975684,
"grad_norm": 0.045857448130846024,
"learning_rate": 1.901605973545116e-06,
"loss": 0.3537,
"step": 305
},
{
"epoch": 2.2188449848024314,
"grad_norm": 0.04272821545600891,
"learning_rate": 1.8683081505338468e-06,
"loss": 0.3373,
"step": 306
},
{
"epoch": 2.2261398176291793,
"grad_norm": 0.04273563250899315,
"learning_rate": 1.8352373262443918e-06,
"loss": 0.3436,
"step": 307
},
{
"epoch": 2.2334346504559273,
"grad_norm": 0.058361783623695374,
"learning_rate": 1.8023958977947303e-06,
"loss": 0.3245,
"step": 308
},
{
"epoch": 2.2407294832826747,
"grad_norm": 0.04002346843481064,
"learning_rate": 1.7697862456752273e-06,
"loss": 0.3317,
"step": 309
},
{
"epoch": 2.2480243161094227,
"grad_norm": 0.039896223694086075,
"learning_rate": 1.7374107335760937e-06,
"loss": 0.2976,
"step": 310
},
{
"epoch": 2.25531914893617,
"grad_norm": 0.04140615463256836,
"learning_rate": 1.7052717082160348e-06,
"loss": 0.3178,
"step": 311
},
{
"epoch": 2.262613981762918,
"grad_norm": 0.04329473525285721,
"learning_rate": 1.6733714991721738e-06,
"loss": 0.3187,
"step": 312
},
{
"epoch": 2.2699088145896655,
"grad_norm": 0.04440492019057274,
"learning_rate": 1.6417124187111778e-06,
"loss": 0.3194,
"step": 313
},
{
"epoch": 2.2772036474164135,
"grad_norm": 0.03996637463569641,
"learning_rate": 1.610296761621662e-06,
"loss": 0.3504,
"step": 314
},
{
"epoch": 2.284498480243161,
"grad_norm": 0.04870344325900078,
"learning_rate": 1.5791268050478487e-06,
"loss": 0.3599,
"step": 315
},
{
"epoch": 2.291793313069909,
"grad_norm": 0.049775756895542145,
"learning_rate": 1.5482048083245116e-06,
"loss": 0.3051,
"step": 316
},
{
"epoch": 2.2990881458966568,
"grad_norm": 0.04166199639439583,
"learning_rate": 1.517533012813217e-06,
"loss": 0.3383,
"step": 317
},
{
"epoch": 2.3063829787234043,
"grad_norm": 0.04483890160918236,
"learning_rate": 1.4871136417398407e-06,
"loss": 0.3302,
"step": 318
},
{
"epoch": 2.3136778115501517,
"grad_norm": 0.04276059940457344,
"learning_rate": 1.4569489000334435e-06,
"loss": 0.3615,
"step": 319
},
{
"epoch": 2.3209726443768997,
"grad_norm": 0.04011907801032066,
"learning_rate": 1.427040974166427e-06,
"loss": 0.3139,
"step": 320
},
{
"epoch": 2.3282674772036476,
"grad_norm": 0.043183084577322006,
"learning_rate": 1.3973920319960654e-06,
"loss": 0.3327,
"step": 321
},
{
"epoch": 2.335562310030395,
"grad_norm": 0.045110031962394714,
"learning_rate": 1.3680042226073554e-06,
"loss": 0.3183,
"step": 322
},
{
"epoch": 2.342857142857143,
"grad_norm": 0.04653245955705643,
"learning_rate": 1.3388796761572493e-06,
"loss": 0.3411,
"step": 323
},
{
"epoch": 2.3501519756838904,
"grad_norm": 0.04192928597331047,
"learning_rate": 1.310020503720254e-06,
"loss": 0.3722,
"step": 324
},
{
"epoch": 2.3574468085106384,
"grad_norm": 0.04330296441912651,
"learning_rate": 1.2814287971354023e-06,
"loss": 0.325,
"step": 325
},
{
"epoch": 2.364741641337386,
"grad_norm": 0.04404173046350479,
"learning_rate": 1.253106628854635e-06,
"loss": 0.3247,
"step": 326
},
{
"epoch": 2.3720364741641338,
"grad_norm": 0.04104992374777794,
"learning_rate": 1.2250560517925747e-06,
"loss": 0.3079,
"step": 327
},
{
"epoch": 2.3793313069908812,
"grad_norm": 0.04262121394276619,
"learning_rate": 1.197279099177731e-06,
"loss": 0.3446,
"step": 328
},
{
"epoch": 2.386626139817629,
"grad_norm": 0.04929178208112717,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.3501,
"step": 329
},
{
"epoch": 2.393920972644377,
"grad_norm": 0.04329733923077583,
"learning_rate": 1.1425541008902852e-06,
"loss": 0.3213,
"step": 330
},
{
"epoch": 2.4012158054711246,
"grad_norm": 0.04333839192986488,
"learning_rate": 1.1156100219249022e-06,
"loss": 0.3232,
"step": 331
},
{
"epoch": 2.4085106382978725,
"grad_norm": 0.04259442910552025,
"learning_rate": 1.0889475005336447e-06,
"loss": 0.3632,
"step": 332
},
{
"epoch": 2.41580547112462,
"grad_norm": 0.04376016557216644,
"learning_rate": 1.0625684693326727e-06,
"loss": 0.3355,
"step": 333
},
{
"epoch": 2.423100303951368,
"grad_norm": 0.04070465639233589,
"learning_rate": 1.0364748403895368e-06,
"loss": 0.341,
"step": 334
},
{
"epoch": 2.4303951367781154,
"grad_norm": 0.041908472776412964,
"learning_rate": 1.0106685050845838e-06,
"loss": 0.3383,
"step": 335
},
{
"epoch": 2.4376899696048633,
"grad_norm": 0.04458033666014671,
"learning_rate": 9.851513339738627e-07,
"loss": 0.3246,
"step": 336
},
{
"epoch": 2.4449848024316108,
"grad_norm": 0.04246847331523895,
"learning_rate": 9.599251766535344e-07,
"loss": 0.3418,
"step": 337
},
{
"epoch": 2.4522796352583587,
"grad_norm": 0.04456906393170357,
"learning_rate": 9.349918616258113e-07,
"loss": 0.3536,
"step": 338
},
{
"epoch": 2.4595744680851066,
"grad_norm": 0.041079938411712646,
"learning_rate": 9.10353196166412e-07,
"loss": 0.3278,
"step": 339
},
{
"epoch": 2.466869300911854,
"grad_norm": 0.049959778785705566,
"learning_rate": 8.860109661935673e-07,
"loss": 0.3417,
"step": 340
},
{
"epoch": 2.474164133738602,
"grad_norm": 0.044310178607702255,
"learning_rate": 8.619669361385663e-07,
"loss": 0.3148,
"step": 341
},
{
"epoch": 2.4814589665653495,
"grad_norm": 0.04187872260808945,
"learning_rate": 8.382228488178639e-07,
"loss": 0.3392,
"step": 342
},
{
"epoch": 2.4887537993920974,
"grad_norm": 0.04105791822075844,
"learning_rate": 8.147804253067581e-07,
"loss": 0.3273,
"step": 343
},
{
"epoch": 2.496048632218845,
"grad_norm": 0.039138007909059525,
"learning_rate": 7.916413648146282e-07,
"loss": 0.3112,
"step": 344
},
{
"epoch": 2.503343465045593,
"grad_norm": 0.04459141194820404,
"learning_rate": 7.6880734456178e-07,
"loss": 0.3463,
"step": 345
},
{
"epoch": 2.5106382978723403,
"grad_norm": 0.043715398758649826,
"learning_rate": 7.462800196578662e-07,
"loss": 0.3439,
"step": 346
},
{
"epoch": 2.517933130699088,
"grad_norm": 0.043532464653253555,
"learning_rate": 7.240610229819195e-07,
"loss": 0.3303,
"step": 347
},
{
"epoch": 2.525227963525836,
"grad_norm": 0.04273553937673569,
"learning_rate": 7.021519650639952e-07,
"loss": 0.3171,
"step": 348
},
{
"epoch": 2.5325227963525836,
"grad_norm": 0.05441723391413689,
"learning_rate": 6.805544339684295e-07,
"loss": 0.3239,
"step": 349
},
{
"epoch": 2.539817629179331,
"grad_norm": 0.04585114121437073,
"learning_rate": 6.592699951787362e-07,
"loss": 0.3378,
"step": 350
},
{
"epoch": 2.547112462006079,
"grad_norm": 0.04242338612675667,
"learning_rate": 6.383001914841252e-07,
"loss": 0.2992,
"step": 351
},
{
"epoch": 2.554407294832827,
"grad_norm": 0.046155836433172226,
"learning_rate": 6.17646542867682e-07,
"loss": 0.3503,
"step": 352
},
{
"epoch": 2.5617021276595744,
"grad_norm": 0.04374154284596443,
"learning_rate": 5.973105463961864e-07,
"loss": 0.3385,
"step": 353
},
{
"epoch": 2.5689969604863223,
"grad_norm": 0.04297053441405296,
"learning_rate": 5.772936761116027e-07,
"loss": 0.3396,
"step": 354
},
{
"epoch": 2.57629179331307,
"grad_norm": 0.04214682802557945,
"learning_rate": 5.575973829242365e-07,
"loss": 0.3373,
"step": 355
},
{
"epoch": 2.5835866261398177,
"grad_norm": 0.04097369685769081,
"learning_rate": 5.382230945075556e-07,
"loss": 0.3386,
"step": 356
},
{
"epoch": 2.590881458966565,
"grad_norm": 0.042690787464380264,
"learning_rate": 5.191722151947227e-07,
"loss": 0.3319,
"step": 357
},
{
"epoch": 2.598176291793313,
"grad_norm": 0.040518004447221756,
"learning_rate": 5.004461258767873e-07,
"loss": 0.3174,
"step": 358
},
{
"epoch": 2.6054711246200606,
"grad_norm": 0.04100370407104492,
"learning_rate": 4.820461839026047e-07,
"loss": 0.34,
"step": 359
},
{
"epoch": 2.6127659574468085,
"grad_norm": 0.04036758467555046,
"learning_rate": 4.639737229804403e-07,
"loss": 0.3351,
"step": 360
},
{
"epoch": 2.6200607902735564,
"grad_norm": 0.04206588491797447,
"learning_rate": 4.4623005308130243e-07,
"loss": 0.3244,
"step": 361
},
{
"epoch": 2.627355623100304,
"grad_norm": 0.04280061274766922,
"learning_rate": 4.2881646034398926e-07,
"loss": 0.3065,
"step": 362
},
{
"epoch": 2.634650455927052,
"grad_norm": 0.04229553043842316,
"learning_rate": 4.1173420698186027e-07,
"loss": 0.3306,
"step": 363
},
{
"epoch": 2.6419452887537993,
"grad_norm": 0.044544368982315063,
"learning_rate": 3.9498453119134917e-07,
"loss": 0.3514,
"step": 364
},
{
"epoch": 2.6492401215805472,
"grad_norm": 0.045995116233825684,
"learning_rate": 3.7856864706221187e-07,
"loss": 0.3498,
"step": 365
},
{
"epoch": 2.6565349544072947,
"grad_norm": 0.048596058040857315,
"learning_rate": 3.6248774448952695e-07,
"loss": 0.3358,
"step": 366
},
{
"epoch": 2.6638297872340426,
"grad_norm": 0.04591159150004387,
"learning_rate": 3.467429890874424e-07,
"loss": 0.3129,
"step": 367
},
{
"epoch": 2.67112462006079,
"grad_norm": 0.041663773357868195,
"learning_rate": 3.313355221046888e-07,
"loss": 0.3213,
"step": 368
},
{
"epoch": 2.678419452887538,
"grad_norm": 0.04105694591999054,
"learning_rate": 3.1626646034186084e-07,
"loss": 0.345,
"step": 369
},
{
"epoch": 2.685714285714286,
"grad_norm": 0.044980090111494064,
"learning_rate": 3.015368960704584e-07,
"loss": 0.3265,
"step": 370
},
{
"epoch": 2.6930091185410334,
"grad_norm": 0.04313720017671585,
"learning_rate": 2.871478969537206e-07,
"loss": 0.3434,
"step": 371
},
{
"epoch": 2.700303951367781,
"grad_norm": 0.04387833923101425,
"learning_rate": 2.7310050596923323e-07,
"loss": 0.33,
"step": 372
},
{
"epoch": 2.707598784194529,
"grad_norm": 0.04352449253201485,
"learning_rate": 2.593957413333331e-07,
"loss": 0.3315,
"step": 373
},
{
"epoch": 2.7148936170212767,
"grad_norm": 0.04267902672290802,
"learning_rate": 2.4603459642729867e-07,
"loss": 0.3574,
"step": 374
},
{
"epoch": 2.722188449848024,
"grad_norm": 0.046177759766578674,
"learning_rate": 2.330180397253473e-07,
"loss": 0.3428,
"step": 375
},
{
"epoch": 2.729483282674772,
"grad_norm": 0.0405619777739048,
"learning_rate": 2.2034701472443854e-07,
"loss": 0.2903,
"step": 376
},
{
"epoch": 2.7367781155015196,
"grad_norm": 0.04294833540916443,
"learning_rate": 2.0802243987588068e-07,
"loss": 0.3664,
"step": 377
},
{
"epoch": 2.7440729483282675,
"grad_norm": 0.048284079879522324,
"learning_rate": 1.9604520851876196e-07,
"loss": 0.3294,
"step": 378
},
{
"epoch": 2.7513677811550155,
"grad_norm": 0.04744973033666611,
"learning_rate": 1.8441618881519186e-07,
"loss": 0.321,
"step": 379
},
{
"epoch": 2.758662613981763,
"grad_norm": 0.03917940333485603,
"learning_rate": 1.7313622368738014e-07,
"loss": 0.307,
"step": 380
},
{
"epoch": 2.7659574468085104,
"grad_norm": 0.04587104544043541,
"learning_rate": 1.6220613075653201e-07,
"loss": 0.3464,
"step": 381
},
{
"epoch": 2.7732522796352583,
"grad_norm": 0.042470064014196396,
"learning_rate": 1.51626702283586e-07,
"loss": 0.2907,
"step": 382
},
{
"epoch": 2.7805471124620063,
"grad_norm": 0.048345521092414856,
"learning_rate": 1.4139870511178767e-07,
"loss": 0.3481,
"step": 383
},
{
"epoch": 2.7878419452887537,
"grad_norm": 0.038930460810661316,
"learning_rate": 1.3152288061110518e-07,
"loss": 0.2987,
"step": 384
},
{
"epoch": 2.7951367781155017,
"grad_norm": 0.04472014680504799,
"learning_rate": 1.2199994462448906e-07,
"loss": 0.3612,
"step": 385
},
{
"epoch": 2.802431610942249,
"grad_norm": 0.05124653875827789,
"learning_rate": 1.1283058741598962e-07,
"loss": 0.3051,
"step": 386
},
{
"epoch": 2.809726443768997,
"grad_norm": 0.038610782474279404,
"learning_rate": 1.0401547362071939e-07,
"loss": 0.3362,
"step": 387
},
{
"epoch": 2.8170212765957445,
"grad_norm": 0.042344819754362106,
"learning_rate": 9.555524219667989e-08,
"loss": 0.3206,
"step": 388
},
{
"epoch": 2.8243161094224924,
"grad_norm": 0.046877894550561905,
"learning_rate": 8.745050637844532e-08,
"loss": 0.3332,
"step": 389
},
{
"epoch": 2.83161094224924,
"grad_norm": 0.04104023799300194,
"learning_rate": 7.970185363271432e-08,
"loss": 0.2941,
"step": 390
},
{
"epoch": 2.838905775075988,
"grad_norm": 0.04339218884706497,
"learning_rate": 7.230984561572729e-08,
"loss": 0.3409,
"step": 391
},
{
"epoch": 2.8462006079027358,
"grad_norm": 0.047086864709854126,
"learning_rate": 6.527501813255344e-08,
"loss": 0.3282,
"step": 392
},
{
"epoch": 2.8534954407294832,
"grad_norm": 0.04140612855553627,
"learning_rate": 5.8597881098257924e-08,
"loss": 0.3706,
"step": 393
},
{
"epoch": 2.860790273556231,
"grad_norm": 0.045086655765771866,
"learning_rate": 5.227891850093314e-08,
"loss": 0.3159,
"step": 394
},
{
"epoch": 2.8680851063829786,
"grad_norm": 0.04669662564992905,
"learning_rate": 4.631858836662562e-08,
"loss": 0.3212,
"step": 395
},
{
"epoch": 2.8753799392097266,
"grad_norm": 0.048425789922475815,
"learning_rate": 4.071732272613149e-08,
"loss": 0.3781,
"step": 396
},
{
"epoch": 2.882674772036474,
"grad_norm": 0.0414654016494751,
"learning_rate": 3.5475527583681005e-08,
"loss": 0.3487,
"step": 397
},
{
"epoch": 2.889969604863222,
"grad_norm": 0.04240646958351135,
"learning_rate": 3.059358288751202e-08,
"loss": 0.3063,
"step": 398
},
{
"epoch": 2.8972644376899694,
"grad_norm": 0.047270409762859344,
"learning_rate": 2.6071842502326526e-08,
"loss": 0.3352,
"step": 399
},
{
"epoch": 2.9045592705167174,
"grad_norm": 0.043454963713884354,
"learning_rate": 2.1910634183644475e-08,
"loss": 0.3442,
"step": 400
},
{
"epoch": 2.9118541033434653,
"grad_norm": 0.045333363115787506,
"learning_rate": 1.811025955404333e-08,
"loss": 0.3196,
"step": 401
},
{
"epoch": 2.9191489361702128,
"grad_norm": 0.04678984358906746,
"learning_rate": 1.4670994081297796e-08,
"loss": 0.3319,
"step": 402
},
{
"epoch": 2.9264437689969602,
"grad_norm": 0.05119337886571884,
"learning_rate": 1.159308705841078e-08,
"loss": 0.3614,
"step": 403
},
{
"epoch": 2.933738601823708,
"grad_norm": 0.048367924988269806,
"learning_rate": 8.87676158554507e-09,
"loss": 0.3615,
"step": 404
},
{
"epoch": 2.941033434650456,
"grad_norm": 0.04424307495355606,
"learning_rate": 6.5222145538501595e-09,
"loss": 0.3027,
"step": 405
},
{
"epoch": 2.9483282674772036,
"grad_norm": 0.0484929159283638,
"learning_rate": 4.5296166311931125e-09,
"loss": 0.3259,
"step": 406
},
{
"epoch": 2.9556231003039515,
"grad_norm": 0.04274160414934158,
"learning_rate": 2.899112249786229e-09,
"loss": 0.3219,
"step": 407
},
{
"epoch": 2.962917933130699,
"grad_norm": 0.04609229788184166,
"learning_rate": 1.6308195957182028e-09,
"loss": 0.3074,
"step": 408
},
{
"epoch": 2.970212765957447,
"grad_norm": 0.05541510134935379,
"learning_rate": 7.24830600386528e-10,
"loss": 0.3853,
"step": 409
},
{
"epoch": 2.977507598784195,
"grad_norm": 0.0421581007540226,
"learning_rate": 1.812109338367174e-10,
"loss": 0.3192,
"step": 410
},
{
"epoch": 2.9848024316109423,
"grad_norm": 0.04373352229595184,
"learning_rate": 0.0,
"loss": 0.3766,
"step": 411
},
{
"epoch": 2.9848024316109423,
"step": 411,
"total_flos": 3.260394272163103e+17,
"train_loss": 0.4436677635586175,
"train_runtime": 142049.243,
"train_samples_per_second": 0.139,
"train_steps_per_second": 0.003
}
],
"logging_steps": 1,
"max_steps": 411,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.260394272163103e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}