Xinging's picture
Upload trainer_state.json with huggingface_hub
782be6b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004,
"grad_norm": 8.721762657165527,
"learning_rate": 4.347826086956522e-07,
"loss": 2.3644,
"step": 1
},
{
"epoch": 0.008,
"grad_norm": 8.021437644958496,
"learning_rate": 8.695652173913044e-07,
"loss": 2.3718,
"step": 2
},
{
"epoch": 0.012,
"grad_norm": 9.153054237365723,
"learning_rate": 1.3043478260869566e-06,
"loss": 2.2777,
"step": 3
},
{
"epoch": 0.016,
"grad_norm": 8.955615043640137,
"learning_rate": 1.7391304347826088e-06,
"loss": 2.1531,
"step": 4
},
{
"epoch": 0.02,
"grad_norm": 8.832795143127441,
"learning_rate": 2.173913043478261e-06,
"loss": 2.4567,
"step": 5
},
{
"epoch": 0.024,
"grad_norm": 9.694704055786133,
"learning_rate": 2.6086956521739132e-06,
"loss": 2.36,
"step": 6
},
{
"epoch": 0.028,
"grad_norm": 7.578924179077148,
"learning_rate": 3.043478260869566e-06,
"loss": 2.1872,
"step": 7
},
{
"epoch": 0.032,
"grad_norm": 6.530836582183838,
"learning_rate": 3.4782608695652175e-06,
"loss": 2.1603,
"step": 8
},
{
"epoch": 0.036,
"grad_norm": 6.5283589363098145,
"learning_rate": 3.91304347826087e-06,
"loss": 2.2485,
"step": 9
},
{
"epoch": 0.04,
"grad_norm": 9.349092483520508,
"learning_rate": 4.347826086956522e-06,
"loss": 2.1946,
"step": 10
},
{
"epoch": 0.044,
"grad_norm": 8.240315437316895,
"learning_rate": 4.782608695652174e-06,
"loss": 2.248,
"step": 11
},
{
"epoch": 0.048,
"grad_norm": 6.106252670288086,
"learning_rate": 5.2173913043478265e-06,
"loss": 2.0142,
"step": 12
},
{
"epoch": 0.052,
"grad_norm": 5.051333904266357,
"learning_rate": 5.652173913043479e-06,
"loss": 2.3068,
"step": 13
},
{
"epoch": 0.056,
"grad_norm": 3.2932288646698,
"learning_rate": 6.086956521739132e-06,
"loss": 1.9645,
"step": 14
},
{
"epoch": 0.06,
"grad_norm": 3.7527170181274414,
"learning_rate": 6.521739130434783e-06,
"loss": 1.9759,
"step": 15
},
{
"epoch": 0.064,
"grad_norm": 2.721696138381958,
"learning_rate": 6.956521739130435e-06,
"loss": 1.9302,
"step": 16
},
{
"epoch": 0.068,
"grad_norm": 2.36800479888916,
"learning_rate": 7.391304347826087e-06,
"loss": 2.205,
"step": 17
},
{
"epoch": 0.072,
"grad_norm": 2.5590853691101074,
"learning_rate": 7.82608695652174e-06,
"loss": 2.119,
"step": 18
},
{
"epoch": 0.076,
"grad_norm": 2.4806504249572754,
"learning_rate": 8.260869565217392e-06,
"loss": 2.0396,
"step": 19
},
{
"epoch": 0.08,
"grad_norm": 2.451159954071045,
"learning_rate": 8.695652173913044e-06,
"loss": 1.9451,
"step": 20
},
{
"epoch": 0.084,
"grad_norm": 2.3489480018615723,
"learning_rate": 9.130434782608697e-06,
"loss": 1.9104,
"step": 21
},
{
"epoch": 0.088,
"grad_norm": 2.2822914123535156,
"learning_rate": 9.565217391304349e-06,
"loss": 1.8797,
"step": 22
},
{
"epoch": 0.092,
"grad_norm": 2.1923317909240723,
"learning_rate": 1e-05,
"loss": 1.9217,
"step": 23
},
{
"epoch": 0.096,
"grad_norm": 2.112422227859497,
"learning_rate": 9.999953315763929e-06,
"loss": 2.1099,
"step": 24
},
{
"epoch": 0.1,
"grad_norm": 1.9636746644973755,
"learning_rate": 9.999813263927483e-06,
"loss": 1.9496,
"step": 25
},
{
"epoch": 0.104,
"grad_norm": 1.8546597957611084,
"learning_rate": 9.999579847105947e-06,
"loss": 1.8579,
"step": 26
},
{
"epoch": 0.108,
"grad_norm": 1.958949327468872,
"learning_rate": 9.999253069658074e-06,
"loss": 1.8821,
"step": 27
},
{
"epoch": 0.112,
"grad_norm": 1.889803409576416,
"learning_rate": 9.99883293768601e-06,
"loss": 1.6971,
"step": 28
},
{
"epoch": 0.116,
"grad_norm": 1.947962999343872,
"learning_rate": 9.998319459035168e-06,
"loss": 1.9231,
"step": 29
},
{
"epoch": 0.12,
"grad_norm": 2.027719259262085,
"learning_rate": 9.997712643294093e-06,
"loss": 1.953,
"step": 30
},
{
"epoch": 0.124,
"grad_norm": 1.8483015298843384,
"learning_rate": 9.997012501794273e-06,
"loss": 1.8959,
"step": 31
},
{
"epoch": 0.128,
"grad_norm": 1.7306790351867676,
"learning_rate": 9.996219047609943e-06,
"loss": 1.8583,
"step": 32
},
{
"epoch": 0.132,
"grad_norm": 1.8961213827133179,
"learning_rate": 9.995332295557818e-06,
"loss": 1.8368,
"step": 33
},
{
"epoch": 0.136,
"grad_norm": 1.9631264209747314,
"learning_rate": 9.994352262196839e-06,
"loss": 1.9679,
"step": 34
},
{
"epoch": 0.14,
"grad_norm": 1.846217155456543,
"learning_rate": 9.993278965827844e-06,
"loss": 1.839,
"step": 35
},
{
"epoch": 0.144,
"grad_norm": 2.0129947662353516,
"learning_rate": 9.992112426493247e-06,
"loss": 1.855,
"step": 36
},
{
"epoch": 0.148,
"grad_norm": 1.9122753143310547,
"learning_rate": 9.990852665976648e-06,
"loss": 1.9006,
"step": 37
},
{
"epoch": 0.152,
"grad_norm": 2.18493914604187,
"learning_rate": 9.989499707802424e-06,
"loss": 1.8405,
"step": 38
},
{
"epoch": 0.156,
"grad_norm": 1.7727043628692627,
"learning_rate": 9.988053577235306e-06,
"loss": 1.8223,
"step": 39
},
{
"epoch": 0.16,
"grad_norm": 1.9057114124298096,
"learning_rate": 9.986514301279894e-06,
"loss": 1.6839,
"step": 40
},
{
"epoch": 0.164,
"grad_norm": 1.7055038213729858,
"learning_rate": 9.984881908680157e-06,
"loss": 1.9267,
"step": 41
},
{
"epoch": 0.168,
"grad_norm": 1.7604912519454956,
"learning_rate": 9.983156429918895e-06,
"loss": 1.6166,
"step": 42
},
{
"epoch": 0.172,
"grad_norm": 1.7344841957092285,
"learning_rate": 9.981337897217171e-06,
"loss": 1.7582,
"step": 43
},
{
"epoch": 0.176,
"grad_norm": 1.7479348182678223,
"learning_rate": 9.979426344533712e-06,
"loss": 1.7273,
"step": 44
},
{
"epoch": 0.18,
"grad_norm": 1.7066375017166138,
"learning_rate": 9.977421807564264e-06,
"loss": 1.6157,
"step": 45
},
{
"epoch": 0.184,
"grad_norm": 1.7395350933074951,
"learning_rate": 9.97532432374094e-06,
"loss": 1.7595,
"step": 46
},
{
"epoch": 0.188,
"grad_norm": 1.7139127254486084,
"learning_rate": 9.973133932231514e-06,
"loss": 1.847,
"step": 47
},
{
"epoch": 0.192,
"grad_norm": 1.7039334774017334,
"learning_rate": 9.970850673938684e-06,
"loss": 1.6266,
"step": 48
},
{
"epoch": 0.196,
"grad_norm": 1.553138017654419,
"learning_rate": 9.96847459149932e-06,
"loss": 1.7547,
"step": 49
},
{
"epoch": 0.2,
"grad_norm": 1.660720705986023,
"learning_rate": 9.966005729283658e-06,
"loss": 1.6159,
"step": 50
},
{
"epoch": 0.204,
"grad_norm": 1.7627906799316406,
"learning_rate": 9.963444133394478e-06,
"loss": 1.6653,
"step": 51
},
{
"epoch": 0.208,
"grad_norm": 1.6014618873596191,
"learning_rate": 9.960789851666237e-06,
"loss": 1.6848,
"step": 52
},
{
"epoch": 0.212,
"grad_norm": 1.5535660982131958,
"learning_rate": 9.958042933664186e-06,
"loss": 1.7494,
"step": 53
},
{
"epoch": 0.216,
"grad_norm": 1.5631558895111084,
"learning_rate": 9.955203430683425e-06,
"loss": 1.724,
"step": 54
},
{
"epoch": 0.22,
"grad_norm": 1.472652554512024,
"learning_rate": 9.952271395747969e-06,
"loss": 1.6804,
"step": 55
},
{
"epoch": 0.224,
"grad_norm": 1.414918303489685,
"learning_rate": 9.949246883609743e-06,
"loss": 1.7572,
"step": 56
},
{
"epoch": 0.228,
"grad_norm": 1.3761672973632812,
"learning_rate": 9.94612995074756e-06,
"loss": 1.6243,
"step": 57
},
{
"epoch": 0.232,
"grad_norm": 1.333956241607666,
"learning_rate": 9.942920655366075e-06,
"loss": 1.6858,
"step": 58
},
{
"epoch": 0.236,
"grad_norm": 1.2539465427398682,
"learning_rate": 9.939619057394687e-06,
"loss": 1.6089,
"step": 59
},
{
"epoch": 0.24,
"grad_norm": 1.3787896633148193,
"learning_rate": 9.936225218486428e-06,
"loss": 1.8374,
"step": 60
},
{
"epoch": 0.244,
"grad_norm": 1.2240163087844849,
"learning_rate": 9.93273920201681e-06,
"loss": 1.6729,
"step": 61
},
{
"epoch": 0.248,
"grad_norm": 1.2668211460113525,
"learning_rate": 9.929161073082636e-06,
"loss": 1.7367,
"step": 62
},
{
"epoch": 0.252,
"grad_norm": 1.2384607791900635,
"learning_rate": 9.925490898500796e-06,
"loss": 1.598,
"step": 63
},
{
"epoch": 0.256,
"grad_norm": 1.2151755094528198,
"learning_rate": 9.921728746807008e-06,
"loss": 1.7472,
"step": 64
},
{
"epoch": 0.26,
"grad_norm": 1.1657401323318481,
"learning_rate": 9.917874688254542e-06,
"loss": 1.5475,
"step": 65
},
{
"epoch": 0.264,
"grad_norm": 1.2198437452316284,
"learning_rate": 9.913928794812909e-06,
"loss": 1.7793,
"step": 66
},
{
"epoch": 0.268,
"grad_norm": 1.2214503288269043,
"learning_rate": 9.90989114016652e-06,
"loss": 1.7146,
"step": 67
},
{
"epoch": 0.272,
"grad_norm": 1.204345464706421,
"learning_rate": 9.905761799713302e-06,
"loss": 1.6711,
"step": 68
},
{
"epoch": 0.276,
"grad_norm": 1.137200117111206,
"learning_rate": 9.901540850563295e-06,
"loss": 1.7723,
"step": 69
},
{
"epoch": 0.28,
"grad_norm": 1.1586705446243286,
"learning_rate": 9.89722837153722e-06,
"loss": 1.6051,
"step": 70
},
{
"epoch": 0.284,
"grad_norm": 1.1941063404083252,
"learning_rate": 9.892824443164987e-06,
"loss": 1.79,
"step": 71
},
{
"epoch": 0.288,
"grad_norm": 1.3017503023147583,
"learning_rate": 9.88832914768421e-06,
"loss": 1.7233,
"step": 72
},
{
"epoch": 0.292,
"grad_norm": 1.219022512435913,
"learning_rate": 9.883742569038663e-06,
"loss": 1.7442,
"step": 73
},
{
"epoch": 0.296,
"grad_norm": 1.2379837036132812,
"learning_rate": 9.879064792876717e-06,
"loss": 1.7228,
"step": 74
},
{
"epoch": 0.3,
"grad_norm": 1.121887445449829,
"learning_rate": 9.874295906549728e-06,
"loss": 1.6269,
"step": 75
},
{
"epoch": 0.304,
"grad_norm": 1.2706581354141235,
"learning_rate": 9.869435999110428e-06,
"loss": 1.797,
"step": 76
},
{
"epoch": 0.308,
"grad_norm": 1.1910197734832764,
"learning_rate": 9.864485161311242e-06,
"loss": 1.7846,
"step": 77
},
{
"epoch": 0.312,
"grad_norm": 1.0924195051193237,
"learning_rate": 9.859443485602603e-06,
"loss": 1.4966,
"step": 78
},
{
"epoch": 0.316,
"grad_norm": 1.2084602117538452,
"learning_rate": 9.85431106613122e-06,
"loss": 1.6557,
"step": 79
},
{
"epoch": 0.32,
"grad_norm": 1.1837743520736694,
"learning_rate": 9.849087998738328e-06,
"loss": 1.8381,
"step": 80
},
{
"epoch": 0.324,
"grad_norm": 1.155611515045166,
"learning_rate": 9.84377438095789e-06,
"loss": 1.6823,
"step": 81
},
{
"epoch": 0.328,
"grad_norm": 1.1229243278503418,
"learning_rate": 9.838370312014783e-06,
"loss": 1.691,
"step": 82
},
{
"epoch": 0.332,
"grad_norm": 1.1645359992980957,
"learning_rate": 9.832875892822937e-06,
"loss": 1.6568,
"step": 83
},
{
"epoch": 0.336,
"grad_norm": 1.2119394540786743,
"learning_rate": 9.827291225983458e-06,
"loss": 1.8006,
"step": 84
},
{
"epoch": 0.34,
"grad_norm": 1.0529983043670654,
"learning_rate": 9.821616415782708e-06,
"loss": 1.4368,
"step": 85
},
{
"epoch": 0.344,
"grad_norm": 1.2436493635177612,
"learning_rate": 9.815851568190358e-06,
"loss": 1.7318,
"step": 86
},
{
"epoch": 0.348,
"grad_norm": 1.1215393543243408,
"learning_rate": 9.80999679085741e-06,
"loss": 1.5843,
"step": 87
},
{
"epoch": 0.352,
"grad_norm": 1.2209582328796387,
"learning_rate": 9.80405219311419e-06,
"loss": 1.6051,
"step": 88
},
{
"epoch": 0.356,
"grad_norm": 1.217036485671997,
"learning_rate": 9.798017885968295e-06,
"loss": 1.6264,
"step": 89
},
{
"epoch": 0.36,
"grad_norm": 1.0814294815063477,
"learning_rate": 9.791893982102537e-06,
"loss": 1.6134,
"step": 90
},
{
"epoch": 0.364,
"grad_norm": 1.034320592880249,
"learning_rate": 9.785680595872824e-06,
"loss": 1.3687,
"step": 91
},
{
"epoch": 0.368,
"grad_norm": 1.1603397130966187,
"learning_rate": 9.77937784330603e-06,
"loss": 1.6229,
"step": 92
},
{
"epoch": 0.372,
"grad_norm": 1.3108525276184082,
"learning_rate": 9.772985842097832e-06,
"loss": 1.6952,
"step": 93
},
{
"epoch": 0.376,
"grad_norm": 1.2516248226165771,
"learning_rate": 9.766504711610507e-06,
"loss": 1.7227,
"step": 94
},
{
"epoch": 0.38,
"grad_norm": 1.2021214962005615,
"learning_rate": 9.759934572870706e-06,
"loss": 1.56,
"step": 95
},
{
"epoch": 0.384,
"grad_norm": 1.317078709602356,
"learning_rate": 9.753275548567192e-06,
"loss": 1.7596,
"step": 96
},
{
"epoch": 0.388,
"grad_norm": 1.2354626655578613,
"learning_rate": 9.74652776304855e-06,
"loss": 1.7374,
"step": 97
},
{
"epoch": 0.392,
"grad_norm": 1.1409403085708618,
"learning_rate": 9.739691342320866e-06,
"loss": 1.5335,
"step": 98
},
{
"epoch": 0.396,
"grad_norm": 1.2714933156967163,
"learning_rate": 9.732766414045368e-06,
"loss": 1.7353,
"step": 99
},
{
"epoch": 0.4,
"grad_norm": 1.250701665878296,
"learning_rate": 9.725753107536053e-06,
"loss": 1.6501,
"step": 100
},
{
"epoch": 0.404,
"grad_norm": 1.0238255262374878,
"learning_rate": 9.718651553757266e-06,
"loss": 1.3938,
"step": 101
},
{
"epoch": 0.408,
"grad_norm": 1.1466556787490845,
"learning_rate": 9.711461885321247e-06,
"loss": 1.6982,
"step": 102
},
{
"epoch": 0.412,
"grad_norm": 1.132495403289795,
"learning_rate": 9.704184236485672e-06,
"loss": 1.5833,
"step": 103
},
{
"epoch": 0.416,
"grad_norm": 1.2290033102035522,
"learning_rate": 9.696818743151128e-06,
"loss": 1.616,
"step": 104
},
{
"epoch": 0.42,
"grad_norm": 1.0731315612792969,
"learning_rate": 9.68936554285859e-06,
"loss": 1.4403,
"step": 105
},
{
"epoch": 0.424,
"grad_norm": 1.17851984500885,
"learning_rate": 9.68182477478684e-06,
"loss": 1.6653,
"step": 106
},
{
"epoch": 0.428,
"grad_norm": 1.2172232866287231,
"learning_rate": 9.67419657974988e-06,
"loss": 1.5907,
"step": 107
},
{
"epoch": 0.432,
"grad_norm": 1.2712275981903076,
"learning_rate": 9.66648110019429e-06,
"loss": 1.6898,
"step": 108
},
{
"epoch": 0.436,
"grad_norm": 1.1592316627502441,
"learning_rate": 9.658678480196579e-06,
"loss": 1.596,
"step": 109
},
{
"epoch": 0.44,
"grad_norm": 1.132724404335022,
"learning_rate": 9.650788865460487e-06,
"loss": 1.5729,
"step": 110
},
{
"epoch": 0.444,
"grad_norm": 1.1037116050720215,
"learning_rate": 9.642812403314272e-06,
"loss": 1.6798,
"step": 111
},
{
"epoch": 0.448,
"grad_norm": 1.1293030977249146,
"learning_rate": 9.634749242707948e-06,
"loss": 1.6263,
"step": 112
},
{
"epoch": 0.452,
"grad_norm": 1.1488841772079468,
"learning_rate": 9.626599534210514e-06,
"loss": 1.7044,
"step": 113
},
{
"epoch": 0.456,
"grad_norm": 1.1456992626190186,
"learning_rate": 9.618363430007134e-06,
"loss": 1.637,
"step": 114
},
{
"epoch": 0.46,
"grad_norm": 1.131826400756836,
"learning_rate": 9.610041083896304e-06,
"loss": 1.5681,
"step": 115
},
{
"epoch": 0.464,
"grad_norm": 1.0559884309768677,
"learning_rate": 9.60163265128697e-06,
"loss": 1.521,
"step": 116
},
{
"epoch": 0.468,
"grad_norm": 1.1588243246078491,
"learning_rate": 9.593138289195634e-06,
"loss": 1.5827,
"step": 117
},
{
"epoch": 0.472,
"grad_norm": 1.1131339073181152,
"learning_rate": 9.584558156243418e-06,
"loss": 1.6897,
"step": 118
},
{
"epoch": 0.476,
"grad_norm": 1.130588173866272,
"learning_rate": 9.575892412653102e-06,
"loss": 1.483,
"step": 119
},
{
"epoch": 0.48,
"grad_norm": 1.136372685432434,
"learning_rate": 9.567141220246136e-06,
"loss": 1.5949,
"step": 120
},
{
"epoch": 0.484,
"grad_norm": 1.1033318042755127,
"learning_rate": 9.55830474243961e-06,
"loss": 1.6146,
"step": 121
},
{
"epoch": 0.488,
"grad_norm": 1.1151621341705322,
"learning_rate": 9.549383144243213e-06,
"loss": 1.6518,
"step": 122
},
{
"epoch": 0.492,
"grad_norm": 1.2058076858520508,
"learning_rate": 9.540376592256142e-06,
"loss": 1.6342,
"step": 123
},
{
"epoch": 0.496,
"grad_norm": 1.1145142316818237,
"learning_rate": 9.531285254663997e-06,
"loss": 1.6838,
"step": 124
},
{
"epoch": 0.5,
"grad_norm": 1.1947275400161743,
"learning_rate": 9.522109301235637e-06,
"loss": 1.5949,
"step": 125
},
{
"epoch": 0.504,
"grad_norm": 1.171634554862976,
"learning_rate": 9.512848903320017e-06,
"loss": 1.6209,
"step": 126
},
{
"epoch": 0.508,
"grad_norm": 1.0351903438568115,
"learning_rate": 9.503504233842973e-06,
"loss": 1.4613,
"step": 127
},
{
"epoch": 0.512,
"grad_norm": 0.9954378604888916,
"learning_rate": 9.494075467304007e-06,
"loss": 1.4866,
"step": 128
},
{
"epoch": 0.516,
"grad_norm": 1.1588574647903442,
"learning_rate": 9.484562779773027e-06,
"loss": 1.5944,
"step": 129
},
{
"epoch": 0.52,
"grad_norm": 1.1795620918273926,
"learning_rate": 9.474966348887055e-06,
"loss": 1.6563,
"step": 130
},
{
"epoch": 0.524,
"grad_norm": 1.1187045574188232,
"learning_rate": 9.465286353846905e-06,
"loss": 1.561,
"step": 131
},
{
"epoch": 0.528,
"grad_norm": 1.0760747194290161,
"learning_rate": 9.455522975413846e-06,
"loss": 1.4359,
"step": 132
},
{
"epoch": 0.532,
"grad_norm": 1.1250879764556885,
"learning_rate": 9.445676395906226e-06,
"loss": 1.579,
"step": 133
},
{
"epoch": 0.536,
"grad_norm": 1.14565908908844,
"learning_rate": 9.435746799196061e-06,
"loss": 1.6176,
"step": 134
},
{
"epoch": 0.54,
"grad_norm": 1.1458956003189087,
"learning_rate": 9.425734370705606e-06,
"loss": 1.635,
"step": 135
},
{
"epoch": 0.544,
"grad_norm": 1.1934502124786377,
"learning_rate": 9.415639297403891e-06,
"loss": 1.6134,
"step": 136
},
{
"epoch": 0.548,
"grad_norm": 1.0963486433029175,
"learning_rate": 9.40546176780323e-06,
"loss": 1.6283,
"step": 137
},
{
"epoch": 0.552,
"grad_norm": 1.089300513267517,
"learning_rate": 9.395201971955701e-06,
"loss": 1.6247,
"step": 138
},
{
"epoch": 0.556,
"grad_norm": 1.1173641681671143,
"learning_rate": 9.384860101449598e-06,
"loss": 1.6772,
"step": 139
},
{
"epoch": 0.56,
"grad_norm": 1.139262080192566,
"learning_rate": 9.374436349405847e-06,
"loss": 1.7146,
"step": 140
},
{
"epoch": 0.564,
"grad_norm": 1.1153786182403564,
"learning_rate": 9.36393091047441e-06,
"loss": 1.5462,
"step": 141
},
{
"epoch": 0.568,
"grad_norm": 1.1419354677200317,
"learning_rate": 9.353343980830644e-06,
"loss": 1.5408,
"step": 142
},
{
"epoch": 0.572,
"grad_norm": 1.2222529649734497,
"learning_rate": 9.342675758171638e-06,
"loss": 1.6216,
"step": 143
},
{
"epoch": 0.576,
"grad_norm": 1.1209079027175903,
"learning_rate": 9.331926441712522e-06,
"loss": 1.4746,
"step": 144
},
{
"epoch": 0.58,
"grad_norm": 1.0842020511627197,
"learning_rate": 9.32109623218275e-06,
"loss": 1.6418,
"step": 145
},
{
"epoch": 0.584,
"grad_norm": 1.1870988607406616,
"learning_rate": 9.310185331822338e-06,
"loss": 1.675,
"step": 146
},
{
"epoch": 0.588,
"grad_norm": 1.1172236204147339,
"learning_rate": 9.299193944378112e-06,
"loss": 1.6609,
"step": 147
},
{
"epoch": 0.592,
"grad_norm": 1.12311851978302,
"learning_rate": 9.28812227509988e-06,
"loss": 1.5276,
"step": 148
},
{
"epoch": 0.596,
"grad_norm": 1.1734628677368164,
"learning_rate": 9.27697053073661e-06,
"loss": 1.5647,
"step": 149
},
{
"epoch": 0.6,
"grad_norm": 1.156967043876648,
"learning_rate": 9.26573891953257e-06,
"loss": 1.594,
"step": 150
},
{
"epoch": 0.604,
"grad_norm": 1.249269962310791,
"learning_rate": 9.254427651223434e-06,
"loss": 1.6792,
"step": 151
},
{
"epoch": 0.608,
"grad_norm": 1.0864014625549316,
"learning_rate": 9.243036937032373e-06,
"loss": 1.4722,
"step": 152
},
{
"epoch": 0.612,
"grad_norm": 1.1315494775772095,
"learning_rate": 9.2315669896661e-06,
"loss": 1.6805,
"step": 153
},
{
"epoch": 0.616,
"grad_norm": 1.224092960357666,
"learning_rate": 9.220018023310908e-06,
"loss": 1.6779,
"step": 154
},
{
"epoch": 0.62,
"grad_norm": 1.1818524599075317,
"learning_rate": 9.208390253628667e-06,
"loss": 1.7031,
"step": 155
},
{
"epoch": 0.624,
"grad_norm": 1.1494390964508057,
"learning_rate": 9.196683897752794e-06,
"loss": 1.6261,
"step": 156
},
{
"epoch": 0.628,
"grad_norm": 1.0383175611495972,
"learning_rate": 9.184899174284201e-06,
"loss": 1.4572,
"step": 157
},
{
"epoch": 0.632,
"grad_norm": 1.1242049932479858,
"learning_rate": 9.173036303287215e-06,
"loss": 1.5738,
"step": 158
},
{
"epoch": 0.636,
"grad_norm": 1.0787436962127686,
"learning_rate": 9.16109550628546e-06,
"loss": 1.5038,
"step": 159
},
{
"epoch": 0.64,
"grad_norm": 1.0636624097824097,
"learning_rate": 9.149077006257734e-06,
"loss": 1.5432,
"step": 160
},
{
"epoch": 0.644,
"grad_norm": 1.1137751340866089,
"learning_rate": 9.136981027633834e-06,
"loss": 1.527,
"step": 161
},
{
"epoch": 0.648,
"grad_norm": 1.1208561658859253,
"learning_rate": 9.124807796290366e-06,
"loss": 1.6232,
"step": 162
},
{
"epoch": 0.652,
"grad_norm": 1.1151702404022217,
"learning_rate": 9.112557539546535e-06,
"loss": 1.6379,
"step": 163
},
{
"epoch": 0.656,
"grad_norm": 1.083153247833252,
"learning_rate": 9.100230486159893e-06,
"loss": 1.6621,
"step": 164
},
{
"epoch": 0.66,
"grad_norm": 1.1297736167907715,
"learning_rate": 9.087826866322065e-06,
"loss": 1.4921,
"step": 165
},
{
"epoch": 0.664,
"grad_norm": 1.0946112871170044,
"learning_rate": 9.075346911654456e-06,
"loss": 1.5721,
"step": 166
},
{
"epoch": 0.668,
"grad_norm": 1.1505929231643677,
"learning_rate": 9.062790855203932e-06,
"loss": 1.6374,
"step": 167
},
{
"epoch": 0.672,
"grad_norm": 1.0435446500778198,
"learning_rate": 9.050158931438451e-06,
"loss": 1.4502,
"step": 168
},
{
"epoch": 0.676,
"grad_norm": 1.010459065437317,
"learning_rate": 9.037451376242696e-06,
"loss": 1.422,
"step": 169
},
{
"epoch": 0.68,
"grad_norm": 1.1430541276931763,
"learning_rate": 9.024668426913671e-06,
"loss": 1.5824,
"step": 170
},
{
"epoch": 0.684,
"grad_norm": 1.1869382858276367,
"learning_rate": 9.011810322156269e-06,
"loss": 1.6054,
"step": 171
},
{
"epoch": 0.688,
"grad_norm": 1.0964624881744385,
"learning_rate": 8.998877302078803e-06,
"loss": 1.5688,
"step": 172
},
{
"epoch": 0.692,
"grad_norm": 1.1466578245162964,
"learning_rate": 8.985869608188545e-06,
"loss": 1.6124,
"step": 173
},
{
"epoch": 0.696,
"grad_norm": 1.1571404933929443,
"learning_rate": 8.97278748338719e-06,
"loss": 1.596,
"step": 174
},
{
"epoch": 0.7,
"grad_norm": 1.113532304763794,
"learning_rate": 8.95963117196634e-06,
"loss": 1.5243,
"step": 175
},
{
"epoch": 0.704,
"grad_norm": 1.1856666803359985,
"learning_rate": 8.946400919602933e-06,
"loss": 1.6771,
"step": 176
},
{
"epoch": 0.708,
"grad_norm": 1.1261265277862549,
"learning_rate": 8.933096973354665e-06,
"loss": 1.536,
"step": 177
},
{
"epoch": 0.712,
"grad_norm": 1.05485999584198,
"learning_rate": 8.919719581655357e-06,
"loss": 1.4499,
"step": 178
},
{
"epoch": 0.716,
"grad_norm": 1.1247637271881104,
"learning_rate": 8.906268994310339e-06,
"loss": 1.6361,
"step": 179
},
{
"epoch": 0.72,
"grad_norm": 1.078016757965088,
"learning_rate": 8.892745462491763e-06,
"loss": 1.5291,
"step": 180
},
{
"epoch": 0.724,
"grad_norm": 1.1105468273162842,
"learning_rate": 8.879149238733932e-06,
"loss": 1.6288,
"step": 181
},
{
"epoch": 0.728,
"grad_norm": 1.1930553913116455,
"learning_rate": 8.865480576928578e-06,
"loss": 1.6732,
"step": 182
},
{
"epoch": 0.732,
"grad_norm": 1.2254596948623657,
"learning_rate": 8.851739732320109e-06,
"loss": 1.7128,
"step": 183
},
{
"epoch": 0.736,
"grad_norm": 1.1654362678527832,
"learning_rate": 8.83792696150086e-06,
"loss": 1.684,
"step": 184
},
{
"epoch": 0.74,
"grad_norm": 1.0579456090927124,
"learning_rate": 8.824042522406295e-06,
"loss": 1.5169,
"step": 185
},
{
"epoch": 0.744,
"grad_norm": 1.1619398593902588,
"learning_rate": 8.810086674310184e-06,
"loss": 1.5584,
"step": 186
},
{
"epoch": 0.748,
"grad_norm": 1.146817922592163,
"learning_rate": 8.796059677819773e-06,
"loss": 1.5477,
"step": 187
},
{
"epoch": 0.752,
"grad_norm": 1.172471523284912,
"learning_rate": 8.781961794870903e-06,
"loss": 1.5598,
"step": 188
},
{
"epoch": 0.756,
"grad_norm": 1.0627797842025757,
"learning_rate": 8.767793288723137e-06,
"loss": 1.435,
"step": 189
},
{
"epoch": 0.76,
"grad_norm": 1.0832802057266235,
"learning_rate": 8.753554423954828e-06,
"loss": 1.5646,
"step": 190
},
{
"epoch": 0.764,
"grad_norm": 1.174402117729187,
"learning_rate": 8.739245466458187e-06,
"loss": 1.6775,
"step": 191
},
{
"epoch": 0.768,
"grad_norm": 1.0686759948730469,
"learning_rate": 8.72486668343431e-06,
"loss": 1.5038,
"step": 192
},
{
"epoch": 0.772,
"grad_norm": 1.1129603385925293,
"learning_rate": 8.7104183433882e-06,
"loss": 1.6307,
"step": 193
},
{
"epoch": 0.776,
"grad_norm": 1.1385724544525146,
"learning_rate": 8.695900716123744e-06,
"loss": 1.6571,
"step": 194
},
{
"epoch": 0.78,
"grad_norm": 1.1305854320526123,
"learning_rate": 8.681314072738678e-06,
"loss": 1.5723,
"step": 195
},
{
"epoch": 0.784,
"grad_norm": 1.0489332675933838,
"learning_rate": 8.666658685619523e-06,
"loss": 1.4929,
"step": 196
},
{
"epoch": 0.788,
"grad_norm": 1.0210576057434082,
"learning_rate": 8.651934828436497e-06,
"loss": 1.49,
"step": 197
},
{
"epoch": 0.792,
"grad_norm": 1.0986818075180054,
"learning_rate": 8.637142776138415e-06,
"loss": 1.5993,
"step": 198
},
{
"epoch": 0.796,
"grad_norm": 1.1953257322311401,
"learning_rate": 8.622282804947537e-06,
"loss": 1.6812,
"step": 199
},
{
"epoch": 0.8,
"grad_norm": 1.0130163431167603,
"learning_rate": 8.607355192354425e-06,
"loss": 1.4005,
"step": 200
},
{
"epoch": 0.804,
"grad_norm": 1.1401948928833008,
"learning_rate": 8.592360217112759e-06,
"loss": 1.561,
"step": 201
},
{
"epoch": 0.808,
"grad_norm": 0.987301230430603,
"learning_rate": 8.57729815923412e-06,
"loss": 1.4176,
"step": 202
},
{
"epoch": 0.812,
"grad_norm": 1.0538125038146973,
"learning_rate": 8.562169299982776e-06,
"loss": 1.5314,
"step": 203
},
{
"epoch": 0.816,
"grad_norm": 1.0923703908920288,
"learning_rate": 8.546973921870421e-06,
"loss": 1.6202,
"step": 204
},
{
"epoch": 0.82,
"grad_norm": 1.0940260887145996,
"learning_rate": 8.531712308650904e-06,
"loss": 1.5696,
"step": 205
},
{
"epoch": 0.824,
"grad_norm": 1.1966121196746826,
"learning_rate": 8.516384745314926e-06,
"loss": 1.5481,
"step": 206
},
{
"epoch": 0.828,
"grad_norm": 1.1779595613479614,
"learning_rate": 8.50099151808472e-06,
"loss": 1.7129,
"step": 207
},
{
"epoch": 0.832,
"grad_norm": 1.1340051889419556,
"learning_rate": 8.485532914408712e-06,
"loss": 1.5981,
"step": 208
},
{
"epoch": 0.836,
"grad_norm": 1.1155678033828735,
"learning_rate": 8.470009222956138e-06,
"loss": 1.5525,
"step": 209
},
{
"epoch": 0.84,
"grad_norm": 1.070615291595459,
"learning_rate": 8.45442073361167e-06,
"loss": 1.4464,
"step": 210
},
{
"epoch": 0.844,
"grad_norm": 1.1107968091964722,
"learning_rate": 8.438767737469995e-06,
"loss": 1.5798,
"step": 211
},
{
"epoch": 0.848,
"grad_norm": 1.063432216644287,
"learning_rate": 8.42305052683038e-06,
"loss": 1.4508,
"step": 212
},
{
"epoch": 0.852,
"grad_norm": 1.0105587244033813,
"learning_rate": 8.407269395191216e-06,
"loss": 1.3754,
"step": 213
},
{
"epoch": 0.856,
"grad_norm": 1.0751416683197021,
"learning_rate": 8.391424637244528e-06,
"loss": 1.4729,
"step": 214
},
{
"epoch": 0.86,
"grad_norm": 1.0858322381973267,
"learning_rate": 8.375516548870489e-06,
"loss": 1.5444,
"step": 215
},
{
"epoch": 0.864,
"grad_norm": 1.1059919595718384,
"learning_rate": 8.359545427131876e-06,
"loss": 1.6031,
"step": 216
},
{
"epoch": 0.868,
"grad_norm": 1.1045112609863281,
"learning_rate": 8.343511570268541e-06,
"loss": 1.5575,
"step": 217
},
{
"epoch": 0.872,
"grad_norm": 1.1263505220413208,
"learning_rate": 8.327415277691824e-06,
"loss": 1.7264,
"step": 218
},
{
"epoch": 0.876,
"grad_norm": 0.9825512766838074,
"learning_rate": 8.311256849978974e-06,
"loss": 1.4317,
"step": 219
},
{
"epoch": 0.88,
"grad_norm": 1.1695114374160767,
"learning_rate": 8.295036588867533e-06,
"loss": 1.6651,
"step": 220
},
{
"epoch": 0.884,
"grad_norm": 1.1236516237258911,
"learning_rate": 8.278754797249702e-06,
"loss": 1.4861,
"step": 221
},
{
"epoch": 0.888,
"grad_norm": 1.051235556602478,
"learning_rate": 8.262411779166681e-06,
"loss": 1.4007,
"step": 222
},
{
"epoch": 0.892,
"grad_norm": 1.0410888195037842,
"learning_rate": 8.246007839802997e-06,
"loss": 1.4743,
"step": 223
},
{
"epoch": 0.896,
"grad_norm": 1.1544384956359863,
"learning_rate": 8.229543285480797e-06,
"loss": 1.6406,
"step": 224
},
{
"epoch": 0.9,
"grad_norm": 1.0394105911254883,
"learning_rate": 8.213018423654144e-06,
"loss": 1.4245,
"step": 225
},
{
"epoch": 0.904,
"grad_norm": 1.1784744262695312,
"learning_rate": 8.196433562903252e-06,
"loss": 1.6525,
"step": 226
},
{
"epoch": 0.908,
"grad_norm": 1.1253184080123901,
"learning_rate": 8.179789012928747e-06,
"loss": 1.6654,
"step": 227
},
{
"epoch": 0.912,
"grad_norm": 1.069104790687561,
"learning_rate": 8.163085084545867e-06,
"loss": 1.5051,
"step": 228
},
{
"epoch": 0.916,
"grad_norm": 1.159909725189209,
"learning_rate": 8.146322089678668e-06,
"loss": 1.6088,
"step": 229
},
{
"epoch": 0.92,
"grad_norm": 1.1648914813995361,
"learning_rate": 8.129500341354192e-06,
"loss": 1.6361,
"step": 230
},
{
"epoch": 0.924,
"grad_norm": 1.210420846939087,
"learning_rate": 8.11262015369663e-06,
"loss": 1.5875,
"step": 231
},
{
"epoch": 0.928,
"grad_norm": 1.0989888906478882,
"learning_rate": 8.095681841921441e-06,
"loss": 1.4767,
"step": 232
},
{
"epoch": 0.932,
"grad_norm": 1.1087747812271118,
"learning_rate": 8.07868572232949e-06,
"loss": 1.6273,
"step": 233
},
{
"epoch": 0.936,
"grad_norm": 1.084105372428894,
"learning_rate": 8.061632112301122e-06,
"loss": 1.4738,
"step": 234
},
{
"epoch": 0.94,
"grad_norm": 0.9671852588653564,
"learning_rate": 8.044521330290235e-06,
"loss": 1.4113,
"step": 235
},
{
"epoch": 0.944,
"grad_norm": 1.1065040826797485,
"learning_rate": 8.027353695818345e-06,
"loss": 1.638,
"step": 236
},
{
"epoch": 0.948,
"grad_norm": 1.0876044034957886,
"learning_rate": 8.010129529468614e-06,
"loss": 1.6306,
"step": 237
},
{
"epoch": 0.952,
"grad_norm": 0.9932905435562134,
"learning_rate": 7.992849152879857e-06,
"loss": 1.4351,
"step": 238
},
{
"epoch": 0.956,
"grad_norm": 1.1644636392593384,
"learning_rate": 7.97551288874055e-06,
"loss": 1.6443,
"step": 239
},
{
"epoch": 0.96,
"grad_norm": 1.0954689979553223,
"learning_rate": 7.95812106078279e-06,
"loss": 1.5673,
"step": 240
},
{
"epoch": 0.964,
"grad_norm": 1.0761327743530273,
"learning_rate": 7.940673993776258e-06,
"loss": 1.6377,
"step": 241
},
{
"epoch": 0.968,
"grad_norm": 1.1261701583862305,
"learning_rate": 7.923172013522153e-06,
"loss": 1.4997,
"step": 242
},
{
"epoch": 0.972,
"grad_norm": 1.02078378200531,
"learning_rate": 7.905615446847107e-06,
"loss": 1.5016,
"step": 243
},
{
"epoch": 0.976,
"grad_norm": 1.1258933544158936,
"learning_rate": 7.888004621597079e-06,
"loss": 1.6324,
"step": 244
},
{
"epoch": 0.98,
"grad_norm": 1.1257457733154297,
"learning_rate": 7.87033986663124e-06,
"loss": 1.6362,
"step": 245
},
{
"epoch": 0.984,
"grad_norm": 0.9650360345840454,
"learning_rate": 7.852621511815825e-06,
"loss": 1.3149,
"step": 246
},
{
"epoch": 0.988,
"grad_norm": 1.1130248308181763,
"learning_rate": 7.834849888017979e-06,
"loss": 1.4682,
"step": 247
},
{
"epoch": 0.992,
"grad_norm": 1.1328444480895996,
"learning_rate": 7.817025327099574e-06,
"loss": 1.6527,
"step": 248
},
{
"epoch": 0.996,
"grad_norm": 1.121050477027893,
"learning_rate": 7.799148161911013e-06,
"loss": 1.5897,
"step": 249
},
{
"epoch": 1.0,
"grad_norm": 1.1452186107635498,
"learning_rate": 7.781218726285014e-06,
"loss": 1.6552,
"step": 250
},
{
"epoch": 1.004,
"grad_norm": 1.1528308391571045,
"learning_rate": 7.763237355030384e-06,
"loss": 1.3685,
"step": 251
},
{
"epoch": 1.008,
"grad_norm": 1.1793309450149536,
"learning_rate": 7.745204383925753e-06,
"loss": 1.4457,
"step": 252
},
{
"epoch": 1.012,
"grad_norm": 1.0134292840957642,
"learning_rate": 7.727120149713313e-06,
"loss": 1.2752,
"step": 253
},
{
"epoch": 1.016,
"grad_norm": 1.0486788749694824,
"learning_rate": 7.708984990092528e-06,
"loss": 1.3084,
"step": 254
},
{
"epoch": 1.02,
"grad_norm": 0.9185591340065002,
"learning_rate": 7.690799243713825e-06,
"loss": 1.2601,
"step": 255
},
{
"epoch": 1.024,
"grad_norm": 0.9956145286560059,
"learning_rate": 7.672563250172278e-06,
"loss": 1.3297,
"step": 256
},
{
"epoch": 1.028,
"grad_norm": 1.016332745552063,
"learning_rate": 7.654277350001255e-06,
"loss": 1.3232,
"step": 257
},
{
"epoch": 1.032,
"grad_norm": 0.9748119711875916,
"learning_rate": 7.635941884666072e-06,
"loss": 1.3335,
"step": 258
},
{
"epoch": 1.036,
"grad_norm": 1.0344090461730957,
"learning_rate": 7.617557196557601e-06,
"loss": 1.3389,
"step": 259
},
{
"epoch": 1.04,
"grad_norm": 1.0641359090805054,
"learning_rate": 7.599123628985894e-06,
"loss": 1.335,
"step": 260
},
{
"epoch": 1.044,
"grad_norm": 0.8922262191772461,
"learning_rate": 7.580641526173758e-06,
"loss": 1.1117,
"step": 261
},
{
"epoch": 1.048,
"grad_norm": 1.0811434984207153,
"learning_rate": 7.5621112332503325e-06,
"loss": 1.2934,
"step": 262
},
{
"epoch": 1.052,
"grad_norm": 1.0639779567718506,
"learning_rate": 7.543533096244644e-06,
"loss": 1.3728,
"step": 263
},
{
"epoch": 1.056,
"grad_norm": 0.9719051122665405,
"learning_rate": 7.524907462079149e-06,
"loss": 1.2263,
"step": 264
},
{
"epoch": 1.06,
"grad_norm": 1.0336952209472656,
"learning_rate": 7.506234678563248e-06,
"loss": 1.4345,
"step": 265
},
{
"epoch": 1.064,
"grad_norm": 1.0820680856704712,
"learning_rate": 7.487515094386792e-06,
"loss": 1.2944,
"step": 266
},
{
"epoch": 1.068,
"grad_norm": 1.0462226867675781,
"learning_rate": 7.468749059113578e-06,
"loss": 1.3447,
"step": 267
},
{
"epoch": 1.072,
"grad_norm": 1.0974781513214111,
"learning_rate": 7.449936923174813e-06,
"loss": 1.4944,
"step": 268
},
{
"epoch": 1.076,
"grad_norm": 0.9548144340515137,
"learning_rate": 7.431079037862575e-06,
"loss": 1.3046,
"step": 269
},
{
"epoch": 1.08,
"grad_norm": 0.9887881875038147,
"learning_rate": 7.412175755323254e-06,
"loss": 1.2941,
"step": 270
},
{
"epoch": 1.084,
"grad_norm": 1.031455159187317,
"learning_rate": 7.39322742855097e-06,
"loss": 1.3129,
"step": 271
},
{
"epoch": 1.088,
"grad_norm": 0.9989827871322632,
"learning_rate": 7.374234411380987e-06,
"loss": 1.2758,
"step": 272
},
{
"epoch": 1.092,
"grad_norm": 1.0414865016937256,
"learning_rate": 7.355197058483103e-06,
"loss": 1.2781,
"step": 273
},
{
"epoch": 1.096,
"grad_norm": 1.0758367776870728,
"learning_rate": 7.336115725355033e-06,
"loss": 1.2932,
"step": 274
},
{
"epoch": 1.1,
"grad_norm": 0.9883400797843933,
"learning_rate": 7.316990768315757e-06,
"loss": 1.3513,
"step": 275
},
{
"epoch": 1.104,
"grad_norm": 1.07712984085083,
"learning_rate": 7.297822544498879e-06,
"loss": 1.2704,
"step": 276
},
{
"epoch": 1.108,
"grad_norm": 0.9456796050071716,
"learning_rate": 7.2786114118459564e-06,
"loss": 1.2326,
"step": 277
},
{
"epoch": 1.112,
"grad_norm": 0.8864812254905701,
"learning_rate": 7.259357729099805e-06,
"loss": 1.1043,
"step": 278
},
{
"epoch": 1.116,
"grad_norm": 1.09107506275177,
"learning_rate": 7.240061855797818e-06,
"loss": 1.3225,
"step": 279
},
{
"epoch": 1.12,
"grad_norm": 0.9345561861991882,
"learning_rate": 7.220724152265234e-06,
"loss": 1.2604,
"step": 280
},
{
"epoch": 1.124,
"grad_norm": 0.9008955359458923,
"learning_rate": 7.201344979608423e-06,
"loss": 1.1904,
"step": 281
},
{
"epoch": 1.1280000000000001,
"grad_norm": 1.0470987558364868,
"learning_rate": 7.181924699708127e-06,
"loss": 1.4107,
"step": 282
},
{
"epoch": 1.1320000000000001,
"grad_norm": 1.0424591302871704,
"learning_rate": 7.162463675212726e-06,
"loss": 1.3053,
"step": 283
},
{
"epoch": 1.1360000000000001,
"grad_norm": 0.9431308507919312,
"learning_rate": 7.142962269531439e-06,
"loss": 1.2126,
"step": 284
},
{
"epoch": 1.1400000000000001,
"grad_norm": 1.0488431453704834,
"learning_rate": 7.12342084682756e-06,
"loss": 1.3095,
"step": 285
},
{
"epoch": 1.144,
"grad_norm": 1.0937862396240234,
"learning_rate": 7.1038397720116445e-06,
"loss": 1.4235,
"step": 286
},
{
"epoch": 1.148,
"grad_norm": 1.033434510231018,
"learning_rate": 7.084219410734701e-06,
"loss": 1.3203,
"step": 287
},
{
"epoch": 1.152,
"grad_norm": 1.036313772201538,
"learning_rate": 7.064560129381359e-06,
"loss": 1.2603,
"step": 288
},
{
"epoch": 1.156,
"grad_norm": 1.0411490201950073,
"learning_rate": 7.0448622950630305e-06,
"loss": 1.2209,
"step": 289
},
{
"epoch": 1.16,
"grad_norm": 1.007789969444275,
"learning_rate": 7.025126275611058e-06,
"loss": 1.23,
"step": 290
},
{
"epoch": 1.164,
"grad_norm": 1.0624102354049683,
"learning_rate": 7.0053524395698345e-06,
"loss": 1.3177,
"step": 291
},
{
"epoch": 1.168,
"grad_norm": 0.9933396577835083,
"learning_rate": 6.985541156189932e-06,
"loss": 1.2976,
"step": 292
},
{
"epoch": 1.172,
"grad_norm": 1.1039681434631348,
"learning_rate": 6.965692795421206e-06,
"loss": 1.4093,
"step": 293
},
{
"epoch": 1.176,
"grad_norm": 1.0943962335586548,
"learning_rate": 6.945807727905876e-06,
"loss": 1.3604,
"step": 294
},
{
"epoch": 1.18,
"grad_norm": 1.1898424625396729,
"learning_rate": 6.925886324971619e-06,
"loss": 1.4917,
"step": 295
},
{
"epoch": 1.184,
"grad_norm": 1.181394338607788,
"learning_rate": 6.905928958624627e-06,
"loss": 1.4781,
"step": 296
},
{
"epoch": 1.188,
"grad_norm": 1.0346969366073608,
"learning_rate": 6.885936001542658e-06,
"loss": 1.3176,
"step": 297
},
{
"epoch": 1.192,
"grad_norm": 1.0920255184173584,
"learning_rate": 6.865907827068085e-06,
"loss": 1.3288,
"step": 298
},
{
"epoch": 1.196,
"grad_norm": 1.0821036100387573,
"learning_rate": 6.845844809200918e-06,
"loss": 1.3529,
"step": 299
},
{
"epoch": 1.2,
"grad_norm": 1.0587323904037476,
"learning_rate": 6.82574732259182e-06,
"loss": 1.3266,
"step": 300
},
{
"epoch": 1.204,
"grad_norm": 1.0351879596710205,
"learning_rate": 6.805615742535117e-06,
"loss": 1.3113,
"step": 301
},
{
"epoch": 1.208,
"grad_norm": 1.1247234344482422,
"learning_rate": 6.785450444961783e-06,
"loss": 1.3628,
"step": 302
},
{
"epoch": 1.212,
"grad_norm": 0.9813088774681091,
"learning_rate": 6.765251806432423e-06,
"loss": 1.2814,
"step": 303
},
{
"epoch": 1.216,
"grad_norm": 0.9769037961959839,
"learning_rate": 6.7450202041302404e-06,
"loss": 1.2868,
"step": 304
},
{
"epoch": 1.22,
"grad_norm": 1.0350134372711182,
"learning_rate": 6.724756015853994e-06,
"loss": 1.2903,
"step": 305
},
{
"epoch": 1.224,
"grad_norm": 0.9875354170799255,
"learning_rate": 6.704459620010945e-06,
"loss": 1.388,
"step": 306
},
{
"epoch": 1.228,
"grad_norm": 1.156988501548767,
"learning_rate": 6.684131395609784e-06,
"loss": 1.4401,
"step": 307
},
{
"epoch": 1.232,
"grad_norm": 1.0350157022476196,
"learning_rate": 6.663771722253567e-06,
"loss": 1.3627,
"step": 308
},
{
"epoch": 1.236,
"grad_norm": 1.099228858947754,
"learning_rate": 6.643380980132608e-06,
"loss": 1.4149,
"step": 309
},
{
"epoch": 1.24,
"grad_norm": 0.9770938754081726,
"learning_rate": 6.622959550017397e-06,
"loss": 1.2515,
"step": 310
},
{
"epoch": 1.244,
"grad_norm": 0.9782474637031555,
"learning_rate": 6.602507813251478e-06,
"loss": 1.2768,
"step": 311
},
{
"epoch": 1.248,
"grad_norm": 1.0403162240982056,
"learning_rate": 6.5820261517443365e-06,
"loss": 1.3037,
"step": 312
},
{
"epoch": 1.252,
"grad_norm": 1.0825715065002441,
"learning_rate": 6.561514947964258e-06,
"loss": 1.3063,
"step": 313
},
{
"epoch": 1.256,
"grad_norm": 0.9935280680656433,
"learning_rate": 6.540974584931199e-06,
"loss": 1.2613,
"step": 314
},
{
"epoch": 1.26,
"grad_norm": 1.0176899433135986,
"learning_rate": 6.520405446209615e-06,
"loss": 1.3171,
"step": 315
},
{
"epoch": 1.264,
"grad_norm": 1.0019985437393188,
"learning_rate": 6.4998079159013236e-06,
"loss": 1.2588,
"step": 316
},
{
"epoch": 1.268,
"grad_norm": 0.966647207736969,
"learning_rate": 6.479182378638308e-06,
"loss": 1.2583,
"step": 317
},
{
"epoch": 1.272,
"grad_norm": 1.0037215948104858,
"learning_rate": 6.458529219575551e-06,
"loss": 1.1899,
"step": 318
},
{
"epoch": 1.276,
"grad_norm": 1.1131309270858765,
"learning_rate": 6.437848824383832e-06,
"loss": 1.4575,
"step": 319
},
{
"epoch": 1.28,
"grad_norm": 0.9740872383117676,
"learning_rate": 6.417141579242532e-06,
"loss": 1.2475,
"step": 320
},
{
"epoch": 1.284,
"grad_norm": 1.0149080753326416,
"learning_rate": 6.396407870832419e-06,
"loss": 1.2164,
"step": 321
},
{
"epoch": 1.288,
"grad_norm": 0.9826841950416565,
"learning_rate": 6.375648086328431e-06,
"loss": 1.217,
"step": 322
},
{
"epoch": 1.292,
"grad_norm": 1.0243743658065796,
"learning_rate": 6.354862613392436e-06,
"loss": 1.3661,
"step": 323
},
{
"epoch": 1.296,
"grad_norm": 1.0462455749511719,
"learning_rate": 6.334051840166006e-06,
"loss": 1.2995,
"step": 324
},
{
"epoch": 1.3,
"grad_norm": 1.05907142162323,
"learning_rate": 6.313216155263161e-06,
"loss": 1.4628,
"step": 325
},
{
"epoch": 1.304,
"grad_norm": 1.0152286291122437,
"learning_rate": 6.292355947763114e-06,
"loss": 1.2515,
"step": 326
},
{
"epoch": 1.308,
"grad_norm": 0.9773034453392029,
"learning_rate": 6.271471607203006e-06,
"loss": 1.2588,
"step": 327
},
{
"epoch": 1.312,
"grad_norm": 0.9034001231193542,
"learning_rate": 6.25056352357063e-06,
"loss": 1.1689,
"step": 328
},
{
"epoch": 1.316,
"grad_norm": 1.0075790882110596,
"learning_rate": 6.2296320872971515e-06,
"loss": 1.3249,
"step": 329
},
{
"epoch": 1.32,
"grad_norm": 1.0259512662887573,
"learning_rate": 6.208677689249816e-06,
"loss": 1.3706,
"step": 330
},
{
"epoch": 1.324,
"grad_norm": 1.1131854057312012,
"learning_rate": 6.187700720724648e-06,
"loss": 1.3719,
"step": 331
},
{
"epoch": 1.328,
"grad_norm": 0.9339361786842346,
"learning_rate": 6.16670157343915e-06,
"loss": 1.1862,
"step": 332
},
{
"epoch": 1.332,
"grad_norm": 1.0028992891311646,
"learning_rate": 6.14568063952498e-06,
"loss": 1.2042,
"step": 333
},
{
"epoch": 1.336,
"grad_norm": 1.0733510255813599,
"learning_rate": 6.124638311520634e-06,
"loss": 1.3601,
"step": 334
},
{
"epoch": 1.34,
"grad_norm": 1.1401326656341553,
"learning_rate": 6.103574982364118e-06,
"loss": 1.3899,
"step": 335
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.9714714884757996,
"learning_rate": 6.082491045385601e-06,
"loss": 1.2551,
"step": 336
},
{
"epoch": 1.3479999999999999,
"grad_norm": 0.9906610250473022,
"learning_rate": 6.061386894300082e-06,
"loss": 1.2254,
"step": 337
},
{
"epoch": 1.3519999999999999,
"grad_norm": 1.0056389570236206,
"learning_rate": 6.0402629232000275e-06,
"loss": 1.3712,
"step": 338
},
{
"epoch": 1.3559999999999999,
"grad_norm": 1.0775071382522583,
"learning_rate": 6.01911952654802e-06,
"loss": 1.4343,
"step": 339
},
{
"epoch": 1.3599999999999999,
"grad_norm": 1.0412591695785522,
"learning_rate": 5.997957099169388e-06,
"loss": 1.3622,
"step": 340
},
{
"epoch": 1.3639999999999999,
"grad_norm": 1.0673794746398926,
"learning_rate": 5.976776036244833e-06,
"loss": 1.3336,
"step": 341
},
{
"epoch": 1.3679999999999999,
"grad_norm": 1.104533076286316,
"learning_rate": 5.955576733303053e-06,
"loss": 1.3961,
"step": 342
},
{
"epoch": 1.3719999999999999,
"grad_norm": 0.98575758934021,
"learning_rate": 5.9343595862133515e-06,
"loss": 1.267,
"step": 343
},
{
"epoch": 1.376,
"grad_norm": 1.0667227506637573,
"learning_rate": 5.91312499117825e-06,
"loss": 1.3001,
"step": 344
},
{
"epoch": 1.38,
"grad_norm": 1.032029151916504,
"learning_rate": 5.891873344726089e-06,
"loss": 1.3165,
"step": 345
},
{
"epoch": 1.384,
"grad_norm": 1.0932317972183228,
"learning_rate": 5.87060504370362e-06,
"loss": 1.3691,
"step": 346
},
{
"epoch": 1.388,
"grad_norm": 1.015009880065918,
"learning_rate": 5.849320485268597e-06,
"loss": 1.301,
"step": 347
},
{
"epoch": 1.392,
"grad_norm": 1.0828276872634888,
"learning_rate": 5.828020066882361e-06,
"loss": 1.3237,
"step": 348
},
{
"epoch": 1.396,
"grad_norm": 0.9718666076660156,
"learning_rate": 5.806704186302413e-06,
"loss": 1.1994,
"step": 349
},
{
"epoch": 1.4,
"grad_norm": 0.9991141557693481,
"learning_rate": 5.7853732415749985e-06,
"loss": 1.2379,
"step": 350
},
{
"epoch": 1.404,
"grad_norm": 1.0330079793930054,
"learning_rate": 5.764027631027659e-06,
"loss": 1.3069,
"step": 351
},
{
"epoch": 1.408,
"grad_norm": 1.040109395980835,
"learning_rate": 5.7426677532618e-06,
"loss": 1.2538,
"step": 352
},
{
"epoch": 1.412,
"grad_norm": 0.9935119152069092,
"learning_rate": 5.721294007145256e-06,
"loss": 1.2556,
"step": 353
},
{
"epoch": 1.416,
"grad_norm": 1.0362998247146606,
"learning_rate": 5.69990679180483e-06,
"loss": 1.3782,
"step": 354
},
{
"epoch": 1.42,
"grad_norm": 0.9767175316810608,
"learning_rate": 5.678506506618845e-06,
"loss": 1.2515,
"step": 355
},
{
"epoch": 1.424,
"grad_norm": 1.1435551643371582,
"learning_rate": 5.657093551209687e-06,
"loss": 1.2987,
"step": 356
},
{
"epoch": 1.428,
"grad_norm": 1.121378779411316,
"learning_rate": 5.635668325436343e-06,
"loss": 1.3855,
"step": 357
},
{
"epoch": 1.432,
"grad_norm": 0.9980752468109131,
"learning_rate": 5.614231229386933e-06,
"loss": 1.2792,
"step": 358
},
{
"epoch": 1.436,
"grad_norm": 1.0650511980056763,
"learning_rate": 5.592782663371237e-06,
"loss": 1.3812,
"step": 359
},
{
"epoch": 1.44,
"grad_norm": 1.0521976947784424,
"learning_rate": 5.571323027913221e-06,
"loss": 1.2471,
"step": 360
},
{
"epoch": 1.444,
"grad_norm": 1.0416388511657715,
"learning_rate": 5.549852723743564e-06,
"loss": 1.3824,
"step": 361
},
{
"epoch": 1.448,
"grad_norm": 0.9701342582702637,
"learning_rate": 5.528372151792161e-06,
"loss": 1.2666,
"step": 362
},
{
"epoch": 1.452,
"grad_norm": 1.0128072500228882,
"learning_rate": 5.506881713180652e-06,
"loss": 1.3477,
"step": 363
},
{
"epoch": 1.456,
"grad_norm": 1.0900636911392212,
"learning_rate": 5.485381809214921e-06,
"loss": 1.4304,
"step": 364
},
{
"epoch": 1.46,
"grad_norm": 1.0411545038223267,
"learning_rate": 5.463872841377601e-06,
"loss": 1.3721,
"step": 365
},
{
"epoch": 1.464,
"grad_norm": 1.113276720046997,
"learning_rate": 5.44235521132059e-06,
"loss": 1.4026,
"step": 366
},
{
"epoch": 1.468,
"grad_norm": 1.1396021842956543,
"learning_rate": 5.420829320857532e-06,
"loss": 1.3881,
"step": 367
},
{
"epoch": 1.472,
"grad_norm": 0.9877294301986694,
"learning_rate": 5.39929557195633e-06,
"loss": 1.2725,
"step": 368
},
{
"epoch": 1.476,
"grad_norm": 1.0698078870773315,
"learning_rate": 5.377754366731633e-06,
"loss": 1.382,
"step": 369
},
{
"epoch": 1.48,
"grad_norm": 0.9980776309967041,
"learning_rate": 5.35620610743732e-06,
"loss": 1.2647,
"step": 370
},
{
"epoch": 1.484,
"grad_norm": 1.0515851974487305,
"learning_rate": 5.334651196459003e-06,
"loss": 1.2589,
"step": 371
},
{
"epoch": 1.488,
"grad_norm": 1.0758516788482666,
"learning_rate": 5.3130900363065055e-06,
"loss": 1.3933,
"step": 372
},
{
"epoch": 1.492,
"grad_norm": 1.057415246963501,
"learning_rate": 5.291523029606339e-06,
"loss": 1.2339,
"step": 373
},
{
"epoch": 1.496,
"grad_norm": 1.135566234588623,
"learning_rate": 5.269950579094199e-06,
"loss": 1.4152,
"step": 374
},
{
"epoch": 1.5,
"grad_norm": 0.9283575415611267,
"learning_rate": 5.248373087607434e-06,
"loss": 1.214,
"step": 375
},
{
"epoch": 1.504,
"grad_norm": 0.9984523057937622,
"learning_rate": 5.22679095807753e-06,
"loss": 1.2959,
"step": 376
},
{
"epoch": 1.508,
"grad_norm": 1.0446697473526,
"learning_rate": 5.2052045935225725e-06,
"loss": 1.3313,
"step": 377
},
{
"epoch": 1.512,
"grad_norm": 1.0381207466125488,
"learning_rate": 5.183614397039741e-06,
"loss": 1.2513,
"step": 378
},
{
"epoch": 1.516,
"grad_norm": 1.2146896123886108,
"learning_rate": 5.162020771797768e-06,
"loss": 1.33,
"step": 379
},
{
"epoch": 1.52,
"grad_norm": 1.1130037307739258,
"learning_rate": 5.1404241210294095e-06,
"loss": 1.3833,
"step": 380
},
{
"epoch": 1.524,
"grad_norm": 1.0213799476623535,
"learning_rate": 5.118824848023926e-06,
"loss": 1.3537,
"step": 381
},
{
"epoch": 1.528,
"grad_norm": 0.9932836890220642,
"learning_rate": 5.097223356119538e-06,
"loss": 1.2763,
"step": 382
},
{
"epoch": 1.532,
"grad_norm": 1.0798388719558716,
"learning_rate": 5.07562004869591e-06,
"loss": 1.4094,
"step": 383
},
{
"epoch": 1.536,
"grad_norm": 1.0010972023010254,
"learning_rate": 5.054015329166596e-06,
"loss": 1.1387,
"step": 384
},
{
"epoch": 1.54,
"grad_norm": 1.0118602514266968,
"learning_rate": 5.032409600971533e-06,
"loss": 1.3698,
"step": 385
},
{
"epoch": 1.544,
"grad_norm": 1.062326192855835,
"learning_rate": 5.010803267569483e-06,
"loss": 1.3475,
"step": 386
},
{
"epoch": 1.548,
"grad_norm": 1.0971441268920898,
"learning_rate": 4.989196732430518e-06,
"loss": 1.2936,
"step": 387
},
{
"epoch": 1.552,
"grad_norm": 1.0156275033950806,
"learning_rate": 4.967590399028468e-06,
"loss": 1.3193,
"step": 388
},
{
"epoch": 1.556,
"grad_norm": 1.0805950164794922,
"learning_rate": 4.9459846708334044e-06,
"loss": 1.3318,
"step": 389
},
{
"epoch": 1.56,
"grad_norm": 1.0072593688964844,
"learning_rate": 4.924379951304094e-06,
"loss": 1.2832,
"step": 390
},
{
"epoch": 1.564,
"grad_norm": 1.0336650609970093,
"learning_rate": 4.902776643880461e-06,
"loss": 1.296,
"step": 391
},
{
"epoch": 1.568,
"grad_norm": 1.0790278911590576,
"learning_rate": 4.881175151976075e-06,
"loss": 1.3453,
"step": 392
},
{
"epoch": 1.572,
"grad_norm": 1.0593476295471191,
"learning_rate": 4.859575878970592e-06,
"loss": 1.2425,
"step": 393
},
{
"epoch": 1.576,
"grad_norm": 1.05976140499115,
"learning_rate": 4.837979228202234e-06,
"loss": 1.3116,
"step": 394
},
{
"epoch": 1.58,
"grad_norm": 1.0560014247894287,
"learning_rate": 4.81638560296026e-06,
"loss": 1.3104,
"step": 395
},
{
"epoch": 1.584,
"grad_norm": 1.143796682357788,
"learning_rate": 4.794795406477429e-06,
"loss": 1.3772,
"step": 396
},
{
"epoch": 1.588,
"grad_norm": 1.3215272426605225,
"learning_rate": 4.773209041922472e-06,
"loss": 1.4062,
"step": 397
},
{
"epoch": 1.592,
"grad_norm": 1.0039423704147339,
"learning_rate": 4.7516269123925665e-06,
"loss": 1.2597,
"step": 398
},
{
"epoch": 1.596,
"grad_norm": 1.0940923690795898,
"learning_rate": 4.730049420905801e-06,
"loss": 1.3308,
"step": 399
},
{
"epoch": 1.6,
"grad_norm": 1.047290325164795,
"learning_rate": 4.708476970393662e-06,
"loss": 1.2995,
"step": 400
},
{
"epoch": 1.604,
"grad_norm": 1.112776279449463,
"learning_rate": 4.686909963693498e-06,
"loss": 1.3655,
"step": 401
},
{
"epoch": 1.608,
"grad_norm": 0.8907529711723328,
"learning_rate": 4.6653488035409975e-06,
"loss": 1.1678,
"step": 402
},
{
"epoch": 1.612,
"grad_norm": 1.1391288042068481,
"learning_rate": 4.643793892562682e-06,
"loss": 1.3858,
"step": 403
},
{
"epoch": 1.616,
"grad_norm": 1.0218274593353271,
"learning_rate": 4.622245633268371e-06,
"loss": 1.3211,
"step": 404
},
{
"epoch": 1.62,
"grad_norm": 1.048348069190979,
"learning_rate": 4.60070442804367e-06,
"loss": 1.3769,
"step": 405
},
{
"epoch": 1.624,
"grad_norm": 1.020076870918274,
"learning_rate": 4.5791706791424694e-06,
"loss": 1.3184,
"step": 406
},
{
"epoch": 1.6280000000000001,
"grad_norm": 1.0310548543930054,
"learning_rate": 4.557644788679413e-06,
"loss": 1.1922,
"step": 407
},
{
"epoch": 1.6320000000000001,
"grad_norm": 1.0457245111465454,
"learning_rate": 4.536127158622401e-06,
"loss": 1.2502,
"step": 408
},
{
"epoch": 1.6360000000000001,
"grad_norm": 0.9524244666099548,
"learning_rate": 4.514618190785081e-06,
"loss": 1.2723,
"step": 409
},
{
"epoch": 1.6400000000000001,
"grad_norm": 1.1064085960388184,
"learning_rate": 4.493118286819348e-06,
"loss": 1.379,
"step": 410
},
{
"epoch": 1.6440000000000001,
"grad_norm": 1.084433674812317,
"learning_rate": 4.47162784820784e-06,
"loss": 1.411,
"step": 411
},
{
"epoch": 1.6480000000000001,
"grad_norm": 1.0625848770141602,
"learning_rate": 4.450147276256439e-06,
"loss": 1.4082,
"step": 412
},
{
"epoch": 1.6520000000000001,
"grad_norm": 0.9723708629608154,
"learning_rate": 4.42867697208678e-06,
"loss": 1.2489,
"step": 413
},
{
"epoch": 1.6560000000000001,
"grad_norm": 1.0396859645843506,
"learning_rate": 4.407217336628765e-06,
"loss": 1.1836,
"step": 414
},
{
"epoch": 1.6600000000000001,
"grad_norm": 1.0983256101608276,
"learning_rate": 4.385768770613069e-06,
"loss": 1.3824,
"step": 415
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.939993679523468,
"learning_rate": 4.3643316745636574e-06,
"loss": 1.242,
"step": 416
},
{
"epoch": 1.6680000000000001,
"grad_norm": 1.333936095237732,
"learning_rate": 4.342906448790315e-06,
"loss": 1.2763,
"step": 417
},
{
"epoch": 1.6720000000000002,
"grad_norm": 1.037348747253418,
"learning_rate": 4.321493493381157e-06,
"loss": 1.2989,
"step": 418
},
{
"epoch": 1.6760000000000002,
"grad_norm": 0.913204550743103,
"learning_rate": 4.300093208195171e-06,
"loss": 1.203,
"step": 419
},
{
"epoch": 1.6800000000000002,
"grad_norm": 1.032127857208252,
"learning_rate": 4.278705992854745e-06,
"loss": 1.2121,
"step": 420
},
{
"epoch": 1.6840000000000002,
"grad_norm": 1.0506740808486938,
"learning_rate": 4.257332246738201e-06,
"loss": 1.4022,
"step": 421
},
{
"epoch": 1.688,
"grad_norm": 1.056723713874817,
"learning_rate": 4.235972368972343e-06,
"loss": 1.3774,
"step": 422
},
{
"epoch": 1.692,
"grad_norm": 0.9607878923416138,
"learning_rate": 4.214626758425003e-06,
"loss": 1.2494,
"step": 423
},
{
"epoch": 1.696,
"grad_norm": 0.9732138514518738,
"learning_rate": 4.193295813697587e-06,
"loss": 1.2858,
"step": 424
},
{
"epoch": 1.7,
"grad_norm": 1.030493140220642,
"learning_rate": 4.171979933117641e-06,
"loss": 1.4126,
"step": 425
},
{
"epoch": 1.704,
"grad_norm": 0.9867613911628723,
"learning_rate": 4.150679514731405e-06,
"loss": 1.3348,
"step": 426
},
{
"epoch": 1.708,
"grad_norm": 0.9390940070152283,
"learning_rate": 4.12939495629638e-06,
"loss": 1.1345,
"step": 427
},
{
"epoch": 1.712,
"grad_norm": 1.0708876848220825,
"learning_rate": 4.108126655273912e-06,
"loss": 1.2774,
"step": 428
},
{
"epoch": 1.716,
"grad_norm": 1.0339386463165283,
"learning_rate": 4.086875008821752e-06,
"loss": 1.3666,
"step": 429
},
{
"epoch": 1.72,
"grad_norm": 1.047210693359375,
"learning_rate": 4.06564041378665e-06,
"loss": 1.3901,
"step": 430
},
{
"epoch": 1.724,
"grad_norm": 1.0303080081939697,
"learning_rate": 4.04442326669695e-06,
"loss": 1.2938,
"step": 431
},
{
"epoch": 1.728,
"grad_norm": 1.1503710746765137,
"learning_rate": 4.023223963755168e-06,
"loss": 1.4221,
"step": 432
},
{
"epoch": 1.732,
"grad_norm": 1.0227370262145996,
"learning_rate": 4.002042900830613e-06,
"loss": 1.3698,
"step": 433
},
{
"epoch": 1.736,
"grad_norm": 1.0578244924545288,
"learning_rate": 3.980880473451982e-06,
"loss": 1.2995,
"step": 434
},
{
"epoch": 1.74,
"grad_norm": 0.9922149181365967,
"learning_rate": 3.959737076799974e-06,
"loss": 1.2021,
"step": 435
},
{
"epoch": 1.744,
"grad_norm": 0.9406126141548157,
"learning_rate": 3.93861310569992e-06,
"loss": 1.1236,
"step": 436
},
{
"epoch": 1.748,
"grad_norm": 0.9856680631637573,
"learning_rate": 3.917508954614401e-06,
"loss": 1.2002,
"step": 437
},
{
"epoch": 1.752,
"grad_norm": 1.0986454486846924,
"learning_rate": 3.896425017635884e-06,
"loss": 1.3789,
"step": 438
},
{
"epoch": 1.756,
"grad_norm": 1.0303860902786255,
"learning_rate": 3.875361688479367e-06,
"loss": 1.3061,
"step": 439
},
{
"epoch": 1.76,
"grad_norm": 1.013808012008667,
"learning_rate": 3.854319360475022e-06,
"loss": 1.2924,
"step": 440
},
{
"epoch": 1.764,
"grad_norm": 1.0553696155548096,
"learning_rate": 3.833298426560851e-06,
"loss": 1.3652,
"step": 441
},
{
"epoch": 1.768,
"grad_norm": 1.0112295150756836,
"learning_rate": 3.8122992792753534e-06,
"loss": 1.3082,
"step": 442
},
{
"epoch": 1.772,
"grad_norm": 1.0582830905914307,
"learning_rate": 3.7913223107501847e-06,
"loss": 1.3769,
"step": 443
},
{
"epoch": 1.776,
"grad_norm": 1.0347267389297485,
"learning_rate": 3.7703679127028497e-06,
"loss": 1.425,
"step": 444
},
{
"epoch": 1.78,
"grad_norm": 1.0205029249191284,
"learning_rate": 3.7494364764293722e-06,
"loss": 1.3091,
"step": 445
},
{
"epoch": 1.784,
"grad_norm": 1.0478813648223877,
"learning_rate": 3.728528392796995e-06,
"loss": 1.341,
"step": 446
},
{
"epoch": 1.788,
"grad_norm": 0.9804567694664001,
"learning_rate": 3.707644052236887e-06,
"loss": 1.2878,
"step": 447
},
{
"epoch": 1.792,
"grad_norm": 1.0110281705856323,
"learning_rate": 3.6867838447368414e-06,
"loss": 1.2472,
"step": 448
},
{
"epoch": 1.796,
"grad_norm": 1.0572271347045898,
"learning_rate": 3.6659481598339952e-06,
"loss": 1.3878,
"step": 449
},
{
"epoch": 1.8,
"grad_norm": 1.082832932472229,
"learning_rate": 3.6451373866075657e-06,
"loss": 1.366,
"step": 450
},
{
"epoch": 1.804,
"grad_norm": 1.0754969120025635,
"learning_rate": 3.624351913671571e-06,
"loss": 1.3644,
"step": 451
},
{
"epoch": 1.808,
"grad_norm": 1.0205718278884888,
"learning_rate": 3.6035921291675815e-06,
"loss": 1.2874,
"step": 452
},
{
"epoch": 1.812,
"grad_norm": 1.0364340543746948,
"learning_rate": 3.5828584207574698e-06,
"loss": 1.313,
"step": 453
},
{
"epoch": 1.8159999999999998,
"grad_norm": 1.0769734382629395,
"learning_rate": 3.5621511756161686e-06,
"loss": 1.3368,
"step": 454
},
{
"epoch": 1.8199999999999998,
"grad_norm": 1.0174931287765503,
"learning_rate": 3.54147078042445e-06,
"loss": 1.2763,
"step": 455
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.94427889585495,
"learning_rate": 3.520817621361693e-06,
"loss": 1.2327,
"step": 456
},
{
"epoch": 1.8279999999999998,
"grad_norm": 0.9798959493637085,
"learning_rate": 3.500192084098677e-06,
"loss": 1.2847,
"step": 457
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.9966975450515747,
"learning_rate": 3.4795945537903852e-06,
"loss": 1.2827,
"step": 458
},
{
"epoch": 1.8359999999999999,
"grad_norm": 1.0126770734786987,
"learning_rate": 3.459025415068804e-06,
"loss": 1.3422,
"step": 459
},
{
"epoch": 1.8399999999999999,
"grad_norm": 1.0910636186599731,
"learning_rate": 3.4384850520357416e-06,
"loss": 1.3603,
"step": 460
},
{
"epoch": 1.8439999999999999,
"grad_norm": 0.9798678755760193,
"learning_rate": 3.4179738482556648e-06,
"loss": 1.2606,
"step": 461
},
{
"epoch": 1.8479999999999999,
"grad_norm": 1.047108769416809,
"learning_rate": 3.3974921867485238e-06,
"loss": 1.2898,
"step": 462
},
{
"epoch": 1.8519999999999999,
"grad_norm": 1.0191044807434082,
"learning_rate": 3.377040449982604e-06,
"loss": 1.1954,
"step": 463
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.9370830655097961,
"learning_rate": 3.356619019867394e-06,
"loss": 1.219,
"step": 464
},
{
"epoch": 1.8599999999999999,
"grad_norm": 0.9407516121864319,
"learning_rate": 3.336228277746435e-06,
"loss": 1.2451,
"step": 465
},
{
"epoch": 1.8639999999999999,
"grad_norm": 0.9665749669075012,
"learning_rate": 3.3158686043902166e-06,
"loss": 1.2389,
"step": 466
},
{
"epoch": 1.8679999999999999,
"grad_norm": 1.0265816450119019,
"learning_rate": 3.2955403799890567e-06,
"loss": 1.3495,
"step": 467
},
{
"epoch": 1.8719999999999999,
"grad_norm": 1.0662939548492432,
"learning_rate": 3.2752439841460063e-06,
"loss": 1.3986,
"step": 468
},
{
"epoch": 1.876,
"grad_norm": 1.049228549003601,
"learning_rate": 3.254979795869761e-06,
"loss": 1.3497,
"step": 469
},
{
"epoch": 1.88,
"grad_norm": 1.0802751779556274,
"learning_rate": 3.234748193567579e-06,
"loss": 1.4039,
"step": 470
},
{
"epoch": 1.884,
"grad_norm": 0.9889898300170898,
"learning_rate": 3.214549555038218e-06,
"loss": 1.2641,
"step": 471
},
{
"epoch": 1.888,
"grad_norm": 0.987068235874176,
"learning_rate": 3.194384257464884e-06,
"loss": 1.2589,
"step": 472
},
{
"epoch": 1.892,
"grad_norm": 1.0221607685089111,
"learning_rate": 3.1742526774081822e-06,
"loss": 1.2261,
"step": 473
},
{
"epoch": 1.896,
"grad_norm": 1.0248719453811646,
"learning_rate": 3.154155190799084e-06,
"loss": 1.2705,
"step": 474
},
{
"epoch": 1.9,
"grad_norm": 1.0018503665924072,
"learning_rate": 3.1340921729319173e-06,
"loss": 1.3734,
"step": 475
},
{
"epoch": 1.904,
"grad_norm": 0.9709426760673523,
"learning_rate": 3.1140639984573428e-06,
"loss": 1.2281,
"step": 476
},
{
"epoch": 1.908,
"grad_norm": 1.056033968925476,
"learning_rate": 3.094071041375375e-06,
"loss": 1.3789,
"step": 477
},
{
"epoch": 1.912,
"grad_norm": 1.0520853996276855,
"learning_rate": 3.0741136750283816e-06,
"loss": 1.4122,
"step": 478
},
{
"epoch": 1.916,
"grad_norm": 1.0225236415863037,
"learning_rate": 3.054192272094125e-06,
"loss": 1.2237,
"step": 479
},
{
"epoch": 1.92,
"grad_norm": 1.04264235496521,
"learning_rate": 3.0343072045787956e-06,
"loss": 1.1914,
"step": 480
},
{
"epoch": 1.924,
"grad_norm": 1.0835708379745483,
"learning_rate": 3.0144588438100693e-06,
"loss": 1.3565,
"step": 481
},
{
"epoch": 1.928,
"grad_norm": 1.0434730052947998,
"learning_rate": 2.994647560430167e-06,
"loss": 1.2601,
"step": 482
},
{
"epoch": 1.932,
"grad_norm": 0.9777680039405823,
"learning_rate": 2.974873724388945e-06,
"loss": 1.2813,
"step": 483
},
{
"epoch": 1.936,
"grad_norm": 1.076163411140442,
"learning_rate": 2.955137704936971e-06,
"loss": 1.3366,
"step": 484
},
{
"epoch": 1.94,
"grad_norm": 1.0271539688110352,
"learning_rate": 2.9354398706186427e-06,
"loss": 1.3005,
"step": 485
},
{
"epoch": 1.944,
"grad_norm": 1.1193921566009521,
"learning_rate": 2.915780589265301e-06,
"loss": 1.3266,
"step": 486
},
{
"epoch": 1.948,
"grad_norm": 1.030122995376587,
"learning_rate": 2.896160227988357e-06,
"loss": 1.2339,
"step": 487
},
{
"epoch": 1.952,
"grad_norm": 1.071089267730713,
"learning_rate": 2.876579153172441e-06,
"loss": 1.2899,
"step": 488
},
{
"epoch": 1.956,
"grad_norm": 0.9699506759643555,
"learning_rate": 2.8570377304685627e-06,
"loss": 1.256,
"step": 489
},
{
"epoch": 1.96,
"grad_norm": 0.9563548564910889,
"learning_rate": 2.8375363247872756e-06,
"loss": 1.2437,
"step": 490
},
{
"epoch": 1.964,
"grad_norm": 0.9940220713615417,
"learning_rate": 2.8180753002918735e-06,
"loss": 1.3382,
"step": 491
},
{
"epoch": 1.968,
"grad_norm": 0.9009513258934021,
"learning_rate": 2.7986550203915807e-06,
"loss": 1.1687,
"step": 492
},
{
"epoch": 1.972,
"grad_norm": 1.0347154140472412,
"learning_rate": 2.779275847734766e-06,
"loss": 1.3601,
"step": 493
},
{
"epoch": 1.976,
"grad_norm": 0.9507591128349304,
"learning_rate": 2.7599381442021833e-06,
"loss": 1.2592,
"step": 494
},
{
"epoch": 1.98,
"grad_norm": 1.239804983139038,
"learning_rate": 2.7406422709001956e-06,
"loss": 1.3886,
"step": 495
},
{
"epoch": 1.984,
"grad_norm": 1.0007840394973755,
"learning_rate": 2.721388588154045e-06,
"loss": 1.3827,
"step": 496
},
{
"epoch": 1.988,
"grad_norm": 0.930361807346344,
"learning_rate": 2.7021774555011214e-06,
"loss": 1.2384,
"step": 497
},
{
"epoch": 1.992,
"grad_norm": 0.9760609865188599,
"learning_rate": 2.6830092316842448e-06,
"loss": 1.3209,
"step": 498
},
{
"epoch": 1.996,
"grad_norm": 0.9475801587104797,
"learning_rate": 2.6638842746449672e-06,
"loss": 1.2661,
"step": 499
},
{
"epoch": 2.0,
"grad_norm": 1.0552971363067627,
"learning_rate": 2.6448029415168964e-06,
"loss": 1.517,
"step": 500
},
{
"epoch": 2.004,
"grad_norm": 0.9212646484375,
"learning_rate": 2.6257655886190147e-06,
"loss": 1.0196,
"step": 501
},
{
"epoch": 2.008,
"grad_norm": 1.021751880645752,
"learning_rate": 2.6067725714490307e-06,
"loss": 1.2333,
"step": 502
},
{
"epoch": 2.012,
"grad_norm": 1.0663317441940308,
"learning_rate": 2.5878242446767466e-06,
"loss": 1.2303,
"step": 503
},
{
"epoch": 2.016,
"grad_norm": 0.9968892335891724,
"learning_rate": 2.5689209621374257e-06,
"loss": 1.2098,
"step": 504
},
{
"epoch": 2.02,
"grad_norm": 1.1156091690063477,
"learning_rate": 2.5500630768251895e-06,
"loss": 1.2579,
"step": 505
},
{
"epoch": 2.024,
"grad_norm": 1.05716073513031,
"learning_rate": 2.5312509408864248e-06,
"loss": 1.2526,
"step": 506
},
{
"epoch": 2.028,
"grad_norm": 0.8975000977516174,
"learning_rate": 2.5124849056132094e-06,
"loss": 1.0011,
"step": 507
},
{
"epoch": 2.032,
"grad_norm": 1.0314711332321167,
"learning_rate": 2.493765321436755e-06,
"loss": 1.1876,
"step": 508
},
{
"epoch": 2.036,
"grad_norm": 0.9085214734077454,
"learning_rate": 2.475092537920853e-06,
"loss": 1.046,
"step": 509
},
{
"epoch": 2.04,
"grad_norm": 0.9011939764022827,
"learning_rate": 2.456466903755357e-06,
"loss": 1.0411,
"step": 510
},
{
"epoch": 2.044,
"grad_norm": 0.9715794920921326,
"learning_rate": 2.4378887667496696e-06,
"loss": 1.1214,
"step": 511
},
{
"epoch": 2.048,
"grad_norm": 1.0464131832122803,
"learning_rate": 2.4193584738262426e-06,
"loss": 1.1207,
"step": 512
},
{
"epoch": 2.052,
"grad_norm": 0.9019450545310974,
"learning_rate": 2.400876371014107e-06,
"loss": 1.092,
"step": 513
},
{
"epoch": 2.056,
"grad_norm": 0.9838923811912537,
"learning_rate": 2.3824428034424e-06,
"loss": 1.1026,
"step": 514
},
{
"epoch": 2.06,
"grad_norm": 0.9204623103141785,
"learning_rate": 2.3640581153339293e-06,
"loss": 1.053,
"step": 515
},
{
"epoch": 2.064,
"grad_norm": 1.0160547494888306,
"learning_rate": 2.3457226499987456e-06,
"loss": 1.165,
"step": 516
},
{
"epoch": 2.068,
"grad_norm": 1.0242973566055298,
"learning_rate": 2.3274367498277246e-06,
"loss": 1.206,
"step": 517
},
{
"epoch": 2.072,
"grad_norm": 0.9955620765686035,
"learning_rate": 2.3092007562861756e-06,
"loss": 1.1202,
"step": 518
},
{
"epoch": 2.076,
"grad_norm": 0.8941048979759216,
"learning_rate": 2.291015009907474e-06,
"loss": 1.0354,
"step": 519
},
{
"epoch": 2.08,
"grad_norm": 1.1226130723953247,
"learning_rate": 2.2728798502866887e-06,
"loss": 1.1285,
"step": 520
},
{
"epoch": 2.084,
"grad_norm": 1.056740164756775,
"learning_rate": 2.2547956160742473e-06,
"loss": 1.2142,
"step": 521
},
{
"epoch": 2.088,
"grad_norm": 1.0686098337173462,
"learning_rate": 2.2367626449696168e-06,
"loss": 1.1753,
"step": 522
},
{
"epoch": 2.092,
"grad_norm": 1.008289098739624,
"learning_rate": 2.2187812737149856e-06,
"loss": 1.0863,
"step": 523
},
{
"epoch": 2.096,
"grad_norm": 0.9350313544273376,
"learning_rate": 2.2008518380889892e-06,
"loss": 1.1109,
"step": 524
},
{
"epoch": 2.1,
"grad_norm": 1.014533519744873,
"learning_rate": 2.182974672900428e-06,
"loss": 1.1973,
"step": 525
},
{
"epoch": 2.104,
"grad_norm": 0.9320296049118042,
"learning_rate": 2.1651501119820212e-06,
"loss": 1.0792,
"step": 526
},
{
"epoch": 2.108,
"grad_norm": 1.096917986869812,
"learning_rate": 2.1473784881841753e-06,
"loss": 1.149,
"step": 527
},
{
"epoch": 2.112,
"grad_norm": 1.0773597955703735,
"learning_rate": 2.129660133368761e-06,
"loss": 1.1394,
"step": 528
},
{
"epoch": 2.116,
"grad_norm": 1.0159255266189575,
"learning_rate": 2.1119953784029207e-06,
"loss": 1.1256,
"step": 529
},
{
"epoch": 2.12,
"grad_norm": 0.973598301410675,
"learning_rate": 2.0943845531528932e-06,
"loss": 1.158,
"step": 530
},
{
"epoch": 2.124,
"grad_norm": 0.9755054712295532,
"learning_rate": 2.0768279864778475e-06,
"loss": 1.098,
"step": 531
},
{
"epoch": 2.128,
"grad_norm": 0.9478545784950256,
"learning_rate": 2.059326006223743e-06,
"loss": 1.1034,
"step": 532
},
{
"epoch": 2.132,
"grad_norm": 1.013749599456787,
"learning_rate": 2.0418789392172113e-06,
"loss": 1.1003,
"step": 533
},
{
"epoch": 2.136,
"grad_norm": 0.9972010254859924,
"learning_rate": 2.0244871112594523e-06,
"loss": 1.1003,
"step": 534
},
{
"epoch": 2.14,
"grad_norm": 1.010685682296753,
"learning_rate": 2.007150847120145e-06,
"loss": 1.1207,
"step": 535
},
{
"epoch": 2.144,
"grad_norm": 1.0307255983352661,
"learning_rate": 1.98987047053139e-06,
"loss": 1.14,
"step": 536
},
{
"epoch": 2.148,
"grad_norm": 1.0822032690048218,
"learning_rate": 1.972646304181656e-06,
"loss": 1.1319,
"step": 537
},
{
"epoch": 2.152,
"grad_norm": 0.954828679561615,
"learning_rate": 1.9554786697097668e-06,
"loss": 1.1214,
"step": 538
},
{
"epoch": 2.156,
"grad_norm": 0.9321712255477905,
"learning_rate": 1.9383678876988797e-06,
"loss": 1.0817,
"step": 539
},
{
"epoch": 2.16,
"grad_norm": 1.0241217613220215,
"learning_rate": 1.921314277670509e-06,
"loss": 1.1557,
"step": 540
},
{
"epoch": 2.164,
"grad_norm": 0.9904546141624451,
"learning_rate": 1.9043181580785597e-06,
"loss": 1.0604,
"step": 541
},
{
"epoch": 2.168,
"grad_norm": 1.0958540439605713,
"learning_rate": 1.8873798463033742e-06,
"loss": 1.1658,
"step": 542
},
{
"epoch": 2.172,
"grad_norm": 1.0296180248260498,
"learning_rate": 1.870499658645809e-06,
"loss": 1.1648,
"step": 543
},
{
"epoch": 2.176,
"grad_norm": 0.9077207446098328,
"learning_rate": 1.8536779103213336e-06,
"loss": 1.0262,
"step": 544
},
{
"epoch": 2.18,
"grad_norm": 0.9854039549827576,
"learning_rate": 1.8369149154541333e-06,
"loss": 1.237,
"step": 545
},
{
"epoch": 2.184,
"grad_norm": 1.0560005903244019,
"learning_rate": 1.8202109870712542e-06,
"loss": 1.2776,
"step": 546
},
{
"epoch": 2.188,
"grad_norm": 1.0411219596862793,
"learning_rate": 1.8035664370967493e-06,
"loss": 1.1759,
"step": 547
},
{
"epoch": 2.192,
"grad_norm": 1.038360357284546,
"learning_rate": 1.7869815763458576e-06,
"loss": 1.0802,
"step": 548
},
{
"epoch": 2.196,
"grad_norm": 1.0684030055999756,
"learning_rate": 1.7704567145192036e-06,
"loss": 1.1016,
"step": 549
},
{
"epoch": 2.2,
"grad_norm": 1.0059784650802612,
"learning_rate": 1.753992160197006e-06,
"loss": 1.0647,
"step": 550
},
{
"epoch": 2.204,
"grad_norm": 1.005402684211731,
"learning_rate": 1.73758822083332e-06,
"loss": 1.0965,
"step": 551
},
{
"epoch": 2.208,
"grad_norm": 1.0324487686157227,
"learning_rate": 1.721245202750299e-06,
"loss": 1.1498,
"step": 552
},
{
"epoch": 2.212,
"grad_norm": 0.966227650642395,
"learning_rate": 1.7049634111324687e-06,
"loss": 1.1032,
"step": 553
},
{
"epoch": 2.216,
"grad_norm": 1.0336990356445312,
"learning_rate": 1.6887431500210272e-06,
"loss": 1.1718,
"step": 554
},
{
"epoch": 2.22,
"grad_norm": 0.9618580937385559,
"learning_rate": 1.6725847223081776e-06,
"loss": 1.0375,
"step": 555
},
{
"epoch": 2.224,
"grad_norm": 0.9805987477302551,
"learning_rate": 1.6564884297314593e-06,
"loss": 1.1808,
"step": 556
},
{
"epoch": 2.228,
"grad_norm": 1.0079127550125122,
"learning_rate": 1.6404545728681232e-06,
"loss": 1.1657,
"step": 557
},
{
"epoch": 2.232,
"grad_norm": 1.0422035455703735,
"learning_rate": 1.624483451129512e-06,
"loss": 1.1185,
"step": 558
},
{
"epoch": 2.2359999999999998,
"grad_norm": 1.0370445251464844,
"learning_rate": 1.6085753627554728e-06,
"loss": 1.1599,
"step": 559
},
{
"epoch": 2.24,
"grad_norm": 1.0785090923309326,
"learning_rate": 1.5927306048087855e-06,
"loss": 1.2284,
"step": 560
},
{
"epoch": 2.2439999999999998,
"grad_norm": 1.0310076475143433,
"learning_rate": 1.5769494731696206e-06,
"loss": 1.1048,
"step": 561
},
{
"epoch": 2.248,
"grad_norm": 1.0009162425994873,
"learning_rate": 1.5612322625300064e-06,
"loss": 1.0906,
"step": 562
},
{
"epoch": 2.252,
"grad_norm": 0.9988439679145813,
"learning_rate": 1.5455792663883329e-06,
"loss": 1.1897,
"step": 563
},
{
"epoch": 2.2560000000000002,
"grad_norm": 0.971535325050354,
"learning_rate": 1.529990777043866e-06,
"loss": 1.0518,
"step": 564
},
{
"epoch": 2.26,
"grad_norm": 1.0020225048065186,
"learning_rate": 1.5144670855912908e-06,
"loss": 1.1513,
"step": 565
},
{
"epoch": 2.2640000000000002,
"grad_norm": 0.9475902915000916,
"learning_rate": 1.499008481915281e-06,
"loss": 1.0735,
"step": 566
},
{
"epoch": 2.268,
"grad_norm": 1.0750163793563843,
"learning_rate": 1.483615254685075e-06,
"loss": 1.186,
"step": 567
},
{
"epoch": 2.2720000000000002,
"grad_norm": 1.0703922510147095,
"learning_rate": 1.4682876913490973e-06,
"loss": 1.2119,
"step": 568
},
{
"epoch": 2.276,
"grad_norm": 1.05789053440094,
"learning_rate": 1.4530260781295813e-06,
"loss": 1.1711,
"step": 569
},
{
"epoch": 2.2800000000000002,
"grad_norm": 0.971713662147522,
"learning_rate": 1.437830700017226e-06,
"loss": 1.1174,
"step": 570
},
{
"epoch": 2.284,
"grad_norm": 0.9409313201904297,
"learning_rate": 1.4227018407658822e-06,
"loss": 1.1378,
"step": 571
},
{
"epoch": 2.288,
"grad_norm": 0.9503111243247986,
"learning_rate": 1.4076397828872441e-06,
"loss": 1.0972,
"step": 572
},
{
"epoch": 2.292,
"grad_norm": 0.9956273436546326,
"learning_rate": 1.392644807645575e-06,
"loss": 1.1251,
"step": 573
},
{
"epoch": 2.296,
"grad_norm": 0.9246674180030823,
"learning_rate": 1.3777171950524648e-06,
"loss": 1.0709,
"step": 574
},
{
"epoch": 2.3,
"grad_norm": 1.0414146184921265,
"learning_rate": 1.3628572238615878e-06,
"loss": 1.1007,
"step": 575
},
{
"epoch": 2.304,
"grad_norm": 1.0711561441421509,
"learning_rate": 1.3480651715635035e-06,
"loss": 1.1467,
"step": 576
},
{
"epoch": 2.308,
"grad_norm": 1.0547200441360474,
"learning_rate": 1.333341314380479e-06,
"loss": 1.1004,
"step": 577
},
{
"epoch": 2.312,
"grad_norm": 1.016107439994812,
"learning_rate": 1.3186859272613222e-06,
"loss": 1.1453,
"step": 578
},
{
"epoch": 2.316,
"grad_norm": 1.0614742040634155,
"learning_rate": 1.3040992838762562e-06,
"loss": 1.2197,
"step": 579
},
{
"epoch": 2.32,
"grad_norm": 0.9451265335083008,
"learning_rate": 1.2895816566118014e-06,
"loss": 0.9909,
"step": 580
},
{
"epoch": 2.324,
"grad_norm": 1.0505961179733276,
"learning_rate": 1.275133316565691e-06,
"loss": 1.1956,
"step": 581
},
{
"epoch": 2.328,
"grad_norm": 1.0458132028579712,
"learning_rate": 1.2607545335418154e-06,
"loss": 1.1305,
"step": 582
},
{
"epoch": 2.332,
"grad_norm": 0.9635487198829651,
"learning_rate": 1.2464455760451733e-06,
"loss": 1.0698,
"step": 583
},
{
"epoch": 2.336,
"grad_norm": 0.8457169532775879,
"learning_rate": 1.2322067112768632e-06,
"loss": 0.9367,
"step": 584
},
{
"epoch": 2.34,
"grad_norm": 1.0260626077651978,
"learning_rate": 1.2180382051290974e-06,
"loss": 1.0588,
"step": 585
},
{
"epoch": 2.344,
"grad_norm": 0.9955061078071594,
"learning_rate": 1.2039403221802297e-06,
"loss": 1.0821,
"step": 586
},
{
"epoch": 2.348,
"grad_norm": 0.973564624786377,
"learning_rate": 1.189913325689816e-06,
"loss": 1.0893,
"step": 587
},
{
"epoch": 2.352,
"grad_norm": 1.0348793268203735,
"learning_rate": 1.175957477593706e-06,
"loss": 1.063,
"step": 588
},
{
"epoch": 2.356,
"grad_norm": 1.0029393434524536,
"learning_rate": 1.1620730384991407e-06,
"loss": 1.084,
"step": 589
},
{
"epoch": 2.36,
"grad_norm": 1.012460708618164,
"learning_rate": 1.1482602676798933e-06,
"loss": 1.0967,
"step": 590
},
{
"epoch": 2.364,
"grad_norm": 0.9073032736778259,
"learning_rate": 1.1345194230714235e-06,
"loss": 1.0183,
"step": 591
},
{
"epoch": 2.368,
"grad_norm": 1.0121479034423828,
"learning_rate": 1.120850761266068e-06,
"loss": 1.1363,
"step": 592
},
{
"epoch": 2.372,
"grad_norm": 1.0043262243270874,
"learning_rate": 1.107254537508239e-06,
"loss": 1.1416,
"step": 593
},
{
"epoch": 2.376,
"grad_norm": 1.0971993207931519,
"learning_rate": 1.0937310056896643e-06,
"loss": 1.1151,
"step": 594
},
{
"epoch": 2.38,
"grad_norm": 0.9937849044799805,
"learning_rate": 1.080280418344643e-06,
"loss": 1.1716,
"step": 595
},
{
"epoch": 2.384,
"grad_norm": 0.9772449135780334,
"learning_rate": 1.0669030266453367e-06,
"loss": 1.0893,
"step": 596
},
{
"epoch": 2.388,
"grad_norm": 1.0142498016357422,
"learning_rate": 1.053599080397068e-06,
"loss": 1.1527,
"step": 597
},
{
"epoch": 2.392,
"grad_norm": 1.081161618232727,
"learning_rate": 1.0403688280336626e-06,
"loss": 1.1253,
"step": 598
},
{
"epoch": 2.396,
"grad_norm": 1.1796064376831055,
"learning_rate": 1.027212516612814e-06,
"loss": 1.2003,
"step": 599
},
{
"epoch": 2.4,
"grad_norm": 1.039437174797058,
"learning_rate": 1.014130391811457e-06,
"loss": 1.1491,
"step": 600
},
{
"epoch": 2.404,
"grad_norm": 1.0628715753555298,
"learning_rate": 1.001122697921197e-06,
"loss": 1.2069,
"step": 601
},
{
"epoch": 2.408,
"grad_norm": 1.0313056707382202,
"learning_rate": 9.881896778437328e-07,
"loss": 1.1649,
"step": 602
},
{
"epoch": 2.412,
"grad_norm": 1.0078644752502441,
"learning_rate": 9.753315730863284e-07,
"loss": 1.1739,
"step": 603
},
{
"epoch": 2.416,
"grad_norm": 1.0151255130767822,
"learning_rate": 9.625486237573046e-07,
"loss": 1.1084,
"step": 604
},
{
"epoch": 2.42,
"grad_norm": 0.9998011589050293,
"learning_rate": 9.498410685615511e-07,
"loss": 1.1366,
"step": 605
},
{
"epoch": 2.424,
"grad_norm": 0.9374569058418274,
"learning_rate": 9.372091447960685e-07,
"loss": 1.0163,
"step": 606
},
{
"epoch": 2.428,
"grad_norm": 1.0558018684387207,
"learning_rate": 9.24653088345544e-07,
"loss": 1.169,
"step": 607
},
{
"epoch": 2.432,
"grad_norm": 1.0993043184280396,
"learning_rate": 9.121731336779377e-07,
"loss": 1.1611,
"step": 608
},
{
"epoch": 2.436,
"grad_norm": 0.933655321598053,
"learning_rate": 8.99769513840108e-07,
"loss": 1.0638,
"step": 609
},
{
"epoch": 2.44,
"grad_norm": 1.012734293937683,
"learning_rate": 8.874424604534643e-07,
"loss": 1.1495,
"step": 610
},
{
"epoch": 2.444,
"grad_norm": 0.9672722816467285,
"learning_rate": 8.751922037096328e-07,
"loss": 1.1275,
"step": 611
},
{
"epoch": 2.448,
"grad_norm": 0.990340530872345,
"learning_rate": 8.630189723661663e-07,
"loss": 1.1968,
"step": 612
},
{
"epoch": 2.452,
"grad_norm": 0.9619189500808716,
"learning_rate": 8.509229937422664e-07,
"loss": 1.0906,
"step": 613
},
{
"epoch": 2.456,
"grad_norm": 0.9421491026878357,
"learning_rate": 8.389044937145397e-07,
"loss": 1.0867,
"step": 614
},
{
"epoch": 2.46,
"grad_norm": 1.013745903968811,
"learning_rate": 8.269636967127864e-07,
"loss": 1.1615,
"step": 615
},
{
"epoch": 2.464,
"grad_norm": 1.1921918392181396,
"learning_rate": 8.151008257158e-07,
"loss": 1.1394,
"step": 616
},
{
"epoch": 2.468,
"grad_norm": 1.0029451847076416,
"learning_rate": 8.033161022472063e-07,
"loss": 1.1518,
"step": 617
},
{
"epoch": 2.472,
"grad_norm": 0.956015944480896,
"learning_rate": 7.916097463713335e-07,
"loss": 1.0743,
"step": 618
},
{
"epoch": 2.476,
"grad_norm": 1.0313812494277954,
"learning_rate": 7.799819766890926e-07,
"loss": 1.1318,
"step": 619
},
{
"epoch": 2.48,
"grad_norm": 0.9998788237571716,
"learning_rate": 7.684330103339016e-07,
"loss": 1.1073,
"step": 620
},
{
"epoch": 2.484,
"grad_norm": 1.022858738899231,
"learning_rate": 7.569630629676294e-07,
"loss": 1.1129,
"step": 621
},
{
"epoch": 2.488,
"grad_norm": 0.9638151526451111,
"learning_rate": 7.455723487765664e-07,
"loss": 1.1429,
"step": 622
},
{
"epoch": 2.492,
"grad_norm": 1.0429561138153076,
"learning_rate": 7.342610804674316e-07,
"loss": 1.1634,
"step": 623
},
{
"epoch": 2.496,
"grad_norm": 1.027001142501831,
"learning_rate": 7.230294692633922e-07,
"loss": 1.157,
"step": 624
},
{
"epoch": 2.5,
"grad_norm": 1.0432196855545044,
"learning_rate": 7.118777249001213e-07,
"loss": 1.1634,
"step": 625
},
{
"epoch": 2.504,
"grad_norm": 1.0033257007598877,
"learning_rate": 7.008060556218893e-07,
"loss": 1.1552,
"step": 626
},
{
"epoch": 2.508,
"grad_norm": 1.0317420959472656,
"learning_rate": 6.898146681776629e-07,
"loss": 1.07,
"step": 627
},
{
"epoch": 2.512,
"grad_norm": 0.9765068888664246,
"learning_rate": 6.789037678172522e-07,
"loss": 1.2058,
"step": 628
},
{
"epoch": 2.516,
"grad_norm": 0.9833831787109375,
"learning_rate": 6.680735582874781e-07,
"loss": 1.1085,
"step": 629
},
{
"epoch": 2.52,
"grad_norm": 1.0484850406646729,
"learning_rate": 6.573242418283632e-07,
"loss": 1.1234,
"step": 630
},
{
"epoch": 2.524,
"grad_norm": 0.9526020884513855,
"learning_rate": 6.466560191693566e-07,
"loss": 1.0921,
"step": 631
},
{
"epoch": 2.528,
"grad_norm": 0.9717855453491211,
"learning_rate": 6.360690895255916e-07,
"loss": 1.038,
"step": 632
},
{
"epoch": 2.532,
"grad_norm": 0.9675845503807068,
"learning_rate": 6.255636505941548e-07,
"loss": 1.1378,
"step": 633
},
{
"epoch": 2.536,
"grad_norm": 0.9222082495689392,
"learning_rate": 6.151398985504043e-07,
"loss": 1.0006,
"step": 634
},
{
"epoch": 2.54,
"grad_norm": 1.0398544073104858,
"learning_rate": 6.047980280443e-07,
"loss": 1.2004,
"step": 635
},
{
"epoch": 2.544,
"grad_norm": 0.9954432845115662,
"learning_rate": 5.945382321967696e-07,
"loss": 1.1346,
"step": 636
},
{
"epoch": 2.548,
"grad_norm": 1.0304733514785767,
"learning_rate": 5.84360702596109e-07,
"loss": 1.1619,
"step": 637
},
{
"epoch": 2.552,
"grad_norm": 1.0341770648956299,
"learning_rate": 5.742656292943943e-07,
"loss": 1.2535,
"step": 638
},
{
"epoch": 2.556,
"grad_norm": 0.9743322134017944,
"learning_rate": 5.642532008039392e-07,
"loss": 1.1206,
"step": 639
},
{
"epoch": 2.56,
"grad_norm": 1.0595154762268066,
"learning_rate": 5.543236040937744e-07,
"loss": 1.1708,
"step": 640
},
{
"epoch": 2.564,
"grad_norm": 1.0400642156600952,
"learning_rate": 5.444770245861553e-07,
"loss": 1.1639,
"step": 641
},
{
"epoch": 2.568,
"grad_norm": 0.8947452902793884,
"learning_rate": 5.347136461530966e-07,
"loss": 1.1171,
"step": 642
},
{
"epoch": 2.572,
"grad_norm": 0.9468121528625488,
"learning_rate": 5.250336511129462e-07,
"loss": 1.0481,
"step": 643
},
{
"epoch": 2.576,
"grad_norm": 0.9436845183372498,
"learning_rate": 5.15437220226972e-07,
"loss": 1.1367,
"step": 644
},
{
"epoch": 2.58,
"grad_norm": 0.9462328553199768,
"learning_rate": 5.059245326959927e-07,
"loss": 1.0463,
"step": 645
},
{
"epoch": 2.584,
"grad_norm": 1.0117089748382568,
"learning_rate": 4.964957661570285e-07,
"loss": 1.1258,
"step": 646
},
{
"epoch": 2.588,
"grad_norm": 1.0454022884368896,
"learning_rate": 4.871510966799847e-07,
"loss": 1.1556,
"step": 647
},
{
"epoch": 2.592,
"grad_norm": 0.9381544589996338,
"learning_rate": 4.778906987643633e-07,
"loss": 1.0831,
"step": 648
},
{
"epoch": 2.596,
"grad_norm": 1.0206154584884644,
"learning_rate": 4.6871474533600413e-07,
"loss": 1.1627,
"step": 649
},
{
"epoch": 2.6,
"grad_norm": 0.9210219979286194,
"learning_rate": 4.5962340774385936e-07,
"loss": 1.1038,
"step": 650
},
{
"epoch": 2.604,
"grad_norm": 0.9466139674186707,
"learning_rate": 4.506168557567886e-07,
"loss": 1.0451,
"step": 651
},
{
"epoch": 2.608,
"grad_norm": 0.9177185297012329,
"learning_rate": 4.4169525756039164e-07,
"loss": 1.0839,
"step": 652
},
{
"epoch": 2.612,
"grad_norm": 1.0056695938110352,
"learning_rate": 4.328587797538658e-07,
"loss": 1.1586,
"step": 653
},
{
"epoch": 2.616,
"grad_norm": 1.0050727128982544,
"learning_rate": 4.2410758734689915e-07,
"loss": 1.1436,
"step": 654
},
{
"epoch": 2.62,
"grad_norm": 0.9479018449783325,
"learning_rate": 4.1544184375658326e-07,
"loss": 1.0661,
"step": 655
},
{
"epoch": 2.624,
"grad_norm": 1.0148154497146606,
"learning_rate": 4.0686171080436767e-07,
"loss": 1.139,
"step": 656
},
{
"epoch": 2.628,
"grad_norm": 0.9281206727027893,
"learning_rate": 3.983673487130313e-07,
"loss": 1.1202,
"step": 657
},
{
"epoch": 2.632,
"grad_norm": 1.0474183559417725,
"learning_rate": 3.8995891610369707e-07,
"loss": 1.175,
"step": 658
},
{
"epoch": 2.636,
"grad_norm": 0.9884467124938965,
"learning_rate": 3.8163656999286647e-07,
"loss": 1.0796,
"step": 659
},
{
"epoch": 2.64,
"grad_norm": 1.0385491847991943,
"learning_rate": 3.734004657894874e-07,
"loss": 1.1317,
"step": 660
},
{
"epoch": 2.644,
"grad_norm": 1.0091276168823242,
"learning_rate": 3.6525075729205274e-07,
"loss": 1.1528,
"step": 661
},
{
"epoch": 2.648,
"grad_norm": 0.9969792366027832,
"learning_rate": 3.5718759668572913e-07,
"loss": 1.0808,
"step": 662
},
{
"epoch": 2.652,
"grad_norm": 1.0289039611816406,
"learning_rate": 3.4921113453951385e-07,
"loss": 1.0928,
"step": 663
},
{
"epoch": 2.656,
"grad_norm": 0.9001720547676086,
"learning_rate": 3.4132151980342255e-07,
"loss": 0.9865,
"step": 664
},
{
"epoch": 2.66,
"grad_norm": 0.9682496786117554,
"learning_rate": 3.335188998057115e-07,
"loss": 1.1555,
"step": 665
},
{
"epoch": 2.664,
"grad_norm": 0.9241328239440918,
"learning_rate": 3.2580342025012204e-07,
"loss": 1.0991,
"step": 666
},
{
"epoch": 2.668,
"grad_norm": 0.8437192440032959,
"learning_rate": 3.1817522521316034e-07,
"loss": 1.0199,
"step": 667
},
{
"epoch": 2.672,
"grad_norm": 0.9753063917160034,
"learning_rate": 3.106344571414116e-07,
"loss": 1.1078,
"step": 668
},
{
"epoch": 2.676,
"grad_norm": 0.987506091594696,
"learning_rate": 3.0318125684887233e-07,
"loss": 1.0607,
"step": 669
},
{
"epoch": 2.68,
"grad_norm": 0.9966980218887329,
"learning_rate": 2.958157635143294e-07,
"loss": 1.1868,
"step": 670
},
{
"epoch": 2.684,
"grad_norm": 0.9446958899497986,
"learning_rate": 2.8853811467875413e-07,
"loss": 1.0376,
"step": 671
},
{
"epoch": 2.6879999999999997,
"grad_norm": 1.0555483102798462,
"learning_rate": 2.813484462427357e-07,
"loss": 1.1838,
"step": 672
},
{
"epoch": 2.692,
"grad_norm": 0.9516134858131409,
"learning_rate": 2.7424689246394685e-07,
"loss": 1.0569,
"step": 673
},
{
"epoch": 2.6959999999999997,
"grad_norm": 0.9849783182144165,
"learning_rate": 2.672335859546332e-07,
"loss": 1.0446,
"step": 674
},
{
"epoch": 2.7,
"grad_norm": 1.0225852727890015,
"learning_rate": 2.6030865767913527e-07,
"loss": 1.197,
"step": 675
},
{
"epoch": 2.7039999999999997,
"grad_norm": 1.0124677419662476,
"learning_rate": 2.534722369514503e-07,
"loss": 1.1335,
"step": 676
},
{
"epoch": 2.708,
"grad_norm": 0.9324035048484802,
"learning_rate": 2.467244514328082e-07,
"loss": 1.0847,
"step": 677
},
{
"epoch": 2.7119999999999997,
"grad_norm": 0.9841536283493042,
"learning_rate": 2.400654271292946e-07,
"loss": 1.1351,
"step": 678
},
{
"epoch": 2.716,
"grad_norm": 1.1448701620101929,
"learning_rate": 2.334952883894942e-07,
"loss": 1.3025,
"step": 679
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.978304922580719,
"learning_rate": 2.270141579021695e-07,
"loss": 1.1859,
"step": 680
},
{
"epoch": 2.724,
"grad_norm": 1.0148320198059082,
"learning_rate": 2.2062215669397201e-07,
"loss": 1.2038,
"step": 681
},
{
"epoch": 2.7279999999999998,
"grad_norm": 1.0819321870803833,
"learning_rate": 2.1431940412717843e-07,
"loss": 1.2272,
"step": 682
},
{
"epoch": 2.732,
"grad_norm": 0.9233974814414978,
"learning_rate": 2.081060178974642e-07,
"loss": 1.047,
"step": 683
},
{
"epoch": 2.7359999999999998,
"grad_norm": 0.9904933571815491,
"learning_rate": 2.019821140317052e-07,
"loss": 1.1776,
"step": 684
},
{
"epoch": 2.74,
"grad_norm": 1.0356868505477905,
"learning_rate": 1.9594780688581172e-07,
"loss": 1.2435,
"step": 685
},
{
"epoch": 2.7439999999999998,
"grad_norm": 1.0266258716583252,
"learning_rate": 1.900032091425902e-07,
"loss": 1.2242,
"step": 686
},
{
"epoch": 2.748,
"grad_norm": 1.131581425666809,
"learning_rate": 1.8414843180964316e-07,
"loss": 1.2531,
"step": 687
},
{
"epoch": 2.752,
"grad_norm": 1.0443313121795654,
"learning_rate": 1.7838358421729375e-07,
"loss": 1.1,
"step": 688
},
{
"epoch": 2.7560000000000002,
"grad_norm": 0.9902170300483704,
"learning_rate": 1.7270877401654283e-07,
"loss": 1.0773,
"step": 689
},
{
"epoch": 2.76,
"grad_norm": 0.9638949632644653,
"learning_rate": 1.6712410717706406e-07,
"loss": 1.1367,
"step": 690
},
{
"epoch": 2.7640000000000002,
"grad_norm": 0.9663549661636353,
"learning_rate": 1.616296879852175e-07,
"loss": 1.1683,
"step": 691
},
{
"epoch": 2.768,
"grad_norm": 1.010324478149414,
"learning_rate": 1.562256190421102e-07,
"loss": 1.1175,
"step": 692
},
{
"epoch": 2.7720000000000002,
"grad_norm": 0.9264208078384399,
"learning_rate": 1.5091200126167328e-07,
"loss": 1.0529,
"step": 693
},
{
"epoch": 2.776,
"grad_norm": 1.1368553638458252,
"learning_rate": 1.4568893386878057e-07,
"loss": 1.163,
"step": 694
},
{
"epoch": 2.7800000000000002,
"grad_norm": 1.0288695096969604,
"learning_rate": 1.405565143973986e-07,
"loss": 1.0929,
"step": 695
},
{
"epoch": 2.784,
"grad_norm": 1.0272082090377808,
"learning_rate": 1.3551483868875836e-07,
"loss": 1.2123,
"step": 696
},
{
"epoch": 2.7880000000000003,
"grad_norm": 0.9298689961433411,
"learning_rate": 1.30564000889572e-07,
"loss": 1.0406,
"step": 697
},
{
"epoch": 2.792,
"grad_norm": 1.0057530403137207,
"learning_rate": 1.257040934502729e-07,
"loss": 1.1671,
"step": 698
},
{
"epoch": 2.7960000000000003,
"grad_norm": 0.9338884353637695,
"learning_rate": 1.209352071232861e-07,
"loss": 1.121,
"step": 699
},
{
"epoch": 2.8,
"grad_norm": 0.9351827502250671,
"learning_rate": 1.162574309613379e-07,
"loss": 1.0767,
"step": 700
},
{
"epoch": 2.8040000000000003,
"grad_norm": 0.9718006253242493,
"learning_rate": 1.1167085231579111e-07,
"loss": 1.0954,
"step": 701
},
{
"epoch": 2.808,
"grad_norm": 1.032774567604065,
"learning_rate": 1.0717555683501413e-07,
"loss": 1.1855,
"step": 702
},
{
"epoch": 2.8120000000000003,
"grad_norm": 1.0385111570358276,
"learning_rate": 1.027716284627811e-07,
"loss": 1.1956,
"step": 703
},
{
"epoch": 2.816,
"grad_norm": 1.0782874822616577,
"learning_rate": 9.845914943670432e-08,
"loss": 1.1951,
"step": 704
},
{
"epoch": 2.82,
"grad_norm": 1.0683598518371582,
"learning_rate": 9.423820028669983e-08,
"loss": 1.2507,
"step": 705
},
{
"epoch": 2.824,
"grad_norm": 0.9356297850608826,
"learning_rate": 9.010885983348094e-08,
"loss": 1.1028,
"step": 706
},
{
"epoch": 2.828,
"grad_norm": 0.9524960517883301,
"learning_rate": 8.607120518709156e-08,
"loss": 1.0402,
"step": 707
},
{
"epoch": 2.832,
"grad_norm": 0.9845964312553406,
"learning_rate": 8.212531174545957e-08,
"loss": 1.1571,
"step": 708
},
{
"epoch": 2.836,
"grad_norm": 0.9666380882263184,
"learning_rate": 7.827125319299301e-08,
"loss": 1.0773,
"step": 709
},
{
"epoch": 2.84,
"grad_norm": 0.9575058817863464,
"learning_rate": 7.450910149920499e-08,
"loss": 1.0366,
"step": 710
},
{
"epoch": 2.844,
"grad_norm": 1.0145996809005737,
"learning_rate": 7.083892691736428e-08,
"loss": 1.1288,
"step": 711
},
{
"epoch": 2.848,
"grad_norm": 1.0520100593566895,
"learning_rate": 6.726079798319185e-08,
"loss": 1.183,
"step": 712
},
{
"epoch": 2.852,
"grad_norm": 0.9477232694625854,
"learning_rate": 6.377478151357308e-08,
"loss": 1.0467,
"step": 713
},
{
"epoch": 2.856,
"grad_norm": 1.0197796821594238,
"learning_rate": 6.038094260531425e-08,
"loss": 1.2143,
"step": 714
},
{
"epoch": 2.86,
"grad_norm": 1.029517650604248,
"learning_rate": 5.707934463392628e-08,
"loss": 1.1159,
"step": 715
},
{
"epoch": 2.864,
"grad_norm": 1.0391532182693481,
"learning_rate": 5.387004925244077e-08,
"loss": 1.1501,
"step": 716
},
{
"epoch": 2.868,
"grad_norm": 0.9292702674865723,
"learning_rate": 5.0753116390258594e-08,
"loss": 1.1139,
"step": 717
},
{
"epoch": 2.872,
"grad_norm": 0.9527564644813538,
"learning_rate": 4.772860425203252e-08,
"loss": 1.0644,
"step": 718
},
{
"epoch": 2.876,
"grad_norm": 1.0512701272964478,
"learning_rate": 4.479656931657694e-08,
"loss": 1.2117,
"step": 719
},
{
"epoch": 2.88,
"grad_norm": 1.0136529207229614,
"learning_rate": 4.195706633581709e-08,
"loss": 1.1464,
"step": 720
},
{
"epoch": 2.884,
"grad_norm": 1.0794901847839355,
"learning_rate": 3.9210148333763135e-08,
"loss": 1.21,
"step": 721
},
{
"epoch": 2.888,
"grad_norm": 1.0426453351974487,
"learning_rate": 3.655586660552324e-08,
"loss": 1.112,
"step": 722
},
{
"epoch": 2.892,
"grad_norm": 0.9325045943260193,
"learning_rate": 3.39942707163432e-08,
"loss": 1.0395,
"step": 723
},
{
"epoch": 2.896,
"grad_norm": 0.9541572332382202,
"learning_rate": 3.152540850068164e-08,
"loss": 1.0698,
"step": 724
},
{
"epoch": 2.9,
"grad_norm": 0.9602929353713989,
"learning_rate": 2.9149326061317373e-08,
"loss": 1.1259,
"step": 725
},
{
"epoch": 2.904,
"grad_norm": 0.8852847218513489,
"learning_rate": 2.686606776848788e-08,
"loss": 1.0172,
"step": 726
},
{
"epoch": 2.908,
"grad_norm": 1.061636209487915,
"learning_rate": 2.4675676259059976e-08,
"loss": 1.1733,
"step": 727
},
{
"epoch": 2.912,
"grad_norm": 1.0787949562072754,
"learning_rate": 2.2578192435736555e-08,
"loss": 1.1884,
"step": 728
},
{
"epoch": 2.916,
"grad_norm": 1.0201940536499023,
"learning_rate": 2.0573655466289423e-08,
"loss": 1.085,
"step": 729
},
{
"epoch": 2.92,
"grad_norm": 0.987388014793396,
"learning_rate": 1.866210278282876e-08,
"loss": 1.1002,
"step": 730
},
{
"epoch": 2.924,
"grad_norm": 1.0137457847595215,
"learning_rate": 1.684357008110593e-08,
"loss": 1.1844,
"step": 731
},
{
"epoch": 2.928,
"grad_norm": 0.9481896162033081,
"learning_rate": 1.5118091319843985e-08,
"loss": 1.1112,
"step": 732
},
{
"epoch": 2.932,
"grad_norm": 0.9766185283660889,
"learning_rate": 1.3485698720107077e-08,
"loss": 1.0849,
"step": 733
},
{
"epoch": 2.936,
"grad_norm": 1.0705715417861938,
"learning_rate": 1.1946422764695376e-08,
"loss": 1.2248,
"step": 734
},
{
"epoch": 2.94,
"grad_norm": 1.0736720561981201,
"learning_rate": 1.0500292197577756e-08,
"loss": 1.0905,
"step": 735
},
{
"epoch": 2.944,
"grad_norm": 1.007784366607666,
"learning_rate": 9.147334023354437e-09,
"loss": 1.168,
"step": 736
},
{
"epoch": 2.948,
"grad_norm": 1.0883458852767944,
"learning_rate": 7.887573506752954e-09,
"loss": 1.1768,
"step": 737
},
{
"epoch": 2.952,
"grad_norm": 0.9906151294708252,
"learning_rate": 6.7210341721563044e-09,
"loss": 1.0216,
"step": 738
},
{
"epoch": 2.956,
"grad_norm": 1.0448439121246338,
"learning_rate": 5.647737803163855e-09,
"loss": 1.1375,
"step": 739
},
{
"epoch": 2.96,
"grad_norm": 1.081884503364563,
"learning_rate": 4.667704442183341e-09,
"loss": 1.1288,
"step": 740
},
{
"epoch": 2.964,
"grad_norm": 1.0683910846710205,
"learning_rate": 3.780952390058379e-09,
"loss": 1.2389,
"step": 741
},
{
"epoch": 2.968,
"grad_norm": 1.0774167776107788,
"learning_rate": 2.98749820572708e-09,
"loss": 1.1979,
"step": 742
},
{
"epoch": 2.972,
"grad_norm": 1.0830549001693726,
"learning_rate": 2.2873567059084056e-09,
"loss": 1.2507,
"step": 743
},
{
"epoch": 2.976,
"grad_norm": 1.0182136297225952,
"learning_rate": 1.680540964832389e-09,
"loss": 1.1204,
"step": 744
},
{
"epoch": 2.98,
"grad_norm": 1.1047148704528809,
"learning_rate": 1.1670623139903303e-09,
"loss": 1.288,
"step": 745
},
{
"epoch": 2.984,
"grad_norm": 0.9929722547531128,
"learning_rate": 7.469303419255225e-10,
"loss": 1.1117,
"step": 746
},
{
"epoch": 2.988,
"grad_norm": 1.0203157663345337,
"learning_rate": 4.2015289405339386e-10,
"loss": 1.1739,
"step": 747
},
{
"epoch": 2.992,
"grad_norm": 0.9244707822799683,
"learning_rate": 1.8673607251717963e-10,
"loss": 1.0712,
"step": 748
},
{
"epoch": 2.996,
"grad_norm": 0.9131047129631042,
"learning_rate": 4.66842360713482e-11,
"loss": 1.0141,
"step": 749
},
{
"epoch": 3.0,
"grad_norm": 0.9466648697853088,
"learning_rate": 0.0,
"loss": 1.0966,
"step": 750
},
{
"epoch": 3.0,
"step": 750,
"total_flos": 119426084241408.0,
"train_loss": 1.3708130646546681,
"train_runtime": 12975.2994,
"train_samples_per_second": 1.85,
"train_steps_per_second": 0.058
}
],
"logging_steps": 1,
"max_steps": 750,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 119426084241408.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}