{ "best_global_step": 33708, "best_metric": 0.7930276463834468, "best_model_checkpoint": "runs/clip_fusion_mmhshateful\\checkpoint-33708", "epoch": 3.0, "eval_steps": 500, "global_step": 33708, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0044499822000712, "grad_norm": 4.786144733428955, "learning_rate": 1.0456062291434927e-07, "loss": 1.1855, "step": 50 }, { "epoch": 0.0088999644001424, "grad_norm": 5.20658016204834, "learning_rate": 2.1579532814238045e-07, "loss": 1.1604, "step": 100 }, { "epoch": 0.013349946600213599, "grad_norm": 4.505102634429932, "learning_rate": 3.2703003337041164e-07, "loss": 1.116, "step": 150 }, { "epoch": 0.0177999288002848, "grad_norm": 7.0701165199279785, "learning_rate": 4.3826473859844274e-07, "loss": 1.4264, "step": 200 }, { "epoch": 0.022249911000355997, "grad_norm": 12.02456283569336, "learning_rate": 5.49499443826474e-07, "loss": 1.401, "step": 250 }, { "epoch": 0.026699893200427198, "grad_norm": 12.082966804504395, "learning_rate": 6.607341490545051e-07, "loss": 1.1857, "step": 300 }, { "epoch": 0.0311498754004984, "grad_norm": 12.752875328063965, "learning_rate": 7.719688542825363e-07, "loss": 1.0223, "step": 350 }, { "epoch": 0.0355998576005696, "grad_norm": 13.606916427612305, "learning_rate": 8.832035595105674e-07, "loss": 1.1318, "step": 400 }, { "epoch": 0.0400498398006408, "grad_norm": 13.681859970092773, "learning_rate": 9.92213570634038e-07, "loss": 1.0702, "step": 450 }, { "epoch": 0.044499822000711994, "grad_norm": 14.764453887939453, "learning_rate": 1.103448275862069e-06, "loss": 1.0199, "step": 500 }, { "epoch": 0.048949804200783195, "grad_norm": 13.899822235107422, "learning_rate": 1.2146829810901001e-06, "loss": 1.1305, "step": 550 }, { "epoch": 0.053399786400854396, "grad_norm": 65.35102844238281, "learning_rate": 1.3259176863181315e-06, "loss": 1.0908, "step": 600 }, { "epoch": 0.0578497686009256, "grad_norm": 17.795137405395508, "learning_rate": 1.4371523915461624e-06, "loss": 1.242, "step": 650 }, { "epoch": 0.0622997508009968, "grad_norm": 7.715653419494629, "learning_rate": 1.5483870967741937e-06, "loss": 0.9389, "step": 700 }, { "epoch": 0.066749733001068, "grad_norm": 7.7139129638671875, "learning_rate": 1.6596218020022248e-06, "loss": 1.0202, "step": 750 }, { "epoch": 0.0711997152011392, "grad_norm": 12.150534629821777, "learning_rate": 1.7708565072302561e-06, "loss": 1.0869, "step": 800 }, { "epoch": 0.0756496974012104, "grad_norm": 7.825719356536865, "learning_rate": 1.8820912124582872e-06, "loss": 0.9803, "step": 850 }, { "epoch": 0.0800996796012816, "grad_norm": 225.07461547851562, "learning_rate": 1.993325917686318e-06, "loss": 1.0503, "step": 900 }, { "epoch": 0.08454966180135279, "grad_norm": 4.734207630157471, "learning_rate": 2.1045606229143494e-06, "loss": 1.2465, "step": 950 }, { "epoch": 0.08899964400142399, "grad_norm": 16.512815475463867, "learning_rate": 2.2157953281423803e-06, "loss": 1.0681, "step": 1000 }, { "epoch": 0.09344962620149519, "grad_norm": 25.41790771484375, "learning_rate": 2.3270300333704117e-06, "loss": 1.078, "step": 1050 }, { "epoch": 0.09789960840156639, "grad_norm": Infinity, "learning_rate": 2.438264738598443e-06, "loss": 1.1562, "step": 1100 }, { "epoch": 0.10234959060163759, "grad_norm": 6.678137302398682, "learning_rate": 2.5472747497219134e-06, "loss": 1.1279, "step": 1150 }, { "epoch": 0.10679957280170879, "grad_norm": 9.758398056030273, "learning_rate": 2.6585094549499447e-06, "loss": 0.9545, "step": 1200 }, { "epoch": 0.11124955500177999, "grad_norm": 5.447558879852295, "learning_rate": 2.7697441601779756e-06, "loss": 1.0267, "step": 1250 }, { "epoch": 0.1156995372018512, "grad_norm": 12.266983032226562, "learning_rate": 2.8809788654060065e-06, "loss": 1.3001, "step": 1300 }, { "epoch": 0.1201495194019224, "grad_norm": 125.02411651611328, "learning_rate": 2.9922135706340383e-06, "loss": 1.2592, "step": 1350 }, { "epoch": 0.1245995016019936, "grad_norm": 5.480194568634033, "learning_rate": 3.103448275862069e-06, "loss": 1.129, "step": 1400 }, { "epoch": 0.1290494838020648, "grad_norm": 6.594211578369141, "learning_rate": 3.2146829810901005e-06, "loss": 1.0413, "step": 1450 }, { "epoch": 0.133499466002136, "grad_norm": 9.544859886169434, "learning_rate": 3.3259176863181314e-06, "loss": 1.2025, "step": 1500 }, { "epoch": 0.1379494482022072, "grad_norm": 6.9253315925598145, "learning_rate": 3.4371523915461623e-06, "loss": 1.1236, "step": 1550 }, { "epoch": 0.1423994304022784, "grad_norm": 33.036712646484375, "learning_rate": 3.548387096774194e-06, "loss": 1.1385, "step": 1600 }, { "epoch": 0.1468494126023496, "grad_norm": 9.286153793334961, "learning_rate": 3.659621802002225e-06, "loss": 1.1375, "step": 1650 }, { "epoch": 0.1512993948024208, "grad_norm": 7.031552314758301, "learning_rate": 3.7708565072302562e-06, "loss": 1.1384, "step": 1700 }, { "epoch": 0.155749377002492, "grad_norm": 3.8519108295440674, "learning_rate": 3.8820912124582876e-06, "loss": 1.0544, "step": 1750 }, { "epoch": 0.1601993592025632, "grad_norm": 11.542702674865723, "learning_rate": 3.993325917686319e-06, "loss": 1.1411, "step": 1800 }, { "epoch": 0.1646493414026344, "grad_norm": 34.22543716430664, "learning_rate": 4.104560622914349e-06, "loss": 1.0, "step": 1850 }, { "epoch": 0.16909932360270558, "grad_norm": 61.1249885559082, "learning_rate": 4.21357063403782e-06, "loss": 1.1662, "step": 1900 }, { "epoch": 0.17354930580277678, "grad_norm": 3.2967917919158936, "learning_rate": 4.324805339265851e-06, "loss": 1.164, "step": 1950 }, { "epoch": 0.17799928800284798, "grad_norm": 20.234724044799805, "learning_rate": 4.436040044493882e-06, "loss": 1.1389, "step": 2000 }, { "epoch": 0.18244927020291918, "grad_norm": 30.73896598815918, "learning_rate": 4.547274749721914e-06, "loss": 1.0945, "step": 2050 }, { "epoch": 0.18689925240299038, "grad_norm": 29.284366607666016, "learning_rate": 4.658509454949945e-06, "loss": 1.1803, "step": 2100 }, { "epoch": 0.19134923460306158, "grad_norm": 4.33050012588501, "learning_rate": 4.7697441601779755e-06, "loss": 1.0196, "step": 2150 }, { "epoch": 0.19579921680313278, "grad_norm": 6.787428379058838, "learning_rate": 4.880978865406007e-06, "loss": 1.1267, "step": 2200 }, { "epoch": 0.20024919900320398, "grad_norm": 38.97026062011719, "learning_rate": 4.992213570634038e-06, "loss": 1.2827, "step": 2250 }, { "epoch": 0.20469918120327518, "grad_norm": 45.926292419433594, "learning_rate": 5.1034482758620695e-06, "loss": 1.18, "step": 2300 }, { "epoch": 0.20914916340334638, "grad_norm": 9.873788833618164, "learning_rate": 5.2146829810901e-06, "loss": 1.0475, "step": 2350 }, { "epoch": 0.21359914560341758, "grad_norm": 7.516901969909668, "learning_rate": 5.325917686318131e-06, "loss": 1.1606, "step": 2400 }, { "epoch": 0.21804912780348878, "grad_norm": 13.822782516479492, "learning_rate": 5.4371523915461635e-06, "loss": 1.1432, "step": 2450 }, { "epoch": 0.22249911000355999, "grad_norm": 4.585656642913818, "learning_rate": 5.548387096774194e-06, "loss": 0.9673, "step": 2500 }, { "epoch": 0.2269490922036312, "grad_norm": 7.065394878387451, "learning_rate": 5.659621802002225e-06, "loss": 1.1697, "step": 2550 }, { "epoch": 0.2313990744037024, "grad_norm": 3.246696949005127, "learning_rate": 5.770856507230256e-06, "loss": 1.3254, "step": 2600 }, { "epoch": 0.2358490566037736, "grad_norm": 11.029773712158203, "learning_rate": 5.882091212458287e-06, "loss": 0.8898, "step": 2650 }, { "epoch": 0.2402990388038448, "grad_norm": 2.3388359546661377, "learning_rate": 5.993325917686319e-06, "loss": 1.1269, "step": 2700 }, { "epoch": 0.244749021003916, "grad_norm": 7.323708534240723, "learning_rate": 6.10456062291435e-06, "loss": 1.2046, "step": 2750 }, { "epoch": 0.2491990032039872, "grad_norm": 9.747910499572754, "learning_rate": 6.215795328142381e-06, "loss": 1.4386, "step": 2800 }, { "epoch": 0.2536489854040584, "grad_norm": 33.742584228515625, "learning_rate": 6.3270300333704115e-06, "loss": 1.223, "step": 2850 }, { "epoch": 0.2580989676041296, "grad_norm": 6.078275203704834, "learning_rate": 6.438264738598443e-06, "loss": 1.1825, "step": 2900 }, { "epoch": 0.2625489498042008, "grad_norm": 3.2230539321899414, "learning_rate": 6.549499443826475e-06, "loss": 1.2174, "step": 2950 }, { "epoch": 0.266998932004272, "grad_norm": 9.313528060913086, "learning_rate": 6.6607341490545054e-06, "loss": 0.9551, "step": 3000 }, { "epoch": 0.2714489142043432, "grad_norm": 2.7404236793518066, "learning_rate": 6.771968854282537e-06, "loss": 0.8545, "step": 3050 }, { "epoch": 0.2758988964044144, "grad_norm": 8.062790870666504, "learning_rate": 6.883203559510568e-06, "loss": 1.1268, "step": 3100 }, { "epoch": 0.2803488786044856, "grad_norm": 8.285247802734375, "learning_rate": 6.9944382647385985e-06, "loss": 1.0328, "step": 3150 }, { "epoch": 0.2847988608045568, "grad_norm": 6.987884998321533, "learning_rate": 7.105672969966631e-06, "loss": 1.3254, "step": 3200 }, { "epoch": 0.289248843004628, "grad_norm": 2.6899290084838867, "learning_rate": 7.216907675194661e-06, "loss": 1.2222, "step": 3250 }, { "epoch": 0.2936988252046992, "grad_norm": 5.234805107116699, "learning_rate": 7.3281423804226925e-06, "loss": 1.134, "step": 3300 }, { "epoch": 0.2981488074047704, "grad_norm": 10.498394012451172, "learning_rate": 7.439377085650724e-06, "loss": 1.0192, "step": 3350 }, { "epoch": 0.3025987896048416, "grad_norm": 10.286867141723633, "learning_rate": 7.550611790878754e-06, "loss": 1.3577, "step": 3400 }, { "epoch": 0.3070487718049128, "grad_norm": 4.571293354034424, "learning_rate": 7.661846496106786e-06, "loss": 1.2335, "step": 3450 }, { "epoch": 0.311498754004984, "grad_norm": 4.046992778778076, "learning_rate": 7.773081201334817e-06, "loss": 1.2942, "step": 3500 }, { "epoch": 0.3159487362050552, "grad_norm": 2.246840715408325, "learning_rate": 7.884315906562849e-06, "loss": 1.2665, "step": 3550 }, { "epoch": 0.3203987184051264, "grad_norm": 4.0541300773620605, "learning_rate": 7.99555061179088e-06, "loss": 1.1772, "step": 3600 }, { "epoch": 0.3248487006051976, "grad_norm": 6.168578147888184, "learning_rate": 8.10678531701891e-06, "loss": 1.1702, "step": 3650 }, { "epoch": 0.3292986828052688, "grad_norm": 29.416217803955078, "learning_rate": 8.218020022246942e-06, "loss": 1.3389, "step": 3700 }, { "epoch": 0.33374866500534, "grad_norm": 5.4274091720581055, "learning_rate": 8.329254727474973e-06, "loss": 1.0503, "step": 3750 }, { "epoch": 0.33819864720541115, "grad_norm": 13.374832153320312, "learning_rate": 8.440489432703005e-06, "loss": 1.2227, "step": 3800 }, { "epoch": 0.34264862940548235, "grad_norm": 4.1568522453308105, "learning_rate": 8.551724137931035e-06, "loss": 1.094, "step": 3850 }, { "epoch": 0.34709861160555355, "grad_norm": 5.195131301879883, "learning_rate": 8.662958843159066e-06, "loss": 1.3611, "step": 3900 }, { "epoch": 0.35154859380562475, "grad_norm": 6.58644437789917, "learning_rate": 8.774193548387098e-06, "loss": 1.1347, "step": 3950 }, { "epoch": 0.35599857600569595, "grad_norm": 7.275071620941162, "learning_rate": 8.885428253615128e-06, "loss": 1.1572, "step": 4000 }, { "epoch": 0.36044855820576716, "grad_norm": 2.4575366973876953, "learning_rate": 8.99666295884316e-06, "loss": 1.174, "step": 4050 }, { "epoch": 0.36489854040583836, "grad_norm": 20.692773818969727, "learning_rate": 9.107897664071191e-06, "loss": 1.0272, "step": 4100 }, { "epoch": 0.36934852260590956, "grad_norm": 24.649314880371094, "learning_rate": 9.216907675194662e-06, "loss": 1.2972, "step": 4150 }, { "epoch": 0.37379850480598076, "grad_norm": 1.798013687133789, "learning_rate": 9.328142380422693e-06, "loss": 1.1485, "step": 4200 }, { "epoch": 0.37824848700605196, "grad_norm": 3.385685920715332, "learning_rate": 9.439377085650723e-06, "loss": 1.4068, "step": 4250 }, { "epoch": 0.38269846920612316, "grad_norm": 185.74266052246094, "learning_rate": 9.550611790878755e-06, "loss": 1.1712, "step": 4300 }, { "epoch": 0.38714845140619436, "grad_norm": 2.422161102294922, "learning_rate": 9.661846496106786e-06, "loss": 1.399, "step": 4350 }, { "epoch": 0.39159843360626556, "grad_norm": 7.63890266418457, "learning_rate": 9.773081201334818e-06, "loss": 1.2974, "step": 4400 }, { "epoch": 0.39604841580633676, "grad_norm": 2.540107488632202, "learning_rate": 9.884315906562849e-06, "loss": 1.1431, "step": 4450 }, { "epoch": 0.40049839800640796, "grad_norm": 184.6052703857422, "learning_rate": 9.995550611790879e-06, "loss": 1.1364, "step": 4500 }, { "epoch": 0.40494838020647916, "grad_norm": 2.693044424057007, "learning_rate": 9.999992203896911e-06, "loss": 1.2756, "step": 4550 }, { "epoch": 0.40939836240655036, "grad_norm": 7.036505222320557, "learning_rate": 9.99996750272898e-06, "loss": 1.087, "step": 4600 }, { "epoch": 0.41384834460662157, "grad_norm": 2.3637502193450928, "learning_rate": 9.999925883044886e-06, "loss": 1.3409, "step": 4650 }, { "epoch": 0.41829832680669277, "grad_norm": 26.791290283203125, "learning_rate": 9.99986734498546e-06, "loss": 1.0476, "step": 4700 }, { "epoch": 0.42274830900676397, "grad_norm": 2.446568489074707, "learning_rate": 9.999791888748777e-06, "loss": 1.2879, "step": 4750 }, { "epoch": 0.42719829120683517, "grad_norm": 1.7970316410064697, "learning_rate": 9.999699514590162e-06, "loss": 0.835, "step": 4800 }, { "epoch": 0.43164827340690637, "grad_norm": 2.94173526763916, "learning_rate": 9.99959022282218e-06, "loss": 1.1942, "step": 4850 }, { "epoch": 0.43609825560697757, "grad_norm": 6.907817363739014, "learning_rate": 9.99946401381465e-06, "loss": 1.4082, "step": 4900 }, { "epoch": 0.44054823780704877, "grad_norm": 4.467363357543945, "learning_rate": 9.999320887994623e-06, "loss": 1.1742, "step": 4950 }, { "epoch": 0.44499822000711997, "grad_norm": 3.6518967151641846, "learning_rate": 9.999160845846401e-06, "loss": 1.3119, "step": 5000 }, { "epoch": 0.44944820220719117, "grad_norm": 4.708242416381836, "learning_rate": 9.99898388791152e-06, "loss": 1.1216, "step": 5050 }, { "epoch": 0.4538981844072624, "grad_norm": 1.7582530975341797, "learning_rate": 9.99879001478876e-06, "loss": 1.2195, "step": 5100 }, { "epoch": 0.4583481666073336, "grad_norm": 24.896326065063477, "learning_rate": 9.998579227134133e-06, "loss": 1.1893, "step": 5150 }, { "epoch": 0.4627981488074048, "grad_norm": 4.394145965576172, "learning_rate": 9.998351525660887e-06, "loss": 1.0771, "step": 5200 }, { "epoch": 0.467248131007476, "grad_norm": 5.524062633514404, "learning_rate": 9.9981069111395e-06, "loss": 1.4627, "step": 5250 }, { "epoch": 0.4716981132075472, "grad_norm": 2.9294071197509766, "learning_rate": 9.997845384397679e-06, "loss": 1.1926, "step": 5300 }, { "epoch": 0.4761480954076184, "grad_norm": 2.470787763595581, "learning_rate": 9.997566946320363e-06, "loss": 1.058, "step": 5350 }, { "epoch": 0.4805980776076896, "grad_norm": 41.667259216308594, "learning_rate": 9.997271597849707e-06, "loss": 1.2937, "step": 5400 }, { "epoch": 0.4850480598077608, "grad_norm": 25.816415786743164, "learning_rate": 9.99695933998509e-06, "loss": 1.0348, "step": 5450 }, { "epoch": 0.489498042007832, "grad_norm": 4.734984874725342, "learning_rate": 9.996630173783106e-06, "loss": 1.2221, "step": 5500 }, { "epoch": 0.4939480242079032, "grad_norm": 4.017815589904785, "learning_rate": 9.996284100357564e-06, "loss": 1.152, "step": 5550 }, { "epoch": 0.4983980064079744, "grad_norm": 5.8167524337768555, "learning_rate": 9.99592112087948e-06, "loss": 1.3604, "step": 5600 }, { "epoch": 0.5028479886080456, "grad_norm": 2.3638503551483154, "learning_rate": 9.995541236577078e-06, "loss": 1.0037, "step": 5650 }, { "epoch": 0.5072979708081168, "grad_norm": 2.891145706176758, "learning_rate": 9.995144448735782e-06, "loss": 1.1916, "step": 5700 }, { "epoch": 0.511747953008188, "grad_norm": 3.616471767425537, "learning_rate": 9.994730758698212e-06, "loss": 1.3284, "step": 5750 }, { "epoch": 0.5161979352082592, "grad_norm": 1.981928825378418, "learning_rate": 9.994300167864183e-06, "loss": 1.3387, "step": 5800 }, { "epoch": 0.5206479174083304, "grad_norm": 4.858288764953613, "learning_rate": 9.993852677690695e-06, "loss": 0.9181, "step": 5850 }, { "epoch": 0.5250978996084016, "grad_norm": 5.251379489898682, "learning_rate": 9.993388289691933e-06, "loss": 1.2167, "step": 5900 }, { "epoch": 0.5295478818084728, "grad_norm": 15.533602714538574, "learning_rate": 9.992907005439254e-06, "loss": 1.5023, "step": 5950 }, { "epoch": 0.533997864008544, "grad_norm": 5.341337203979492, "learning_rate": 9.992408826561197e-06, "loss": 1.3035, "step": 6000 }, { "epoch": 0.5384478462086152, "grad_norm": 4.640894412994385, "learning_rate": 9.991893754743462e-06, "loss": 1.1636, "step": 6050 }, { "epoch": 0.5428978284086864, "grad_norm": 3.204641103744507, "learning_rate": 9.991361791728908e-06, "loss": 1.234, "step": 6100 }, { "epoch": 0.5473478106087576, "grad_norm": 39.53562545776367, "learning_rate": 9.990812939317552e-06, "loss": 1.0594, "step": 6150 }, { "epoch": 0.5517977928088288, "grad_norm": 2.1653263568878174, "learning_rate": 9.990247199366561e-06, "loss": 1.1363, "step": 6200 }, { "epoch": 0.5562477750089, "grad_norm": 18.60838508605957, "learning_rate": 9.989664573790242e-06, "loss": 1.104, "step": 6250 }, { "epoch": 0.5606977572089712, "grad_norm": 4.904402256011963, "learning_rate": 9.989065064560043e-06, "loss": 1.1985, "step": 6300 }, { "epoch": 0.5651477394090424, "grad_norm": 22.430341720581055, "learning_rate": 9.988448673704535e-06, "loss": 1.1599, "step": 6350 }, { "epoch": 0.5695977216091136, "grad_norm": 2.3300557136535645, "learning_rate": 9.987815403309417e-06, "loss": 1.2128, "step": 6400 }, { "epoch": 0.5740477038091848, "grad_norm": 16.416412353515625, "learning_rate": 9.987165255517501e-06, "loss": 1.0455, "step": 6450 }, { "epoch": 0.578497686009256, "grad_norm": 2.1213927268981934, "learning_rate": 9.986498232528709e-06, "loss": 0.9009, "step": 6500 }, { "epoch": 0.5829476682093272, "grad_norm": 1.6562855243682861, "learning_rate": 9.985814336600063e-06, "loss": 1.2225, "step": 6550 }, { "epoch": 0.5873976504093984, "grad_norm": 63.272544860839844, "learning_rate": 9.985113570045682e-06, "loss": 0.9986, "step": 6600 }, { "epoch": 0.5918476326094696, "grad_norm": 5.0370683670043945, "learning_rate": 9.984395935236763e-06, "loss": 1.2009, "step": 6650 }, { "epoch": 0.5962976148095408, "grad_norm": 2.671168088912964, "learning_rate": 9.983661434601588e-06, "loss": 1.1226, "step": 6700 }, { "epoch": 0.600747597009612, "grad_norm": 20.33568572998047, "learning_rate": 9.982910070625505e-06, "loss": 1.3099, "step": 6750 }, { "epoch": 0.6051975792096832, "grad_norm": 2.655186414718628, "learning_rate": 9.98214184585092e-06, "loss": 1.0238, "step": 6800 }, { "epoch": 0.6096475614097544, "grad_norm": 2.3275833129882812, "learning_rate": 9.9813567628773e-06, "loss": 1.0833, "step": 6850 }, { "epoch": 0.6140975436098256, "grad_norm": 1.5307073593139648, "learning_rate": 9.980554824361146e-06, "loss": 1.3424, "step": 6900 }, { "epoch": 0.6185475258098968, "grad_norm": 5.340336799621582, "learning_rate": 9.979736033015998e-06, "loss": 1.2753, "step": 6950 }, { "epoch": 0.622997508009968, "grad_norm": 3.586345672607422, "learning_rate": 9.978900391612422e-06, "loss": 1.1633, "step": 7000 }, { "epoch": 0.6274474902100392, "grad_norm": 8.149271011352539, "learning_rate": 9.978047902978e-06, "loss": 1.2048, "step": 7050 }, { "epoch": 0.6318974724101104, "grad_norm": 3.370352268218994, "learning_rate": 9.977178569997317e-06, "loss": 0.8993, "step": 7100 }, { "epoch": 0.6363474546101816, "grad_norm": 5.327269554138184, "learning_rate": 9.976292395611957e-06, "loss": 1.2034, "step": 7150 }, { "epoch": 0.6407974368102528, "grad_norm": 1.5556652545928955, "learning_rate": 9.975389382820492e-06, "loss": 1.0849, "step": 7200 }, { "epoch": 0.645247419010324, "grad_norm": 2.147937536239624, "learning_rate": 9.97446953467847e-06, "loss": 1.2608, "step": 7250 }, { "epoch": 0.6496974012103952, "grad_norm": 3.057896852493286, "learning_rate": 9.9735328542984e-06, "loss": 1.0354, "step": 7300 }, { "epoch": 0.6541473834104664, "grad_norm": 1.395318627357483, "learning_rate": 9.972579344849757e-06, "loss": 1.2001, "step": 7350 }, { "epoch": 0.6585973656105376, "grad_norm": 4.244902610778809, "learning_rate": 9.97160900955895e-06, "loss": 1.0765, "step": 7400 }, { "epoch": 0.6630473478106088, "grad_norm": 7.125218391418457, "learning_rate": 9.97062185170933e-06, "loss": 1.1104, "step": 7450 }, { "epoch": 0.66749733001068, "grad_norm": 2.4416606426239014, "learning_rate": 9.969617874641166e-06, "loss": 1.1251, "step": 7500 }, { "epoch": 0.6719473122107511, "grad_norm": 3.541290044784546, "learning_rate": 9.968597081751642e-06, "loss": 1.1731, "step": 7550 }, { "epoch": 0.6763972944108223, "grad_norm": 46.99223709106445, "learning_rate": 9.967580393338561e-06, "loss": 1.4722, "step": 7600 }, { "epoch": 0.6808472766108935, "grad_norm": 1.1745175123214722, "learning_rate": 9.966526315367801e-06, "loss": 1.1045, "step": 7650 }, { "epoch": 0.6852972588109647, "grad_norm": 2.0251762866973877, "learning_rate": 9.96545543203667e-06, "loss": 1.2423, "step": 7700 }, { "epoch": 0.6897472410110359, "grad_norm": 2.456342935562134, "learning_rate": 9.964367746968741e-06, "loss": 1.0161, "step": 7750 }, { "epoch": 0.6941972232111071, "grad_norm": 2.9943089485168457, "learning_rate": 9.96326326384444e-06, "loss": 1.1466, "step": 7800 }, { "epoch": 0.6986472054111783, "grad_norm": 1.5178414583206177, "learning_rate": 9.962141986401036e-06, "loss": 1.1385, "step": 7850 }, { "epoch": 0.7030971876112495, "grad_norm": 2.009186267852783, "learning_rate": 9.96100391843262e-06, "loss": 1.2702, "step": 7900 }, { "epoch": 0.7075471698113207, "grad_norm": 3.1253960132598877, "learning_rate": 9.959849063790107e-06, "loss": 1.0517, "step": 7950 }, { "epoch": 0.7119971520113919, "grad_norm": 4.601151466369629, "learning_rate": 9.958677426381204e-06, "loss": 1.0766, "step": 8000 }, { "epoch": 0.7164471342114631, "grad_norm": 2.2349307537078857, "learning_rate": 9.957489010170412e-06, "loss": 1.1042, "step": 8050 }, { "epoch": 0.7208971164115343, "grad_norm": 3.8279824256896973, "learning_rate": 9.956283819179004e-06, "loss": 1.272, "step": 8100 }, { "epoch": 0.7253470986116055, "grad_norm": 1.91068434715271, "learning_rate": 9.955061857485014e-06, "loss": 1.1951, "step": 8150 }, { "epoch": 0.7297970808116767, "grad_norm": 2.437605619430542, "learning_rate": 9.953823129223229e-06, "loss": 1.1582, "step": 8200 }, { "epoch": 0.7342470630117479, "grad_norm": 2.7856671810150146, "learning_rate": 9.952567638585157e-06, "loss": 1.0706, "step": 8250 }, { "epoch": 0.7386970452118191, "grad_norm": 65.03971099853516, "learning_rate": 9.951295389819042e-06, "loss": 1.5961, "step": 8300 }, { "epoch": 0.7431470274118903, "grad_norm": 2.078338384628296, "learning_rate": 9.95000638722982e-06, "loss": 1.1698, "step": 8350 }, { "epoch": 0.7475970096119615, "grad_norm": 5.004459857940674, "learning_rate": 9.948700635179121e-06, "loss": 1.196, "step": 8400 }, { "epoch": 0.7520469918120327, "grad_norm": 3.9566500186920166, "learning_rate": 9.947378138085253e-06, "loss": 1.1133, "step": 8450 }, { "epoch": 0.7564969740121039, "grad_norm": 2.2350332736968994, "learning_rate": 9.946038900423182e-06, "loss": 1.2413, "step": 8500 }, { "epoch": 0.7609469562121751, "grad_norm": 12.273381233215332, "learning_rate": 9.94468292672452e-06, "loss": 1.4454, "step": 8550 }, { "epoch": 0.7653969384122463, "grad_norm": 4.170358180999756, "learning_rate": 9.94331022157751e-06, "loss": 1.2241, "step": 8600 }, { "epoch": 0.7698469206123175, "grad_norm": 2.163768768310547, "learning_rate": 9.941920789627009e-06, "loss": 0.95, "step": 8650 }, { "epoch": 0.7742969028123887, "grad_norm": 3.350728750228882, "learning_rate": 9.940514635574473e-06, "loss": 1.129, "step": 8700 }, { "epoch": 0.7787468850124599, "grad_norm": 3.2991418838500977, "learning_rate": 9.939091764177938e-06, "loss": 1.0522, "step": 8750 }, { "epoch": 0.7831968672125311, "grad_norm": 27.37785530090332, "learning_rate": 9.937652180252013e-06, "loss": 1.2875, "step": 8800 }, { "epoch": 0.7876468494126023, "grad_norm": 1.4893865585327148, "learning_rate": 9.936195888667855e-06, "loss": 0.974, "step": 8850 }, { "epoch": 0.7920968316126735, "grad_norm": 1.5383645296096802, "learning_rate": 9.934722894353154e-06, "loss": 1.3581, "step": 8900 }, { "epoch": 0.7965468138127447, "grad_norm": 3.010911464691162, "learning_rate": 9.933233202292117e-06, "loss": 1.0246, "step": 8950 }, { "epoch": 0.8009967960128159, "grad_norm": 2.668351650238037, "learning_rate": 9.931726817525453e-06, "loss": 1.1922, "step": 9000 }, { "epoch": 0.8054467782128871, "grad_norm": 3.1990578174591064, "learning_rate": 9.930203745150352e-06, "loss": 0.9458, "step": 9050 }, { "epoch": 0.8098967604129583, "grad_norm": 3.6666007041931152, "learning_rate": 9.928663990320479e-06, "loss": 1.0693, "step": 9100 }, { "epoch": 0.8143467426130295, "grad_norm": 84.4525375366211, "learning_rate": 9.927107558245933e-06, "loss": 1.2878, "step": 9150 }, { "epoch": 0.8187967248131007, "grad_norm": 1.7887487411499023, "learning_rate": 9.925534454193259e-06, "loss": 1.1697, "step": 9200 }, { "epoch": 0.8232467070131719, "grad_norm": 3.0660512447357178, "learning_rate": 9.923944683485405e-06, "loss": 1.105, "step": 9250 }, { "epoch": 0.8276966892132431, "grad_norm": 6.017421722412109, "learning_rate": 9.92233825150172e-06, "loss": 1.3024, "step": 9300 }, { "epoch": 0.8321466714133143, "grad_norm": 5.540665149688721, "learning_rate": 9.920715163677927e-06, "loss": 0.9632, "step": 9350 }, { "epoch": 0.8365966536133855, "grad_norm": 20.97583770751953, "learning_rate": 9.91907542550611e-06, "loss": 1.0405, "step": 9400 }, { "epoch": 0.8410466358134567, "grad_norm": 5.126645088195801, "learning_rate": 9.917419042534692e-06, "loss": 1.4297, "step": 9450 }, { "epoch": 0.8454966180135279, "grad_norm": 2.5642127990722656, "learning_rate": 9.91574602036842e-06, "loss": 0.9677, "step": 9500 }, { "epoch": 0.8499466002135991, "grad_norm": 32.117130279541016, "learning_rate": 9.914056364668338e-06, "loss": 1.2197, "step": 9550 }, { "epoch": 0.8543965824136703, "grad_norm": 1.9256842136383057, "learning_rate": 9.91235008115178e-06, "loss": 1.0439, "step": 9600 }, { "epoch": 0.8588465646137415, "grad_norm": 1.141822338104248, "learning_rate": 9.910627175592337e-06, "loss": 1.088, "step": 9650 }, { "epoch": 0.8632965468138127, "grad_norm": 2.8787155151367188, "learning_rate": 9.90888765381985e-06, "loss": 1.4034, "step": 9700 }, { "epoch": 0.8677465290138839, "grad_norm": 3.679314613342285, "learning_rate": 9.907166807105236e-06, "loss": 1.1288, "step": 9750 }, { "epoch": 0.8721965112139551, "grad_norm": 4.225368976593018, "learning_rate": 9.905394402650154e-06, "loss": 1.2115, "step": 9800 }, { "epoch": 0.8766464934140263, "grad_norm": 1.8613295555114746, "learning_rate": 9.903605399688295e-06, "loss": 1.4082, "step": 9850 }, { "epoch": 0.8810964756140975, "grad_norm": 4.124039649963379, "learning_rate": 9.901799804273148e-06, "loss": 1.348, "step": 9900 }, { "epoch": 0.8855464578141687, "grad_norm": 6.448430061340332, "learning_rate": 9.89997762251436e-06, "loss": 0.9987, "step": 9950 }, { "epoch": 0.8899964400142399, "grad_norm": 5.277775287628174, "learning_rate": 9.898138860577687e-06, "loss": 0.9385, "step": 10000 }, { "epoch": 0.8944464222143111, "grad_norm": 1.809823989868164, "learning_rate": 9.896283524684997e-06, "loss": 1.1597, "step": 10050 }, { "epoch": 0.8988964044143823, "grad_norm": 5.325939655303955, "learning_rate": 9.89444922150802e-06, "loss": 1.1662, "step": 10100 }, { "epoch": 0.9033463866144535, "grad_norm": 3.811429500579834, "learning_rate": 9.892561087757642e-06, "loss": 1.1608, "step": 10150 }, { "epoch": 0.9077963688145247, "grad_norm": 4.000579833984375, "learning_rate": 9.890656398924896e-06, "loss": 1.2019, "step": 10200 }, { "epoch": 0.912246351014596, "grad_norm": 7.725211143493652, "learning_rate": 9.888735161454727e-06, "loss": 1.4009, "step": 10250 }, { "epoch": 0.9166963332146671, "grad_norm": 2.2231249809265137, "learning_rate": 9.886797381848075e-06, "loss": 1.242, "step": 10300 }, { "epoch": 0.9211463154147383, "grad_norm": 19.54480743408203, "learning_rate": 9.88484306666185e-06, "loss": 1.107, "step": 10350 }, { "epoch": 0.9255962976148095, "grad_norm": 3.519115924835205, "learning_rate": 9.882872222508923e-06, "loss": 1.0243, "step": 10400 }, { "epoch": 0.9300462798148807, "grad_norm": 2.8360178470611572, "learning_rate": 9.880884856058087e-06, "loss": 1.0392, "step": 10450 }, { "epoch": 0.934496262014952, "grad_norm": 2.47052001953125, "learning_rate": 9.878880974034043e-06, "loss": 1.2191, "step": 10500 }, { "epoch": 0.9389462442150232, "grad_norm": 10.560032844543457, "learning_rate": 9.876860583217378e-06, "loss": 1.1239, "step": 10550 }, { "epoch": 0.9433962264150944, "grad_norm": 2.5285675525665283, "learning_rate": 9.87482369044454e-06, "loss": 0.9617, "step": 10600 }, { "epoch": 0.9478462086151656, "grad_norm": 1.5266432762145996, "learning_rate": 9.872770302607812e-06, "loss": 0.9498, "step": 10650 }, { "epoch": 0.9522961908152368, "grad_norm": 1.7341094017028809, "learning_rate": 9.870700426655297e-06, "loss": 1.1383, "step": 10700 }, { "epoch": 0.956746173015308, "grad_norm": 1.6931637525558472, "learning_rate": 9.868614069590884e-06, "loss": 0.9525, "step": 10750 }, { "epoch": 0.9611961552153792, "grad_norm": 4.657320499420166, "learning_rate": 9.866511238474235e-06, "loss": 1.2013, "step": 10800 }, { "epoch": 0.9656461374154504, "grad_norm": 14.48027515411377, "learning_rate": 9.864391940420749e-06, "loss": 1.0478, "step": 10850 }, { "epoch": 0.9700961196155216, "grad_norm": 1.3597983121871948, "learning_rate": 9.862256182601554e-06, "loss": 1.2549, "step": 10900 }, { "epoch": 0.9745461018155928, "grad_norm": 1.4422001838684082, "learning_rate": 9.86010397224346e-06, "loss": 0.8965, "step": 10950 }, { "epoch": 0.978996084015664, "grad_norm": 1.2722834348678589, "learning_rate": 9.85793531662896e-06, "loss": 1.3304, "step": 11000 }, { "epoch": 0.9834460662157352, "grad_norm": 5.499887943267822, "learning_rate": 9.855750223096193e-06, "loss": 1.2654, "step": 11050 }, { "epoch": 0.9878960484158064, "grad_norm": 2.06746244430542, "learning_rate": 9.853548699038905e-06, "loss": 1.1647, "step": 11100 }, { "epoch": 0.9923460306158776, "grad_norm": 20.98439598083496, "learning_rate": 9.851330751906457e-06, "loss": 1.4341, "step": 11150 }, { "epoch": 0.9967960128159488, "grad_norm": 1.6883600950241089, "learning_rate": 9.849096389203766e-06, "loss": 1.3922, "step": 11200 }, { "epoch": 1.0, "eval_f1_macro": 0.5731334970008174, "eval_f1_micro": 0.6438154750964896, "eval_loss": 1.4835697412490845, "eval_roc_macro": 0.7859875423851859, "eval_runtime": 17.1862, "eval_samples_per_second": 290.931, "eval_steps_per_second": 9.135, "step": 11236 }, { "epoch": 1.00124599501602, "grad_norm": 1.949639081954956, "learning_rate": 9.846845618491308e-06, "loss": 1.1121, "step": 11250 }, { "epoch": 1.0056959772160912, "grad_norm": 9.548723220825195, "learning_rate": 9.844578447385066e-06, "loss": 1.1523, "step": 11300 }, { "epoch": 1.0101459594161624, "grad_norm": 4.071320056915283, "learning_rate": 9.842294883556531e-06, "loss": 1.2337, "step": 11350 }, { "epoch": 1.0145959416162336, "grad_norm": 1.6248520612716675, "learning_rate": 9.83999493473265e-06, "loss": 1.0516, "step": 11400 }, { "epoch": 1.0190459238163048, "grad_norm": 2.9080331325531006, "learning_rate": 9.837678608695822e-06, "loss": 1.284, "step": 11450 }, { "epoch": 1.023495906016376, "grad_norm": 7.0387349128723145, "learning_rate": 9.835345913283853e-06, "loss": 1.2029, "step": 11500 }, { "epoch": 1.0279458882164472, "grad_norm": 17.00929832458496, "learning_rate": 9.832996856389947e-06, "loss": 1.3472, "step": 11550 }, { "epoch": 1.0323958704165184, "grad_norm": 3.5550198554992676, "learning_rate": 9.830631445962667e-06, "loss": 1.1013, "step": 11600 }, { "epoch": 1.0368458526165896, "grad_norm": 3.269357442855835, "learning_rate": 9.828249690005911e-06, "loss": 1.0927, "step": 11650 }, { "epoch": 1.0412958348166608, "grad_norm": 1.844076156616211, "learning_rate": 9.825851596578886e-06, "loss": 1.1606, "step": 11700 }, { "epoch": 1.045745817016732, "grad_norm": 2.7527997493743896, "learning_rate": 9.823437173796081e-06, "loss": 1.1469, "step": 11750 }, { "epoch": 1.0501957992168032, "grad_norm": 2.3174569606781006, "learning_rate": 9.821006429827243e-06, "loss": 1.2882, "step": 11800 }, { "epoch": 1.0546457814168744, "grad_norm": 4.85365629196167, "learning_rate": 9.818559372897338e-06, "loss": 1.1565, "step": 11850 }, { "epoch": 1.0590957636169456, "grad_norm": 54.91732406616211, "learning_rate": 9.816096011286534e-06, "loss": 1.165, "step": 11900 }, { "epoch": 1.0635457458170168, "grad_norm": 6.512928009033203, "learning_rate": 9.813616353330172e-06, "loss": 1.1781, "step": 11950 }, { "epoch": 1.067995728017088, "grad_norm": 14.093378067016602, "learning_rate": 9.811120407418732e-06, "loss": 0.9137, "step": 12000 }, { "epoch": 1.0724457102171592, "grad_norm": 24.66497230529785, "learning_rate": 9.80860818199781e-06, "loss": 1.2257, "step": 12050 }, { "epoch": 1.0768956924172304, "grad_norm": 11.153156280517578, "learning_rate": 9.806079685568085e-06, "loss": 1.2621, "step": 12100 }, { "epoch": 1.0813456746173016, "grad_norm": 3.1181159019470215, "learning_rate": 9.803534926685295e-06, "loss": 1.2942, "step": 12150 }, { "epoch": 1.0857956568173728, "grad_norm": 2.6894562244415283, "learning_rate": 9.800973913960206e-06, "loss": 1.1701, "step": 12200 }, { "epoch": 1.090245639017444, "grad_norm": 6.137502670288086, "learning_rate": 9.79839665605858e-06, "loss": 1.2406, "step": 12250 }, { "epoch": 1.0946956212175152, "grad_norm": 14.08859634399414, "learning_rate": 9.795803161701148e-06, "loss": 1.3194, "step": 12300 }, { "epoch": 1.0991456034175864, "grad_norm": 2.133207082748413, "learning_rate": 9.793193439663585e-06, "loss": 1.1323, "step": 12350 }, { "epoch": 1.1035955856176576, "grad_norm": 4.284803867340088, "learning_rate": 9.790567498776473e-06, "loss": 1.1972, "step": 12400 }, { "epoch": 1.1080455678177288, "grad_norm": 12.784284591674805, "learning_rate": 9.787925347925273e-06, "loss": 1.1307, "step": 12450 }, { "epoch": 1.1124955500178, "grad_norm": 2.8994858264923096, "learning_rate": 9.785266996050298e-06, "loss": 1.2852, "step": 12500 }, { "epoch": 1.1169455322178712, "grad_norm": 3.0852248668670654, "learning_rate": 9.782592452146682e-06, "loss": 0.9786, "step": 12550 }, { "epoch": 1.1213955144179424, "grad_norm": 6.462624549865723, "learning_rate": 9.779901725264343e-06, "loss": 1.0721, "step": 12600 }, { "epoch": 1.1258454966180136, "grad_norm": 4.060140132904053, "learning_rate": 9.777194824507965e-06, "loss": 1.055, "step": 12650 }, { "epoch": 1.1302954788180848, "grad_norm": 75.92053985595703, "learning_rate": 9.774471759036956e-06, "loss": 1.3518, "step": 12700 }, { "epoch": 1.134745461018156, "grad_norm": 2.528672456741333, "learning_rate": 9.771732538065419e-06, "loss": 1.0876, "step": 12750 }, { "epoch": 1.1391954432182272, "grad_norm": 2.1993253231048584, "learning_rate": 9.768977170862125e-06, "loss": 1.1564, "step": 12800 }, { "epoch": 1.1436454254182984, "grad_norm": 31.24458885192871, "learning_rate": 9.76626125491385e-06, "loss": 1.0644, "step": 12850 }, { "epoch": 1.1480954076183696, "grad_norm": 1.644692063331604, "learning_rate": 9.763473945730217e-06, "loss": 1.5248, "step": 12900 }, { "epoch": 1.1525453898184408, "grad_norm": 3.0403051376342773, "learning_rate": 9.760670518259634e-06, "loss": 1.1142, "step": 12950 }, { "epoch": 1.156995372018512, "grad_norm": 2.0886764526367188, "learning_rate": 9.75785098198813e-06, "loss": 1.0546, "step": 13000 }, { "epoch": 1.1614453542185832, "grad_norm": 2.114189386367798, "learning_rate": 9.755015346456243e-06, "loss": 1.3302, "step": 13050 }, { "epoch": 1.1658953364186544, "grad_norm": 2.0161049365997314, "learning_rate": 9.752163621258983e-06, "loss": 0.9879, "step": 13100 }, { "epoch": 1.1703453186187256, "grad_norm": 18.574996948242188, "learning_rate": 9.749295816045805e-06, "loss": 1.2368, "step": 13150 }, { "epoch": 1.1747953008187968, "grad_norm": 4.07729959487915, "learning_rate": 9.746411940520576e-06, "loss": 1.002, "step": 13200 }, { "epoch": 1.179245283018868, "grad_norm": 10.727592468261719, "learning_rate": 9.743512004441533e-06, "loss": 1.4668, "step": 13250 }, { "epoch": 1.1836952652189392, "grad_norm": 7.894482135772705, "learning_rate": 9.740596017621265e-06, "loss": 0.9369, "step": 13300 }, { "epoch": 1.1881452474190104, "grad_norm": 3.6572136878967285, "learning_rate": 9.737663989926674e-06, "loss": 1.074, "step": 13350 }, { "epoch": 1.1925952296190816, "grad_norm": 3.9508605003356934, "learning_rate": 9.73471593127893e-06, "loss": 1.0137, "step": 13400 }, { "epoch": 1.1970452118191528, "grad_norm": 1.306636095046997, "learning_rate": 9.73175185165346e-06, "loss": 0.9727, "step": 13450 }, { "epoch": 1.201495194019224, "grad_norm": 3.1539437770843506, "learning_rate": 9.728771761079889e-06, "loss": 1.098, "step": 13500 }, { "epoch": 1.2059451762192952, "grad_norm": 1.5145061016082764, "learning_rate": 9.725775669642028e-06, "loss": 1.3266, "step": 13550 }, { "epoch": 1.2103951584193664, "grad_norm": 3.1356329917907715, "learning_rate": 9.722763587477827e-06, "loss": 1.2718, "step": 13600 }, { "epoch": 1.2148451406194376, "grad_norm": 3.084242343902588, "learning_rate": 9.719735524779344e-06, "loss": 1.0294, "step": 13650 }, { "epoch": 1.2192951228195088, "grad_norm": 3.5138604640960693, "learning_rate": 9.71669149179271e-06, "loss": 0.9782, "step": 13700 }, { "epoch": 1.22374510501958, "grad_norm": 4.40930700302124, "learning_rate": 9.713631498818097e-06, "loss": 1.0357, "step": 13750 }, { "epoch": 1.2281950872196512, "grad_norm": 3.1267247200012207, "learning_rate": 9.71055555620968e-06, "loss": 0.9359, "step": 13800 }, { "epoch": 1.2326450694197224, "grad_norm": 3.8787832260131836, "learning_rate": 9.707463674375605e-06, "loss": 1.3417, "step": 13850 }, { "epoch": 1.2370950516197936, "grad_norm": 3.748028039932251, "learning_rate": 9.70435586377795e-06, "loss": 1.2548, "step": 13900 }, { "epoch": 1.2415450338198648, "grad_norm": 1.7193983793258667, "learning_rate": 9.701232134932693e-06, "loss": 1.179, "step": 13950 }, { "epoch": 1.245995016019936, "grad_norm": 1.3606804609298706, "learning_rate": 9.698092498409671e-06, "loss": 1.5612, "step": 14000 }, { "epoch": 1.250444998220007, "grad_norm": 2.073108673095703, "learning_rate": 9.694936964832553e-06, "loss": 1.0429, "step": 14050 }, { "epoch": 1.2548949804200782, "grad_norm": 39.97510528564453, "learning_rate": 9.691765544878797e-06, "loss": 1.1086, "step": 14100 }, { "epoch": 1.2593449626201494, "grad_norm": 5.577615737915039, "learning_rate": 9.688578249279618e-06, "loss": 1.2737, "step": 14150 }, { "epoch": 1.2637949448202206, "grad_norm": 4.211680889129639, "learning_rate": 9.68537508881995e-06, "loss": 1.0693, "step": 14200 }, { "epoch": 1.2682449270202918, "grad_norm": 36.70176315307617, "learning_rate": 9.682156074338405e-06, "loss": 1.2561, "step": 14250 }, { "epoch": 1.272694909220363, "grad_norm": 1.8267853260040283, "learning_rate": 9.678921216727243e-06, "loss": 1.215, "step": 14300 }, { "epoch": 1.2771448914204342, "grad_norm": 2.5578699111938477, "learning_rate": 9.675670526932335e-06, "loss": 1.2996, "step": 14350 }, { "epoch": 1.2815948736205054, "grad_norm": 1.339385747909546, "learning_rate": 9.672404015953124e-06, "loss": 1.1302, "step": 14400 }, { "epoch": 1.2860448558205766, "grad_norm": 2.060532331466675, "learning_rate": 9.66912169484258e-06, "loss": 1.4199, "step": 14450 }, { "epoch": 1.2904948380206478, "grad_norm": 103.55339050292969, "learning_rate": 9.66582357470718e-06, "loss": 1.0826, "step": 14500 }, { "epoch": 1.294944820220719, "grad_norm": 2.465893268585205, "learning_rate": 9.662509666706855e-06, "loss": 1.0528, "step": 14550 }, { "epoch": 1.2993948024207902, "grad_norm": 4.356234073638916, "learning_rate": 9.65917998205496e-06, "loss": 0.9369, "step": 14600 }, { "epoch": 1.3038447846208614, "grad_norm": 3.971658945083618, "learning_rate": 9.655834532018233e-06, "loss": 1.0668, "step": 14650 }, { "epoch": 1.3082947668209326, "grad_norm": 127.91210174560547, "learning_rate": 9.652473327916756e-06, "loss": 1.061, "step": 14700 }, { "epoch": 1.3127447490210038, "grad_norm": 3.902003288269043, "learning_rate": 9.649096381123921e-06, "loss": 1.3372, "step": 14750 }, { "epoch": 1.317194731221075, "grad_norm": 3.8342809677124023, "learning_rate": 9.645703703066389e-06, "loss": 1.4228, "step": 14800 }, { "epoch": 1.3216447134211462, "grad_norm": 24.404268264770508, "learning_rate": 9.642295305224049e-06, "loss": 1.0666, "step": 14850 }, { "epoch": 1.3260946956212174, "grad_norm": 1.8024643659591675, "learning_rate": 9.638871199129983e-06, "loss": 1.0447, "step": 14900 }, { "epoch": 1.3305446778212886, "grad_norm": 2.399761915206909, "learning_rate": 9.635431396370425e-06, "loss": 1.4002, "step": 14950 }, { "epoch": 1.3349946600213598, "grad_norm": 2.1221187114715576, "learning_rate": 9.631975908584723e-06, "loss": 1.1807, "step": 15000 }, { "epoch": 1.339444642221431, "grad_norm": 2.58046293258667, "learning_rate": 9.628504747465299e-06, "loss": 1.2697, "step": 15050 }, { "epoch": 1.3438946244215022, "grad_norm": 2.4955642223358154, "learning_rate": 9.625017924757607e-06, "loss": 1.3565, "step": 15100 }, { "epoch": 1.3483446066215734, "grad_norm": 2.9939377307891846, "learning_rate": 9.621515452260097e-06, "loss": 1.3135, "step": 15150 }, { "epoch": 1.3527945888216446, "grad_norm": 4.295444488525391, "learning_rate": 9.617997341824175e-06, "loss": 1.2504, "step": 15200 }, { "epoch": 1.3572445710217158, "grad_norm": 2.983476161956787, "learning_rate": 9.614463605354156e-06, "loss": 1.3785, "step": 15250 }, { "epoch": 1.361694553221787, "grad_norm": 7.761312961578369, "learning_rate": 9.61091425480724e-06, "loss": 1.3405, "step": 15300 }, { "epoch": 1.3661445354218582, "grad_norm": 57.313018798828125, "learning_rate": 9.607349302193447e-06, "loss": 1.0072, "step": 15350 }, { "epoch": 1.3705945176219294, "grad_norm": 3.6994099617004395, "learning_rate": 9.603768759575601e-06, "loss": 1.291, "step": 15400 }, { "epoch": 1.3750444998220006, "grad_norm": 1.5268616676330566, "learning_rate": 9.600172639069274e-06, "loss": 1.066, "step": 15450 }, { "epoch": 1.3794944820220718, "grad_norm": 21.69536590576172, "learning_rate": 9.596560952842749e-06, "loss": 1.2371, "step": 15500 }, { "epoch": 1.383944464222143, "grad_norm": 1.630044937133789, "learning_rate": 9.59293371311698e-06, "loss": 0.9359, "step": 15550 }, { "epoch": 1.3883944464222142, "grad_norm": 14.258574485778809, "learning_rate": 9.589290932165551e-06, "loss": 1.0232, "step": 15600 }, { "epoch": 1.3928444286222854, "grad_norm": 2.067307472229004, "learning_rate": 9.58563262231463e-06, "loss": 1.4709, "step": 15650 }, { "epoch": 1.3972944108223566, "grad_norm": 1.6463223695755005, "learning_rate": 9.581958795942931e-06, "loss": 1.0904, "step": 15700 }, { "epoch": 1.4017443930224278, "grad_norm": 1.8261125087738037, "learning_rate": 9.578269465481675e-06, "loss": 1.4242, "step": 15750 }, { "epoch": 1.406194375222499, "grad_norm": 3.1252987384796143, "learning_rate": 9.574564643414544e-06, "loss": 0.9622, "step": 15800 }, { "epoch": 1.4106443574225702, "grad_norm": 5.265848636627197, "learning_rate": 9.570844342277634e-06, "loss": 0.9704, "step": 15850 }, { "epoch": 1.4150943396226414, "grad_norm": 39.59652328491211, "learning_rate": 9.567108574659426e-06, "loss": 1.2369, "step": 15900 }, { "epoch": 1.4195443218227126, "grad_norm": 22.356395721435547, "learning_rate": 9.563357353200726e-06, "loss": 0.9315, "step": 15950 }, { "epoch": 1.4239943040227838, "grad_norm": 3.433993339538574, "learning_rate": 9.559590690594642e-06, "loss": 1.4935, "step": 16000 }, { "epoch": 1.428444286222855, "grad_norm": 7.673436164855957, "learning_rate": 9.555808599586522e-06, "loss": 1.0998, "step": 16050 }, { "epoch": 1.4328942684229262, "grad_norm": 32.12130355834961, "learning_rate": 9.552011092973925e-06, "loss": 1.1462, "step": 16100 }, { "epoch": 1.4373442506229974, "grad_norm": 3.335659980773926, "learning_rate": 9.548198183606567e-06, "loss": 1.048, "step": 16150 }, { "epoch": 1.4417942328230686, "grad_norm": 4.108005046844482, "learning_rate": 9.544369884386289e-06, "loss": 1.3477, "step": 16200 }, { "epoch": 1.4462442150231398, "grad_norm": 4.166143417358398, "learning_rate": 9.540526208267001e-06, "loss": 1.149, "step": 16250 }, { "epoch": 1.450694197223211, "grad_norm": 3.79103422164917, "learning_rate": 9.536667168254648e-06, "loss": 1.1809, "step": 16300 }, { "epoch": 1.4551441794232822, "grad_norm": 2.854372501373291, "learning_rate": 9.53279277740716e-06, "loss": 0.992, "step": 16350 }, { "epoch": 1.4595941616233534, "grad_norm": 34.19712829589844, "learning_rate": 9.528903048834412e-06, "loss": 1.0187, "step": 16400 }, { "epoch": 1.4640441438234246, "grad_norm": 5.673674583435059, "learning_rate": 9.524997995698178e-06, "loss": 1.0937, "step": 16450 }, { "epoch": 1.4684941260234958, "grad_norm": 22.42845344543457, "learning_rate": 9.521077631212081e-06, "loss": 1.0844, "step": 16500 }, { "epoch": 1.472944108223567, "grad_norm": 3.2464911937713623, "learning_rate": 9.517141968641562e-06, "loss": 0.9007, "step": 16550 }, { "epoch": 1.4773940904236382, "grad_norm": 4.14447021484375, "learning_rate": 9.513191021303816e-06, "loss": 0.9134, "step": 16600 }, { "epoch": 1.4818440726237094, "grad_norm": 3.0078744888305664, "learning_rate": 9.509224802567769e-06, "loss": 1.285, "step": 16650 }, { "epoch": 1.4862940548237806, "grad_norm": 50.43156814575195, "learning_rate": 9.505243325854013e-06, "loss": 1.3349, "step": 16700 }, { "epoch": 1.4907440370238518, "grad_norm": 8.508861541748047, "learning_rate": 9.50124660463477e-06, "loss": 1.1369, "step": 16750 }, { "epoch": 1.495194019223923, "grad_norm": 3.2918832302093506, "learning_rate": 9.497234652433847e-06, "loss": 1.2296, "step": 16800 }, { "epoch": 1.4996440014239942, "grad_norm": 2.637911319732666, "learning_rate": 9.493207482826588e-06, "loss": 1.0119, "step": 16850 }, { "epoch": 1.5040939836240654, "grad_norm": 1.2078408002853394, "learning_rate": 9.48916510943983e-06, "loss": 1.3002, "step": 16900 }, { "epoch": 1.5085439658241366, "grad_norm": 4.45991325378418, "learning_rate": 9.48510754595185e-06, "loss": 1.3907, "step": 16950 }, { "epoch": 1.5129939480242078, "grad_norm": 4.222795486450195, "learning_rate": 9.481034806092332e-06, "loss": 1.2051, "step": 17000 }, { "epoch": 1.517443930224279, "grad_norm": 1.4514007568359375, "learning_rate": 9.476946903642307e-06, "loss": 1.0817, "step": 17050 }, { "epoch": 1.5218939124243502, "grad_norm": 2.8987677097320557, "learning_rate": 9.472843852434115e-06, "loss": 1.1361, "step": 17100 }, { "epoch": 1.5263438946244214, "grad_norm": 15.554654121398926, "learning_rate": 9.468725666351351e-06, "loss": 1.3591, "step": 17150 }, { "epoch": 1.5307938768244926, "grad_norm": 2.6242129802703857, "learning_rate": 9.464592359328826e-06, "loss": 1.1506, "step": 17200 }, { "epoch": 1.5352438590245638, "grad_norm": 13.537214279174805, "learning_rate": 9.460443945352516e-06, "loss": 1.2651, "step": 17250 }, { "epoch": 1.539693841224635, "grad_norm": 4.483321189880371, "learning_rate": 9.456280438459514e-06, "loss": 1.1459, "step": 17300 }, { "epoch": 1.5441438234247062, "grad_norm": 2.2444071769714355, "learning_rate": 9.452101852737983e-06, "loss": 1.081, "step": 17350 }, { "epoch": 1.5485938056247774, "grad_norm": 43.67666244506836, "learning_rate": 9.44790820232711e-06, "loss": 1.2404, "step": 17400 }, { "epoch": 1.5530437878248486, "grad_norm": 10.265786170959473, "learning_rate": 9.443699501417053e-06, "loss": 1.1605, "step": 17450 }, { "epoch": 1.5574937700249198, "grad_norm": 4.091914176940918, "learning_rate": 9.439475764248902e-06, "loss": 1.2294, "step": 17500 }, { "epoch": 1.561943752224991, "grad_norm": 6.255279064178467, "learning_rate": 9.435237005114622e-06, "loss": 1.1792, "step": 17550 }, { "epoch": 1.5663937344250622, "grad_norm": 4.586801052093506, "learning_rate": 9.43098323835701e-06, "loss": 0.7805, "step": 17600 }, { "epoch": 1.5708437166251334, "grad_norm": 3.979323148727417, "learning_rate": 9.426714478369644e-06, "loss": 1.1521, "step": 17650 }, { "epoch": 1.5752936988252046, "grad_norm": 18.58481788635254, "learning_rate": 9.422430739596832e-06, "loss": 1.1825, "step": 17700 }, { "epoch": 1.5797436810252758, "grad_norm": 3.5666186809539795, "learning_rate": 9.418132036533571e-06, "loss": 1.0548, "step": 17750 }, { "epoch": 1.584193663225347, "grad_norm": 2.110865592956543, "learning_rate": 9.413818383725492e-06, "loss": 0.9357, "step": 17800 }, { "epoch": 1.5886436454254182, "grad_norm": 4.222817897796631, "learning_rate": 9.40948979576881e-06, "loss": 1.098, "step": 17850 }, { "epoch": 1.5930936276254895, "grad_norm": 5.590696334838867, "learning_rate": 9.405146287310276e-06, "loss": 1.3779, "step": 17900 }, { "epoch": 1.5975436098255607, "grad_norm": 17.699539184570312, "learning_rate": 9.40078787304713e-06, "loss": 1.4143, "step": 17950 }, { "epoch": 1.6019935920256319, "grad_norm": 3.137864589691162, "learning_rate": 9.396414567727052e-06, "loss": 1.191, "step": 18000 }, { "epoch": 1.606443574225703, "grad_norm": 1.2940500974655151, "learning_rate": 9.3920263861481e-06, "loss": 0.9338, "step": 18050 }, { "epoch": 1.6108935564257743, "grad_norm": 8.280607223510742, "learning_rate": 9.387623343158676e-06, "loss": 1.0956, "step": 18100 }, { "epoch": 1.6153435386258455, "grad_norm": 4.7189154624938965, "learning_rate": 9.383205453657467e-06, "loss": 1.0332, "step": 18150 }, { "epoch": 1.6197935208259167, "grad_norm": 9.871176719665527, "learning_rate": 9.378772732593401e-06, "loss": 1.1304, "step": 18200 }, { "epoch": 1.6242435030259879, "grad_norm": 5.7673773765563965, "learning_rate": 9.374325194965583e-06, "loss": 1.2543, "step": 18250 }, { "epoch": 1.628693485226059, "grad_norm": 2.566429853439331, "learning_rate": 9.369862855823265e-06, "loss": 1.1047, "step": 18300 }, { "epoch": 1.6331434674261303, "grad_norm": 26.976482391357422, "learning_rate": 9.365385730265768e-06, "loss": 1.2725, "step": 18350 }, { "epoch": 1.6375934496262015, "grad_norm": 2.1873958110809326, "learning_rate": 9.36089383344246e-06, "loss": 1.1866, "step": 18400 }, { "epoch": 1.6420434318262727, "grad_norm": 3.5106539726257324, "learning_rate": 9.356387180552688e-06, "loss": 0.8293, "step": 18450 }, { "epoch": 1.6464934140263439, "grad_norm": 4.386513710021973, "learning_rate": 9.35186578684572e-06, "loss": 1.1414, "step": 18500 }, { "epoch": 1.650943396226415, "grad_norm": 1.6417053937911987, "learning_rate": 9.347329667620715e-06, "loss": 0.8623, "step": 18550 }, { "epoch": 1.6553933784264863, "grad_norm": 4.800745487213135, "learning_rate": 9.342778838226652e-06, "loss": 1.2382, "step": 18600 }, { "epoch": 1.6598433606265575, "grad_norm": 4.2037811279296875, "learning_rate": 9.33821331406229e-06, "loss": 1.1082, "step": 18650 }, { "epoch": 1.6642933428266287, "grad_norm": 4.587414264678955, "learning_rate": 9.333633110576104e-06, "loss": 1.007, "step": 18700 }, { "epoch": 1.6687433250266999, "grad_norm": 0.9835273027420044, "learning_rate": 9.329038243266247e-06, "loss": 1.2081, "step": 18750 }, { "epoch": 1.673193307226771, "grad_norm": 1.7699389457702637, "learning_rate": 9.324428727680486e-06, "loss": 0.9018, "step": 18800 }, { "epoch": 1.6776432894268423, "grad_norm": 4.841673374176025, "learning_rate": 9.319804579416156e-06, "loss": 0.9412, "step": 18850 }, { "epoch": 1.6820932716269135, "grad_norm": 3.841601610183716, "learning_rate": 9.315165814120106e-06, "loss": 1.0312, "step": 18900 }, { "epoch": 1.6865432538269847, "grad_norm": 1.6013894081115723, "learning_rate": 9.310512447488639e-06, "loss": 1.2405, "step": 18950 }, { "epoch": 1.6909932360270559, "grad_norm": 3.2987258434295654, "learning_rate": 9.305937997148593e-06, "loss": 1.1916, "step": 19000 }, { "epoch": 1.695443218227127, "grad_norm": 25.56344223022461, "learning_rate": 9.301255766373573e-06, "loss": 1.2343, "step": 19050 }, { "epoch": 1.6998932004271983, "grad_norm": 1.9605740308761597, "learning_rate": 9.29655898133092e-06, "loss": 1.0114, "step": 19100 }, { "epoch": 1.7043431826272695, "grad_norm": 3.098889112472534, "learning_rate": 9.29184765791326e-06, "loss": 1.0922, "step": 19150 }, { "epoch": 1.7087931648273407, "grad_norm": 28.6696720123291, "learning_rate": 9.28712181206242e-06, "loss": 0.9569, "step": 19200 }, { "epoch": 1.7132431470274119, "grad_norm": 12.398526191711426, "learning_rate": 9.282381459769361e-06, "loss": 1.3861, "step": 19250 }, { "epoch": 1.717693129227483, "grad_norm": 2.8596978187561035, "learning_rate": 9.277626617074134e-06, "loss": 1.1212, "step": 19300 }, { "epoch": 1.7221431114275543, "grad_norm": 5.168841361999512, "learning_rate": 9.27285730006582e-06, "loss": 1.1481, "step": 19350 }, { "epoch": 1.7265930936276255, "grad_norm": 6.3637285232543945, "learning_rate": 9.268073524882477e-06, "loss": 1.0321, "step": 19400 }, { "epoch": 1.7310430758276967, "grad_norm": 2.3252317905426025, "learning_rate": 9.263275307711088e-06, "loss": 1.181, "step": 19450 }, { "epoch": 1.7354930580277679, "grad_norm": 2.9167275428771973, "learning_rate": 9.258462664787497e-06, "loss": 1.3372, "step": 19500 }, { "epoch": 1.739943040227839, "grad_norm": 1.907272458076477, "learning_rate": 9.25363561239637e-06, "loss": 0.9545, "step": 19550 }, { "epoch": 1.7443930224279103, "grad_norm": 1.4615880250930786, "learning_rate": 9.248794166871122e-06, "loss": 0.849, "step": 19600 }, { "epoch": 1.7488430046279815, "grad_norm": 6.986257553100586, "learning_rate": 9.243938344593878e-06, "loss": 1.2813, "step": 19650 }, { "epoch": 1.7532929868280527, "grad_norm": 2.2245941162109375, "learning_rate": 9.239068161995404e-06, "loss": 1.1946, "step": 19700 }, { "epoch": 1.7577429690281239, "grad_norm": 50.05204391479492, "learning_rate": 9.234183635555059e-06, "loss": 1.4131, "step": 19750 }, { "epoch": 1.762192951228195, "grad_norm": 2.8868045806884766, "learning_rate": 9.22928478180074e-06, "loss": 0.9141, "step": 19800 }, { "epoch": 1.7666429334282663, "grad_norm": 20.075448989868164, "learning_rate": 9.22437161730882e-06, "loss": 0.8691, "step": 19850 }, { "epoch": 1.7710929156283375, "grad_norm": 2.335299253463745, "learning_rate": 9.219444158704102e-06, "loss": 1.2345, "step": 19900 }, { "epoch": 1.7755428978284087, "grad_norm": 3.9631264209747314, "learning_rate": 9.214502422659747e-06, "loss": 1.2498, "step": 19950 }, { "epoch": 1.7799928800284799, "grad_norm": 2.0723419189453125, "learning_rate": 9.209546425897236e-06, "loss": 0.8745, "step": 20000 }, { "epoch": 1.784442862228551, "grad_norm": 2.13045597076416, "learning_rate": 9.204576185186296e-06, "loss": 1.3863, "step": 20050 }, { "epoch": 1.7888928444286223, "grad_norm": 1.6986653804779053, "learning_rate": 9.19959171734486e-06, "loss": 0.9166, "step": 20100 }, { "epoch": 1.7933428266286935, "grad_norm": 4.065128326416016, "learning_rate": 9.194593039238995e-06, "loss": 1.2152, "step": 20150 }, { "epoch": 1.7977928088287647, "grad_norm": 4.584660053253174, "learning_rate": 9.189580167782854e-06, "loss": 1.5208, "step": 20200 }, { "epoch": 1.8022427910288359, "grad_norm": 11.462759017944336, "learning_rate": 9.184553119938618e-06, "loss": 1.0526, "step": 20250 }, { "epoch": 1.806692773228907, "grad_norm": 6.428556442260742, "learning_rate": 9.179511912716437e-06, "loss": 1.0469, "step": 20300 }, { "epoch": 1.8111427554289783, "grad_norm": 1.5907330513000488, "learning_rate": 9.174456563174366e-06, "loss": 1.1875, "step": 20350 }, { "epoch": 1.8155927376290495, "grad_norm": 1.8544707298278809, "learning_rate": 9.169387088418324e-06, "loss": 1.3646, "step": 20400 }, { "epoch": 1.8200427198291207, "grad_norm": 5.369518280029297, "learning_rate": 9.164303505602018e-06, "loss": 0.8809, "step": 20450 }, { "epoch": 1.824492702029192, "grad_norm": 41.83477783203125, "learning_rate": 9.159205831926896e-06, "loss": 1.4172, "step": 20500 }, { "epoch": 1.828942684229263, "grad_norm": 10.984286308288574, "learning_rate": 9.154094084642084e-06, "loss": 1.1946, "step": 20550 }, { "epoch": 1.8333926664293343, "grad_norm": 7.0459980964660645, "learning_rate": 9.148968281044333e-06, "loss": 0.9949, "step": 20600 }, { "epoch": 1.8378426486294055, "grad_norm": 3.9733259677886963, "learning_rate": 9.14382843847795e-06, "loss": 1.3764, "step": 20650 }, { "epoch": 1.8422926308294767, "grad_norm": 38.754398345947266, "learning_rate": 9.138674574334754e-06, "loss": 1.2117, "step": 20700 }, { "epoch": 1.846742613029548, "grad_norm": 6.152956485748291, "learning_rate": 9.133506706054001e-06, "loss": 1.2517, "step": 20750 }, { "epoch": 1.851192595229619, "grad_norm": 16.012182235717773, "learning_rate": 9.12832485112234e-06, "loss": 1.1585, "step": 20800 }, { "epoch": 1.8556425774296903, "grad_norm": 6.728699684143066, "learning_rate": 9.123129027073745e-06, "loss": 1.1423, "step": 20850 }, { "epoch": 1.8600925596297615, "grad_norm": 5.004445552825928, "learning_rate": 9.117919251489455e-06, "loss": 1.5617, "step": 20900 }, { "epoch": 1.8645425418298327, "grad_norm": 1.446129560470581, "learning_rate": 9.112695541997919e-06, "loss": 0.9359, "step": 20950 }, { "epoch": 1.868992524029904, "grad_norm": 2.032081365585327, "learning_rate": 9.107457916274736e-06, "loss": 1.2204, "step": 21000 }, { "epoch": 1.873442506229975, "grad_norm": 2.8808462619781494, "learning_rate": 9.102206392042592e-06, "loss": 1.0414, "step": 21050 }, { "epoch": 1.8778924884300463, "grad_norm": 3.854820728302002, "learning_rate": 9.096940987071199e-06, "loss": 1.0371, "step": 21100 }, { "epoch": 1.8823424706301175, "grad_norm": 19.898290634155273, "learning_rate": 9.091767440276301e-06, "loss": 1.0004, "step": 21150 }, { "epoch": 1.8867924528301887, "grad_norm": 15.854068756103516, "learning_rate": 9.086474604049189e-06, "loss": 1.3197, "step": 21200 }, { "epoch": 1.89124243503026, "grad_norm": 1.3529242277145386, "learning_rate": 9.081167940314877e-06, "loss": 1.1424, "step": 21250 }, { "epoch": 1.895692417230331, "grad_norm": 2.192044258117676, "learning_rate": 9.07584746702966e-06, "loss": 0.9573, "step": 21300 }, { "epoch": 1.9001423994304023, "grad_norm": 3.725179433822632, "learning_rate": 9.070513202196554e-06, "loss": 1.0583, "step": 21350 }, { "epoch": 1.9045923816304735, "grad_norm": 161.891357421875, "learning_rate": 9.06516516386525e-06, "loss": 1.2946, "step": 21400 }, { "epoch": 1.9090423638305447, "grad_norm": 2.681293487548828, "learning_rate": 9.059803370132035e-06, "loss": 1.0395, "step": 21450 }, { "epoch": 1.913492346030616, "grad_norm": 1.9469075202941895, "learning_rate": 9.05442783913975e-06, "loss": 1.0148, "step": 21500 }, { "epoch": 1.917942328230687, "grad_norm": 3.344614267349243, "learning_rate": 9.049038589077713e-06, "loss": 1.2876, "step": 21550 }, { "epoch": 1.9223923104307583, "grad_norm": 1.1296032667160034, "learning_rate": 9.043635638181665e-06, "loss": 1.2043, "step": 21600 }, { "epoch": 1.9268422926308295, "grad_norm": 6.035279273986816, "learning_rate": 9.038219004733708e-06, "loss": 1.0606, "step": 21650 }, { "epoch": 1.9312922748309007, "grad_norm": 2.4318339824676514, "learning_rate": 9.032788707062242e-06, "loss": 1.1416, "step": 21700 }, { "epoch": 1.935742257030972, "grad_norm": 3.6626288890838623, "learning_rate": 9.027344763541898e-06, "loss": 1.2975, "step": 21750 }, { "epoch": 1.9401922392310431, "grad_norm": 1.704512596130371, "learning_rate": 9.02188719259349e-06, "loss": 1.1551, "step": 21800 }, { "epoch": 1.9446422214311143, "grad_norm": 5.014428615570068, "learning_rate": 9.016416012683937e-06, "loss": 1.1037, "step": 21850 }, { "epoch": 1.9490922036311855, "grad_norm": 1.1153799295425415, "learning_rate": 9.01093124232621e-06, "loss": 1.0453, "step": 21900 }, { "epoch": 1.9535421858312567, "grad_norm": 3.569368839263916, "learning_rate": 9.00543290007926e-06, "loss": 1.0537, "step": 21950 }, { "epoch": 1.957992168031328, "grad_norm": 1.7606276273727417, "learning_rate": 9.000031375160225e-06, "loss": 1.5965, "step": 22000 }, { "epoch": 1.9624421502313991, "grad_norm": 1.7233045101165771, "learning_rate": 8.994506215504933e-06, "loss": 1.0973, "step": 22050 }, { "epoch": 1.9668921324314703, "grad_norm": 2.966580867767334, "learning_rate": 8.9889675395382e-06, "loss": 0.8992, "step": 22100 }, { "epoch": 1.9713421146315415, "grad_norm": 5.9786810874938965, "learning_rate": 8.98341536600138e-06, "loss": 1.0543, "step": 22150 }, { "epoch": 1.9757920968316127, "grad_norm": 2.1468379497528076, "learning_rate": 8.977849713681506e-06, "loss": 1.2327, "step": 22200 }, { "epoch": 1.980242079031684, "grad_norm": 19.30150604248047, "learning_rate": 8.972270601411214e-06, "loss": 1.2989, "step": 22250 }, { "epoch": 1.9846920612317551, "grad_norm": 27.76581573486328, "learning_rate": 8.96667804806869e-06, "loss": 1.5043, "step": 22300 }, { "epoch": 1.9891420434318263, "grad_norm": 74.53713989257812, "learning_rate": 8.961072072577593e-06, "loss": 1.057, "step": 22350 }, { "epoch": 1.9935920256318975, "grad_norm": 5.978003978729248, "learning_rate": 8.955452693907008e-06, "loss": 1.0906, "step": 22400 }, { "epoch": 1.9980420078319687, "grad_norm": 2.036816358566284, "learning_rate": 8.949819931071364e-06, "loss": 1.4813, "step": 22450 }, { "epoch": 2.0, "eval_f1_macro": 0.5637987433580627, "eval_f1_micro": 0.6360079979135878, "eval_loss": 1.4815376996994019, "eval_roc_macro": 0.7903839789930128, "eval_runtime": 16.784, "eval_samples_per_second": 297.902, "eval_steps_per_second": 9.354, "step": 22472 }, { "epoch": 2.00249199003204, "grad_norm": 2.155324697494507, "learning_rate": 8.944173803130389e-06, "loss": 1.1606, "step": 22500 }, { "epoch": 2.006941972232111, "grad_norm": 4.189097881317139, "learning_rate": 8.938514329189024e-06, "loss": 0.9744, "step": 22550 }, { "epoch": 2.0113919544321823, "grad_norm": 1.295159101486206, "learning_rate": 8.932841528397373e-06, "loss": 1.1001, "step": 22600 }, { "epoch": 2.0158419366322535, "grad_norm": 1.9589343070983887, "learning_rate": 8.927155419950638e-06, "loss": 0.9577, "step": 22650 }, { "epoch": 2.0202919188323247, "grad_norm": 5.343440532684326, "learning_rate": 8.921456023089046e-06, "loss": 1.0358, "step": 22700 }, { "epoch": 2.024741901032396, "grad_norm": 30.961210250854492, "learning_rate": 8.91574335709779e-06, "loss": 1.1267, "step": 22750 }, { "epoch": 2.029191883232467, "grad_norm": 3.0702064037323, "learning_rate": 8.910017441306963e-06, "loss": 1.177, "step": 22800 }, { "epoch": 2.0336418654325383, "grad_norm": 3.1723406314849854, "learning_rate": 8.904278295091488e-06, "loss": 1.0468, "step": 22850 }, { "epoch": 2.0380918476326095, "grad_norm": 13.87281608581543, "learning_rate": 8.898525937871064e-06, "loss": 0.7976, "step": 22900 }, { "epoch": 2.0425418298326807, "grad_norm": 5.048751354217529, "learning_rate": 8.892760389110083e-06, "loss": 1.0209, "step": 22950 }, { "epoch": 2.046991812032752, "grad_norm": 5.139971733093262, "learning_rate": 8.886981668317579e-06, "loss": 1.3838, "step": 23000 }, { "epoch": 2.051441794232823, "grad_norm": 132.7314453125, "learning_rate": 8.881189795047154e-06, "loss": 1.1292, "step": 23050 }, { "epoch": 2.0558917764328943, "grad_norm": 3.473353385925293, "learning_rate": 8.875384788896916e-06, "loss": 1.1318, "step": 23100 }, { "epoch": 2.0603417586329655, "grad_norm": 4.366340160369873, "learning_rate": 8.869566669509415e-06, "loss": 1.3773, "step": 23150 }, { "epoch": 2.0647917408330367, "grad_norm": 2.5309484004974365, "learning_rate": 8.863735456571562e-06, "loss": 0.8047, "step": 23200 }, { "epoch": 2.069241723033108, "grad_norm": 5.370298385620117, "learning_rate": 8.857891169814584e-06, "loss": 1.1495, "step": 23250 }, { "epoch": 2.073691705233179, "grad_norm": 32.654052734375, "learning_rate": 8.85203382901394e-06, "loss": 1.2006, "step": 23300 }, { "epoch": 2.0781416874332503, "grad_norm": 11.880526542663574, "learning_rate": 8.84616345398926e-06, "loss": 1.1651, "step": 23350 }, { "epoch": 2.0825916696333215, "grad_norm": 62.83907699584961, "learning_rate": 8.840280064604283e-06, "loss": 1.2286, "step": 23400 }, { "epoch": 2.0870416518333927, "grad_norm": 17.57111358642578, "learning_rate": 8.834383680766778e-06, "loss": 0.9153, "step": 23450 }, { "epoch": 2.091491634033464, "grad_norm": 6.901736259460449, "learning_rate": 8.828474322428493e-06, "loss": 1.1368, "step": 23500 }, { "epoch": 2.095941616233535, "grad_norm": 3.7496025562286377, "learning_rate": 8.82255200958507e-06, "loss": 0.9267, "step": 23550 }, { "epoch": 2.1003915984336063, "grad_norm": 6.058346271514893, "learning_rate": 8.816616762275988e-06, "loss": 1.0274, "step": 23600 }, { "epoch": 2.1048415806336775, "grad_norm": 2.10307240486145, "learning_rate": 8.810668600584494e-06, "loss": 1.1931, "step": 23650 }, { "epoch": 2.1092915628337487, "grad_norm": 6.615518569946289, "learning_rate": 8.804707544637531e-06, "loss": 1.0985, "step": 23700 }, { "epoch": 2.11374154503382, "grad_norm": 8.819474220275879, "learning_rate": 8.79873361460568e-06, "loss": 1.0349, "step": 23750 }, { "epoch": 2.118191527233891, "grad_norm": 5.8756890296936035, "learning_rate": 8.792746830703069e-06, "loss": 1.386, "step": 23800 }, { "epoch": 2.1226415094339623, "grad_norm": 5.690371513366699, "learning_rate": 8.78674721318734e-06, "loss": 1.0267, "step": 23850 }, { "epoch": 2.1270914916340335, "grad_norm": 6.222828388214111, "learning_rate": 8.780734782359544e-06, "loss": 1.0879, "step": 23900 }, { "epoch": 2.1315414738341047, "grad_norm": 5.530284404754639, "learning_rate": 8.774709558564094e-06, "loss": 1.1169, "step": 23950 }, { "epoch": 2.135991456034176, "grad_norm": 3.2359983921051025, "learning_rate": 8.768671562188695e-06, "loss": 1.1373, "step": 24000 }, { "epoch": 2.140441438234247, "grad_norm": 40.97042465209961, "learning_rate": 8.762620813664268e-06, "loss": 0.9672, "step": 24050 }, { "epoch": 2.1448914204343184, "grad_norm": 5.081694602966309, "learning_rate": 8.756557333464882e-06, "loss": 0.9978, "step": 24100 }, { "epoch": 2.1493414026343896, "grad_norm": 32.206390380859375, "learning_rate": 8.750481142107687e-06, "loss": 1.1574, "step": 24150 }, { "epoch": 2.1537913848344608, "grad_norm": 25.25119972229004, "learning_rate": 8.744392260152845e-06, "loss": 1.296, "step": 24200 }, { "epoch": 2.158241367034532, "grad_norm": 12.113503456115723, "learning_rate": 8.738412863274928e-06, "loss": 1.392, "step": 24250 }, { "epoch": 2.162691349234603, "grad_norm": 14.096972465515137, "learning_rate": 8.732298914761341e-06, "loss": 1.1154, "step": 24300 }, { "epoch": 2.1671413314346744, "grad_norm": 5.593836784362793, "learning_rate": 8.72617233717377e-06, "loss": 1.0403, "step": 24350 }, { "epoch": 2.1715913136347456, "grad_norm": 5.61565637588501, "learning_rate": 8.720033151242869e-06, "loss": 1.1062, "step": 24400 }, { "epoch": 2.1760412958348168, "grad_norm": 42.833309173583984, "learning_rate": 8.713881377741955e-06, "loss": 0.933, "step": 24450 }, { "epoch": 2.180491278034888, "grad_norm": 5.178569793701172, "learning_rate": 8.707717037486942e-06, "loss": 1.1012, "step": 24500 }, { "epoch": 2.184941260234959, "grad_norm": 11.145857810974121, "learning_rate": 8.701540151336264e-06, "loss": 1.3169, "step": 24550 }, { "epoch": 2.1893912424350304, "grad_norm": 23.287050247192383, "learning_rate": 8.695350740190803e-06, "loss": 1.0103, "step": 24600 }, { "epoch": 2.1938412246351016, "grad_norm": 10.678844451904297, "learning_rate": 8.689148824993827e-06, "loss": 1.3111, "step": 24650 }, { "epoch": 2.1982912068351728, "grad_norm": 3.5322773456573486, "learning_rate": 8.682934426730914e-06, "loss": 1.008, "step": 24700 }, { "epoch": 2.202741189035244, "grad_norm": 5.218996047973633, "learning_rate": 8.67670756642988e-06, "loss": 1.2264, "step": 24750 }, { "epoch": 2.207191171235315, "grad_norm": 63.2972412109375, "learning_rate": 8.670468265160708e-06, "loss": 1.0576, "step": 24800 }, { "epoch": 2.2116411534353864, "grad_norm": 2.3555707931518555, "learning_rate": 8.664216544035482e-06, "loss": 0.9713, "step": 24850 }, { "epoch": 2.2160911356354576, "grad_norm": 3.9515628814697266, "learning_rate": 8.657952424208304e-06, "loss": 0.9629, "step": 24900 }, { "epoch": 2.2205411178355288, "grad_norm": 2.62445330619812, "learning_rate": 8.651675926875239e-06, "loss": 1.1797, "step": 24950 }, { "epoch": 2.2249911000356, "grad_norm": 1.7551193237304688, "learning_rate": 8.645387073274225e-06, "loss": 0.9877, "step": 25000 }, { "epoch": 2.229441082235671, "grad_norm": 8.436429023742676, "learning_rate": 8.639085884685016e-06, "loss": 1.0168, "step": 25050 }, { "epoch": 2.2338910644357424, "grad_norm": 3.9038734436035156, "learning_rate": 8.632772382429106e-06, "loss": 1.4004, "step": 25100 }, { "epoch": 2.2383410466358136, "grad_norm": 9.511496543884277, "learning_rate": 8.626446587869648e-06, "loss": 1.1553, "step": 25150 }, { "epoch": 2.2427910288358848, "grad_norm": 23.454593658447266, "learning_rate": 8.620108522411395e-06, "loss": 1.0411, "step": 25200 }, { "epoch": 2.247241011035956, "grad_norm": 3.236924171447754, "learning_rate": 8.613758207500616e-06, "loss": 1.2447, "step": 25250 }, { "epoch": 2.251690993236027, "grad_norm": 5.9351115226745605, "learning_rate": 8.607395664625034e-06, "loss": 1.132, "step": 25300 }, { "epoch": 2.2561409754360984, "grad_norm": 38.95446014404297, "learning_rate": 8.601020915313746e-06, "loss": 1.2323, "step": 25350 }, { "epoch": 2.2605909576361696, "grad_norm": 90.74901580810547, "learning_rate": 8.594633981137151e-06, "loss": 1.1448, "step": 25400 }, { "epoch": 2.2650409398362408, "grad_norm": 7.584736347198486, "learning_rate": 8.58823488370688e-06, "loss": 1.4164, "step": 25450 }, { "epoch": 2.269490922036312, "grad_norm": 119.72415161132812, "learning_rate": 8.58182364467572e-06, "loss": 1.1254, "step": 25500 }, { "epoch": 2.273940904236383, "grad_norm": 177.30332946777344, "learning_rate": 8.575400285737541e-06, "loss": 1.1962, "step": 25550 }, { "epoch": 2.2783908864364544, "grad_norm": 8.911840438842773, "learning_rate": 8.568964828627228e-06, "loss": 1.142, "step": 25600 }, { "epoch": 2.2828408686365256, "grad_norm": 4.965555667877197, "learning_rate": 8.562517295120597e-06, "loss": 1.1061, "step": 25650 }, { "epoch": 2.287290850836597, "grad_norm": 21.19535255432129, "learning_rate": 8.55605770703433e-06, "loss": 1.0081, "step": 25700 }, { "epoch": 2.291740833036668, "grad_norm": 11.012398719787598, "learning_rate": 8.549586086225898e-06, "loss": 0.9854, "step": 25750 }, { "epoch": 2.296190815236739, "grad_norm": 1.656829833984375, "learning_rate": 8.543102454593487e-06, "loss": 0.9566, "step": 25800 }, { "epoch": 2.3006407974368104, "grad_norm": 5.494473457336426, "learning_rate": 8.536606834075929e-06, "loss": 1.2099, "step": 25850 }, { "epoch": 2.3050907796368816, "grad_norm": 8.97497844696045, "learning_rate": 8.530099246652615e-06, "loss": 0.9954, "step": 25900 }, { "epoch": 2.309540761836953, "grad_norm": 3.492748975753784, "learning_rate": 8.523579714343434e-06, "loss": 1.0522, "step": 25950 }, { "epoch": 2.313990744037024, "grad_norm": 37.55433654785156, "learning_rate": 8.51704825920869e-06, "loss": 0.9149, "step": 26000 }, { "epoch": 2.318440726237095, "grad_norm": 14.30866813659668, "learning_rate": 8.510504903349036e-06, "loss": 1.2345, "step": 26050 }, { "epoch": 2.3228907084371664, "grad_norm": 4.3899970054626465, "learning_rate": 8.503949668905387e-06, "loss": 1.0823, "step": 26100 }, { "epoch": 2.3273406906372376, "grad_norm": 4.052544116973877, "learning_rate": 8.497382578058856e-06, "loss": 1.2713, "step": 26150 }, { "epoch": 2.331790672837309, "grad_norm": 99.42903900146484, "learning_rate": 8.490803653030674e-06, "loss": 1.1796, "step": 26200 }, { "epoch": 2.33624065503738, "grad_norm": 42.8563232421875, "learning_rate": 8.484212916082118e-06, "loss": 1.496, "step": 26250 }, { "epoch": 2.340690637237451, "grad_norm": 14.94514274597168, "learning_rate": 8.477610389514429e-06, "loss": 1.1291, "step": 26300 }, { "epoch": 2.3451406194375224, "grad_norm": 5.753387928009033, "learning_rate": 8.470996095668742e-06, "loss": 1.1266, "step": 26350 }, { "epoch": 2.3495906016375936, "grad_norm": 5.291345119476318, "learning_rate": 8.46437005692601e-06, "loss": 1.1334, "step": 26400 }, { "epoch": 2.354040583837665, "grad_norm": 8.006234169006348, "learning_rate": 8.45786516566637e-06, "loss": 1.1703, "step": 26450 }, { "epoch": 2.358490566037736, "grad_norm": 4.2896575927734375, "learning_rate": 8.45121593821125e-06, "loss": 1.0734, "step": 26500 }, { "epoch": 2.362940548237807, "grad_norm": 11.514307022094727, "learning_rate": 8.444555032789705e-06, "loss": 0.9845, "step": 26550 }, { "epoch": 2.3673905304378784, "grad_norm": 8.433670043945312, "learning_rate": 8.437882471940405e-06, "loss": 1.269, "step": 26600 }, { "epoch": 2.3718405126379496, "grad_norm": 70.60860443115234, "learning_rate": 8.431332075971145e-06, "loss": 1.0962, "step": 26650 }, { "epoch": 2.376290494838021, "grad_norm": 5.992774963378906, "learning_rate": 8.424636504022777e-06, "loss": 0.9536, "step": 26700 }, { "epoch": 2.380740477038092, "grad_norm": 5.762092590332031, "learning_rate": 8.417929344045485e-06, "loss": 0.9755, "step": 26750 }, { "epoch": 2.385190459238163, "grad_norm": 6.791254997253418, "learning_rate": 8.411210618734453e-06, "loss": 1.0588, "step": 26800 }, { "epoch": 2.3896404414382344, "grad_norm": 1.9339035749435425, "learning_rate": 8.404480350824005e-06, "loss": 0.9508, "step": 26850 }, { "epoch": 2.3940904236383056, "grad_norm": 5.775173664093018, "learning_rate": 8.397738563087515e-06, "loss": 1.4055, "step": 26900 }, { "epoch": 2.398540405838377, "grad_norm": 18.85881996154785, "learning_rate": 8.390985278337341e-06, "loss": 0.9507, "step": 26950 }, { "epoch": 2.402990388038448, "grad_norm": 8.913670539855957, "learning_rate": 8.384220519424744e-06, "loss": 1.0135, "step": 27000 }, { "epoch": 2.407440370238519, "grad_norm": 5.081998825073242, "learning_rate": 8.377444309239809e-06, "loss": 1.0391, "step": 27050 }, { "epoch": 2.4118903524385904, "grad_norm": 7.047842979431152, "learning_rate": 8.370656670711368e-06, "loss": 1.1259, "step": 27100 }, { "epoch": 2.4163403346386616, "grad_norm": 16.793575286865234, "learning_rate": 8.363857626806926e-06, "loss": 1.0253, "step": 27150 }, { "epoch": 2.420790316838733, "grad_norm": 28.434322357177734, "learning_rate": 8.35704720053258e-06, "loss": 1.1931, "step": 27200 }, { "epoch": 2.425240299038804, "grad_norm": 4.692121505737305, "learning_rate": 8.350225414932942e-06, "loss": 0.9322, "step": 27250 }, { "epoch": 2.429690281238875, "grad_norm": 16.48238754272461, "learning_rate": 8.343392293091056e-06, "loss": 0.8793, "step": 27300 }, { "epoch": 2.4341402634389464, "grad_norm": 4.453526020050049, "learning_rate": 8.336547858128331e-06, "loss": 1.0611, "step": 27350 }, { "epoch": 2.4385902456390176, "grad_norm": 6.173256874084473, "learning_rate": 8.329692133204454e-06, "loss": 0.9212, "step": 27400 }, { "epoch": 2.443040227839089, "grad_norm": 36.94075012207031, "learning_rate": 8.322825141517316e-06, "loss": 1.2478, "step": 27450 }, { "epoch": 2.44749021003916, "grad_norm": 6.299712181091309, "learning_rate": 8.315946906302928e-06, "loss": 1.2644, "step": 27500 }, { "epoch": 2.451940192239231, "grad_norm": 4.783332824707031, "learning_rate": 8.30905745083535e-06, "loss": 1.156, "step": 27550 }, { "epoch": 2.4563901744393024, "grad_norm": 122.3803939819336, "learning_rate": 8.302156798426603e-06, "loss": 1.2043, "step": 27600 }, { "epoch": 2.4608401566393736, "grad_norm": 8.773785591125488, "learning_rate": 8.295244972426602e-06, "loss": 0.9245, "step": 27650 }, { "epoch": 2.465290138839445, "grad_norm": 3.724982500076294, "learning_rate": 8.288321996223065e-06, "loss": 1.2747, "step": 27700 }, { "epoch": 2.469740121039516, "grad_norm": 7.775646686553955, "learning_rate": 8.281387893241442e-06, "loss": 1.2704, "step": 27750 }, { "epoch": 2.474190103239587, "grad_norm": 7.251370906829834, "learning_rate": 8.274442686944833e-06, "loss": 1.0481, "step": 27800 }, { "epoch": 2.4786400854396584, "grad_norm": 9.903209686279297, "learning_rate": 8.267486400833904e-06, "loss": 1.0447, "step": 27850 }, { "epoch": 2.4830900676397296, "grad_norm": 4.922275066375732, "learning_rate": 8.26051905844682e-06, "loss": 1.2688, "step": 27900 }, { "epoch": 2.487540049839801, "grad_norm": 22.75454330444336, "learning_rate": 8.25354068335915e-06, "loss": 0.9519, "step": 27950 }, { "epoch": 2.491990032039872, "grad_norm": 5.216652870178223, "learning_rate": 8.246551299183803e-06, "loss": 1.2772, "step": 28000 }, { "epoch": 2.496440014239943, "grad_norm": 4.686745643615723, "learning_rate": 8.239550929570927e-06, "loss": 1.0244, "step": 28050 }, { "epoch": 2.500889996440014, "grad_norm": 24.949731826782227, "learning_rate": 8.232539598207855e-06, "loss": 1.2868, "step": 28100 }, { "epoch": 2.5053399786400856, "grad_norm": 4.0583577156066895, "learning_rate": 8.225517328819006e-06, "loss": 1.1708, "step": 28150 }, { "epoch": 2.5097899608401564, "grad_norm": 9.95898723602295, "learning_rate": 8.218484145165806e-06, "loss": 0.9592, "step": 28200 }, { "epoch": 2.514239943040228, "grad_norm": 4.198066711425781, "learning_rate": 8.21144007104662e-06, "loss": 1.093, "step": 28250 }, { "epoch": 2.5186899252402988, "grad_norm": 4.381742477416992, "learning_rate": 8.204385130296655e-06, "loss": 1.2448, "step": 28300 }, { "epoch": 2.5231399074403704, "grad_norm": 5.279024124145508, "learning_rate": 8.197319346787893e-06, "loss": 1.1951, "step": 28350 }, { "epoch": 2.527589889640441, "grad_norm": 56.91624069213867, "learning_rate": 8.190242744429009e-06, "loss": 0.7142, "step": 28400 }, { "epoch": 2.532039871840513, "grad_norm": 4.616907119750977, "learning_rate": 8.183155347165275e-06, "loss": 0.8593, "step": 28450 }, { "epoch": 2.5364898540405836, "grad_norm": 3.0933165550231934, "learning_rate": 8.176057178978497e-06, "loss": 1.4901, "step": 28500 }, { "epoch": 2.5409398362406552, "grad_norm": 9.415891647338867, "learning_rate": 8.168948263886928e-06, "loss": 1.0598, "step": 28550 }, { "epoch": 2.545389818440726, "grad_norm": 4.536644458770752, "learning_rate": 8.161828625945183e-06, "loss": 1.2457, "step": 28600 }, { "epoch": 2.5498398006407976, "grad_norm": 116.62373352050781, "learning_rate": 8.154698289244158e-06, "loss": 0.9189, "step": 28650 }, { "epoch": 2.5542897828408684, "grad_norm": 120.8448715209961, "learning_rate": 8.147557277910956e-06, "loss": 1.0832, "step": 28700 }, { "epoch": 2.55873976504094, "grad_norm": 134.3202362060547, "learning_rate": 8.140405616108796e-06, "loss": 1.1851, "step": 28750 }, { "epoch": 2.563189747241011, "grad_norm": 5.872665882110596, "learning_rate": 8.133243328036934e-06, "loss": 1.0069, "step": 28800 }, { "epoch": 2.5676397294410824, "grad_norm": 8.058123588562012, "learning_rate": 8.126070437930589e-06, "loss": 0.8563, "step": 28850 }, { "epoch": 2.572089711641153, "grad_norm": 65.59220123291016, "learning_rate": 8.118886970060847e-06, "loss": 1.2164, "step": 28900 }, { "epoch": 2.576539693841225, "grad_norm": 32.869144439697266, "learning_rate": 8.111692948734593e-06, "loss": 1.4324, "step": 28950 }, { "epoch": 2.5809896760412956, "grad_norm": 4.6152167320251465, "learning_rate": 8.104488398294415e-06, "loss": 0.9489, "step": 29000 }, { "epoch": 2.5854396582413672, "grad_norm": 7.358272075653076, "learning_rate": 8.097273343118533e-06, "loss": 1.2895, "step": 29050 }, { "epoch": 2.589889640441438, "grad_norm": 59.257080078125, "learning_rate": 8.090047807620712e-06, "loss": 1.0207, "step": 29100 }, { "epoch": 2.5943396226415096, "grad_norm": 3.397690773010254, "learning_rate": 8.082811816250178e-06, "loss": 0.9065, "step": 29150 }, { "epoch": 2.5987896048415804, "grad_norm": 34.30668640136719, "learning_rate": 8.075565393491536e-06, "loss": 1.4361, "step": 29200 }, { "epoch": 2.603239587041652, "grad_norm": 6.047588348388672, "learning_rate": 8.068308563864694e-06, "loss": 1.1302, "step": 29250 }, { "epoch": 2.607689569241723, "grad_norm": 111.73908996582031, "learning_rate": 8.061041351924764e-06, "loss": 1.1882, "step": 29300 }, { "epoch": 2.6121395514417944, "grad_norm": 52.087554931640625, "learning_rate": 8.053763782261999e-06, "loss": 1.4243, "step": 29350 }, { "epoch": 2.616589533641865, "grad_norm": 40.76045608520508, "learning_rate": 8.046475879501692e-06, "loss": 1.2167, "step": 29400 }, { "epoch": 2.621039515841937, "grad_norm": 9.153675079345703, "learning_rate": 8.039177668304106e-06, "loss": 1.0961, "step": 29450 }, { "epoch": 2.6254894980420076, "grad_norm": 4.274999141693115, "learning_rate": 8.03186917336438e-06, "loss": 1.0813, "step": 29500 }, { "epoch": 2.6299394802420792, "grad_norm": 2.6339595317840576, "learning_rate": 8.024550419412453e-06, "loss": 0.8781, "step": 29550 }, { "epoch": 2.63438946244215, "grad_norm": 4.687946796417236, "learning_rate": 8.017221431212977e-06, "loss": 1.1897, "step": 29600 }, { "epoch": 2.6388394446422216, "grad_norm": 4.757798194885254, "learning_rate": 8.009882233565236e-06, "loss": 1.0778, "step": 29650 }, { "epoch": 2.6432894268422924, "grad_norm": 23.445499420166016, "learning_rate": 8.002532851303058e-06, "loss": 1.1461, "step": 29700 }, { "epoch": 2.647739409042364, "grad_norm": 3.199381113052368, "learning_rate": 7.99517330929473e-06, "loss": 1.2998, "step": 29750 }, { "epoch": 2.652189391242435, "grad_norm": 5.970043659210205, "learning_rate": 7.987803632442925e-06, "loss": 1.0217, "step": 29800 }, { "epoch": 2.6566393734425064, "grad_norm": 5.606378078460693, "learning_rate": 7.980423845684602e-06, "loss": 0.9616, "step": 29850 }, { "epoch": 2.661089355642577, "grad_norm": 120.21475982666016, "learning_rate": 7.97303397399093e-06, "loss": 1.028, "step": 29900 }, { "epoch": 2.665539337842649, "grad_norm": 5.092627048492432, "learning_rate": 7.965634042367206e-06, "loss": 1.0255, "step": 29950 }, { "epoch": 2.6699893200427196, "grad_norm": 6.353029727935791, "learning_rate": 7.958224075852764e-06, "loss": 1.1038, "step": 30000 }, { "epoch": 2.6744393022427912, "grad_norm": 4.838529109954834, "learning_rate": 7.950804099520897e-06, "loss": 0.9106, "step": 30050 }, { "epoch": 2.678889284442862, "grad_norm": 14.908506393432617, "learning_rate": 7.943374138478765e-06, "loss": 1.3749, "step": 30100 }, { "epoch": 2.6833392666429337, "grad_norm": 7.021056175231934, "learning_rate": 7.935934217867314e-06, "loss": 1.1395, "step": 30150 }, { "epoch": 2.6877892488430044, "grad_norm": 4.955234050750732, "learning_rate": 7.928484362861191e-06, "loss": 1.0474, "step": 30200 }, { "epoch": 2.692239231043076, "grad_norm": 4.593790054321289, "learning_rate": 7.921024598668657e-06, "loss": 1.1164, "step": 30250 }, { "epoch": 2.696689213243147, "grad_norm": 17.162233352661133, "learning_rate": 7.913554950531507e-06, "loss": 1.2881, "step": 30300 }, { "epoch": 2.7011391954432185, "grad_norm": 13.859789848327637, "learning_rate": 7.90607544372498e-06, "loss": 1.1034, "step": 30350 }, { "epoch": 2.705589177643289, "grad_norm": 16.67905616760254, "learning_rate": 7.898586103557665e-06, "loss": 1.2444, "step": 30400 }, { "epoch": 2.710039159843361, "grad_norm": 5.408065319061279, "learning_rate": 7.891086955371436e-06, "loss": 0.9214, "step": 30450 }, { "epoch": 2.7144891420434316, "grad_norm": 18.90802764892578, "learning_rate": 7.883578024541348e-06, "loss": 0.9308, "step": 30500 }, { "epoch": 2.7189391242435033, "grad_norm": 7.50139856338501, "learning_rate": 7.87605933647556e-06, "loss": 0.9778, "step": 30550 }, { "epoch": 2.723389106443574, "grad_norm": 2.602687120437622, "learning_rate": 7.868530916615247e-06, "loss": 0.9832, "step": 30600 }, { "epoch": 2.7278390886436457, "grad_norm": 5.7807183265686035, "learning_rate": 7.860992790434514e-06, "loss": 1.0695, "step": 30650 }, { "epoch": 2.7322890708437164, "grad_norm": 12.97180461883545, "learning_rate": 7.853444983440307e-06, "loss": 1.3614, "step": 30700 }, { "epoch": 2.736739053043788, "grad_norm": 4.461914539337158, "learning_rate": 7.845887521172334e-06, "loss": 0.9692, "step": 30750 }, { "epoch": 2.741189035243859, "grad_norm": 9.48783016204834, "learning_rate": 7.838320429202965e-06, "loss": 0.9439, "step": 30800 }, { "epoch": 2.7456390174439305, "grad_norm": 2.8514506816864014, "learning_rate": 7.830743733137166e-06, "loss": 1.2092, "step": 30850 }, { "epoch": 2.750088999644001, "grad_norm": 6.755616664886475, "learning_rate": 7.82315745861239e-06, "loss": 0.9959, "step": 30900 }, { "epoch": 2.754538981844073, "grad_norm": 3.672438859939575, "learning_rate": 7.815561631298508e-06, "loss": 0.9792, "step": 30950 }, { "epoch": 2.7589889640441436, "grad_norm": 70.43602752685547, "learning_rate": 7.807956276897712e-06, "loss": 1.0361, "step": 31000 }, { "epoch": 2.7634389462442153, "grad_norm": 6.686511039733887, "learning_rate": 7.80034142114443e-06, "loss": 0.9733, "step": 31050 }, { "epoch": 2.767888928444286, "grad_norm": 8.275307655334473, "learning_rate": 7.792717089805246e-06, "loss": 1.1822, "step": 31100 }, { "epoch": 2.7723389106443577, "grad_norm": 23.298620223999023, "learning_rate": 7.785083308678795e-06, "loss": 1.0775, "step": 31150 }, { "epoch": 2.7767888928444284, "grad_norm": 9.375283241271973, "learning_rate": 7.777440103595701e-06, "loss": 0.985, "step": 31200 }, { "epoch": 2.7812388750445, "grad_norm": 6.555306911468506, "learning_rate": 7.769787500418469e-06, "loss": 0.8715, "step": 31250 }, { "epoch": 2.785688857244571, "grad_norm": 6.745306491851807, "learning_rate": 7.762125525041404e-06, "loss": 0.9865, "step": 31300 }, { "epoch": 2.7901388394446425, "grad_norm": 4.08693265914917, "learning_rate": 7.754454203390525e-06, "loss": 1.2418, "step": 31350 }, { "epoch": 2.7945888216447132, "grad_norm": 12.843364715576172, "learning_rate": 7.746773561423478e-06, "loss": 0.7343, "step": 31400 }, { "epoch": 2.799038803844785, "grad_norm": 2.835442543029785, "learning_rate": 7.739083625129441e-06, "loss": 1.0608, "step": 31450 }, { "epoch": 2.8034887860448556, "grad_norm": 5.7765631675720215, "learning_rate": 7.731384420529051e-06, "loss": 0.9831, "step": 31500 }, { "epoch": 2.8079387682449273, "grad_norm": 266.4544372558594, "learning_rate": 7.723675973674299e-06, "loss": 1.2846, "step": 31550 }, { "epoch": 2.812388750444998, "grad_norm": 4.718108177185059, "learning_rate": 7.71595831064845e-06, "loss": 1.1094, "step": 31600 }, { "epoch": 2.8168387326450697, "grad_norm": 52.2277717590332, "learning_rate": 7.708231457565953e-06, "loss": 0.9962, "step": 31650 }, { "epoch": 2.8212887148451404, "grad_norm": 2.205535650253296, "learning_rate": 7.700495440572363e-06, "loss": 1.1389, "step": 31700 }, { "epoch": 2.825738697045212, "grad_norm": 5.4852776527404785, "learning_rate": 7.692750285844226e-06, "loss": 1.2493, "step": 31750 }, { "epoch": 2.830188679245283, "grad_norm": 12.645532608032227, "learning_rate": 7.684996019589024e-06, "loss": 1.1035, "step": 31800 }, { "epoch": 2.8346386614453545, "grad_norm": 56.84285354614258, "learning_rate": 7.677232668045062e-06, "loss": 1.2681, "step": 31850 }, { "epoch": 2.8390886436454252, "grad_norm": 5.458531856536865, "learning_rate": 7.66946025748139e-06, "loss": 1.0134, "step": 31900 }, { "epoch": 2.843538625845497, "grad_norm": 10.682284355163574, "learning_rate": 7.661678814197705e-06, "loss": 1.0421, "step": 31950 }, { "epoch": 2.8479886080455676, "grad_norm": 2.0057425498962402, "learning_rate": 7.653888364524276e-06, "loss": 1.2504, "step": 32000 }, { "epoch": 2.8524385902456393, "grad_norm": 5.69174337387085, "learning_rate": 7.646088934821848e-06, "loss": 1.1247, "step": 32050 }, { "epoch": 2.85688857244571, "grad_norm": 8.440912246704102, "learning_rate": 7.638280551481539e-06, "loss": 1.0693, "step": 32100 }, { "epoch": 2.8613385546457817, "grad_norm": 6.582889080047607, "learning_rate": 7.63046324092478e-06, "loss": 1.1101, "step": 32150 }, { "epoch": 2.8657885368458524, "grad_norm": 57.74811553955078, "learning_rate": 7.622637029603197e-06, "loss": 1.1735, "step": 32200 }, { "epoch": 2.870238519045924, "grad_norm": 145.7239990234375, "learning_rate": 7.6148019439985446e-06, "loss": 1.0937, "step": 32250 }, { "epoch": 2.874688501245995, "grad_norm": 2.1090431213378906, "learning_rate": 7.606958010622594e-06, "loss": 1.1936, "step": 32300 }, { "epoch": 2.8791384834460665, "grad_norm": 174.78880310058594, "learning_rate": 7.599105256017062e-06, "loss": 1.0315, "step": 32350 }, { "epoch": 2.8835884656461372, "grad_norm": 10.177800178527832, "learning_rate": 7.591243706753515e-06, "loss": 1.0523, "step": 32400 }, { "epoch": 2.888038447846209, "grad_norm": 2.842582941055298, "learning_rate": 7.583373389433272e-06, "loss": 1.3135, "step": 32450 }, { "epoch": 2.8924884300462796, "grad_norm": 5.6900224685668945, "learning_rate": 7.575494330687327e-06, "loss": 1.1948, "step": 32500 }, { "epoch": 2.8969384122463513, "grad_norm": 5.440971851348877, "learning_rate": 7.567606557176248e-06, "loss": 0.9205, "step": 32550 }, { "epoch": 2.901388394446422, "grad_norm": 4.601685047149658, "learning_rate": 7.559710095590094e-06, "loss": 0.8829, "step": 32600 }, { "epoch": 2.9058383766464932, "grad_norm": 14.143527030944824, "learning_rate": 7.551804972648321e-06, "loss": 1.1196, "step": 32650 }, { "epoch": 2.9102883588465644, "grad_norm": 9.098328590393066, "learning_rate": 7.543891215099692e-06, "loss": 1.2866, "step": 32700 }, { "epoch": 2.9147383410466357, "grad_norm": 6.828986644744873, "learning_rate": 7.536127381213123e-06, "loss": 1.0689, "step": 32750 }, { "epoch": 2.919188323246707, "grad_norm": 5.287408828735352, "learning_rate": 7.528196606171394e-06, "loss": 1.3178, "step": 32800 }, { "epoch": 2.923638305446778, "grad_norm": 13.490480422973633, "learning_rate": 7.520257276407033e-06, "loss": 0.9586, "step": 32850 }, { "epoch": 2.9280882876468493, "grad_norm": 45.76041793823242, "learning_rate": 7.512309418784553e-06, "loss": 0.9261, "step": 32900 }, { "epoch": 2.9325382698469205, "grad_norm": 209.08949279785156, "learning_rate": 7.504353060197321e-06, "loss": 1.175, "step": 32950 }, { "epoch": 2.9369882520469917, "grad_norm": 8.27961254119873, "learning_rate": 7.496388227567465e-06, "loss": 1.02, "step": 33000 }, { "epoch": 2.941438234247063, "grad_norm": 5.50449275970459, "learning_rate": 7.488414947845793e-06, "loss": 1.5827, "step": 33050 }, { "epoch": 2.945888216447134, "grad_norm": 3.9725236892700195, "learning_rate": 7.480433248011694e-06, "loss": 1.0297, "step": 33100 }, { "epoch": 2.9503381986472053, "grad_norm": 6.791203022003174, "learning_rate": 7.472443155073043e-06, "loss": 1.0961, "step": 33150 }, { "epoch": 2.9547881808472765, "grad_norm": 7.256369113922119, "learning_rate": 7.46444469606612e-06, "loss": 1.1382, "step": 33200 }, { "epoch": 2.9592381630473477, "grad_norm": 12.464081764221191, "learning_rate": 7.456437898055517e-06, "loss": 1.3095, "step": 33250 }, { "epoch": 2.963688145247419, "grad_norm": 47.463340759277344, "learning_rate": 7.448422788134031e-06, "loss": 1.0221, "step": 33300 }, { "epoch": 2.96813812744749, "grad_norm": 5.142919540405273, "learning_rate": 7.440559942332215e-06, "loss": 1.0761, "step": 33350 }, { "epoch": 2.9725881096475613, "grad_norm": 12.167074203491211, "learning_rate": 7.432528454866374e-06, "loss": 0.9054, "step": 33400 }, { "epoch": 2.9770380918476325, "grad_norm": 3.7981958389282227, "learning_rate": 7.424488736392641e-06, "loss": 1.0385, "step": 33450 }, { "epoch": 2.9814880740477037, "grad_norm": 7.430481433868408, "learning_rate": 7.416440814115211e-06, "loss": 1.6092, "step": 33500 }, { "epoch": 2.985938056247775, "grad_norm": 10.22213363647461, "learning_rate": 7.408384715266045e-06, "loss": 1.0588, "step": 33550 }, { "epoch": 2.990388038447846, "grad_norm": 4.126297473907471, "learning_rate": 7.400320467104767e-06, "loss": 1.1279, "step": 33600 }, { "epoch": 2.9948380206479173, "grad_norm": 10.618496894836426, "learning_rate": 7.392248096918576e-06, "loss": 1.3174, "step": 33650 }, { "epoch": 2.9992880028479885, "grad_norm": 7.148495674133301, "learning_rate": 7.384167632022156e-06, "loss": 1.0492, "step": 33700 }, { "epoch": 3.0, "eval_f1_macro": 0.5710135980875565, "eval_f1_micro": 0.6377365040791529, "eval_loss": 1.5430234670639038, "eval_roc_macro": 0.7930276463834468, "eval_runtime": 16.7403, "eval_samples_per_second": 298.68, "eval_steps_per_second": 9.379, "step": 33708 } ], "logging_steps": 50, "max_steps": 89888, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }