{ "best_global_step": 6505, "best_metric": 0.19649724662303925, "best_model_checkpoint": "/media/user/Expansion1/snowflake-arctic-embed-xs-refusal/checkpoint-6505", "epoch": 5.0, "eval_steps": 500, "global_step": 32525, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07686395080707148, "grad_norm": 2.155881881713867, "learning_rate": 4.923289777094543e-05, "loss": 0.3511, "num_input_tokens_seen": 512000, "step": 500, "train_runtime": 5.3031, "train_tokens_per_second": 96546.489 }, { "epoch": 0.15372790161414296, "grad_norm": 0.6236560344696045, "learning_rate": 4.846425826287471e-05, "loss": 0.2593, "num_input_tokens_seen": 1024000, "step": 1000, "train_runtime": 10.3436, "train_tokens_per_second": 98998.124 }, { "epoch": 0.23059185242121444, "grad_norm": 3.8176653385162354, "learning_rate": 4.7695618754804e-05, "loss": 0.2546, "num_input_tokens_seen": 1536000, "step": 1500, "train_runtime": 15.3626, "train_tokens_per_second": 99983.128 }, { "epoch": 0.3074558032282859, "grad_norm": 2.538367986679077, "learning_rate": 4.692697924673328e-05, "loss": 0.2377, "num_input_tokens_seen": 2048000, "step": 2000, "train_runtime": 20.351, "train_tokens_per_second": 100634.117 }, { "epoch": 0.3843197540353574, "grad_norm": 3.922593832015991, "learning_rate": 4.615833973866257e-05, "loss": 0.2411, "num_input_tokens_seen": 2560000, "step": 2500, "train_runtime": 25.363, "train_tokens_per_second": 100934.428 }, { "epoch": 0.4611837048424289, "grad_norm": 0.728330135345459, "learning_rate": 4.5389700230591855e-05, "loss": 0.2278, "num_input_tokens_seen": 3072000, "step": 3000, "train_runtime": 30.387, "train_tokens_per_second": 101095.729 }, { "epoch": 0.5380476556495004, "grad_norm": 0.5299816131591797, "learning_rate": 4.462106072252114e-05, "loss": 0.2422, "num_input_tokens_seen": 3584000, "step": 3500, "train_runtime": 35.3784, "train_tokens_per_second": 101304.819 }, { "epoch": 0.6149116064565718, "grad_norm": 9.030647277832031, "learning_rate": 4.3852421214450424e-05, "loss": 0.2269, "num_input_tokens_seen": 4096000, "step": 4000, "train_runtime": 40.3905, "train_tokens_per_second": 101409.866 }, { "epoch": 0.6917755572636434, "grad_norm": 1.8069450855255127, "learning_rate": 4.308378170637971e-05, "loss": 0.2378, "num_input_tokens_seen": 4608000, "step": 4500, "train_runtime": 45.4094, "train_tokens_per_second": 101476.838 }, { "epoch": 0.7686395080707148, "grad_norm": 2.4024786949157715, "learning_rate": 4.231514219830899e-05, "loss": 0.2129, "num_input_tokens_seen": 5120000, "step": 5000, "train_runtime": 50.4114, "train_tokens_per_second": 101564.317 }, { "epoch": 0.8455034588777863, "grad_norm": 0.8753976821899414, "learning_rate": 4.1546502690238284e-05, "loss": 0.2354, "num_input_tokens_seen": 5632000, "step": 5500, "train_runtime": 55.4722, "train_tokens_per_second": 101528.336 }, { "epoch": 0.9223674096848578, "grad_norm": 4.247986316680908, "learning_rate": 4.077786318216756e-05, "loss": 0.237, "num_input_tokens_seen": 6144000, "step": 6000, "train_runtime": 60.5329, "train_tokens_per_second": 101498.514 }, { "epoch": 0.9992313604919293, "grad_norm": 7.1489973068237305, "learning_rate": 4.000922367409685e-05, "loss": 0.2286, "num_input_tokens_seen": 6656000, "step": 6500, "train_runtime": 65.5361, "train_tokens_per_second": 101562.357 }, { "epoch": 1.0, "eval_accuracy": 0.919369715603382, "eval_loss": 0.19649724662303925, "eval_runtime": 5.1167, "eval_samples_per_second": 2542.665, "eval_steps_per_second": 317.98, "num_input_tokens_seen": 6661120, "step": 6505 }, { "epoch": 1.0760953112990008, "grad_norm": 10.231003761291504, "learning_rate": 3.9240584166026136e-05, "loss": 0.1751, "num_input_tokens_seen": 7168000, "step": 7000, "train_runtime": 75.9746, "train_tokens_per_second": 94347.276 }, { "epoch": 1.1529592621060722, "grad_norm": 0.037300433963537216, "learning_rate": 3.847194465795542e-05, "loss": 0.1739, "num_input_tokens_seen": 7680000, "step": 7500, "train_runtime": 81.0147, "train_tokens_per_second": 94797.624 }, { "epoch": 1.2298232129131437, "grad_norm": 10.88604736328125, "learning_rate": 3.7703305149884705e-05, "loss": 0.1923, "num_input_tokens_seen": 8192000, "step": 8000, "train_runtime": 86.0728, "train_tokens_per_second": 95175.201 }, { "epoch": 1.3066871637202153, "grad_norm": 14.25737190246582, "learning_rate": 3.693466564181399e-05, "loss": 0.1809, "num_input_tokens_seen": 8704000, "step": 8500, "train_runtime": 91.0822, "train_tokens_per_second": 95562.051 }, { "epoch": 1.3835511145272867, "grad_norm": 0.3044818639755249, "learning_rate": 3.6166026133743274e-05, "loss": 0.1922, "num_input_tokens_seen": 9216000, "step": 9000, "train_runtime": 96.1257, "train_tokens_per_second": 95874.452 }, { "epoch": 1.4604150653343582, "grad_norm": 15.291511535644531, "learning_rate": 3.5397386625672565e-05, "loss": 0.171, "num_input_tokens_seen": 9728000, "step": 9500, "train_runtime": 101.1575, "train_tokens_per_second": 96166.85 }, { "epoch": 1.5372790161414298, "grad_norm": 12.030097007751465, "learning_rate": 3.462874711760184e-05, "loss": 0.1779, "num_input_tokens_seen": 10240000, "step": 10000, "train_runtime": 106.2268, "train_tokens_per_second": 96397.497 }, { "epoch": 1.614142966948501, "grad_norm": 0.17212723195552826, "learning_rate": 3.386010760953113e-05, "loss": 0.176, "num_input_tokens_seen": 10752000, "step": 10500, "train_runtime": 111.2959, "train_tokens_per_second": 96607.35 }, { "epoch": 1.6910069177555727, "grad_norm": 35.843482971191406, "learning_rate": 3.309146810146042e-05, "loss": 0.1879, "num_input_tokens_seen": 11264000, "step": 11000, "train_runtime": 116.3574, "train_tokens_per_second": 96805.219 }, { "epoch": 1.767870868562644, "grad_norm": 0.055776312947273254, "learning_rate": 3.23228285933897e-05, "loss": 0.1749, "num_input_tokens_seen": 11776000, "step": 11500, "train_runtime": 121.386, "train_tokens_per_second": 97012.845 }, { "epoch": 1.8447348193697155, "grad_norm": 0.48420748114585876, "learning_rate": 3.1554189085318986e-05, "loss": 0.1716, "num_input_tokens_seen": 12288000, "step": 12000, "train_runtime": 126.4198, "train_tokens_per_second": 97200.001 }, { "epoch": 1.9215987701767872, "grad_norm": 0.3808608949184418, "learning_rate": 3.078554957724827e-05, "loss": 0.1819, "num_input_tokens_seen": 12800000, "step": 12500, "train_runtime": 131.456, "train_tokens_per_second": 97370.996 }, { "epoch": 1.9984627209838586, "grad_norm": 0.15483863651752472, "learning_rate": 3.0016910069177555e-05, "loss": 0.1718, "num_input_tokens_seen": 13312000, "step": 13000, "train_runtime": 136.5089, "train_tokens_per_second": 97517.416 }, { "epoch": 2.0, "eval_accuracy": 0.9259031514219831, "eval_loss": 0.27698734402656555, "eval_runtime": 5.1343, "eval_samples_per_second": 2533.949, "eval_steps_per_second": 316.89, "num_input_tokens_seen": 13322240, "step": 13010 }, { "epoch": 2.07532667179093, "grad_norm": 0.06390306353569031, "learning_rate": 2.9248270561106846e-05, "loss": 0.1384, "num_input_tokens_seen": 13824000, "step": 13500, "train_runtime": 146.9662, "train_tokens_per_second": 94062.42 }, { "epoch": 2.1521906225980016, "grad_norm": 0.09185440093278885, "learning_rate": 2.8479631053036127e-05, "loss": 0.123, "num_input_tokens_seen": 14336000, "step": 14000, "train_runtime": 152.0064, "train_tokens_per_second": 94311.832 }, { "epoch": 2.229054573405073, "grad_norm": 0.05354034900665283, "learning_rate": 2.7710991544965414e-05, "loss": 0.1265, "num_input_tokens_seen": 14848000, "step": 14500, "train_runtime": 157.0491, "train_tokens_per_second": 94543.682 }, { "epoch": 2.3059185242121445, "grad_norm": 6.2592926025390625, "learning_rate": 2.6942352036894695e-05, "loss": 0.1214, "num_input_tokens_seen": 15360000, "step": 15000, "train_runtime": 162.0991, "train_tokens_per_second": 94756.854 }, { "epoch": 2.382782475019216, "grad_norm": 0.3284030854701996, "learning_rate": 2.6173712528823986e-05, "loss": 0.1298, "num_input_tokens_seen": 15872000, "step": 15500, "train_runtime": 167.141, "train_tokens_per_second": 94961.759 }, { "epoch": 2.4596464258262873, "grad_norm": 0.2101190984249115, "learning_rate": 2.5405073020753267e-05, "loss": 0.1279, "num_input_tokens_seen": 16384000, "step": 16000, "train_runtime": 172.1787, "train_tokens_per_second": 95156.958 }, { "epoch": 2.536510376633359, "grad_norm": 7.672014236450195, "learning_rate": 2.463643351268255e-05, "loss": 0.1435, "num_input_tokens_seen": 16896000, "step": 16500, "train_runtime": 177.2341, "train_tokens_per_second": 95331.566 }, { "epoch": 2.6133743274404306, "grad_norm": 37.9052734375, "learning_rate": 2.3867794004611836e-05, "loss": 0.123, "num_input_tokens_seen": 17408000, "step": 17000, "train_runtime": 182.2827, "train_tokens_per_second": 95500.03 }, { "epoch": 2.690238278247502, "grad_norm": 0.08578933030366898, "learning_rate": 2.3099154496541124e-05, "loss": 0.1289, "num_input_tokens_seen": 17920000, "step": 17500, "train_runtime": 187.3355, "train_tokens_per_second": 95657.272 }, { "epoch": 2.7671022290545735, "grad_norm": 0.08860859274864197, "learning_rate": 2.2330514988470408e-05, "loss": 0.1296, "num_input_tokens_seen": 18432000, "step": 18000, "train_runtime": 192.3781, "train_tokens_per_second": 95811.329 }, { "epoch": 2.8439661798616447, "grad_norm": 0.41104796528816223, "learning_rate": 2.1561875480399692e-05, "loss": 0.1124, "num_input_tokens_seen": 18944000, "step": 18500, "train_runtime": 197.7595, "train_tokens_per_second": 95793.134 }, { "epoch": 2.9208301306687163, "grad_norm": 37.97283172607422, "learning_rate": 2.079323597232898e-05, "loss": 0.1382, "num_input_tokens_seen": 19456000, "step": 19000, "train_runtime": 203.2967, "train_tokens_per_second": 95702.474 }, { "epoch": 2.997694081475788, "grad_norm": 0.0325402170419693, "learning_rate": 2.0024596464258264e-05, "loss": 0.1388, "num_input_tokens_seen": 19968000, "step": 19500, "train_runtime": 208.8029, "train_tokens_per_second": 95630.843 }, { "epoch": 3.0, "eval_accuracy": 0.9287471176018447, "eval_loss": 0.31825903058052063, "eval_runtime": 5.4038, "eval_samples_per_second": 2407.562, "eval_steps_per_second": 301.084, "num_input_tokens_seen": 19983360, "step": 19515 }, { "epoch": 3.074558032282859, "grad_norm": 0.05180477350950241, "learning_rate": 1.925595695618755e-05, "loss": 0.0823, "num_input_tokens_seen": 20480000, "step": 20000, "train_runtime": 219.9739, "train_tokens_per_second": 93101.956 }, { "epoch": 3.151421983089931, "grad_norm": 0.005111335311084986, "learning_rate": 1.8487317448116833e-05, "loss": 0.0716, "num_input_tokens_seen": 20992000, "step": 20500, "train_runtime": 225.3567, "train_tokens_per_second": 93150.099 }, { "epoch": 3.2282859338970025, "grad_norm": 0.012623129412531853, "learning_rate": 1.771867794004612e-05, "loss": 0.0915, "num_input_tokens_seen": 21504000, "step": 21000, "train_runtime": 230.6352, "train_tokens_per_second": 93238.15 }, { "epoch": 3.3051498847040737, "grad_norm": 10.89956283569336, "learning_rate": 1.6950038431975405e-05, "loss": 0.0783, "num_input_tokens_seen": 22016000, "step": 21500, "train_runtime": 236.059, "train_tokens_per_second": 93264.831 }, { "epoch": 3.3820138355111453, "grad_norm": 0.010125258006155491, "learning_rate": 1.618139892390469e-05, "loss": 0.0882, "num_input_tokens_seen": 22528000, "step": 22000, "train_runtime": 241.5242, "train_tokens_per_second": 93274.31 }, { "epoch": 3.458877786318217, "grad_norm": 0.04097803309559822, "learning_rate": 1.5412759415833973e-05, "loss": 0.095, "num_input_tokens_seen": 23040000, "step": 22500, "train_runtime": 247.0361, "train_tokens_per_second": 93265.737 }, { "epoch": 3.535741737125288, "grad_norm": 0.3172767758369446, "learning_rate": 1.464411990776326e-05, "loss": 0.0847, "num_input_tokens_seen": 23552000, "step": 23000, "train_runtime": 252.366, "train_tokens_per_second": 93324.783 }, { "epoch": 3.61260568793236, "grad_norm": 0.0049354820512235165, "learning_rate": 1.3875480399692545e-05, "loss": 0.0758, "num_input_tokens_seen": 24064000, "step": 23500, "train_runtime": 257.7685, "train_tokens_per_second": 93355.1 }, { "epoch": 3.689469638739431, "grad_norm": 0.09353843331336975, "learning_rate": 1.310684089162183e-05, "loss": 0.0812, "num_input_tokens_seen": 24576000, "step": 24000, "train_runtime": 263.0331, "train_tokens_per_second": 93433.102 }, { "epoch": 3.7663335895465027, "grad_norm": 13.593195915222168, "learning_rate": 1.2338201383551116e-05, "loss": 0.0882, "num_input_tokens_seen": 25088000, "step": 24500, "train_runtime": 268.3188, "train_tokens_per_second": 93500.701 }, { "epoch": 3.8431975403535743, "grad_norm": 0.12645399570465088, "learning_rate": 1.15695618754804e-05, "loss": 0.0862, "num_input_tokens_seen": 25600000, "step": 25000, "train_runtime": 273.6907, "train_tokens_per_second": 93536.259 }, { "epoch": 3.9200614911606455, "grad_norm": 0.015061162412166595, "learning_rate": 1.0800922367409686e-05, "loss": 0.0825, "num_input_tokens_seen": 26112000, "step": 25500, "train_runtime": 279.0408, "train_tokens_per_second": 93577.699 }, { "epoch": 3.996925441967717, "grad_norm": 0.020776506513357162, "learning_rate": 1.003228285933897e-05, "loss": 0.0772, "num_input_tokens_seen": 26624000, "step": 26000, "train_runtime": 284.4194, "train_tokens_per_second": 93608.25 }, { "epoch": 4.0, "eval_accuracy": 0.9270561106840891, "eval_loss": 0.43025368452072144, "eval_runtime": 5.3244, "eval_samples_per_second": 2443.487, "eval_steps_per_second": 305.577, "num_input_tokens_seen": 26644480, "step": 26020 }, { "epoch": 4.073789392774788, "grad_norm": 0.09127756953239441, "learning_rate": 9.263643351268256e-06, "loss": 0.0615, "num_input_tokens_seen": 27136000, "step": 26500, "train_runtime": 295.1257, "train_tokens_per_second": 91947.265 }, { "epoch": 4.15065334358186, "grad_norm": 0.11556842923164368, "learning_rate": 8.49500384319754e-06, "loss": 0.0444, "num_input_tokens_seen": 27648000, "step": 27000, "train_runtime": 300.149, "train_tokens_per_second": 92114.259 }, { "epoch": 4.227517294388932, "grad_norm": 0.09005430340766907, "learning_rate": 7.726364335126826e-06, "loss": 0.0434, "num_input_tokens_seen": 28160000, "step": 27500, "train_runtime": 305.2327, "train_tokens_per_second": 92257.495 }, { "epoch": 4.304381245196003, "grad_norm": 0.004569609649479389, "learning_rate": 6.9577248270561115e-06, "loss": 0.0352, "num_input_tokens_seen": 28672000, "step": 28000, "train_runtime": 310.2962, "train_tokens_per_second": 92402.026 }, { "epoch": 4.381245196003075, "grad_norm": 1.524936318397522, "learning_rate": 6.189085318985397e-06, "loss": 0.06, "num_input_tokens_seen": 29184000, "step": 28500, "train_runtime": 315.365, "train_tokens_per_second": 92540.388 }, { "epoch": 4.458109146810146, "grad_norm": 0.014427268877625465, "learning_rate": 5.420445810914681e-06, "loss": 0.0492, "num_input_tokens_seen": 29696000, "step": 29000, "train_runtime": 320.4274, "train_tokens_per_second": 92676.208 }, { "epoch": 4.534973097617217, "grad_norm": 0.07355033606290817, "learning_rate": 4.651806302843966e-06, "loss": 0.0572, "num_input_tokens_seen": 30208000, "step": 29500, "train_runtime": 325.4966, "train_tokens_per_second": 92805.883 }, { "epoch": 4.611837048424289, "grad_norm": 7.176478385925293, "learning_rate": 3.883166794773251e-06, "loss": 0.0442, "num_input_tokens_seen": 30720000, "step": 30000, "train_runtime": 330.568, "train_tokens_per_second": 92930.968 }, { "epoch": 4.688700999231361, "grad_norm": 0.030576860532164574, "learning_rate": 3.114527286702537e-06, "loss": 0.0465, "num_input_tokens_seen": 31232000, "step": 30500, "train_runtime": 335.6493, "train_tokens_per_second": 93049.488 }, { "epoch": 4.765564950038432, "grad_norm": 0.005597515497356653, "learning_rate": 2.345887778631822e-06, "loss": 0.0487, "num_input_tokens_seen": 31744000, "step": 31000, "train_runtime": 340.8069, "train_tokens_per_second": 93143.641 }, { "epoch": 4.842428900845503, "grad_norm": 0.021089155226945877, "learning_rate": 1.5772482705611067e-06, "loss": 0.0557, "num_input_tokens_seen": 32256000, "step": 31500, "train_runtime": 346.0097, "train_tokens_per_second": 93222.824 }, { "epoch": 4.919292851652575, "grad_norm": 0.023516027256846428, "learning_rate": 8.086087624903922e-07, "loss": 0.0399, "num_input_tokens_seen": 32768000, "step": 32000, "train_runtime": 351.2595, "train_tokens_per_second": 93287.165 }, { "epoch": 4.996156802459646, "grad_norm": 0.007581554353237152, "learning_rate": 3.996925441967718e-08, "loss": 0.0532, "num_input_tokens_seen": 33280000, "step": 32500, "train_runtime": 356.5514, "train_tokens_per_second": 93338.566 }, { "epoch": 5.0, "eval_accuracy": 0.9264411990776326, "eval_loss": 0.4613732397556305, "eval_runtime": 5.2997, "eval_samples_per_second": 2454.857, "eval_steps_per_second": 306.999, "num_input_tokens_seen": 33305600, "step": 32525 }, { "epoch": 5.0, "num_input_tokens_seen": 33305600, "step": 32525, "total_flos": 2157435918643200.0, "train_loss": 0.1371641572881533, "train_runtime": 362.4532, "train_samples_per_second": 717.886, "train_steps_per_second": 89.736 } ], "logging_steps": 500, "max_steps": 32525, "num_input_tokens_seen": 33305600, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2157435918643200.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }