{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 30.125, "learning_rate": 3.1746031746031746e-06, "loss": 2.5254, "step": 10 }, { "epoch": 0.032, "grad_norm": 20.25, "learning_rate": 6.349206349206349e-06, "loss": 2.5254, "step": 20 }, { "epoch": 0.048, "grad_norm": 14.8125, "learning_rate": 9.523809523809525e-06, "loss": 2.0496, "step": 30 }, { "epoch": 0.064, "grad_norm": 10.9375, "learning_rate": 1.2698412698412699e-05, "loss": 1.4853, "step": 40 }, { "epoch": 0.08, "grad_norm": 5.625, "learning_rate": 1.5873015873015872e-05, "loss": 0.8731, "step": 50 }, { "epoch": 0.096, "grad_norm": 3.109375, "learning_rate": 1.904761904761905e-05, "loss": 0.6086, "step": 60 }, { "epoch": 0.112, "grad_norm": 3.015625, "learning_rate": 1.9750889679715305e-05, "loss": 0.5745, "step": 70 }, { "epoch": 0.128, "grad_norm": 2.375, "learning_rate": 1.939501779359431e-05, "loss": 0.5121, "step": 80 }, { "epoch": 0.144, "grad_norm": 3.28125, "learning_rate": 1.903914590747331e-05, "loss": 0.4875, "step": 90 }, { "epoch": 0.16, "grad_norm": 2.703125, "learning_rate": 1.8683274021352315e-05, "loss": 0.4997, "step": 100 }, { "epoch": 0.176, "grad_norm": 2.203125, "learning_rate": 1.832740213523132e-05, "loss": 0.5004, "step": 110 }, { "epoch": 0.192, "grad_norm": 2.109375, "learning_rate": 1.7971530249110324e-05, "loss": 0.5119, "step": 120 }, { "epoch": 0.208, "grad_norm": 2.453125, "learning_rate": 1.7615658362989325e-05, "loss": 0.5088, "step": 130 }, { "epoch": 0.224, "grad_norm": 2.171875, "learning_rate": 1.725978647686833e-05, "loss": 0.4842, "step": 140 }, { "epoch": 0.24, "grad_norm": 3.65625, "learning_rate": 1.690391459074733e-05, "loss": 0.5193, "step": 150 }, { "epoch": 0.256, "grad_norm": 2.609375, "learning_rate": 1.6548042704626336e-05, "loss": 0.4984, "step": 160 }, { "epoch": 0.272, "grad_norm": 2.28125, "learning_rate": 1.619217081850534e-05, "loss": 0.5011, "step": 170 }, { "epoch": 0.288, "grad_norm": 3.46875, "learning_rate": 1.583629893238434e-05, "loss": 0.5493, "step": 180 }, { "epoch": 0.304, "grad_norm": 2.46875, "learning_rate": 1.5480427046263346e-05, "loss": 0.4869, "step": 190 }, { "epoch": 0.32, "grad_norm": 2.734375, "learning_rate": 1.5124555160142349e-05, "loss": 0.4902, "step": 200 }, { "epoch": 0.336, "grad_norm": 1.921875, "learning_rate": 1.4768683274021354e-05, "loss": 0.5013, "step": 210 }, { "epoch": 0.352, "grad_norm": 2.125, "learning_rate": 1.4412811387900356e-05, "loss": 0.5061, "step": 220 }, { "epoch": 0.368, "grad_norm": 2.265625, "learning_rate": 1.4056939501779361e-05, "loss": 0.4956, "step": 230 }, { "epoch": 0.384, "grad_norm": 1.6015625, "learning_rate": 1.3701067615658364e-05, "loss": 0.4665, "step": 240 }, { "epoch": 0.4, "grad_norm": 2.234375, "learning_rate": 1.3345195729537369e-05, "loss": 0.4605, "step": 250 }, { "epoch": 0.416, "grad_norm": 2.1875, "learning_rate": 1.298932384341637e-05, "loss": 0.4728, "step": 260 }, { "epoch": 0.432, "grad_norm": 2.46875, "learning_rate": 1.2633451957295374e-05, "loss": 0.5115, "step": 270 }, { "epoch": 0.448, "grad_norm": 2.765625, "learning_rate": 1.2277580071174377e-05, "loss": 0.4873, "step": 280 }, { "epoch": 0.464, "grad_norm": 2.140625, "learning_rate": 1.1921708185053382e-05, "loss": 0.5266, "step": 290 }, { "epoch": 0.48, "grad_norm": 2.078125, "learning_rate": 1.1565836298932385e-05, "loss": 0.5175, "step": 300 }, { "epoch": 0.496, "grad_norm": 2.328125, "learning_rate": 1.120996441281139e-05, "loss": 0.4702, "step": 310 }, { "epoch": 0.512, "grad_norm": 2.734375, "learning_rate": 1.0854092526690392e-05, "loss": 0.5071, "step": 320 }, { "epoch": 0.528, "grad_norm": 2.03125, "learning_rate": 1.0498220640569397e-05, "loss": 0.5155, "step": 330 }, { "epoch": 0.544, "grad_norm": 2.53125, "learning_rate": 1.01423487544484e-05, "loss": 0.4964, "step": 340 }, { "epoch": 0.56, "grad_norm": 2.984375, "learning_rate": 9.786476868327403e-06, "loss": 0.4867, "step": 350 }, { "epoch": 0.576, "grad_norm": 2.828125, "learning_rate": 9.430604982206405e-06, "loss": 0.545, "step": 360 }, { "epoch": 0.592, "grad_norm": 2.546875, "learning_rate": 9.07473309608541e-06, "loss": 0.4832, "step": 370 }, { "epoch": 0.608, "grad_norm": 2.6875, "learning_rate": 8.718861209964413e-06, "loss": 0.4826, "step": 380 }, { "epoch": 0.624, "grad_norm": 1.765625, "learning_rate": 8.362989323843418e-06, "loss": 0.4652, "step": 390 }, { "epoch": 0.64, "grad_norm": 2.359375, "learning_rate": 8.00711743772242e-06, "loss": 0.5069, "step": 400 }, { "epoch": 0.656, "grad_norm": 2.453125, "learning_rate": 7.651245551601423e-06, "loss": 0.5131, "step": 410 }, { "epoch": 0.672, "grad_norm": 2.3125, "learning_rate": 7.295373665480427e-06, "loss": 0.4937, "step": 420 }, { "epoch": 0.688, "grad_norm": 2.109375, "learning_rate": 6.939501779359431e-06, "loss": 0.4878, "step": 430 }, { "epoch": 0.704, "grad_norm": 3.546875, "learning_rate": 6.5836298932384346e-06, "loss": 0.4821, "step": 440 }, { "epoch": 0.72, "grad_norm": 2.046875, "learning_rate": 6.227758007117438e-06, "loss": 0.4486, "step": 450 }, { "epoch": 0.736, "grad_norm": 2.71875, "learning_rate": 5.871886120996442e-06, "loss": 0.5022, "step": 460 }, { "epoch": 0.752, "grad_norm": 2.53125, "learning_rate": 5.516014234875445e-06, "loss": 0.4977, "step": 470 }, { "epoch": 0.768, "grad_norm": 2.6875, "learning_rate": 5.160142348754449e-06, "loss": 0.5076, "step": 480 }, { "epoch": 0.784, "grad_norm": 2.546875, "learning_rate": 4.8042704626334524e-06, "loss": 0.451, "step": 490 }, { "epoch": 0.8, "grad_norm": 1.734375, "learning_rate": 4.448398576512456e-06, "loss": 0.4576, "step": 500 }, { "epoch": 0.816, "grad_norm": 2.765625, "learning_rate": 4.09252669039146e-06, "loss": 0.4914, "step": 510 }, { "epoch": 0.832, "grad_norm": 2.59375, "learning_rate": 3.7366548042704632e-06, "loss": 0.4938, "step": 520 }, { "epoch": 0.848, "grad_norm": 2.625, "learning_rate": 3.3807829181494666e-06, "loss": 0.5218, "step": 530 }, { "epoch": 0.864, "grad_norm": 1.8515625, "learning_rate": 3.0249110320284703e-06, "loss": 0.4694, "step": 540 }, { "epoch": 0.88, "grad_norm": 2.375, "learning_rate": 2.669039145907473e-06, "loss": 0.5102, "step": 550 }, { "epoch": 0.896, "grad_norm": 2.375, "learning_rate": 2.313167259786477e-06, "loss": 0.506, "step": 560 }, { "epoch": 0.912, "grad_norm": 2.875, "learning_rate": 1.9572953736654807e-06, "loss": 0.4982, "step": 570 }, { "epoch": 0.928, "grad_norm": 1.9140625, "learning_rate": 1.6014234875444842e-06, "loss": 0.5107, "step": 580 }, { "epoch": 0.944, "grad_norm": 2.546875, "learning_rate": 1.2455516014234877e-06, "loss": 0.4857, "step": 590 }, { "epoch": 0.96, "grad_norm": 1.8359375, "learning_rate": 8.896797153024913e-07, "loss": 0.4821, "step": 600 }, { "epoch": 0.976, "grad_norm": 2.609375, "learning_rate": 5.338078291814947e-07, "loss": 0.5166, "step": 610 }, { "epoch": 0.992, "grad_norm": 2.578125, "learning_rate": 1.7793594306049826e-07, "loss": 0.5116, "step": 620 }, { "epoch": 1.0, "step": 625, "total_flos": 9900164319805440.0, "train_loss": 0.6107198246002197, "train_runtime": 287.0781, "train_samples_per_second": 34.834, "train_steps_per_second": 2.177 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9900164319805440.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }