| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 625, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 30.125, | |
| "learning_rate": 3.1746031746031746e-06, | |
| "loss": 2.5254, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 20.25, | |
| "learning_rate": 6.349206349206349e-06, | |
| "loss": 2.5254, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 9.523809523809525e-06, | |
| "loss": 2.0496, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.2698412698412699e-05, | |
| "loss": 1.4853, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 5.625, | |
| "learning_rate": 1.5873015873015872e-05, | |
| "loss": 0.8731, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.904761904761905e-05, | |
| "loss": 0.6086, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.9750889679715305e-05, | |
| "loss": 0.5745, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.939501779359431e-05, | |
| "loss": 0.5121, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.903914590747331e-05, | |
| "loss": 0.4875, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.8683274021352315e-05, | |
| "loss": 0.4997, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.832740213523132e-05, | |
| "loss": 0.5004, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.7971530249110324e-05, | |
| "loss": 0.5119, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.7615658362989325e-05, | |
| "loss": 0.5088, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.725978647686833e-05, | |
| "loss": 0.4842, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.690391459074733e-05, | |
| "loss": 0.5193, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.6548042704626336e-05, | |
| "loss": 0.4984, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.619217081850534e-05, | |
| "loss": 0.5011, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.583629893238434e-05, | |
| "loss": 0.5493, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.5480427046263346e-05, | |
| "loss": 0.4869, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.5124555160142349e-05, | |
| "loss": 0.4902, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.4768683274021354e-05, | |
| "loss": 0.5013, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.4412811387900356e-05, | |
| "loss": 0.5061, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.4056939501779361e-05, | |
| "loss": 0.4956, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.3701067615658364e-05, | |
| "loss": 0.4665, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.3345195729537369e-05, | |
| "loss": 0.4605, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.298932384341637e-05, | |
| "loss": 0.4728, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.2633451957295374e-05, | |
| "loss": 0.5115, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 1.2277580071174377e-05, | |
| "loss": 0.4873, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.1921708185053382e-05, | |
| "loss": 0.5266, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.1565836298932385e-05, | |
| "loss": 0.5175, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.120996441281139e-05, | |
| "loss": 0.4702, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 1.0854092526690392e-05, | |
| "loss": 0.5071, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.0498220640569397e-05, | |
| "loss": 0.5155, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.01423487544484e-05, | |
| "loss": 0.4964, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 9.786476868327403e-06, | |
| "loss": 0.4867, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 9.430604982206405e-06, | |
| "loss": 0.545, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 9.07473309608541e-06, | |
| "loss": 0.4832, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 8.718861209964413e-06, | |
| "loss": 0.4826, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 8.362989323843418e-06, | |
| "loss": 0.4652, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 8.00711743772242e-06, | |
| "loss": 0.5069, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 7.651245551601423e-06, | |
| "loss": 0.5131, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 7.295373665480427e-06, | |
| "loss": 0.4937, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 6.939501779359431e-06, | |
| "loss": 0.4878, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 6.5836298932384346e-06, | |
| "loss": 0.4821, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 6.227758007117438e-06, | |
| "loss": 0.4486, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 5.871886120996442e-06, | |
| "loss": 0.5022, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 5.516014234875445e-06, | |
| "loss": 0.4977, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.160142348754449e-06, | |
| "loss": 0.5076, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 4.8042704626334524e-06, | |
| "loss": 0.451, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 4.448398576512456e-06, | |
| "loss": 0.4576, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 4.09252669039146e-06, | |
| "loss": 0.4914, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 3.7366548042704632e-06, | |
| "loss": 0.4938, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 2.625, | |
| "learning_rate": 3.3807829181494666e-06, | |
| "loss": 0.5218, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 3.0249110320284703e-06, | |
| "loss": 0.4694, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.669039145907473e-06, | |
| "loss": 0.5102, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.313167259786477e-06, | |
| "loss": 0.506, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 2.875, | |
| "learning_rate": 1.9572953736654807e-06, | |
| "loss": 0.4982, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.6014234875444842e-06, | |
| "loss": 0.5107, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.2455516014234877e-06, | |
| "loss": 0.4857, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 8.896797153024913e-07, | |
| "loss": 0.4821, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.338078291814947e-07, | |
| "loss": 0.5166, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.7793594306049826e-07, | |
| "loss": 0.5116, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 625, | |
| "total_flos": 9900164319805440.0, | |
| "train_loss": 0.6107198246002197, | |
| "train_runtime": 287.0781, | |
| "train_samples_per_second": 34.834, | |
| "train_steps_per_second": 2.177 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 625, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9900164319805440.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |