{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9348387096774193, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04838709677419355, "grad_norm": 1.3511691093444824, "learning_rate": 3.151125401929261e-05, "loss": 0.678, "step": 50 }, { "epoch": 0.0967741935483871, "grad_norm": 1.436481237411499, "learning_rate": 6.366559485530547e-05, "loss": 0.4574, "step": 100 }, { "epoch": 0.0967741935483871, "eval_loss": 0.4402719736099243, "eval_runtime": 41.5704, "eval_samples_per_second": 4.811, "eval_steps_per_second": 2.406, "step": 100 }, { "epoch": 0.14516129032258066, "grad_norm": 1.0493075847625732, "learning_rate": 9.581993569131833e-05, "loss": 0.4469, "step": 150 }, { "epoch": 0.1935483870967742, "grad_norm": 0.8483838438987732, "learning_rate": 0.00012797427652733122, "loss": 0.4438, "step": 200 }, { "epoch": 0.1935483870967742, "eval_loss": 0.4417407512664795, "eval_runtime": 41.5811, "eval_samples_per_second": 4.81, "eval_steps_per_second": 2.405, "step": 200 }, { "epoch": 0.24193548387096775, "grad_norm": 0.9024653434753418, "learning_rate": 0.00016012861736334406, "loss": 0.4456, "step": 250 }, { "epoch": 0.2903225806451613, "grad_norm": 1.418324589729309, "learning_rate": 0.00019228295819935694, "loss": 0.4257, "step": 300 }, { "epoch": 0.2903225806451613, "eval_loss": 0.4612005650997162, "eval_runtime": 41.5922, "eval_samples_per_second": 4.809, "eval_steps_per_second": 2.404, "step": 300 }, { "epoch": 0.3387096774193548, "grad_norm": 1.0883433818817139, "learning_rate": 0.0001999085358179519, "loss": 0.4772, "step": 350 }, { "epoch": 0.3870967741935484, "grad_norm": 1.501890778541565, "learning_rate": 0.00019950981468803888, "loss": 0.4794, "step": 400 }, { "epoch": 0.3870967741935484, "eval_loss": 0.45909491181373596, "eval_runtime": 41.5808, "eval_samples_per_second": 4.81, "eval_steps_per_second": 2.405, "step": 400 }, { "epoch": 0.43548387096774194, "grad_norm": 0.9890356063842773, "learning_rate": 0.00019879597690223086, "loss": 0.4974, "step": 450 }, { "epoch": 0.4838709677419355, "grad_norm": 0.9810131192207336, "learning_rate": 0.00019776928296293779, "loss": 0.5155, "step": 500 }, { "epoch": 0.4838709677419355, "eval_loss": 0.4590587615966797, "eval_runtime": 41.5834, "eval_samples_per_second": 4.81, "eval_steps_per_second": 2.405, "step": 500 }, { "epoch": 0.532258064516129, "grad_norm": 0.9346289038658142, "learning_rate": 0.0001964329840907734, "loss": 0.4596, "step": 550 }, { "epoch": 0.5806451612903226, "grad_norm": 1.4689308404922485, "learning_rate": 0.0001947913119289498, "loss": 0.4729, "step": 600 }, { "epoch": 0.5806451612903226, "eval_loss": 0.44996699690818787, "eval_runtime": 41.5738, "eval_samples_per_second": 4.811, "eval_steps_per_second": 2.405, "step": 600 }, { "epoch": 0.6290322580645161, "grad_norm": 0.922006368637085, "learning_rate": 0.00019284946514297897, "loss": 0.4412, "step": 650 }, { "epoch": 0.6774193548387096, "grad_norm": 1.1207324266433716, "learning_rate": 0.00019061359295811437, "loss": 0.4879, "step": 700 }, { "epoch": 0.6774193548387096, "eval_loss": 0.4418741464614868, "eval_runtime": 41.5961, "eval_samples_per_second": 4.808, "eval_steps_per_second": 2.404, "step": 700 }, { "epoch": 0.7258064516129032, "grad_norm": 1.4568829536437988, "learning_rate": 0.00018809077568666602, "loss": 0.4746, "step": 750 }, { "epoch": 0.7741935483870968, "grad_norm": 1.242003321647644, "learning_rate": 0.00018528900230685191, "loss": 0.4441, "step": 800 }, { "epoch": 0.7741935483870968, "eval_loss": 0.431382954120636, "eval_runtime": 41.5321, "eval_samples_per_second": 4.816, "eval_steps_per_second": 2.408, "step": 800 }, { "epoch": 0.8225806451612904, "grad_norm": 0.9119803309440613, "learning_rate": 0.00018221714516418672, "loss": 0.4537, "step": 850 }, { "epoch": 0.8709677419354839, "grad_norm": 0.964235246181488, "learning_rate": 0.00017888493187552139, "loss": 0.4421, "step": 900 }, { "epoch": 0.8709677419354839, "eval_loss": 0.4243137240409851, "eval_runtime": 41.5455, "eval_samples_per_second": 4.814, "eval_steps_per_second": 2.407, "step": 900 }, { "epoch": 0.9193548387096774, "grad_norm": 0.8462353348731995, "learning_rate": 0.00017530291452470345, "loss": 0.465, "step": 950 }, { "epoch": 0.967741935483871, "grad_norm": 1.1114662885665894, "learning_rate": 0.0001714824362474075, "loss": 0.4533, "step": 1000 }, { "epoch": 0.967741935483871, "eval_loss": 0.41407376527786255, "eval_runtime": 41.5658, "eval_samples_per_second": 4.812, "eval_steps_per_second": 2.406, "step": 1000 }, { "epoch": 1.0154838709677418, "grad_norm": 1.0085160732269287, "learning_rate": 0.00016743559531094947, "loss": 0.3982, "step": 1050 }, { "epoch": 1.0638709677419356, "grad_norm": 0.7745186686515808, "learning_rate": 0.0001631752068028344, "loss": 0.3314, "step": 1100 }, { "epoch": 1.0638709677419356, "eval_loss": 0.4301433265209198, "eval_runtime": 41.5627, "eval_samples_per_second": 4.812, "eval_steps_per_second": 2.406, "step": 1100 }, { "epoch": 1.112258064516129, "grad_norm": 1.0404142141342163, "learning_rate": 0.00015871476204935706, "loss": 0.3153, "step": 1150 }, { "epoch": 1.1606451612903226, "grad_norm": 0.9000236988067627, "learning_rate": 0.0001540683858927647, "loss": 0.3233, "step": 1200 }, { "epoch": 1.1606451612903226, "eval_loss": 0.4346558451652527, "eval_runtime": 41.5749, "eval_samples_per_second": 4.811, "eval_steps_per_second": 2.405, "step": 1200 }, { "epoch": 1.209032258064516, "grad_norm": 0.9963415265083313, "learning_rate": 0.00014925079196227153, "loss": 0.3442, "step": 1250 }, { "epoch": 1.2574193548387096, "grad_norm": 1.0009533166885376, "learning_rate": 0.000144277236080568, "loss": 0.3322, "step": 1300 }, { "epoch": 1.2574193548387096, "eval_loss": 0.4213526248931885, "eval_runtime": 41.5673, "eval_samples_per_second": 4.811, "eval_steps_per_second": 2.406, "step": 1300 }, { "epoch": 1.305806451612903, "grad_norm": 0.8589375615119934, "learning_rate": 0.00013916346795337162, "loss": 0.3397, "step": 1350 }, { "epoch": 1.3541935483870968, "grad_norm": 1.0944445133209229, "learning_rate": 0.00013392568129500418, "loss": 0.3184, "step": 1400 }, { "epoch": 1.3541935483870968, "eval_loss": 0.4043886661529541, "eval_runtime": 41.5726, "eval_samples_per_second": 4.811, "eval_steps_per_second": 2.405, "step": 1400 }, { "epoch": 1.4025806451612903, "grad_norm": 0.8919832110404968, "learning_rate": 0.00012858046254793118, "loss": 0.3144, "step": 1450 }, { "epoch": 1.4509677419354838, "grad_norm": 0.9278978109359741, "learning_rate": 0.00012314473835865362, "loss": 0.3048, "step": 1500 }, { "epoch": 1.4509677419354838, "eval_loss": 0.4060533046722412, "eval_runtime": 41.5601, "eval_samples_per_second": 4.812, "eval_steps_per_second": 2.406, "step": 1500 }, { "epoch": 1.4993548387096773, "grad_norm": 0.8267381191253662, "learning_rate": 0.00011763572197627924, "loss": 0.312, "step": 1550 }, { "epoch": 1.547741935483871, "grad_norm": 0.6980671286582947, "learning_rate": 0.00011207085874351245, "loss": 0.296, "step": 1600 }, { "epoch": 1.547741935483871, "eval_loss": 0.40329769253730774, "eval_runtime": 41.551, "eval_samples_per_second": 4.813, "eval_steps_per_second": 2.407, "step": 1600 }, { "epoch": 1.5961290322580646, "grad_norm": 0.7019182443618774, "learning_rate": 0.0001064677708526759, "loss": 0.3025, "step": 1650 }, { "epoch": 1.644516129032258, "grad_norm": 0.9012386202812195, "learning_rate": 0.00010084420154170429, "loss": 0.3019, "step": 1700 }, { "epoch": 1.644516129032258, "eval_loss": 0.3987695276737213, "eval_runtime": 41.5774, "eval_samples_per_second": 4.81, "eval_steps_per_second": 2.405, "step": 1700 }, { "epoch": 1.6929032258064516, "grad_norm": 0.7666453123092651, "learning_rate": 9.521795890682426e-05, "loss": 0.3176, "step": 1750 }, { "epoch": 1.741290322580645, "grad_norm": 0.8256189823150635, "learning_rate": 8.960685950984856e-05, "loss": 0.3072, "step": 1800 }, { "epoch": 1.741290322580645, "eval_loss": 0.3912976086139679, "eval_runtime": 41.5631, "eval_samples_per_second": 4.812, "eval_steps_per_second": 2.406, "step": 1800 }, { "epoch": 1.7896774193548386, "grad_norm": 0.9326472282409668, "learning_rate": 8.402867195866182e-05, "loss": 0.3021, "step": 1850 }, { "epoch": 1.838064516129032, "grad_norm": 0.8051843047142029, "learning_rate": 7.8501060639562e-05, "loss": 0.2801, "step": 1900 }, { "epoch": 1.838064516129032, "eval_loss": 0.3875766098499298, "eval_runtime": 41.5674, "eval_samples_per_second": 4.811, "eval_steps_per_second": 2.406, "step": 1900 }, { "epoch": 1.8864516129032258, "grad_norm": 1.0912061929702759, "learning_rate": 7.304152977963892e-05, "loss": 0.312, "step": 1950 }, { "epoch": 1.9348387096774193, "grad_norm": 0.9892815351486206, "learning_rate": 6.76673680163272e-05, "loss": 0.3227, "step": 2000 }, { "epoch": 1.9348387096774193, "eval_loss": 0.37876129150390625, "eval_runtime": 41.5682, "eval_samples_per_second": 4.811, "eval_steps_per_second": 2.406, "step": 2000 } ], "logging_steps": 50, "max_steps": 3102, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.318892586822533e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }