agentlans commited on
Commit
17628df
·
verified ·
1 Parent(s): 7cec8a5

Upload 12 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<|...|>": 30525,
3
+ "<|assistant|>": 30524,
4
+ "<|system|>": 30522,
5
+ "<|user|>": 30523
6
+ }
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.919369715603382,
4
+ "eval_loss": 0.19649724662303925,
5
+ "eval_runtime": 5.3689,
6
+ "eval_samples": 13010,
7
+ "eval_samples_per_second": 2423.215,
8
+ "eval_steps_per_second": 303.042,
9
+ "num_input_tokens_seen": 33305600,
10
+ "total_flos": 2157435918643200.0,
11
+ "train_loss": 0.1371641572881533,
12
+ "train_runtime": 362.4532,
13
+ "train_samples": 52040,
14
+ "train_samples_per_second": 717.886,
15
+ "train_steps_per_second": 89.736
16
+ }
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "finetuning_task": "text-classification",
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 384,
13
+ "id2label": {
14
+ "0": "Non-refusal",
15
+ "1": "Refusal"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 1536,
19
+ "label2id": {
20
+ "Non-refusal": 0,
21
+ "Refusal": 1
22
+ },
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 6,
28
+ "pad_token_id": 0,
29
+ "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
+ "transformers_version": "5.0.0.dev0",
32
+ "type_vocab_size": 2,
33
+ "use_cache": false,
34
+ "vocab_size": 30526
35
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.919369715603382,
4
+ "eval_loss": 0.19649724662303925,
5
+ "eval_runtime": 5.3689,
6
+ "eval_samples": 13010,
7
+ "eval_samples_per_second": 2423.215,
8
+ "eval_steps_per_second": 303.042,
9
+ "num_input_tokens_seen": 33305600
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3997c8da63fc6340b43e6f8d107f48183e754e83933dad0b2aaaf0dc270edd3d
3
+ size 90874096
special_tokens_map.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|system|>",
4
+ "<|user|>",
5
+ "<|assistant|>",
6
+ "<|...|>"
7
+ ],
8
+ "cls_token": {
9
+ "content": "[CLS]",
10
+ "lstrip": false,
11
+ "normalized": false,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ },
15
+ "mask_token": {
16
+ "content": "[MASK]",
17
+ "lstrip": false,
18
+ "normalized": false,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "pad_token": {
23
+ "content": "[PAD]",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false
28
+ },
29
+ "sep_token": {
30
+ "content": "[SEP]",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "unk_token": {
37
+ "content": "[UNK]",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false
42
+ }
43
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "30522": {
44
+ "content": "<|system|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "30523": {
52
+ "content": "<|user|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "30524": {
60
+ "content": "<|assistant|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "30525": {
68
+ "content": "<|...|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ }
75
+ },
76
+ "additional_special_tokens": [
77
+ "<|system|>",
78
+ "<|user|>",
79
+ "<|assistant|>",
80
+ "<|...|>"
81
+ ],
82
+ "clean_up_tokenization_spaces": true,
83
+ "cls_token": "[CLS]",
84
+ "do_basic_tokenize": true,
85
+ "do_lower_case": true,
86
+ "extra_special_tokens": {},
87
+ "mask_token": "[MASK]",
88
+ "max_length": 512,
89
+ "model_max_length": 512,
90
+ "never_split": null,
91
+ "pad_to_multiple_of": null,
92
+ "pad_token": "[PAD]",
93
+ "pad_token_type_id": 0,
94
+ "padding_side": "right",
95
+ "sep_token": "[SEP]",
96
+ "stride": 0,
97
+ "strip_accents": null,
98
+ "tokenize_chinese_chars": true,
99
+ "tokenizer_class": "BertTokenizer",
100
+ "truncation_side": "right",
101
+ "truncation_strategy": "longest_first",
102
+ "unk_token": "[UNK]"
103
+ }
train_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "num_input_tokens_seen": 33305600,
4
+ "total_flos": 2157435918643200.0,
5
+ "train_loss": 0.1371641572881533,
6
+ "train_runtime": 362.4532,
7
+ "train_samples": 52040,
8
+ "train_samples_per_second": 717.886,
9
+ "train_steps_per_second": 89.736
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 6505,
3
+ "best_metric": 0.19649724662303925,
4
+ "best_model_checkpoint": "/media/user/Expansion1/snowflake-arctic-embed-xs-refusal/checkpoint-6505",
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 32525,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.07686395080707148,
14
+ "grad_norm": 2.155881881713867,
15
+ "learning_rate": 4.923289777094543e-05,
16
+ "loss": 0.3511,
17
+ "num_input_tokens_seen": 512000,
18
+ "step": 500,
19
+ "train_runtime": 5.3031,
20
+ "train_tokens_per_second": 96546.489
21
+ },
22
+ {
23
+ "epoch": 0.15372790161414296,
24
+ "grad_norm": 0.6236560344696045,
25
+ "learning_rate": 4.846425826287471e-05,
26
+ "loss": 0.2593,
27
+ "num_input_tokens_seen": 1024000,
28
+ "step": 1000,
29
+ "train_runtime": 10.3436,
30
+ "train_tokens_per_second": 98998.124
31
+ },
32
+ {
33
+ "epoch": 0.23059185242121444,
34
+ "grad_norm": 3.8176653385162354,
35
+ "learning_rate": 4.7695618754804e-05,
36
+ "loss": 0.2546,
37
+ "num_input_tokens_seen": 1536000,
38
+ "step": 1500,
39
+ "train_runtime": 15.3626,
40
+ "train_tokens_per_second": 99983.128
41
+ },
42
+ {
43
+ "epoch": 0.3074558032282859,
44
+ "grad_norm": 2.538367986679077,
45
+ "learning_rate": 4.692697924673328e-05,
46
+ "loss": 0.2377,
47
+ "num_input_tokens_seen": 2048000,
48
+ "step": 2000,
49
+ "train_runtime": 20.351,
50
+ "train_tokens_per_second": 100634.117
51
+ },
52
+ {
53
+ "epoch": 0.3843197540353574,
54
+ "grad_norm": 3.922593832015991,
55
+ "learning_rate": 4.615833973866257e-05,
56
+ "loss": 0.2411,
57
+ "num_input_tokens_seen": 2560000,
58
+ "step": 2500,
59
+ "train_runtime": 25.363,
60
+ "train_tokens_per_second": 100934.428
61
+ },
62
+ {
63
+ "epoch": 0.4611837048424289,
64
+ "grad_norm": 0.728330135345459,
65
+ "learning_rate": 4.5389700230591855e-05,
66
+ "loss": 0.2278,
67
+ "num_input_tokens_seen": 3072000,
68
+ "step": 3000,
69
+ "train_runtime": 30.387,
70
+ "train_tokens_per_second": 101095.729
71
+ },
72
+ {
73
+ "epoch": 0.5380476556495004,
74
+ "grad_norm": 0.5299816131591797,
75
+ "learning_rate": 4.462106072252114e-05,
76
+ "loss": 0.2422,
77
+ "num_input_tokens_seen": 3584000,
78
+ "step": 3500,
79
+ "train_runtime": 35.3784,
80
+ "train_tokens_per_second": 101304.819
81
+ },
82
+ {
83
+ "epoch": 0.6149116064565718,
84
+ "grad_norm": 9.030647277832031,
85
+ "learning_rate": 4.3852421214450424e-05,
86
+ "loss": 0.2269,
87
+ "num_input_tokens_seen": 4096000,
88
+ "step": 4000,
89
+ "train_runtime": 40.3905,
90
+ "train_tokens_per_second": 101409.866
91
+ },
92
+ {
93
+ "epoch": 0.6917755572636434,
94
+ "grad_norm": 1.8069450855255127,
95
+ "learning_rate": 4.308378170637971e-05,
96
+ "loss": 0.2378,
97
+ "num_input_tokens_seen": 4608000,
98
+ "step": 4500,
99
+ "train_runtime": 45.4094,
100
+ "train_tokens_per_second": 101476.838
101
+ },
102
+ {
103
+ "epoch": 0.7686395080707148,
104
+ "grad_norm": 2.4024786949157715,
105
+ "learning_rate": 4.231514219830899e-05,
106
+ "loss": 0.2129,
107
+ "num_input_tokens_seen": 5120000,
108
+ "step": 5000,
109
+ "train_runtime": 50.4114,
110
+ "train_tokens_per_second": 101564.317
111
+ },
112
+ {
113
+ "epoch": 0.8455034588777863,
114
+ "grad_norm": 0.8753976821899414,
115
+ "learning_rate": 4.1546502690238284e-05,
116
+ "loss": 0.2354,
117
+ "num_input_tokens_seen": 5632000,
118
+ "step": 5500,
119
+ "train_runtime": 55.4722,
120
+ "train_tokens_per_second": 101528.336
121
+ },
122
+ {
123
+ "epoch": 0.9223674096848578,
124
+ "grad_norm": 4.247986316680908,
125
+ "learning_rate": 4.077786318216756e-05,
126
+ "loss": 0.237,
127
+ "num_input_tokens_seen": 6144000,
128
+ "step": 6000,
129
+ "train_runtime": 60.5329,
130
+ "train_tokens_per_second": 101498.514
131
+ },
132
+ {
133
+ "epoch": 0.9992313604919293,
134
+ "grad_norm": 7.1489973068237305,
135
+ "learning_rate": 4.000922367409685e-05,
136
+ "loss": 0.2286,
137
+ "num_input_tokens_seen": 6656000,
138
+ "step": 6500,
139
+ "train_runtime": 65.5361,
140
+ "train_tokens_per_second": 101562.357
141
+ },
142
+ {
143
+ "epoch": 1.0,
144
+ "eval_accuracy": 0.919369715603382,
145
+ "eval_loss": 0.19649724662303925,
146
+ "eval_runtime": 5.1167,
147
+ "eval_samples_per_second": 2542.665,
148
+ "eval_steps_per_second": 317.98,
149
+ "num_input_tokens_seen": 6661120,
150
+ "step": 6505
151
+ },
152
+ {
153
+ "epoch": 1.0760953112990008,
154
+ "grad_norm": 10.231003761291504,
155
+ "learning_rate": 3.9240584166026136e-05,
156
+ "loss": 0.1751,
157
+ "num_input_tokens_seen": 7168000,
158
+ "step": 7000,
159
+ "train_runtime": 75.9746,
160
+ "train_tokens_per_second": 94347.276
161
+ },
162
+ {
163
+ "epoch": 1.1529592621060722,
164
+ "grad_norm": 0.037300433963537216,
165
+ "learning_rate": 3.847194465795542e-05,
166
+ "loss": 0.1739,
167
+ "num_input_tokens_seen": 7680000,
168
+ "step": 7500,
169
+ "train_runtime": 81.0147,
170
+ "train_tokens_per_second": 94797.624
171
+ },
172
+ {
173
+ "epoch": 1.2298232129131437,
174
+ "grad_norm": 10.88604736328125,
175
+ "learning_rate": 3.7703305149884705e-05,
176
+ "loss": 0.1923,
177
+ "num_input_tokens_seen": 8192000,
178
+ "step": 8000,
179
+ "train_runtime": 86.0728,
180
+ "train_tokens_per_second": 95175.201
181
+ },
182
+ {
183
+ "epoch": 1.3066871637202153,
184
+ "grad_norm": 14.25737190246582,
185
+ "learning_rate": 3.693466564181399e-05,
186
+ "loss": 0.1809,
187
+ "num_input_tokens_seen": 8704000,
188
+ "step": 8500,
189
+ "train_runtime": 91.0822,
190
+ "train_tokens_per_second": 95562.051
191
+ },
192
+ {
193
+ "epoch": 1.3835511145272867,
194
+ "grad_norm": 0.3044818639755249,
195
+ "learning_rate": 3.6166026133743274e-05,
196
+ "loss": 0.1922,
197
+ "num_input_tokens_seen": 9216000,
198
+ "step": 9000,
199
+ "train_runtime": 96.1257,
200
+ "train_tokens_per_second": 95874.452
201
+ },
202
+ {
203
+ "epoch": 1.4604150653343582,
204
+ "grad_norm": 15.291511535644531,
205
+ "learning_rate": 3.5397386625672565e-05,
206
+ "loss": 0.171,
207
+ "num_input_tokens_seen": 9728000,
208
+ "step": 9500,
209
+ "train_runtime": 101.1575,
210
+ "train_tokens_per_second": 96166.85
211
+ },
212
+ {
213
+ "epoch": 1.5372790161414298,
214
+ "grad_norm": 12.030097007751465,
215
+ "learning_rate": 3.462874711760184e-05,
216
+ "loss": 0.1779,
217
+ "num_input_tokens_seen": 10240000,
218
+ "step": 10000,
219
+ "train_runtime": 106.2268,
220
+ "train_tokens_per_second": 96397.497
221
+ },
222
+ {
223
+ "epoch": 1.614142966948501,
224
+ "grad_norm": 0.17212723195552826,
225
+ "learning_rate": 3.386010760953113e-05,
226
+ "loss": 0.176,
227
+ "num_input_tokens_seen": 10752000,
228
+ "step": 10500,
229
+ "train_runtime": 111.2959,
230
+ "train_tokens_per_second": 96607.35
231
+ },
232
+ {
233
+ "epoch": 1.6910069177555727,
234
+ "grad_norm": 35.843482971191406,
235
+ "learning_rate": 3.309146810146042e-05,
236
+ "loss": 0.1879,
237
+ "num_input_tokens_seen": 11264000,
238
+ "step": 11000,
239
+ "train_runtime": 116.3574,
240
+ "train_tokens_per_second": 96805.219
241
+ },
242
+ {
243
+ "epoch": 1.767870868562644,
244
+ "grad_norm": 0.055776312947273254,
245
+ "learning_rate": 3.23228285933897e-05,
246
+ "loss": 0.1749,
247
+ "num_input_tokens_seen": 11776000,
248
+ "step": 11500,
249
+ "train_runtime": 121.386,
250
+ "train_tokens_per_second": 97012.845
251
+ },
252
+ {
253
+ "epoch": 1.8447348193697155,
254
+ "grad_norm": 0.48420748114585876,
255
+ "learning_rate": 3.1554189085318986e-05,
256
+ "loss": 0.1716,
257
+ "num_input_tokens_seen": 12288000,
258
+ "step": 12000,
259
+ "train_runtime": 126.4198,
260
+ "train_tokens_per_second": 97200.001
261
+ },
262
+ {
263
+ "epoch": 1.9215987701767872,
264
+ "grad_norm": 0.3808608949184418,
265
+ "learning_rate": 3.078554957724827e-05,
266
+ "loss": 0.1819,
267
+ "num_input_tokens_seen": 12800000,
268
+ "step": 12500,
269
+ "train_runtime": 131.456,
270
+ "train_tokens_per_second": 97370.996
271
+ },
272
+ {
273
+ "epoch": 1.9984627209838586,
274
+ "grad_norm": 0.15483863651752472,
275
+ "learning_rate": 3.0016910069177555e-05,
276
+ "loss": 0.1718,
277
+ "num_input_tokens_seen": 13312000,
278
+ "step": 13000,
279
+ "train_runtime": 136.5089,
280
+ "train_tokens_per_second": 97517.416
281
+ },
282
+ {
283
+ "epoch": 2.0,
284
+ "eval_accuracy": 0.9259031514219831,
285
+ "eval_loss": 0.27698734402656555,
286
+ "eval_runtime": 5.1343,
287
+ "eval_samples_per_second": 2533.949,
288
+ "eval_steps_per_second": 316.89,
289
+ "num_input_tokens_seen": 13322240,
290
+ "step": 13010
291
+ },
292
+ {
293
+ "epoch": 2.07532667179093,
294
+ "grad_norm": 0.06390306353569031,
295
+ "learning_rate": 2.9248270561106846e-05,
296
+ "loss": 0.1384,
297
+ "num_input_tokens_seen": 13824000,
298
+ "step": 13500,
299
+ "train_runtime": 146.9662,
300
+ "train_tokens_per_second": 94062.42
301
+ },
302
+ {
303
+ "epoch": 2.1521906225980016,
304
+ "grad_norm": 0.09185440093278885,
305
+ "learning_rate": 2.8479631053036127e-05,
306
+ "loss": 0.123,
307
+ "num_input_tokens_seen": 14336000,
308
+ "step": 14000,
309
+ "train_runtime": 152.0064,
310
+ "train_tokens_per_second": 94311.832
311
+ },
312
+ {
313
+ "epoch": 2.229054573405073,
314
+ "grad_norm": 0.05354034900665283,
315
+ "learning_rate": 2.7710991544965414e-05,
316
+ "loss": 0.1265,
317
+ "num_input_tokens_seen": 14848000,
318
+ "step": 14500,
319
+ "train_runtime": 157.0491,
320
+ "train_tokens_per_second": 94543.682
321
+ },
322
+ {
323
+ "epoch": 2.3059185242121445,
324
+ "grad_norm": 6.2592926025390625,
325
+ "learning_rate": 2.6942352036894695e-05,
326
+ "loss": 0.1214,
327
+ "num_input_tokens_seen": 15360000,
328
+ "step": 15000,
329
+ "train_runtime": 162.0991,
330
+ "train_tokens_per_second": 94756.854
331
+ },
332
+ {
333
+ "epoch": 2.382782475019216,
334
+ "grad_norm": 0.3284030854701996,
335
+ "learning_rate": 2.6173712528823986e-05,
336
+ "loss": 0.1298,
337
+ "num_input_tokens_seen": 15872000,
338
+ "step": 15500,
339
+ "train_runtime": 167.141,
340
+ "train_tokens_per_second": 94961.759
341
+ },
342
+ {
343
+ "epoch": 2.4596464258262873,
344
+ "grad_norm": 0.2101190984249115,
345
+ "learning_rate": 2.5405073020753267e-05,
346
+ "loss": 0.1279,
347
+ "num_input_tokens_seen": 16384000,
348
+ "step": 16000,
349
+ "train_runtime": 172.1787,
350
+ "train_tokens_per_second": 95156.958
351
+ },
352
+ {
353
+ "epoch": 2.536510376633359,
354
+ "grad_norm": 7.672014236450195,
355
+ "learning_rate": 2.463643351268255e-05,
356
+ "loss": 0.1435,
357
+ "num_input_tokens_seen": 16896000,
358
+ "step": 16500,
359
+ "train_runtime": 177.2341,
360
+ "train_tokens_per_second": 95331.566
361
+ },
362
+ {
363
+ "epoch": 2.6133743274404306,
364
+ "grad_norm": 37.9052734375,
365
+ "learning_rate": 2.3867794004611836e-05,
366
+ "loss": 0.123,
367
+ "num_input_tokens_seen": 17408000,
368
+ "step": 17000,
369
+ "train_runtime": 182.2827,
370
+ "train_tokens_per_second": 95500.03
371
+ },
372
+ {
373
+ "epoch": 2.690238278247502,
374
+ "grad_norm": 0.08578933030366898,
375
+ "learning_rate": 2.3099154496541124e-05,
376
+ "loss": 0.1289,
377
+ "num_input_tokens_seen": 17920000,
378
+ "step": 17500,
379
+ "train_runtime": 187.3355,
380
+ "train_tokens_per_second": 95657.272
381
+ },
382
+ {
383
+ "epoch": 2.7671022290545735,
384
+ "grad_norm": 0.08860859274864197,
385
+ "learning_rate": 2.2330514988470408e-05,
386
+ "loss": 0.1296,
387
+ "num_input_tokens_seen": 18432000,
388
+ "step": 18000,
389
+ "train_runtime": 192.3781,
390
+ "train_tokens_per_second": 95811.329
391
+ },
392
+ {
393
+ "epoch": 2.8439661798616447,
394
+ "grad_norm": 0.41104796528816223,
395
+ "learning_rate": 2.1561875480399692e-05,
396
+ "loss": 0.1124,
397
+ "num_input_tokens_seen": 18944000,
398
+ "step": 18500,
399
+ "train_runtime": 197.7595,
400
+ "train_tokens_per_second": 95793.134
401
+ },
402
+ {
403
+ "epoch": 2.9208301306687163,
404
+ "grad_norm": 37.97283172607422,
405
+ "learning_rate": 2.079323597232898e-05,
406
+ "loss": 0.1382,
407
+ "num_input_tokens_seen": 19456000,
408
+ "step": 19000,
409
+ "train_runtime": 203.2967,
410
+ "train_tokens_per_second": 95702.474
411
+ },
412
+ {
413
+ "epoch": 2.997694081475788,
414
+ "grad_norm": 0.0325402170419693,
415
+ "learning_rate": 2.0024596464258264e-05,
416
+ "loss": 0.1388,
417
+ "num_input_tokens_seen": 19968000,
418
+ "step": 19500,
419
+ "train_runtime": 208.8029,
420
+ "train_tokens_per_second": 95630.843
421
+ },
422
+ {
423
+ "epoch": 3.0,
424
+ "eval_accuracy": 0.9287471176018447,
425
+ "eval_loss": 0.31825903058052063,
426
+ "eval_runtime": 5.4038,
427
+ "eval_samples_per_second": 2407.562,
428
+ "eval_steps_per_second": 301.084,
429
+ "num_input_tokens_seen": 19983360,
430
+ "step": 19515
431
+ },
432
+ {
433
+ "epoch": 3.074558032282859,
434
+ "grad_norm": 0.05180477350950241,
435
+ "learning_rate": 1.925595695618755e-05,
436
+ "loss": 0.0823,
437
+ "num_input_tokens_seen": 20480000,
438
+ "step": 20000,
439
+ "train_runtime": 219.9739,
440
+ "train_tokens_per_second": 93101.956
441
+ },
442
+ {
443
+ "epoch": 3.151421983089931,
444
+ "grad_norm": 0.005111335311084986,
445
+ "learning_rate": 1.8487317448116833e-05,
446
+ "loss": 0.0716,
447
+ "num_input_tokens_seen": 20992000,
448
+ "step": 20500,
449
+ "train_runtime": 225.3567,
450
+ "train_tokens_per_second": 93150.099
451
+ },
452
+ {
453
+ "epoch": 3.2282859338970025,
454
+ "grad_norm": 0.012623129412531853,
455
+ "learning_rate": 1.771867794004612e-05,
456
+ "loss": 0.0915,
457
+ "num_input_tokens_seen": 21504000,
458
+ "step": 21000,
459
+ "train_runtime": 230.6352,
460
+ "train_tokens_per_second": 93238.15
461
+ },
462
+ {
463
+ "epoch": 3.3051498847040737,
464
+ "grad_norm": 10.89956283569336,
465
+ "learning_rate": 1.6950038431975405e-05,
466
+ "loss": 0.0783,
467
+ "num_input_tokens_seen": 22016000,
468
+ "step": 21500,
469
+ "train_runtime": 236.059,
470
+ "train_tokens_per_second": 93264.831
471
+ },
472
+ {
473
+ "epoch": 3.3820138355111453,
474
+ "grad_norm": 0.010125258006155491,
475
+ "learning_rate": 1.618139892390469e-05,
476
+ "loss": 0.0882,
477
+ "num_input_tokens_seen": 22528000,
478
+ "step": 22000,
479
+ "train_runtime": 241.5242,
480
+ "train_tokens_per_second": 93274.31
481
+ },
482
+ {
483
+ "epoch": 3.458877786318217,
484
+ "grad_norm": 0.04097803309559822,
485
+ "learning_rate": 1.5412759415833973e-05,
486
+ "loss": 0.095,
487
+ "num_input_tokens_seen": 23040000,
488
+ "step": 22500,
489
+ "train_runtime": 247.0361,
490
+ "train_tokens_per_second": 93265.737
491
+ },
492
+ {
493
+ "epoch": 3.535741737125288,
494
+ "grad_norm": 0.3172767758369446,
495
+ "learning_rate": 1.464411990776326e-05,
496
+ "loss": 0.0847,
497
+ "num_input_tokens_seen": 23552000,
498
+ "step": 23000,
499
+ "train_runtime": 252.366,
500
+ "train_tokens_per_second": 93324.783
501
+ },
502
+ {
503
+ "epoch": 3.61260568793236,
504
+ "grad_norm": 0.0049354820512235165,
505
+ "learning_rate": 1.3875480399692545e-05,
506
+ "loss": 0.0758,
507
+ "num_input_tokens_seen": 24064000,
508
+ "step": 23500,
509
+ "train_runtime": 257.7685,
510
+ "train_tokens_per_second": 93355.1
511
+ },
512
+ {
513
+ "epoch": 3.689469638739431,
514
+ "grad_norm": 0.09353843331336975,
515
+ "learning_rate": 1.310684089162183e-05,
516
+ "loss": 0.0812,
517
+ "num_input_tokens_seen": 24576000,
518
+ "step": 24000,
519
+ "train_runtime": 263.0331,
520
+ "train_tokens_per_second": 93433.102
521
+ },
522
+ {
523
+ "epoch": 3.7663335895465027,
524
+ "grad_norm": 13.593195915222168,
525
+ "learning_rate": 1.2338201383551116e-05,
526
+ "loss": 0.0882,
527
+ "num_input_tokens_seen": 25088000,
528
+ "step": 24500,
529
+ "train_runtime": 268.3188,
530
+ "train_tokens_per_second": 93500.701
531
+ },
532
+ {
533
+ "epoch": 3.8431975403535743,
534
+ "grad_norm": 0.12645399570465088,
535
+ "learning_rate": 1.15695618754804e-05,
536
+ "loss": 0.0862,
537
+ "num_input_tokens_seen": 25600000,
538
+ "step": 25000,
539
+ "train_runtime": 273.6907,
540
+ "train_tokens_per_second": 93536.259
541
+ },
542
+ {
543
+ "epoch": 3.9200614911606455,
544
+ "grad_norm": 0.015061162412166595,
545
+ "learning_rate": 1.0800922367409686e-05,
546
+ "loss": 0.0825,
547
+ "num_input_tokens_seen": 26112000,
548
+ "step": 25500,
549
+ "train_runtime": 279.0408,
550
+ "train_tokens_per_second": 93577.699
551
+ },
552
+ {
553
+ "epoch": 3.996925441967717,
554
+ "grad_norm": 0.020776506513357162,
555
+ "learning_rate": 1.003228285933897e-05,
556
+ "loss": 0.0772,
557
+ "num_input_tokens_seen": 26624000,
558
+ "step": 26000,
559
+ "train_runtime": 284.4194,
560
+ "train_tokens_per_second": 93608.25
561
+ },
562
+ {
563
+ "epoch": 4.0,
564
+ "eval_accuracy": 0.9270561106840891,
565
+ "eval_loss": 0.43025368452072144,
566
+ "eval_runtime": 5.3244,
567
+ "eval_samples_per_second": 2443.487,
568
+ "eval_steps_per_second": 305.577,
569
+ "num_input_tokens_seen": 26644480,
570
+ "step": 26020
571
+ },
572
+ {
573
+ "epoch": 4.073789392774788,
574
+ "grad_norm": 0.09127756953239441,
575
+ "learning_rate": 9.263643351268256e-06,
576
+ "loss": 0.0615,
577
+ "num_input_tokens_seen": 27136000,
578
+ "step": 26500,
579
+ "train_runtime": 295.1257,
580
+ "train_tokens_per_second": 91947.265
581
+ },
582
+ {
583
+ "epoch": 4.15065334358186,
584
+ "grad_norm": 0.11556842923164368,
585
+ "learning_rate": 8.49500384319754e-06,
586
+ "loss": 0.0444,
587
+ "num_input_tokens_seen": 27648000,
588
+ "step": 27000,
589
+ "train_runtime": 300.149,
590
+ "train_tokens_per_second": 92114.259
591
+ },
592
+ {
593
+ "epoch": 4.227517294388932,
594
+ "grad_norm": 0.09005430340766907,
595
+ "learning_rate": 7.726364335126826e-06,
596
+ "loss": 0.0434,
597
+ "num_input_tokens_seen": 28160000,
598
+ "step": 27500,
599
+ "train_runtime": 305.2327,
600
+ "train_tokens_per_second": 92257.495
601
+ },
602
+ {
603
+ "epoch": 4.304381245196003,
604
+ "grad_norm": 0.004569609649479389,
605
+ "learning_rate": 6.9577248270561115e-06,
606
+ "loss": 0.0352,
607
+ "num_input_tokens_seen": 28672000,
608
+ "step": 28000,
609
+ "train_runtime": 310.2962,
610
+ "train_tokens_per_second": 92402.026
611
+ },
612
+ {
613
+ "epoch": 4.381245196003075,
614
+ "grad_norm": 1.524936318397522,
615
+ "learning_rate": 6.189085318985397e-06,
616
+ "loss": 0.06,
617
+ "num_input_tokens_seen": 29184000,
618
+ "step": 28500,
619
+ "train_runtime": 315.365,
620
+ "train_tokens_per_second": 92540.388
621
+ },
622
+ {
623
+ "epoch": 4.458109146810146,
624
+ "grad_norm": 0.014427268877625465,
625
+ "learning_rate": 5.420445810914681e-06,
626
+ "loss": 0.0492,
627
+ "num_input_tokens_seen": 29696000,
628
+ "step": 29000,
629
+ "train_runtime": 320.4274,
630
+ "train_tokens_per_second": 92676.208
631
+ },
632
+ {
633
+ "epoch": 4.534973097617217,
634
+ "grad_norm": 0.07355033606290817,
635
+ "learning_rate": 4.651806302843966e-06,
636
+ "loss": 0.0572,
637
+ "num_input_tokens_seen": 30208000,
638
+ "step": 29500,
639
+ "train_runtime": 325.4966,
640
+ "train_tokens_per_second": 92805.883
641
+ },
642
+ {
643
+ "epoch": 4.611837048424289,
644
+ "grad_norm": 7.176478385925293,
645
+ "learning_rate": 3.883166794773251e-06,
646
+ "loss": 0.0442,
647
+ "num_input_tokens_seen": 30720000,
648
+ "step": 30000,
649
+ "train_runtime": 330.568,
650
+ "train_tokens_per_second": 92930.968
651
+ },
652
+ {
653
+ "epoch": 4.688700999231361,
654
+ "grad_norm": 0.030576860532164574,
655
+ "learning_rate": 3.114527286702537e-06,
656
+ "loss": 0.0465,
657
+ "num_input_tokens_seen": 31232000,
658
+ "step": 30500,
659
+ "train_runtime": 335.6493,
660
+ "train_tokens_per_second": 93049.488
661
+ },
662
+ {
663
+ "epoch": 4.765564950038432,
664
+ "grad_norm": 0.005597515497356653,
665
+ "learning_rate": 2.345887778631822e-06,
666
+ "loss": 0.0487,
667
+ "num_input_tokens_seen": 31744000,
668
+ "step": 31000,
669
+ "train_runtime": 340.8069,
670
+ "train_tokens_per_second": 93143.641
671
+ },
672
+ {
673
+ "epoch": 4.842428900845503,
674
+ "grad_norm": 0.021089155226945877,
675
+ "learning_rate": 1.5772482705611067e-06,
676
+ "loss": 0.0557,
677
+ "num_input_tokens_seen": 32256000,
678
+ "step": 31500,
679
+ "train_runtime": 346.0097,
680
+ "train_tokens_per_second": 93222.824
681
+ },
682
+ {
683
+ "epoch": 4.919292851652575,
684
+ "grad_norm": 0.023516027256846428,
685
+ "learning_rate": 8.086087624903922e-07,
686
+ "loss": 0.0399,
687
+ "num_input_tokens_seen": 32768000,
688
+ "step": 32000,
689
+ "train_runtime": 351.2595,
690
+ "train_tokens_per_second": 93287.165
691
+ },
692
+ {
693
+ "epoch": 4.996156802459646,
694
+ "grad_norm": 0.007581554353237152,
695
+ "learning_rate": 3.996925441967718e-08,
696
+ "loss": 0.0532,
697
+ "num_input_tokens_seen": 33280000,
698
+ "step": 32500,
699
+ "train_runtime": 356.5514,
700
+ "train_tokens_per_second": 93338.566
701
+ },
702
+ {
703
+ "epoch": 5.0,
704
+ "eval_accuracy": 0.9264411990776326,
705
+ "eval_loss": 0.4613732397556305,
706
+ "eval_runtime": 5.2997,
707
+ "eval_samples_per_second": 2454.857,
708
+ "eval_steps_per_second": 306.999,
709
+ "num_input_tokens_seen": 33305600,
710
+ "step": 32525
711
+ },
712
+ {
713
+ "epoch": 5.0,
714
+ "num_input_tokens_seen": 33305600,
715
+ "step": 32525,
716
+ "total_flos": 2157435918643200.0,
717
+ "train_loss": 0.1371641572881533,
718
+ "train_runtime": 362.4532,
719
+ "train_samples_per_second": 717.886,
720
+ "train_steps_per_second": 89.736
721
+ }
722
+ ],
723
+ "logging_steps": 500,
724
+ "max_steps": 32525,
725
+ "num_input_tokens_seen": 33305600,
726
+ "num_train_epochs": 5,
727
+ "save_steps": 500,
728
+ "stateful_callbacks": {
729
+ "TrainerControl": {
730
+ "args": {
731
+ "should_epoch_stop": false,
732
+ "should_evaluate": false,
733
+ "should_log": false,
734
+ "should_save": true,
735
+ "should_training_stop": true
736
+ },
737
+ "attributes": {}
738
+ }
739
+ },
740
+ "total_flos": 2157435918643200.0,
741
+ "train_batch_size": 8,
742
+ "trial_name": null,
743
+ "trial_params": null
744
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88bc931c3ffc6b4b25fa2a994558f90c9fe24e5f81f4d486bf8c879b0f031021
3
+ size 5201
vocab.txt ADDED
The diff for this file is too large to render. See raw diff