j05hr3d commited on
Commit
d8a90cb
·
verified ·
1 Parent(s): fcdeb9e

Model save

Browse files
Files changed (3) hide show
  1. README.md +11 -10
  2. adapter_model.safetensors +1 -1
  3. trainer_state.json +116 -101
README.md CHANGED
@@ -19,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.7595
23
 
24
  ## Model description
25
 
@@ -53,15 +53,16 @@ The following hyperparameters were used during training:
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:------:|:----:|:---------------:|
56
- | 0.8984 | 0.3361 | 20 | 0.8835 |
57
- | 0.8801 | 0.6723 | 40 | 0.8205 |
58
- | 0.7978 | 1.0 | 60 | 0.7939 |
59
- | 0.7223 | 1.3361 | 80 | 0.7812 |
60
- | 0.7371 | 1.6723 | 100 | 0.7724 |
61
- | 0.6279 | 2.0 | 120 | 0.7595 |
62
- | 0.5718 | 2.3361 | 140 | 0.7720 |
63
- | 0.5764 | 2.6723 | 160 | 0.7730 |
64
- | 0.5958 | 3.0 | 180 | 0.7697 |
 
65
 
66
 
67
  ### Framework versions
 
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.8553
23
 
24
  ## Model description
25
 
 
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:------:|:----:|:---------------:|
56
+ | 0.9584 | 0.2974 | 20 | 0.9885 |
57
+ | 0.8218 | 0.5948 | 40 | 0.9364 |
58
+ | 0.7618 | 0.8922 | 60 | 0.9027 |
59
+ | 0.6981 | 1.1784 | 80 | 0.8842 |
60
+ | 0.7566 | 1.4758 | 100 | 0.8678 |
61
+ | 0.6459 | 1.7732 | 120 | 0.8576 |
62
+ | 0.6026 | 2.0595 | 140 | 0.8553 |
63
+ | 0.6211 | 2.3569 | 160 | 0.8604 |
64
+ | 0.4883 | 2.6543 | 180 | 0.8642 |
65
+ | 0.5752 | 2.9517 | 200 | 0.8622 |
66
 
67
 
68
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80d5fc62d3c85e08b599cc189cf02a4b46700a939b122459f92d43dde1d94652
3
  size 323014168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5848642e6ab97926e3049fd764b2faf34d7700eeb44b0b98b684add03fb39355
3
  size 323014168
trainer_state.json CHANGED
@@ -1,169 +1,184 @@
1
  {
2
- "best_global_step": 120,
3
- "best_metric": 0.7594600915908813,
4
- "best_model_checkpoint": "j05hr3d/SFT-Qwen2.5-Coder-7B_v1/checkpoint-120",
5
- "epoch": 3.0,
6
  "eval_steps": 20,
7
- "global_step": 180,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.33613445378151263,
14
- "grad_norm": 0.4482860267162323,
15
- "learning_rate": 9.252873563218392e-05,
16
- "loss": 0.8984,
17
  "step": 20
18
  },
19
  {
20
- "epoch": 0.33613445378151263,
21
- "eval_loss": 0.8834584355354309,
22
- "eval_runtime": 16.2682,
23
- "eval_samples_per_second": 3.381,
24
- "eval_steps_per_second": 0.43,
25
  "step": 20
26
  },
27
  {
28
- "epoch": 0.6722689075630253,
29
- "grad_norm": 0.31251004338264465,
30
- "learning_rate": 8.103448275862069e-05,
31
- "loss": 0.8801,
32
  "step": 40
33
  },
34
  {
35
- "epoch": 0.6722689075630253,
36
- "eval_loss": 0.8205494284629822,
37
- "eval_runtime": 14.3181,
38
- "eval_samples_per_second": 3.841,
39
- "eval_steps_per_second": 0.489,
40
  "step": 40
41
  },
42
  {
43
- "epoch": 1.0,
44
- "grad_norm": 0.3656582236289978,
45
- "learning_rate": 6.954022988505747e-05,
46
- "loss": 0.7978,
47
  "step": 60
48
  },
49
  {
50
- "epoch": 1.0,
51
- "eval_loss": 0.7939355969429016,
52
- "eval_runtime": 14.3226,
53
- "eval_samples_per_second": 3.84,
54
- "eval_steps_per_second": 0.489,
55
  "step": 60
56
  },
57
  {
58
- "epoch": 1.3361344537815127,
59
- "grad_norm": 0.5589117407798767,
60
- "learning_rate": 5.8045977011494254e-05,
61
- "loss": 0.7223,
62
  "step": 80
63
  },
64
  {
65
- "epoch": 1.3361344537815127,
66
- "eval_loss": 0.7812108993530273,
67
- "eval_runtime": 14.3201,
68
- "eval_samples_per_second": 3.841,
69
- "eval_steps_per_second": 0.489,
70
  "step": 80
71
  },
72
  {
73
- "epoch": 1.6722689075630253,
74
- "grad_norm": 0.5989664793014526,
75
- "learning_rate": 4.655172413793104e-05,
76
- "loss": 0.7371,
77
  "step": 100
78
  },
79
  {
80
- "epoch": 1.6722689075630253,
81
- "eval_loss": 0.7724329233169556,
82
- "eval_runtime": 14.3105,
83
- "eval_samples_per_second": 3.843,
84
- "eval_steps_per_second": 0.489,
85
  "step": 100
86
  },
87
  {
88
- "epoch": 2.0,
89
- "grad_norm": 1.47295343875885,
90
- "learning_rate": 3.505747126436782e-05,
91
- "loss": 0.6279,
92
  "step": 120
93
  },
94
  {
95
- "epoch": 2.0,
96
- "eval_loss": 0.7594600915908813,
97
- "eval_runtime": 14.3138,
98
- "eval_samples_per_second": 3.842,
99
- "eval_steps_per_second": 0.489,
100
  "step": 120
101
  },
102
  {
103
- "epoch": 2.3361344537815127,
104
- "grad_norm": 0.5978106260299683,
105
- "learning_rate": 2.3563218390804597e-05,
106
- "loss": 0.5718,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 2.3361344537815127,
111
- "eval_loss": 0.7720052599906921,
112
- "eval_runtime": 14.3148,
113
- "eval_samples_per_second": 3.842,
114
- "eval_steps_per_second": 0.489,
115
  "step": 140
116
  },
117
  {
118
- "epoch": 2.6722689075630255,
119
- "grad_norm": 0.5382282137870789,
120
- "learning_rate": 1.206896551724138e-05,
121
- "loss": 0.5764,
122
  "step": 160
123
  },
124
  {
125
- "epoch": 2.6722689075630255,
126
- "eval_loss": 0.772979736328125,
127
- "eval_runtime": 14.3279,
128
- "eval_samples_per_second": 3.839,
129
- "eval_steps_per_second": 0.489,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 3.0,
134
- "grad_norm": 0.9302071332931519,
135
- "learning_rate": 5.747126436781609e-07,
136
- "loss": 0.5958,
137
  "step": 180
138
  },
139
  {
140
- "epoch": 3.0,
141
- "eval_loss": 0.7697007656097412,
142
- "eval_runtime": 14.3234,
143
- "eval_samples_per_second": 3.84,
144
- "eval_steps_per_second": 0.489,
145
  "step": 180
146
  },
147
  {
148
- "epoch": 3.0,
149
- "step": 180,
150
- "total_flos": 7.62172871614802e+16,
151
- "train_loss": 0.7119400660196941,
152
- "train_runtime": 1075.5336,
153
- "train_samples_per_second": 1.325,
154
- "train_steps_per_second": 0.167
155
  },
156
  {
157
- "epoch": 3.0,
158
- "eval_loss": 0.7594600915908813,
159
- "eval_runtime": 14.2724,
160
- "eval_samples_per_second": 3.854,
161
- "eval_steps_per_second": 0.49,
162
- "step": 180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  }
164
  ],
165
  "logging_steps": 20,
166
- "max_steps": 180,
167
  "num_input_tokens_seen": 0,
168
  "num_train_epochs": 3,
169
  "save_steps": 20,
@@ -188,7 +203,7 @@
188
  "attributes": {}
189
  }
190
  },
191
- "total_flos": 7.62172871614802e+16,
192
  "train_batch_size": 2,
193
  "trial_name": null,
194
  "trial_params": null
 
1
  {
2
+ "best_global_step": 140,
3
+ "best_metric": 0.8553072810173035,
4
+ "best_model_checkpoint": "j05hr3d/SFT-Qwen2.5-Coder-7B_v1/checkpoint-140",
5
+ "epoch": 2.9516728624535316,
6
  "eval_steps": 20,
7
+ "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.29739776951672864,
14
+ "grad_norm": 0.3168877959251404,
15
+ "learning_rate": 9.390862944162437e-05,
16
+ "loss": 0.9584,
17
  "step": 20
18
  },
19
  {
20
+ "epoch": 0.29739776951672864,
21
+ "eval_loss": 0.9884762167930603,
22
+ "eval_runtime": 21.8941,
23
+ "eval_samples_per_second": 2.649,
24
+ "eval_steps_per_second": 0.365,
25
  "step": 20
26
  },
27
  {
28
+ "epoch": 0.5947955390334573,
29
+ "grad_norm": 0.43252885341644287,
30
+ "learning_rate": 8.375634517766498e-05,
31
+ "loss": 0.8218,
32
  "step": 40
33
  },
34
  {
35
+ "epoch": 0.5947955390334573,
36
+ "eval_loss": 0.9363918900489807,
37
+ "eval_runtime": 19.8022,
38
+ "eval_samples_per_second": 2.929,
39
+ "eval_steps_per_second": 0.404,
40
  "step": 40
41
  },
42
  {
43
+ "epoch": 0.8921933085501859,
44
+ "grad_norm": 0.420043021440506,
45
+ "learning_rate": 7.360406091370558e-05,
46
+ "loss": 0.7618,
47
  "step": 60
48
  },
49
  {
50
+ "epoch": 0.8921933085501859,
51
+ "eval_loss": 0.9027390480041504,
52
+ "eval_runtime": 19.7842,
53
+ "eval_samples_per_second": 2.932,
54
+ "eval_steps_per_second": 0.404,
55
  "step": 60
56
  },
57
  {
58
+ "epoch": 1.178438661710037,
59
+ "grad_norm": 0.5402843356132507,
60
+ "learning_rate": 6.34517766497462e-05,
61
+ "loss": 0.6981,
62
  "step": 80
63
  },
64
  {
65
+ "epoch": 1.178438661710037,
66
+ "eval_loss": 0.884242594242096,
67
+ "eval_runtime": 19.8075,
68
+ "eval_samples_per_second": 2.928,
69
+ "eval_steps_per_second": 0.404,
70
  "step": 80
71
  },
72
  {
73
+ "epoch": 1.4758364312267658,
74
+ "grad_norm": 0.4523223042488098,
75
+ "learning_rate": 5.329949238578681e-05,
76
+ "loss": 0.7566,
77
  "step": 100
78
  },
79
  {
80
+ "epoch": 1.4758364312267658,
81
+ "eval_loss": 0.8677776455879211,
82
+ "eval_runtime": 19.7865,
83
+ "eval_samples_per_second": 2.931,
84
+ "eval_steps_per_second": 0.404,
85
  "step": 100
86
  },
87
  {
88
+ "epoch": 1.7732342007434945,
89
+ "grad_norm": 0.3251374065876007,
90
+ "learning_rate": 4.3147208121827415e-05,
91
+ "loss": 0.6459,
92
  "step": 120
93
  },
94
  {
95
+ "epoch": 1.7732342007434945,
96
+ "eval_loss": 0.857609212398529,
97
+ "eval_runtime": 19.8121,
98
+ "eval_samples_per_second": 2.927,
99
+ "eval_steps_per_second": 0.404,
100
  "step": 120
101
  },
102
  {
103
+ "epoch": 2.059479553903346,
104
+ "grad_norm": 0.491901695728302,
105
+ "learning_rate": 3.299492385786802e-05,
106
+ "loss": 0.6026,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 2.059479553903346,
111
+ "eval_loss": 0.8553072810173035,
112
+ "eval_runtime": 19.8082,
113
+ "eval_samples_per_second": 2.928,
114
+ "eval_steps_per_second": 0.404,
115
  "step": 140
116
  },
117
  {
118
+ "epoch": 2.356877323420074,
119
+ "grad_norm": 0.7482563257217407,
120
+ "learning_rate": 2.284263959390863e-05,
121
+ "loss": 0.6211,
122
  "step": 160
123
  },
124
  {
125
+ "epoch": 2.356877323420074,
126
+ "eval_loss": 0.8603928685188293,
127
+ "eval_runtime": 19.7923,
128
+ "eval_samples_per_second": 2.93,
129
+ "eval_steps_per_second": 0.404,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 2.654275092936803,
134
+ "grad_norm": 1.1968350410461426,
135
+ "learning_rate": 1.2690355329949238e-05,
136
+ "loss": 0.4883,
137
  "step": 180
138
  },
139
  {
140
+ "epoch": 2.654275092936803,
141
+ "eval_loss": 0.8641771674156189,
142
+ "eval_runtime": 19.7998,
143
+ "eval_samples_per_second": 2.929,
144
+ "eval_steps_per_second": 0.404,
145
  "step": 180
146
  },
147
  {
148
+ "epoch": 2.9516728624535316,
149
+ "grad_norm": 0.8442820310592651,
150
+ "learning_rate": 2.5380710659898476e-06,
151
+ "loss": 0.5752,
152
+ "step": 200
 
 
153
  },
154
  {
155
+ "epoch": 2.9516728624535316,
156
+ "eval_loss": 0.862239420413971,
157
+ "eval_runtime": 19.8149,
158
+ "eval_samples_per_second": 2.927,
159
+ "eval_steps_per_second": 0.404,
160
+ "step": 200
161
+ },
162
+ {
163
+ "epoch": 2.9516728624535316,
164
+ "step": 200,
165
+ "total_flos": 7.855857074425651e+16,
166
+ "train_loss": 0.6929673862457275,
167
+ "train_runtime": 1202.5527,
168
+ "train_samples_per_second": 1.34,
169
+ "train_steps_per_second": 0.17
170
+ },
171
+ {
172
+ "epoch": 2.9516728624535316,
173
+ "eval_loss": 0.8553072810173035,
174
+ "eval_runtime": 19.7428,
175
+ "eval_samples_per_second": 2.938,
176
+ "eval_steps_per_second": 0.405,
177
+ "step": 200
178
  }
179
  ],
180
  "logging_steps": 20,
181
+ "max_steps": 204,
182
  "num_input_tokens_seen": 0,
183
  "num_train_epochs": 3,
184
  "save_steps": 20,
 
203
  "attributes": {}
204
  }
205
  },
206
+ "total_flos": 7.855857074425651e+16,
207
  "train_batch_size": 2,
208
  "trial_name": null,
209
  "trial_params": null