ysn-rfd commited on Feb 12

Commit

52dd370

verified ·

1 Parent(s): 3d3c435

Upload YASIN V2

Browse files

Files changed (18) hide show

checkpoint-8000/config.json +29 -0
checkpoint-8000/generation_config.json +7 -0
checkpoint-8000/model.safetensors +3 -0
checkpoint-8000/optimizer.pt +3 -0
checkpoint-8000/rng_state.pth +3 -0
checkpoint-8000/scaler.pt +3 -0
checkpoint-8000/scheduler.pt +3 -0
checkpoint-8000/trainer_state.json +1154 -0
checkpoint-8000/training_args.bin +3 -0
checkpoint-9000/config.json +29 -0
checkpoint-9000/generation_config.json +7 -0
checkpoint-9000/model.safetensors +3 -0
checkpoint-9000/optimizer.pt +3 -0
checkpoint-9000/rng_state.pth +3 -0
checkpoint-9000/scaler.pt +3 -0
checkpoint-9000/scheduler.pt +3 -0
checkpoint-9000/trainer_state.json +1294 -0
checkpoint-9000/training_args.bin +3 -0

checkpoint-8000/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "YasinForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 48,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 1024,
+  "mlp_bias": false,
+  "model_type": "yasin",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 16,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 50257
+}

checkpoint-8000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

checkpoint-8000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e54a94b4a0394ce9f2c1148d17a5253da67db0bb8efc9f5c5144365c37b342f0
+size 912876512

checkpoint-8000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63e12e5a118b9d112314c2e99ddda40792a9ff0cf6dac2c1949896784d92626d
+size 1825846859

checkpoint-8000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
+size 14645

checkpoint-8000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:022915fa8ac65fae0f13853519b8aa7ee79efd632a399c1f64c6b87bab0b057a
+size 1383

checkpoint-8000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:703fbf281c65b18b081956dbdc91da7ba5d0f7374dd0632e1ca82a90f2073597
+size 1465

checkpoint-8000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1154 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.008,
+  "eval_steps": 500,
+  "global_step": 8000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 5e-05,
+      "grad_norm": 4.426811695098877,
+      "learning_rate": 4.999755000000001e-05,
+      "loss": 6.3755,
+      "step": 50
+    },
+    {
+      "epoch": 0.0001,
+      "grad_norm": 1.7730491161346436,
+      "learning_rate": 4.999505e-05,
+      "loss": 3.3233,
+      "step": 100
+    },
+    {
+      "epoch": 0.00015,
+      "grad_norm": 1.4473446607589722,
+      "learning_rate": 4.999255e-05,
+      "loss": 2.7523,
+      "step": 150
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 1.4645352363586426,
+      "learning_rate": 4.9990050000000004e-05,
+      "loss": 2.4618,
+      "step": 200
+    },
+    {
+      "epoch": 0.00025,
+      "grad_norm": 1.1990822553634644,
+      "learning_rate": 4.998755e-05,
+      "loss": 2.3417,
+      "step": 250
+    },
+    {
+      "epoch": 0.0003,
+      "grad_norm": 3.0459673404693604,
+      "learning_rate": 4.998505e-05,
+      "loss": 2.2581,
+      "step": 300
+    },
+    {
+      "epoch": 0.00035,
+      "grad_norm": 1.2380167245864868,
+      "learning_rate": 4.998255e-05,
+      "loss": 2.1563,
+      "step": 350
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 1.2060939073562622,
+      "learning_rate": 4.998005e-05,
+      "loss": 1.8521,
+      "step": 400
+    },
+    {
+      "epoch": 0.00045,
+      "grad_norm": 0.9207198619842529,
+      "learning_rate": 4.9977550000000004e-05,
+      "loss": 1.9201,
+      "step": 450
+    },
+    {
+      "epoch": 0.0005,
+      "grad_norm": 1.0762739181518555,
+      "learning_rate": 4.997505e-05,
+      "loss": 1.8104,
+      "step": 500
+    },
+    {
+      "epoch": 0.00055,
+      "grad_norm": 1.0367337465286255,
+      "learning_rate": 4.997255e-05,
+      "loss": 1.8756,
+      "step": 550
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 0.9291325211524963,
+      "learning_rate": 4.997005e-05,
+      "loss": 1.7404,
+      "step": 600
+    },
+    {
+      "epoch": 0.00065,
+      "grad_norm": 0.7452297210693359,
+      "learning_rate": 4.996755e-05,
+      "loss": 1.6843,
+      "step": 650
+    },
+    {
+      "epoch": 0.0007,
+      "grad_norm": 0.9317042231559753,
+      "learning_rate": 4.9965050000000004e-05,
+      "loss": 1.7006,
+      "step": 700
+    },
+    {
+      "epoch": 0.00075,
+      "grad_norm": 1.0354136228561401,
+      "learning_rate": 4.9962550000000005e-05,
+      "loss": 1.5652,
+      "step": 750
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.050106406211853,
+      "learning_rate": 4.996005e-05,
+      "loss": 1.5966,
+      "step": 800
+    },
+    {
+      "epoch": 0.00085,
+      "grad_norm": 0.9646019339561462,
+      "learning_rate": 4.995755e-05,
+      "loss": 1.6297,
+      "step": 850
+    },
+    {
+      "epoch": 0.0009,
+      "grad_norm": 0.9152287840843201,
+      "learning_rate": 4.995505e-05,
+      "loss": 1.6328,
+      "step": 900
+    },
+    {
+      "epoch": 0.00095,
+      "grad_norm": 0.9403690099716187,
+      "learning_rate": 4.995255e-05,
+      "loss": 1.6056,
+      "step": 950
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 1.1822874546051025,
+      "learning_rate": 4.9950050000000005e-05,
+      "loss": 1.6016,
+      "step": 1000
+    },
+    {
+      "epoch": 0.00105,
+      "grad_norm": 1.3227542638778687,
+      "learning_rate": 4.9947550000000006e-05,
+      "loss": 1.5448,
+      "step": 1050
+    },
+    {
+      "epoch": 0.0011,
+      "grad_norm": 1.0503350496292114,
+      "learning_rate": 4.994505000000001e-05,
+      "loss": 1.6355,
+      "step": 1100
+    },
+    {
+      "epoch": 0.00115,
+      "grad_norm": 1.1647204160690308,
+      "learning_rate": 4.994255e-05,
+      "loss": 1.4673,
+      "step": 1150
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 0.7281339168548584,
+      "learning_rate": 4.994005e-05,
+      "loss": 1.4641,
+      "step": 1200
+    },
+    {
+      "epoch": 0.00125,
+      "grad_norm": 1.2438446283340454,
+      "learning_rate": 4.9937550000000004e-05,
+      "loss": 1.5177,
+      "step": 1250
+    },
+    {
+      "epoch": 0.0013,
+      "grad_norm": 0.7967873811721802,
+      "learning_rate": 4.993505e-05,
+      "loss": 1.4399,
+      "step": 1300
+    },
+    {
+      "epoch": 0.00135,
+      "grad_norm": 0.8938255310058594,
+      "learning_rate": 4.993255e-05,
+      "loss": 1.4453,
+      "step": 1350
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 1.4659217596054077,
+      "learning_rate": 4.993005e-05,
+      "loss": 1.6517,
+      "step": 1400
+    },
+    {
+      "epoch": 0.00145,
+      "grad_norm": 0.7856793403625488,
+      "learning_rate": 4.992755e-05,
+      "loss": 1.4721,
+      "step": 1450
+    },
+    {
+      "epoch": 0.0015,
+      "grad_norm": 0.6772142648696899,
+      "learning_rate": 4.9925050000000004e-05,
+      "loss": 1.5314,
+      "step": 1500
+    },
+    {
+      "epoch": 0.00155,
+      "grad_norm": 0.7613831758499146,
+      "learning_rate": 4.992255e-05,
+      "loss": 1.4603,
+      "step": 1550
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1216529607772827,
+      "learning_rate": 4.992005e-05,
+      "loss": 1.54,
+      "step": 1600
+    },
+    {
+      "epoch": 0.00165,
+      "grad_norm": 0.7359323501586914,
+      "learning_rate": 4.991755e-05,
+      "loss": 1.5121,
+      "step": 1650
+    },
+    {
+      "epoch": 0.0017,
+      "grad_norm": 0.8343626260757446,
+      "learning_rate": 4.991505e-05,
+      "loss": 1.6107,
+      "step": 1700
+    },
+    {
+      "epoch": 0.00175,
+      "grad_norm": 0.9381140470504761,
+      "learning_rate": 4.9912550000000004e-05,
+      "loss": 1.5059,
+      "step": 1750
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 0.7317638993263245,
+      "learning_rate": 4.9910050000000005e-05,
+      "loss": 1.5621,
+      "step": 1800
+    },
+    {
+      "epoch": 0.00185,
+      "grad_norm": 0.8273525834083557,
+      "learning_rate": 4.990755e-05,
+      "loss": 1.4737,
+      "step": 1850
+    },
+    {
+      "epoch": 0.0019,
+      "grad_norm": 0.7206686735153198,
+      "learning_rate": 4.990505e-05,
+      "loss": 1.5242,
+      "step": 1900
+    },
+    {
+      "epoch": 0.00195,
+      "grad_norm": 0.9094993472099304,
+      "learning_rate": 4.990255e-05,
+      "loss": 1.5336,
+      "step": 1950
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 0.7432169914245605,
+      "learning_rate": 4.9900050000000003e-05,
+      "loss": 1.4913,
+      "step": 2000
+    },
+    {
+      "epoch": 0.00205,
+      "grad_norm": 1.337276816368103,
+      "learning_rate": 4.9897550000000005e-05,
+      "loss": 1.5643,
+      "step": 2050
+    },
+    {
+      "epoch": 0.0021,
+      "grad_norm": 0.6664676666259766,
+      "learning_rate": 4.9895050000000006e-05,
+      "loss": 1.4728,
+      "step": 2100
+    },
+    {
+      "epoch": 0.00215,
+      "grad_norm": 0.7946372628211975,
+      "learning_rate": 4.989255000000001e-05,
+      "loss": 1.4832,
+      "step": 2150
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 0.9259161949157715,
+      "learning_rate": 4.989005e-05,
+      "loss": 1.5674,
+      "step": 2200
+    },
+    {
+      "epoch": 0.00225,
+      "grad_norm": 0.9703247547149658,
+      "learning_rate": 4.988755e-05,
+      "loss": 1.3973,
+      "step": 2250
+    },
+    {
+      "epoch": 0.0023,
+      "grad_norm": 0.7390087842941284,
+      "learning_rate": 4.988505e-05,
+      "loss": 1.3919,
+      "step": 2300
+    },
+    {
+      "epoch": 0.00235,
+      "grad_norm": 0.768277645111084,
+      "learning_rate": 4.988255e-05,
+      "loss": 1.4815,
+      "step": 2350
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.76382976770401,
+      "learning_rate": 4.988005e-05,
+      "loss": 1.4458,
+      "step": 2400
+    },
+    {
+      "epoch": 0.00245,
+      "grad_norm": 0.7780851721763611,
+      "learning_rate": 4.987755e-05,
+      "loss": 1.4382,
+      "step": 2450
+    },
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.7184464335441589,
+      "learning_rate": 4.987505e-05,
+      "loss": 1.521,
+      "step": 2500
+    },
+    {
+      "epoch": 0.00255,
+      "grad_norm": 0.6212390065193176,
+      "learning_rate": 4.9872550000000004e-05,
+      "loss": 1.3889,
+      "step": 2550
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 0.8539580702781677,
+      "learning_rate": 4.987005e-05,
+      "loss": 1.415,
+      "step": 2600
+    },
+    {
+      "epoch": 0.00265,
+      "grad_norm": 0.7129775881767273,
+      "learning_rate": 4.986755e-05,
+      "loss": 1.4102,
+      "step": 2650
+    },
+    {
+      "epoch": 0.0027,
+      "grad_norm": 0.5899195671081543,
+      "learning_rate": 4.986505e-05,
+      "loss": 1.3074,
+      "step": 2700
+    },
+    {
+      "epoch": 0.00275,
+      "grad_norm": 0.6940101981163025,
+      "learning_rate": 4.986255e-05,
+      "loss": 1.3748,
+      "step": 2750
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.6420891880989075,
+      "learning_rate": 4.9860050000000004e-05,
+      "loss": 1.4089,
+      "step": 2800
+    },
+    {
+      "epoch": 0.00285,
+      "grad_norm": 0.8561428189277649,
+      "learning_rate": 4.9857550000000005e-05,
+      "loss": 1.4052,
+      "step": 2850
+    },
+    {
+      "epoch": 0.0029,
+      "grad_norm": 0.6900970935821533,
+      "learning_rate": 4.9855050000000006e-05,
+      "loss": 1.4323,
+      "step": 2900
+    },
+    {
+      "epoch": 0.00295,
+      "grad_norm": 0.8071371912956238,
+      "learning_rate": 4.985255e-05,
+      "loss": 1.3485,
+      "step": 2950
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 0.6493249535560608,
+      "learning_rate": 4.985005e-05,
+      "loss": 1.4321,
+      "step": 3000
+    },
+    {
+      "epoch": 0.00305,
+      "grad_norm": 0.8712514042854309,
+      "learning_rate": 4.9847550000000004e-05,
+      "loss": 1.406,
+      "step": 3050
+    },
+    {
+      "epoch": 0.0031,
+      "grad_norm": 0.8464570045471191,
+      "learning_rate": 4.9845050000000005e-05,
+      "loss": 1.3322,
+      "step": 3100
+    },
+    {
+      "epoch": 0.00315,
+      "grad_norm": 0.7779001593589783,
+      "learning_rate": 4.9842550000000006e-05,
+      "loss": 1.3905,
+      "step": 3150
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8314748406410217,
+      "learning_rate": 4.984005000000001e-05,
+      "loss": 1.4134,
+      "step": 3200
+    },
+    {
+      "epoch": 0.00325,
+      "grad_norm": 0.7180835604667664,
+      "learning_rate": 4.983755e-05,
+      "loss": 1.3629,
+      "step": 3250
+    },
+    {
+      "epoch": 0.0033,
+      "grad_norm": 0.6165933012962341,
+      "learning_rate": 4.9835049999999996e-05,
+      "loss": 1.3833,
+      "step": 3300
+    },
+    {
+      "epoch": 0.00335,
+      "grad_norm": 1.8662049770355225,
+      "learning_rate": 4.983255e-05,
+      "loss": 1.3557,
+      "step": 3350
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 0.9091550707817078,
+      "learning_rate": 4.983005e-05,
+      "loss": 1.4092,
+      "step": 3400
+    },
+    {
+      "epoch": 0.00345,
+      "grad_norm": 0.9908722639083862,
+      "learning_rate": 4.982755e-05,
+      "loss": 1.3465,
+      "step": 3450
+    },
+    {
+      "epoch": 0.0035,
+      "grad_norm": 0.862427830696106,
+      "learning_rate": 4.982505e-05,
+      "loss": 1.3143,
+      "step": 3500
+    },
+    {
+      "epoch": 0.00355,
+      "grad_norm": 0.755211591720581,
+      "learning_rate": 4.982255e-05,
+      "loss": 1.358,
+      "step": 3550
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 0.7872670888900757,
+      "learning_rate": 4.9820050000000004e-05,
+      "loss": 1.3952,
+      "step": 3600
+    },
+    {
+      "epoch": 0.00365,
+      "grad_norm": 1.5124619007110596,
+      "learning_rate": 4.981755e-05,
+      "loss": 1.2929,
+      "step": 3650
+    },
+    {
+      "epoch": 0.0037,
+      "grad_norm": 0.7712079286575317,
+      "learning_rate": 4.981505e-05,
+      "loss": 1.3429,
+      "step": 3700
+    },
+    {
+      "epoch": 0.00375,
+      "grad_norm": 0.7001494765281677,
+      "learning_rate": 4.981255e-05,
+      "loss": 1.272,
+      "step": 3750
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 0.674104630947113,
+      "learning_rate": 4.981005e-05,
+      "loss": 1.3891,
+      "step": 3800
+    },
+    {
+      "epoch": 0.00385,
+      "grad_norm": 1.0494478940963745,
+      "learning_rate": 4.9807550000000004e-05,
+      "loss": 1.333,
+      "step": 3850
+    },
+    {
+      "epoch": 0.0039,
+      "grad_norm": 0.6674365401268005,
+      "learning_rate": 4.9805050000000005e-05,
+      "loss": 1.4418,
+      "step": 3900
+    },
+    {
+      "epoch": 0.00395,
+      "grad_norm": 0.7624682784080505,
+      "learning_rate": 4.9802550000000007e-05,
+      "loss": 1.2837,
+      "step": 3950
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 1.5128511190414429,
+      "learning_rate": 4.980005e-05,
+      "loss": 1.2609,
+      "step": 4000
+    },
+    {
+      "epoch": 0.00405,
+      "grad_norm": 0.5422250628471375,
+      "learning_rate": 4.979755e-05,
+      "loss": 1.3253,
+      "step": 4050
+    },
+    {
+      "epoch": 0.0041,
+      "grad_norm": 0.63419508934021,
+      "learning_rate": 4.9795050000000004e-05,
+      "loss": 1.3369,
+      "step": 4100
+    },
+    {
+      "epoch": 0.00415,
+      "grad_norm": 0.7608025670051575,
+      "learning_rate": 4.9792550000000005e-05,
+      "loss": 1.3186,
+      "step": 4150
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 0.6282067894935608,
+      "learning_rate": 4.9790050000000006e-05,
+      "loss": 1.3755,
+      "step": 4200
+    },
+    {
+      "epoch": 0.00425,
+      "grad_norm": 0.7882742881774902,
+      "learning_rate": 4.978755000000001e-05,
+      "loss": 1.3326,
+      "step": 4250
+    },
+    {
+      "epoch": 0.0043,
+      "grad_norm": 0.6797521710395813,
+      "learning_rate": 4.978505e-05,
+      "loss": 1.2774,
+      "step": 4300
+    },
+    {
+      "epoch": 0.00435,
+      "grad_norm": 0.6129476428031921,
+      "learning_rate": 4.978255e-05,
+      "loss": 1.324,
+      "step": 4350
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.7544896006584167,
+      "learning_rate": 4.978005e-05,
+      "loss": 1.299,
+      "step": 4400
+    },
+    {
+      "epoch": 0.00445,
+      "grad_norm": 0.6218116283416748,
+      "learning_rate": 4.977755e-05,
+      "loss": 1.3084,
+      "step": 4450
+    },
+    {
+      "epoch": 0.0045,
+      "grad_norm": 0.5591554045677185,
+      "learning_rate": 4.977505e-05,
+      "loss": 1.3001,
+      "step": 4500
+    },
+    {
+      "epoch": 0.00455,
+      "grad_norm": 0.854743242263794,
+      "learning_rate": 4.977255e-05,
+      "loss": 1.3357,
+      "step": 4550
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 0.7265657186508179,
+      "learning_rate": 4.977005e-05,
+      "loss": 1.3362,
+      "step": 4600
+    },
+    {
+      "epoch": 0.00465,
+      "grad_norm": 0.8089432120323181,
+      "learning_rate": 4.9767550000000004e-05,
+      "loss": 1.3087,
+      "step": 4650
+    },
+    {
+      "epoch": 0.0047,
+      "grad_norm": 0.8999931216239929,
+      "learning_rate": 4.976505e-05,
+      "loss": 1.3894,
+      "step": 4700
+    },
+    {
+      "epoch": 0.00475,
+      "grad_norm": 0.7705693244934082,
+      "learning_rate": 4.976255e-05,
+      "loss": 1.2585,
+      "step": 4750
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7201912999153137,
+      "learning_rate": 4.976005e-05,
+      "loss": 1.221,
+      "step": 4800
+    },
+    {
+      "epoch": 0.00485,
+      "grad_norm": 0.6199264526367188,
+      "learning_rate": 4.975755e-05,
+      "loss": 1.2878,
+      "step": 4850
+    },
+    {
+      "epoch": 0.0049,
+      "grad_norm": 0.6883070468902588,
+      "learning_rate": 4.9755050000000004e-05,
+      "loss": 1.3032,
+      "step": 4900
+    },
+    {
+      "epoch": 0.00495,
+      "grad_norm": 0.524889349937439,
+      "learning_rate": 4.9752550000000005e-05,
+      "loss": 1.2706,
+      "step": 4950
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.7067034244537354,
+      "learning_rate": 4.9750050000000007e-05,
+      "loss": 1.2424,
+      "step": 5000
+    },
+    {
+      "epoch": 0.00505,
+      "grad_norm": 0.7097471356391907,
+      "learning_rate": 4.974755e-05,
+      "loss": 1.3175,
+      "step": 5050
+    },
+    {
+      "epoch": 0.0051,
+      "grad_norm": 0.6991677284240723,
+      "learning_rate": 4.974505e-05,
+      "loss": 1.2983,
+      "step": 5100
+    },
+    {
+      "epoch": 0.00515,
+      "grad_norm": 1.4025177955627441,
+      "learning_rate": 4.9742550000000004e-05,
+      "loss": 1.2695,
+      "step": 5150
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 0.6075018048286438,
+      "learning_rate": 4.9740050000000005e-05,
+      "loss": 1.2973,
+      "step": 5200
+    },
+    {
+      "epoch": 0.00525,
+      "grad_norm": 0.5344257950782776,
+      "learning_rate": 4.9737550000000006e-05,
+      "loss": 1.2353,
+      "step": 5250
+    },
+    {
+      "epoch": 0.0053,
+      "grad_norm": 0.5963959693908691,
+      "learning_rate": 4.973505e-05,
+      "loss": 1.2359,
+      "step": 5300
+    },
+    {
+      "epoch": 0.00535,
+      "grad_norm": 0.9493517279624939,
+      "learning_rate": 4.973255e-05,
+      "loss": 1.252,
+      "step": 5350
+    },
+    {
+      "epoch": 0.0054,
+      "grad_norm": 0.6466087102890015,
+      "learning_rate": 4.9730050000000003e-05,
+      "loss": 1.2012,
+      "step": 5400
+    },
+    {
+      "epoch": 0.00545,
+      "grad_norm": 0.6407445669174194,
+      "learning_rate": 4.972755e-05,
+      "loss": 1.2615,
+      "step": 5450
+    },
+    {
+      "epoch": 0.0055,
+      "grad_norm": 0.617421567440033,
+      "learning_rate": 4.972505e-05,
+      "loss": 1.2718,
+      "step": 5500
+    },
+    {
+      "epoch": 0.00555,
+      "grad_norm": 0.5649735331535339,
+      "learning_rate": 4.972255e-05,
+      "loss": 1.2439,
+      "step": 5550
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.675645649433136,
+      "learning_rate": 4.972005e-05,
+      "loss": 1.2847,
+      "step": 5600
+    },
+    {
+      "epoch": 0.00565,
+      "grad_norm": 0.6887643337249756,
+      "learning_rate": 4.971755e-05,
+      "loss": 1.2464,
+      "step": 5650
+    },
+    {
+      "epoch": 0.0057,
+      "grad_norm": 0.6572535037994385,
+      "learning_rate": 4.9715050000000004e-05,
+      "loss": 1.2517,
+      "step": 5700
+    },
+    {
+      "epoch": 0.00575,
+      "grad_norm": 0.6076003313064575,
+      "learning_rate": 4.971255e-05,
+      "loss": 1.2183,
+      "step": 5750
+    },
+    {
+      "epoch": 0.0058,
+      "grad_norm": 0.6134760975837708,
+      "learning_rate": 4.971005e-05,
+      "loss": 1.2303,
+      "step": 5800
+    },
+    {
+      "epoch": 0.00585,
+      "grad_norm": 0.6791936159133911,
+      "learning_rate": 4.970755e-05,
+      "loss": 1.2351,
+      "step": 5850
+    },
+    {
+      "epoch": 0.0059,
+      "grad_norm": 0.6307588219642639,
+      "learning_rate": 4.970505e-05,
+      "loss": 1.254,
+      "step": 5900
+    },
+    {
+      "epoch": 0.00595,
+      "grad_norm": 1.1272459030151367,
+      "learning_rate": 4.9702550000000004e-05,
+      "loss": 1.265,
+      "step": 5950
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 0.5415930151939392,
+      "learning_rate": 4.9700050000000005e-05,
+      "loss": 1.2068,
+      "step": 6000
+    },
+    {
+      "epoch": 0.00605,
+      "grad_norm": 0.5450507402420044,
+      "learning_rate": 4.969755000000001e-05,
+      "loss": 1.2408,
+      "step": 6050
+    },
+    {
+      "epoch": 0.0061,
+      "grad_norm": 0.6528737545013428,
+      "learning_rate": 4.969505e-05,
+      "loss": 1.2035,
+      "step": 6100
+    },
+    {
+      "epoch": 0.00615,
+      "grad_norm": 0.6139540672302246,
+      "learning_rate": 4.969255e-05,
+      "loss": 1.2077,
+      "step": 6150
+    },
+    {
+      "epoch": 0.0062,
+      "grad_norm": 0.6503865718841553,
+      "learning_rate": 4.9690050000000004e-05,
+      "loss": 1.2055,
+      "step": 6200
+    },
+    {
+      "epoch": 0.00625,
+      "grad_norm": 0.6888736486434937,
+      "learning_rate": 4.9687550000000005e-05,
+      "loss": 1.2161,
+      "step": 6250
+    },
+    {
+      "epoch": 0.0063,
+      "grad_norm": 1.1457703113555908,
+      "learning_rate": 4.968505e-05,
+      "loss": 1.1939,
+      "step": 6300
+    },
+    {
+      "epoch": 0.00635,
+      "grad_norm": 0.5861665606498718,
+      "learning_rate": 4.968255e-05,
+      "loss": 1.2177,
+      "step": 6350
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8666422963142395,
+      "learning_rate": 4.968005e-05,
+      "loss": 1.1945,
+      "step": 6400
+    },
+    {
+      "epoch": 0.00645,
+      "grad_norm": 0.6094856858253479,
+      "learning_rate": 4.9677550000000003e-05,
+      "loss": 1.2394,
+      "step": 6450
+    },
+    {
+      "epoch": 0.0065,
+      "grad_norm": 0.6510404348373413,
+      "learning_rate": 4.967505e-05,
+      "loss": 1.1764,
+      "step": 6500
+    },
+    {
+      "epoch": 0.00655,
+      "grad_norm": 0.7044185996055603,
+      "learning_rate": 4.967255e-05,
+      "loss": 1.1951,
+      "step": 6550
+    },
+    {
+      "epoch": 0.0066,
+      "grad_norm": 0.6577385663986206,
+      "learning_rate": 4.967005e-05,
+      "loss": 1.2186,
+      "step": 6600
+    },
+    {
+      "epoch": 0.00665,
+      "grad_norm": 0.6468052268028259,
+      "learning_rate": 4.966755e-05,
+      "loss": 1.2491,
+      "step": 6650
+    },
+    {
+      "epoch": 0.0067,
+      "grad_norm": 0.6153103709220886,
+      "learning_rate": 4.966505e-05,
+      "loss": 1.25,
+      "step": 6700
+    },
+    {
+      "epoch": 0.00675,
+      "grad_norm": 0.6066887974739075,
+      "learning_rate": 4.9662550000000004e-05,
+      "loss": 1.1852,
+      "step": 6750
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 0.5735403895378113,
+      "learning_rate": 4.9660050000000006e-05,
+      "loss": 1.2139,
+      "step": 6800
+    },
+    {
+      "epoch": 0.00685,
+      "grad_norm": 0.6247462630271912,
+      "learning_rate": 4.965755e-05,
+      "loss": 1.1597,
+      "step": 6850
+    },
+    {
+      "epoch": 0.0069,
+      "grad_norm": 0.6690845489501953,
+      "learning_rate": 4.965505e-05,
+      "loss": 1.2095,
+      "step": 6900
+    },
+    {
+      "epoch": 0.00695,
+      "grad_norm": 0.5683197975158691,
+      "learning_rate": 4.965255e-05,
+      "loss": 1.3077,
+      "step": 6950
+    },
+    {
+      "epoch": 0.007,
+      "grad_norm": 0.6314153671264648,
+      "learning_rate": 4.9650050000000004e-05,
+      "loss": 1.2556,
+      "step": 7000
+    },
+    {
+      "epoch": 0.00705,
+      "grad_norm": 0.641153872013092,
+      "learning_rate": 4.9647550000000005e-05,
+      "loss": 1.1643,
+      "step": 7050
+    },
+    {
+      "epoch": 0.0071,
+      "grad_norm": 0.6113337278366089,
+      "learning_rate": 4.964505000000001e-05,
+      "loss": 1.1688,
+      "step": 7100
+    },
+    {
+      "epoch": 0.00715,
+      "grad_norm": 0.5102031230926514,
+      "learning_rate": 4.964255e-05,
+      "loss": 1.1835,
+      "step": 7150
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.6926486492156982,
+      "learning_rate": 4.964005e-05,
+      "loss": 1.1704,
+      "step": 7200
+    },
+    {
+      "epoch": 0.00725,
+      "grad_norm": 0.594326376914978,
+      "learning_rate": 4.9637550000000004e-05,
+      "loss": 1.1434,
+      "step": 7250
+    },
+    {
+      "epoch": 0.0073,
+      "grad_norm": 0.5609344840049744,
+      "learning_rate": 4.963505e-05,
+      "loss": 1.2389,
+      "step": 7300
+    },
+    {
+      "epoch": 0.00735,
+      "grad_norm": 0.6464629173278809,
+      "learning_rate": 4.963255e-05,
+      "loss": 1.2097,
+      "step": 7350
+    },
+    {
+      "epoch": 0.0074,
+      "grad_norm": 0.5782963633537292,
+      "learning_rate": 4.963005e-05,
+      "loss": 1.2607,
+      "step": 7400
+    },
+    {
+      "epoch": 0.00745,
+      "grad_norm": 0.7301307320594788,
+      "learning_rate": 4.962755e-05,
+      "loss": 1.1759,
+      "step": 7450
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.5845964550971985,
+      "learning_rate": 4.9625050000000004e-05,
+      "loss": 1.1806,
+      "step": 7500
+    },
+    {
+      "epoch": 0.00755,
+      "grad_norm": 0.618295431137085,
+      "learning_rate": 4.962255e-05,
+      "loss": 1.1339,
+      "step": 7550
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 0.5799853801727295,
+      "learning_rate": 4.962005e-05,
+      "loss": 1.1641,
+      "step": 7600
+    },
+    {
+      "epoch": 0.00765,
+      "grad_norm": 0.5227323174476624,
+      "learning_rate": 4.961755e-05,
+      "loss": 1.2375,
+      "step": 7650
+    },
+    {
+      "epoch": 0.0077,
+      "grad_norm": 0.699111819267273,
+      "learning_rate": 4.961505e-05,
+      "loss": 1.0943,
+      "step": 7700
+    },
+    {
+      "epoch": 0.00775,
+      "grad_norm": 0.5230436325073242,
+      "learning_rate": 4.961255e-05,
+      "loss": 1.1762,
+      "step": 7750
+    },
+    {
+      "epoch": 0.0078,
+      "grad_norm": 0.5776082873344421,
+      "learning_rate": 4.9610050000000005e-05,
+      "loss": 1.18,
+      "step": 7800
+    },
+    {
+      "epoch": 0.00785,
+      "grad_norm": 0.584697425365448,
+      "learning_rate": 4.9607550000000006e-05,
+      "loss": 1.2878,
+      "step": 7850
+    },
+    {
+      "epoch": 0.0079,
+      "grad_norm": 0.66282057762146,
+      "learning_rate": 4.960505e-05,
+      "loss": 1.219,
+      "step": 7900
+    },
+    {
+      "epoch": 0.00795,
+      "grad_norm": 1.2405346632003784,
+      "learning_rate": 4.960255e-05,
+      "loss": 1.0984,
+      "step": 7950
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6882185339927673,
+      "learning_rate": 4.960005e-05,
+      "loss": 1.2291,
+      "step": 8000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1000000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.49121398734848e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-8000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f1f06799524e8433f4ea5813803d7bd7b4539e7b44d7846bd777c6941f1e37c
+size 5841

checkpoint-9000/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "YasinForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "head_dim": 48,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 1024,
+  "mlp_bias": false,
+  "model_type": "yasin",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 16,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 50257
+}

checkpoint-9000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

checkpoint-9000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f25d4fd6edbfc4cf8153ac40dafe3105e19751bce39ed2b4dd71ee5ca8f07409
+size 912876512

checkpoint-9000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ccf591c09ac9a236d05591e4c5adedeed47e23cba5dec29440a520451a48bb5
+size 1825846859

checkpoint-9000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
+size 14645

checkpoint-9000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b0ffe1c5339266a7640f64d74cbe7720984c034c061903c7cafd92501da2b55
+size 1383

checkpoint-9000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eba484843bfc8f44ba71a7c82b9990f51769097f58eba94a55a38f79aa56cc8a
+size 1465

checkpoint-9000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1294 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.009,
+  "eval_steps": 500,
+  "global_step": 9000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 5e-05,
+      "grad_norm": 4.426811695098877,
+      "learning_rate": 4.999755000000001e-05,
+      "loss": 6.3755,
+      "step": 50
+    },
+    {
+      "epoch": 0.0001,
+      "grad_norm": 1.7730491161346436,
+      "learning_rate": 4.999505e-05,
+      "loss": 3.3233,
+      "step": 100
+    },
+    {
+      "epoch": 0.00015,
+      "grad_norm": 1.4473446607589722,
+      "learning_rate": 4.999255e-05,
+      "loss": 2.7523,
+      "step": 150
+    },
+    {
+      "epoch": 0.0002,
+      "grad_norm": 1.4645352363586426,
+      "learning_rate": 4.9990050000000004e-05,
+      "loss": 2.4618,
+      "step": 200
+    },
+    {
+      "epoch": 0.00025,
+      "grad_norm": 1.1990822553634644,
+      "learning_rate": 4.998755e-05,
+      "loss": 2.3417,
+      "step": 250
+    },
+    {
+      "epoch": 0.0003,
+      "grad_norm": 3.0459673404693604,
+      "learning_rate": 4.998505e-05,
+      "loss": 2.2581,
+      "step": 300
+    },
+    {
+      "epoch": 0.00035,
+      "grad_norm": 1.2380167245864868,
+      "learning_rate": 4.998255e-05,
+      "loss": 2.1563,
+      "step": 350
+    },
+    {
+      "epoch": 0.0004,
+      "grad_norm": 1.2060939073562622,
+      "learning_rate": 4.998005e-05,
+      "loss": 1.8521,
+      "step": 400
+    },
+    {
+      "epoch": 0.00045,
+      "grad_norm": 0.9207198619842529,
+      "learning_rate": 4.9977550000000004e-05,
+      "loss": 1.9201,
+      "step": 450
+    },
+    {
+      "epoch": 0.0005,
+      "grad_norm": 1.0762739181518555,
+      "learning_rate": 4.997505e-05,
+      "loss": 1.8104,
+      "step": 500
+    },
+    {
+      "epoch": 0.00055,
+      "grad_norm": 1.0367337465286255,
+      "learning_rate": 4.997255e-05,
+      "loss": 1.8756,
+      "step": 550
+    },
+    {
+      "epoch": 0.0006,
+      "grad_norm": 0.9291325211524963,
+      "learning_rate": 4.997005e-05,
+      "loss": 1.7404,
+      "step": 600
+    },
+    {
+      "epoch": 0.00065,
+      "grad_norm": 0.7452297210693359,
+      "learning_rate": 4.996755e-05,
+      "loss": 1.6843,
+      "step": 650
+    },
+    {
+      "epoch": 0.0007,
+      "grad_norm": 0.9317042231559753,
+      "learning_rate": 4.9965050000000004e-05,
+      "loss": 1.7006,
+      "step": 700
+    },
+    {
+      "epoch": 0.00075,
+      "grad_norm": 1.0354136228561401,
+      "learning_rate": 4.9962550000000005e-05,
+      "loss": 1.5652,
+      "step": 750
+    },
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.050106406211853,
+      "learning_rate": 4.996005e-05,
+      "loss": 1.5966,
+      "step": 800
+    },
+    {
+      "epoch": 0.00085,
+      "grad_norm": 0.9646019339561462,
+      "learning_rate": 4.995755e-05,
+      "loss": 1.6297,
+      "step": 850
+    },
+    {
+      "epoch": 0.0009,
+      "grad_norm": 0.9152287840843201,
+      "learning_rate": 4.995505e-05,
+      "loss": 1.6328,
+      "step": 900
+    },
+    {
+      "epoch": 0.00095,
+      "grad_norm": 0.9403690099716187,
+      "learning_rate": 4.995255e-05,
+      "loss": 1.6056,
+      "step": 950
+    },
+    {
+      "epoch": 0.001,
+      "grad_norm": 1.1822874546051025,
+      "learning_rate": 4.9950050000000005e-05,
+      "loss": 1.6016,
+      "step": 1000
+    },
+    {
+      "epoch": 0.00105,
+      "grad_norm": 1.3227542638778687,
+      "learning_rate": 4.9947550000000006e-05,
+      "loss": 1.5448,
+      "step": 1050
+    },
+    {
+      "epoch": 0.0011,
+      "grad_norm": 1.0503350496292114,
+      "learning_rate": 4.994505000000001e-05,
+      "loss": 1.6355,
+      "step": 1100
+    },
+    {
+      "epoch": 0.00115,
+      "grad_norm": 1.1647204160690308,
+      "learning_rate": 4.994255e-05,
+      "loss": 1.4673,
+      "step": 1150
+    },
+    {
+      "epoch": 0.0012,
+      "grad_norm": 0.7281339168548584,
+      "learning_rate": 4.994005e-05,
+      "loss": 1.4641,
+      "step": 1200
+    },
+    {
+      "epoch": 0.00125,
+      "grad_norm": 1.2438446283340454,
+      "learning_rate": 4.9937550000000004e-05,
+      "loss": 1.5177,
+      "step": 1250
+    },
+    {
+      "epoch": 0.0013,
+      "grad_norm": 0.7967873811721802,
+      "learning_rate": 4.993505e-05,
+      "loss": 1.4399,
+      "step": 1300
+    },
+    {
+      "epoch": 0.00135,
+      "grad_norm": 0.8938255310058594,
+      "learning_rate": 4.993255e-05,
+      "loss": 1.4453,
+      "step": 1350
+    },
+    {
+      "epoch": 0.0014,
+      "grad_norm": 1.4659217596054077,
+      "learning_rate": 4.993005e-05,
+      "loss": 1.6517,
+      "step": 1400
+    },
+    {
+      "epoch": 0.00145,
+      "grad_norm": 0.7856793403625488,
+      "learning_rate": 4.992755e-05,
+      "loss": 1.4721,
+      "step": 1450
+    },
+    {
+      "epoch": 0.0015,
+      "grad_norm": 0.6772142648696899,
+      "learning_rate": 4.9925050000000004e-05,
+      "loss": 1.5314,
+      "step": 1500
+    },
+    {
+      "epoch": 0.00155,
+      "grad_norm": 0.7613831758499146,
+      "learning_rate": 4.992255e-05,
+      "loss": 1.4603,
+      "step": 1550
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.1216529607772827,
+      "learning_rate": 4.992005e-05,
+      "loss": 1.54,
+      "step": 1600
+    },
+    {
+      "epoch": 0.00165,
+      "grad_norm": 0.7359323501586914,
+      "learning_rate": 4.991755e-05,
+      "loss": 1.5121,
+      "step": 1650
+    },
+    {
+      "epoch": 0.0017,
+      "grad_norm": 0.8343626260757446,
+      "learning_rate": 4.991505e-05,
+      "loss": 1.6107,
+      "step": 1700
+    },
+    {
+      "epoch": 0.00175,
+      "grad_norm": 0.9381140470504761,
+      "learning_rate": 4.9912550000000004e-05,
+      "loss": 1.5059,
+      "step": 1750
+    },
+    {
+      "epoch": 0.0018,
+      "grad_norm": 0.7317638993263245,
+      "learning_rate": 4.9910050000000005e-05,
+      "loss": 1.5621,
+      "step": 1800
+    },
+    {
+      "epoch": 0.00185,
+      "grad_norm": 0.8273525834083557,
+      "learning_rate": 4.990755e-05,
+      "loss": 1.4737,
+      "step": 1850
+    },
+    {
+      "epoch": 0.0019,
+      "grad_norm": 0.7206686735153198,
+      "learning_rate": 4.990505e-05,
+      "loss": 1.5242,
+      "step": 1900
+    },
+    {
+      "epoch": 0.00195,
+      "grad_norm": 0.9094993472099304,
+      "learning_rate": 4.990255e-05,
+      "loss": 1.5336,
+      "step": 1950
+    },
+    {
+      "epoch": 0.002,
+      "grad_norm": 0.7432169914245605,
+      "learning_rate": 4.9900050000000003e-05,
+      "loss": 1.4913,
+      "step": 2000
+    },
+    {
+      "epoch": 0.00205,
+      "grad_norm": 1.337276816368103,
+      "learning_rate": 4.9897550000000005e-05,
+      "loss": 1.5643,
+      "step": 2050
+    },
+    {
+      "epoch": 0.0021,
+      "grad_norm": 0.6664676666259766,
+      "learning_rate": 4.9895050000000006e-05,
+      "loss": 1.4728,
+      "step": 2100
+    },
+    {
+      "epoch": 0.00215,
+      "grad_norm": 0.7946372628211975,
+      "learning_rate": 4.989255000000001e-05,
+      "loss": 1.4832,
+      "step": 2150
+    },
+    {
+      "epoch": 0.0022,
+      "grad_norm": 0.9259161949157715,
+      "learning_rate": 4.989005e-05,
+      "loss": 1.5674,
+      "step": 2200
+    },
+    {
+      "epoch": 0.00225,
+      "grad_norm": 0.9703247547149658,
+      "learning_rate": 4.988755e-05,
+      "loss": 1.3973,
+      "step": 2250
+    },
+    {
+      "epoch": 0.0023,
+      "grad_norm": 0.7390087842941284,
+      "learning_rate": 4.988505e-05,
+      "loss": 1.3919,
+      "step": 2300
+    },
+    {
+      "epoch": 0.00235,
+      "grad_norm": 0.768277645111084,
+      "learning_rate": 4.988255e-05,
+      "loss": 1.4815,
+      "step": 2350
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.76382976770401,
+      "learning_rate": 4.988005e-05,
+      "loss": 1.4458,
+      "step": 2400
+    },
+    {
+      "epoch": 0.00245,
+      "grad_norm": 0.7780851721763611,
+      "learning_rate": 4.987755e-05,
+      "loss": 1.4382,
+      "step": 2450
+    },
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.7184464335441589,
+      "learning_rate": 4.987505e-05,
+      "loss": 1.521,
+      "step": 2500
+    },
+    {
+      "epoch": 0.00255,
+      "grad_norm": 0.6212390065193176,
+      "learning_rate": 4.9872550000000004e-05,
+      "loss": 1.3889,
+      "step": 2550
+    },
+    {
+      "epoch": 0.0026,
+      "grad_norm": 0.8539580702781677,
+      "learning_rate": 4.987005e-05,
+      "loss": 1.415,
+      "step": 2600
+    },
+    {
+      "epoch": 0.00265,
+      "grad_norm": 0.7129775881767273,
+      "learning_rate": 4.986755e-05,
+      "loss": 1.4102,
+      "step": 2650
+    },
+    {
+      "epoch": 0.0027,
+      "grad_norm": 0.5899195671081543,
+      "learning_rate": 4.986505e-05,
+      "loss": 1.3074,
+      "step": 2700
+    },
+    {
+      "epoch": 0.00275,
+      "grad_norm": 0.6940101981163025,
+      "learning_rate": 4.986255e-05,
+      "loss": 1.3748,
+      "step": 2750
+    },
+    {
+      "epoch": 0.0028,
+      "grad_norm": 0.6420891880989075,
+      "learning_rate": 4.9860050000000004e-05,
+      "loss": 1.4089,
+      "step": 2800
+    },
+    {
+      "epoch": 0.00285,
+      "grad_norm": 0.8561428189277649,
+      "learning_rate": 4.9857550000000005e-05,
+      "loss": 1.4052,
+      "step": 2850
+    },
+    {
+      "epoch": 0.0029,
+      "grad_norm": 0.6900970935821533,
+      "learning_rate": 4.9855050000000006e-05,
+      "loss": 1.4323,
+      "step": 2900
+    },
+    {
+      "epoch": 0.00295,
+      "grad_norm": 0.8071371912956238,
+      "learning_rate": 4.985255e-05,
+      "loss": 1.3485,
+      "step": 2950
+    },
+    {
+      "epoch": 0.003,
+      "grad_norm": 0.6493249535560608,
+      "learning_rate": 4.985005e-05,
+      "loss": 1.4321,
+      "step": 3000
+    },
+    {
+      "epoch": 0.00305,
+      "grad_norm": 0.8712514042854309,
+      "learning_rate": 4.9847550000000004e-05,
+      "loss": 1.406,
+      "step": 3050
+    },
+    {
+      "epoch": 0.0031,
+      "grad_norm": 0.8464570045471191,
+      "learning_rate": 4.9845050000000005e-05,
+      "loss": 1.3322,
+      "step": 3100
+    },
+    {
+      "epoch": 0.00315,
+      "grad_norm": 0.7779001593589783,
+      "learning_rate": 4.9842550000000006e-05,
+      "loss": 1.3905,
+      "step": 3150
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.8314748406410217,
+      "learning_rate": 4.984005000000001e-05,
+      "loss": 1.4134,
+      "step": 3200
+    },
+    {
+      "epoch": 0.00325,
+      "grad_norm": 0.7180835604667664,
+      "learning_rate": 4.983755e-05,
+      "loss": 1.3629,
+      "step": 3250
+    },
+    {
+      "epoch": 0.0033,
+      "grad_norm": 0.6165933012962341,
+      "learning_rate": 4.9835049999999996e-05,
+      "loss": 1.3833,
+      "step": 3300
+    },
+    {
+      "epoch": 0.00335,
+      "grad_norm": 1.8662049770355225,
+      "learning_rate": 4.983255e-05,
+      "loss": 1.3557,
+      "step": 3350
+    },
+    {
+      "epoch": 0.0034,
+      "grad_norm": 0.9091550707817078,
+      "learning_rate": 4.983005e-05,
+      "loss": 1.4092,
+      "step": 3400
+    },
+    {
+      "epoch": 0.00345,
+      "grad_norm": 0.9908722639083862,
+      "learning_rate": 4.982755e-05,
+      "loss": 1.3465,
+      "step": 3450
+    },
+    {
+      "epoch": 0.0035,
+      "grad_norm": 0.862427830696106,
+      "learning_rate": 4.982505e-05,
+      "loss": 1.3143,
+      "step": 3500
+    },
+    {
+      "epoch": 0.00355,
+      "grad_norm": 0.755211591720581,
+      "learning_rate": 4.982255e-05,
+      "loss": 1.358,
+      "step": 3550
+    },
+    {
+      "epoch": 0.0036,
+      "grad_norm": 0.7872670888900757,
+      "learning_rate": 4.9820050000000004e-05,
+      "loss": 1.3952,
+      "step": 3600
+    },
+    {
+      "epoch": 0.00365,
+      "grad_norm": 1.5124619007110596,
+      "learning_rate": 4.981755e-05,
+      "loss": 1.2929,
+      "step": 3650
+    },
+    {
+      "epoch": 0.0037,
+      "grad_norm": 0.7712079286575317,
+      "learning_rate": 4.981505e-05,
+      "loss": 1.3429,
+      "step": 3700
+    },
+    {
+      "epoch": 0.00375,
+      "grad_norm": 0.7001494765281677,
+      "learning_rate": 4.981255e-05,
+      "loss": 1.272,
+      "step": 3750
+    },
+    {
+      "epoch": 0.0038,
+      "grad_norm": 0.674104630947113,
+      "learning_rate": 4.981005e-05,
+      "loss": 1.3891,
+      "step": 3800
+    },
+    {
+      "epoch": 0.00385,
+      "grad_norm": 1.0494478940963745,
+      "learning_rate": 4.9807550000000004e-05,
+      "loss": 1.333,
+      "step": 3850
+    },
+    {
+      "epoch": 0.0039,
+      "grad_norm": 0.6674365401268005,
+      "learning_rate": 4.9805050000000005e-05,
+      "loss": 1.4418,
+      "step": 3900
+    },
+    {
+      "epoch": 0.00395,
+      "grad_norm": 0.7624682784080505,
+      "learning_rate": 4.9802550000000007e-05,
+      "loss": 1.2837,
+      "step": 3950
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 1.5128511190414429,
+      "learning_rate": 4.980005e-05,
+      "loss": 1.2609,
+      "step": 4000
+    },
+    {
+      "epoch": 0.00405,
+      "grad_norm": 0.5422250628471375,
+      "learning_rate": 4.979755e-05,
+      "loss": 1.3253,
+      "step": 4050
+    },
+    {
+      "epoch": 0.0041,
+      "grad_norm": 0.63419508934021,
+      "learning_rate": 4.9795050000000004e-05,
+      "loss": 1.3369,
+      "step": 4100
+    },
+    {
+      "epoch": 0.00415,
+      "grad_norm": 0.7608025670051575,
+      "learning_rate": 4.9792550000000005e-05,
+      "loss": 1.3186,
+      "step": 4150
+    },
+    {
+      "epoch": 0.0042,
+      "grad_norm": 0.6282067894935608,
+      "learning_rate": 4.9790050000000006e-05,
+      "loss": 1.3755,
+      "step": 4200
+    },
+    {
+      "epoch": 0.00425,
+      "grad_norm": 0.7882742881774902,
+      "learning_rate": 4.978755000000001e-05,
+      "loss": 1.3326,
+      "step": 4250
+    },
+    {
+      "epoch": 0.0043,
+      "grad_norm": 0.6797521710395813,
+      "learning_rate": 4.978505e-05,
+      "loss": 1.2774,
+      "step": 4300
+    },
+    {
+      "epoch": 0.00435,
+      "grad_norm": 0.6129476428031921,
+      "learning_rate": 4.978255e-05,
+      "loss": 1.324,
+      "step": 4350
+    },
+    {
+      "epoch": 0.0044,
+      "grad_norm": 0.7544896006584167,
+      "learning_rate": 4.978005e-05,
+      "loss": 1.299,
+      "step": 4400
+    },
+    {
+      "epoch": 0.00445,
+      "grad_norm": 0.6218116283416748,
+      "learning_rate": 4.977755e-05,
+      "loss": 1.3084,
+      "step": 4450
+    },
+    {
+      "epoch": 0.0045,
+      "grad_norm": 0.5591554045677185,
+      "learning_rate": 4.977505e-05,
+      "loss": 1.3001,
+      "step": 4500
+    },
+    {
+      "epoch": 0.00455,
+      "grad_norm": 0.854743242263794,
+      "learning_rate": 4.977255e-05,
+      "loss": 1.3357,
+      "step": 4550
+    },
+    {
+      "epoch": 0.0046,
+      "grad_norm": 0.7265657186508179,
+      "learning_rate": 4.977005e-05,
+      "loss": 1.3362,
+      "step": 4600
+    },
+    {
+      "epoch": 0.00465,
+      "grad_norm": 0.8089432120323181,
+      "learning_rate": 4.9767550000000004e-05,
+      "loss": 1.3087,
+      "step": 4650
+    },
+    {
+      "epoch": 0.0047,
+      "grad_norm": 0.8999931216239929,
+      "learning_rate": 4.976505e-05,
+      "loss": 1.3894,
+      "step": 4700
+    },
+    {
+      "epoch": 0.00475,
+      "grad_norm": 0.7705693244934082,
+      "learning_rate": 4.976255e-05,
+      "loss": 1.2585,
+      "step": 4750
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.7201912999153137,
+      "learning_rate": 4.976005e-05,
+      "loss": 1.221,
+      "step": 4800
+    },
+    {
+      "epoch": 0.00485,
+      "grad_norm": 0.6199264526367188,
+      "learning_rate": 4.975755e-05,
+      "loss": 1.2878,
+      "step": 4850
+    },
+    {
+      "epoch": 0.0049,
+      "grad_norm": 0.6883070468902588,
+      "learning_rate": 4.9755050000000004e-05,
+      "loss": 1.3032,
+      "step": 4900
+    },
+    {
+      "epoch": 0.00495,
+      "grad_norm": 0.524889349937439,
+      "learning_rate": 4.9752550000000005e-05,
+      "loss": 1.2706,
+      "step": 4950
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.7067034244537354,
+      "learning_rate": 4.9750050000000007e-05,
+      "loss": 1.2424,
+      "step": 5000
+    },
+    {
+      "epoch": 0.00505,
+      "grad_norm": 0.7097471356391907,
+      "learning_rate": 4.974755e-05,
+      "loss": 1.3175,
+      "step": 5050
+    },
+    {
+      "epoch": 0.0051,
+      "grad_norm": 0.6991677284240723,
+      "learning_rate": 4.974505e-05,
+      "loss": 1.2983,
+      "step": 5100
+    },
+    {
+      "epoch": 0.00515,
+      "grad_norm": 1.4025177955627441,
+      "learning_rate": 4.9742550000000004e-05,
+      "loss": 1.2695,
+      "step": 5150
+    },
+    {
+      "epoch": 0.0052,
+      "grad_norm": 0.6075018048286438,
+      "learning_rate": 4.9740050000000005e-05,
+      "loss": 1.2973,
+      "step": 5200
+    },
+    {
+      "epoch": 0.00525,
+      "grad_norm": 0.5344257950782776,
+      "learning_rate": 4.9737550000000006e-05,
+      "loss": 1.2353,
+      "step": 5250
+    },
+    {
+      "epoch": 0.0053,
+      "grad_norm": 0.5963959693908691,
+      "learning_rate": 4.973505e-05,
+      "loss": 1.2359,
+      "step": 5300
+    },
+    {
+      "epoch": 0.00535,
+      "grad_norm": 0.9493517279624939,
+      "learning_rate": 4.973255e-05,
+      "loss": 1.252,
+      "step": 5350
+    },
+    {
+      "epoch": 0.0054,
+      "grad_norm": 0.6466087102890015,
+      "learning_rate": 4.9730050000000003e-05,
+      "loss": 1.2012,
+      "step": 5400
+    },
+    {
+      "epoch": 0.00545,
+      "grad_norm": 0.6407445669174194,
+      "learning_rate": 4.972755e-05,
+      "loss": 1.2615,
+      "step": 5450
+    },
+    {
+      "epoch": 0.0055,
+      "grad_norm": 0.617421567440033,
+      "learning_rate": 4.972505e-05,
+      "loss": 1.2718,
+      "step": 5500
+    },
+    {
+      "epoch": 0.00555,
+      "grad_norm": 0.5649735331535339,
+      "learning_rate": 4.972255e-05,
+      "loss": 1.2439,
+      "step": 5550
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.675645649433136,
+      "learning_rate": 4.972005e-05,
+      "loss": 1.2847,
+      "step": 5600
+    },
+    {
+      "epoch": 0.00565,
+      "grad_norm": 0.6887643337249756,
+      "learning_rate": 4.971755e-05,
+      "loss": 1.2464,
+      "step": 5650
+    },
+    {
+      "epoch": 0.0057,
+      "grad_norm": 0.6572535037994385,
+      "learning_rate": 4.9715050000000004e-05,
+      "loss": 1.2517,
+      "step": 5700
+    },
+    {
+      "epoch": 0.00575,
+      "grad_norm": 0.6076003313064575,
+      "learning_rate": 4.971255e-05,
+      "loss": 1.2183,
+      "step": 5750
+    },
+    {
+      "epoch": 0.0058,
+      "grad_norm": 0.6134760975837708,
+      "learning_rate": 4.971005e-05,
+      "loss": 1.2303,
+      "step": 5800
+    },
+    {
+      "epoch": 0.00585,
+      "grad_norm": 0.6791936159133911,
+      "learning_rate": 4.970755e-05,
+      "loss": 1.2351,
+      "step": 5850
+    },
+    {
+      "epoch": 0.0059,
+      "grad_norm": 0.6307588219642639,
+      "learning_rate": 4.970505e-05,
+      "loss": 1.254,
+      "step": 5900
+    },
+    {
+      "epoch": 0.00595,
+      "grad_norm": 1.1272459030151367,
+      "learning_rate": 4.9702550000000004e-05,
+      "loss": 1.265,
+      "step": 5950
+    },
+    {
+      "epoch": 0.006,
+      "grad_norm": 0.5415930151939392,
+      "learning_rate": 4.9700050000000005e-05,
+      "loss": 1.2068,
+      "step": 6000
+    },
+    {
+      "epoch": 0.00605,
+      "grad_norm": 0.5450507402420044,
+      "learning_rate": 4.969755000000001e-05,
+      "loss": 1.2408,
+      "step": 6050
+    },
+    {
+      "epoch": 0.0061,
+      "grad_norm": 0.6528737545013428,
+      "learning_rate": 4.969505e-05,
+      "loss": 1.2035,
+      "step": 6100
+    },
+    {
+      "epoch": 0.00615,
+      "grad_norm": 0.6139540672302246,
+      "learning_rate": 4.969255e-05,
+      "loss": 1.2077,
+      "step": 6150
+    },
+    {
+      "epoch": 0.0062,
+      "grad_norm": 0.6503865718841553,
+      "learning_rate": 4.9690050000000004e-05,
+      "loss": 1.2055,
+      "step": 6200
+    },
+    {
+      "epoch": 0.00625,
+      "grad_norm": 0.6888736486434937,
+      "learning_rate": 4.9687550000000005e-05,
+      "loss": 1.2161,
+      "step": 6250
+    },
+    {
+      "epoch": 0.0063,
+      "grad_norm": 1.1457703113555908,
+      "learning_rate": 4.968505e-05,
+      "loss": 1.1939,
+      "step": 6300
+    },
+    {
+      "epoch": 0.00635,
+      "grad_norm": 0.5861665606498718,
+      "learning_rate": 4.968255e-05,
+      "loss": 1.2177,
+      "step": 6350
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.8666422963142395,
+      "learning_rate": 4.968005e-05,
+      "loss": 1.1945,
+      "step": 6400
+    },
+    {
+      "epoch": 0.00645,
+      "grad_norm": 0.6094856858253479,
+      "learning_rate": 4.9677550000000003e-05,
+      "loss": 1.2394,
+      "step": 6450
+    },
+    {
+      "epoch": 0.0065,
+      "grad_norm": 0.6510404348373413,
+      "learning_rate": 4.967505e-05,
+      "loss": 1.1764,
+      "step": 6500
+    },
+    {
+      "epoch": 0.00655,
+      "grad_norm": 0.7044185996055603,
+      "learning_rate": 4.967255e-05,
+      "loss": 1.1951,
+      "step": 6550
+    },
+    {
+      "epoch": 0.0066,
+      "grad_norm": 0.6577385663986206,
+      "learning_rate": 4.967005e-05,
+      "loss": 1.2186,
+      "step": 6600
+    },
+    {
+      "epoch": 0.00665,
+      "grad_norm": 0.6468052268028259,
+      "learning_rate": 4.966755e-05,
+      "loss": 1.2491,
+      "step": 6650
+    },
+    {
+      "epoch": 0.0067,
+      "grad_norm": 0.6153103709220886,
+      "learning_rate": 4.966505e-05,
+      "loss": 1.25,
+      "step": 6700
+    },
+    {
+      "epoch": 0.00675,
+      "grad_norm": 0.6066887974739075,
+      "learning_rate": 4.9662550000000004e-05,
+      "loss": 1.1852,
+      "step": 6750
+    },
+    {
+      "epoch": 0.0068,
+      "grad_norm": 0.5735403895378113,
+      "learning_rate": 4.9660050000000006e-05,
+      "loss": 1.2139,
+      "step": 6800
+    },
+    {
+      "epoch": 0.00685,
+      "grad_norm": 0.6247462630271912,
+      "learning_rate": 4.965755e-05,
+      "loss": 1.1597,
+      "step": 6850
+    },
+    {
+      "epoch": 0.0069,
+      "grad_norm": 0.6690845489501953,
+      "learning_rate": 4.965505e-05,
+      "loss": 1.2095,
+      "step": 6900
+    },
+    {
+      "epoch": 0.00695,
+      "grad_norm": 0.5683197975158691,
+      "learning_rate": 4.965255e-05,
+      "loss": 1.3077,
+      "step": 6950
+    },
+    {
+      "epoch": 0.007,
+      "grad_norm": 0.6314153671264648,
+      "learning_rate": 4.9650050000000004e-05,
+      "loss": 1.2556,
+      "step": 7000
+    },
+    {
+      "epoch": 0.00705,
+      "grad_norm": 0.641153872013092,
+      "learning_rate": 4.9647550000000005e-05,
+      "loss": 1.1643,
+      "step": 7050
+    },
+    {
+      "epoch": 0.0071,
+      "grad_norm": 0.6113337278366089,
+      "learning_rate": 4.964505000000001e-05,
+      "loss": 1.1688,
+      "step": 7100
+    },
+    {
+      "epoch": 0.00715,
+      "grad_norm": 0.5102031230926514,
+      "learning_rate": 4.964255e-05,
+      "loss": 1.1835,
+      "step": 7150
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.6926486492156982,
+      "learning_rate": 4.964005e-05,
+      "loss": 1.1704,
+      "step": 7200
+    },
+    {
+      "epoch": 0.00725,
+      "grad_norm": 0.594326376914978,
+      "learning_rate": 4.9637550000000004e-05,
+      "loss": 1.1434,
+      "step": 7250
+    },
+    {
+      "epoch": 0.0073,
+      "grad_norm": 0.5609344840049744,
+      "learning_rate": 4.963505e-05,
+      "loss": 1.2389,
+      "step": 7300
+    },
+    {
+      "epoch": 0.00735,
+      "grad_norm": 0.6464629173278809,
+      "learning_rate": 4.963255e-05,
+      "loss": 1.2097,
+      "step": 7350
+    },
+    {
+      "epoch": 0.0074,
+      "grad_norm": 0.5782963633537292,
+      "learning_rate": 4.963005e-05,
+      "loss": 1.2607,
+      "step": 7400
+    },
+    {
+      "epoch": 0.00745,
+      "grad_norm": 0.7301307320594788,
+      "learning_rate": 4.962755e-05,
+      "loss": 1.1759,
+      "step": 7450
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.5845964550971985,
+      "learning_rate": 4.9625050000000004e-05,
+      "loss": 1.1806,
+      "step": 7500
+    },
+    {
+      "epoch": 0.00755,
+      "grad_norm": 0.618295431137085,
+      "learning_rate": 4.962255e-05,
+      "loss": 1.1339,
+      "step": 7550
+    },
+    {
+      "epoch": 0.0076,
+      "grad_norm": 0.5799853801727295,
+      "learning_rate": 4.962005e-05,
+      "loss": 1.1641,
+      "step": 7600
+    },
+    {
+      "epoch": 0.00765,
+      "grad_norm": 0.5227323174476624,
+      "learning_rate": 4.961755e-05,
+      "loss": 1.2375,
+      "step": 7650
+    },
+    {
+      "epoch": 0.0077,
+      "grad_norm": 0.699111819267273,
+      "learning_rate": 4.961505e-05,
+      "loss": 1.0943,
+      "step": 7700
+    },
+    {
+      "epoch": 0.00775,
+      "grad_norm": 0.5230436325073242,
+      "learning_rate": 4.961255e-05,
+      "loss": 1.1762,
+      "step": 7750
+    },
+    {
+      "epoch": 0.0078,
+      "grad_norm": 0.5776082873344421,
+      "learning_rate": 4.9610050000000005e-05,
+      "loss": 1.18,
+      "step": 7800
+    },
+    {
+      "epoch": 0.00785,
+      "grad_norm": 0.584697425365448,
+      "learning_rate": 4.9607550000000006e-05,
+      "loss": 1.2878,
+      "step": 7850
+    },
+    {
+      "epoch": 0.0079,
+      "grad_norm": 0.66282057762146,
+      "learning_rate": 4.960505e-05,
+      "loss": 1.219,
+      "step": 7900
+    },
+    {
+      "epoch": 0.00795,
+      "grad_norm": 1.2405346632003784,
+      "learning_rate": 4.960255e-05,
+      "loss": 1.0984,
+      "step": 7950
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.6882185339927673,
+      "learning_rate": 4.960005e-05,
+      "loss": 1.2291,
+      "step": 8000
+    },
+    {
+      "epoch": 0.00805,
+      "grad_norm": 0.5376139879226685,
+      "learning_rate": 4.9597550000000004e-05,
+      "loss": 1.1768,
+      "step": 8050
+    },
+    {
+      "epoch": 0.0081,
+      "grad_norm": 0.5613966584205627,
+      "learning_rate": 4.9595050000000006e-05,
+      "loss": 1.1882,
+      "step": 8100
+    },
+    {
+      "epoch": 0.00815,
+      "grad_norm": 0.5976732969284058,
+      "learning_rate": 4.959255000000001e-05,
+      "loss": 1.1938,
+      "step": 8150
+    },
+    {
+      "epoch": 0.0082,
+      "grad_norm": 0.5053662657737732,
+      "learning_rate": 4.959005e-05,
+      "loss": 1.1335,
+      "step": 8200
+    },
+    {
+      "epoch": 0.00825,
+      "grad_norm": 0.5319002866744995,
+      "learning_rate": 4.958755e-05,
+      "loss": 1.23,
+      "step": 8250
+    },
+    {
+      "epoch": 0.0083,
+      "grad_norm": 0.6441113948822021,
+      "learning_rate": 4.9585050000000004e-05,
+      "loss": 1.095,
+      "step": 8300
+    },
+    {
+      "epoch": 0.00835,
+      "grad_norm": 0.7779256701469421,
+      "learning_rate": 4.958255e-05,
+      "loss": 1.1637,
+      "step": 8350
+    },
+    {
+      "epoch": 0.0084,
+      "grad_norm": 0.6262242794036865,
+      "learning_rate": 4.958005e-05,
+      "loss": 1.1756,
+      "step": 8400
+    },
+    {
+      "epoch": 0.00845,
+      "grad_norm": 0.6683831214904785,
+      "learning_rate": 4.957755e-05,
+      "loss": 1.2142,
+      "step": 8450
+    },
+    {
+      "epoch": 0.0085,
+      "grad_norm": 0.8390682339668274,
+      "learning_rate": 4.957505e-05,
+      "loss": 1.147,
+      "step": 8500
+    },
+    {
+      "epoch": 0.00855,
+      "grad_norm": 0.6620815396308899,
+      "learning_rate": 4.9572550000000004e-05,
+      "loss": 1.1619,
+      "step": 8550
+    },
+    {
+      "epoch": 0.0086,
+      "grad_norm": 0.5890603065490723,
+      "learning_rate": 4.957005e-05,
+      "loss": 1.1076,
+      "step": 8600
+    },
+    {
+      "epoch": 0.00865,
+      "grad_norm": 0.578610897064209,
+      "learning_rate": 4.956755e-05,
+      "loss": 1.0664,
+      "step": 8650
+    },
+    {
+      "epoch": 0.0087,
+      "grad_norm": 0.5939948558807373,
+      "learning_rate": 4.956505e-05,
+      "loss": 1.1613,
+      "step": 8700
+    },
+    {
+      "epoch": 0.00875,
+      "grad_norm": 0.6854498386383057,
+      "learning_rate": 4.956255e-05,
+      "loss": 1.218,
+      "step": 8750
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.8202462196350098,
+      "learning_rate": 4.956005e-05,
+      "loss": 1.2239,
+      "step": 8800
+    },
+    {
+      "epoch": 0.00885,
+      "grad_norm": 0.5740517973899841,
+      "learning_rate": 4.9557550000000005e-05,
+      "loss": 1.1104,
+      "step": 8850
+    },
+    {
+      "epoch": 0.0089,
+      "grad_norm": 0.7225694060325623,
+      "learning_rate": 4.9555050000000006e-05,
+      "loss": 1.1617,
+      "step": 8900
+    },
+    {
+      "epoch": 0.00895,
+      "grad_norm": 0.5652306079864502,
+      "learning_rate": 4.955255e-05,
+      "loss": 1.0913,
+      "step": 8950
+    },
+    {
+      "epoch": 0.009,
+      "grad_norm": 0.9501689672470093,
+      "learning_rate": 4.955005e-05,
+      "loss": 1.1862,
+      "step": 9000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1000000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.67761573576704e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-9000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f1f06799524e8433f4ea5813803d7bd7b4539e7b44d7846bd777c6941f1e37c
+size 5841