{ "best_global_step": 160, "best_metric": 0.5677427649497986, "best_model_checkpoint": "./results/checkpoint-160", "epoch": 0.9660377358490566, "eval_steps": 20, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0060377358490566035, "grad_norm": 70.5, "learning_rate": 0.0, "loss": 1.6976, "step": 1 }, { "epoch": 0.012075471698113207, "grad_norm": 64.0, "learning_rate": 1.0000000000000002e-06, "loss": 1.7496, "step": 2 }, { "epoch": 0.018113207547169812, "grad_norm": 30.5, "learning_rate": 2.0000000000000003e-06, "loss": 1.5311, "step": 3 }, { "epoch": 0.024150943396226414, "grad_norm": 23.625, "learning_rate": 3e-06, "loss": 1.2349, "step": 4 }, { "epoch": 0.03018867924528302, "grad_norm": 25.75, "learning_rate": 4.000000000000001e-06, "loss": 0.9968, "step": 5 }, { "epoch": 0.036226415094339624, "grad_norm": 12.625, "learning_rate": 5e-06, "loss": 0.9259, "step": 6 }, { "epoch": 0.04226415094339623, "grad_norm": 9.5, "learning_rate": 6e-06, "loss": 0.8499, "step": 7 }, { "epoch": 0.04830188679245283, "grad_norm": 9.375, "learning_rate": 7e-06, "loss": 0.8527, "step": 8 }, { "epoch": 0.05433962264150943, "grad_norm": 8.9375, "learning_rate": 8.000000000000001e-06, "loss": 0.8248, "step": 9 }, { "epoch": 0.06037735849056604, "grad_norm": 8.125, "learning_rate": 9e-06, "loss": 0.8, "step": 10 }, { "epoch": 0.06641509433962264, "grad_norm": 10.75, "learning_rate": 1e-05, "loss": 0.7759, "step": 11 }, { "epoch": 0.07245283018867925, "grad_norm": 12.25, "learning_rate": 9.999762028709558e-06, "loss": 0.8134, "step": 12 }, { "epoch": 0.07849056603773585, "grad_norm": 14.8125, "learning_rate": 9.999048137490364e-06, "loss": 0.8324, "step": 13 }, { "epoch": 0.08452830188679246, "grad_norm": 7.875, "learning_rate": 9.997858394296666e-06, "loss": 0.8162, "step": 14 }, { "epoch": 0.09056603773584905, "grad_norm": 6.625, "learning_rate": 9.99619291237835e-06, "loss": 0.7931, "step": 15 }, { "epoch": 0.09660377358490566, "grad_norm": 6.46875, "learning_rate": 9.994051850270172e-06, "loss": 0.7436, "step": 16 }, { "epoch": 0.10264150943396226, "grad_norm": 5.96875, "learning_rate": 9.991435411776654e-06, "loss": 0.7802, "step": 17 }, { "epoch": 0.10867924528301887, "grad_norm": 5.90625, "learning_rate": 9.988343845952697e-06, "loss": 0.7157, "step": 18 }, { "epoch": 0.11471698113207547, "grad_norm": 5.84375, "learning_rate": 9.984777447079861e-06, "loss": 0.7629, "step": 19 }, { "epoch": 0.12075471698113208, "grad_norm": 6.3125, "learning_rate": 9.980736554638367e-06, "loss": 0.739, "step": 20 }, { "epoch": 0.12075471698113208, "eval_loss": 0.7346939444541931, "eval_runtime": 16.561, "eval_samples_per_second": 16.847, "eval_steps_per_second": 4.227, "step": 20 }, { "epoch": 0.12679245283018867, "grad_norm": 5.59375, "learning_rate": 9.976221553274767e-06, "loss": 0.7613, "step": 21 }, { "epoch": 0.1328301886792453, "grad_norm": 6.1875, "learning_rate": 9.971232872765344e-06, "loss": 0.7267, "step": 22 }, { "epoch": 0.13886792452830188, "grad_norm": 5.375, "learning_rate": 9.96577098797519e-06, "loss": 0.6918, "step": 23 }, { "epoch": 0.1449056603773585, "grad_norm": 5.59375, "learning_rate": 9.959836418813016e-06, "loss": 0.7049, "step": 24 }, { "epoch": 0.1509433962264151, "grad_norm": 5.3125, "learning_rate": 9.953429730181653e-06, "loss": 0.7378, "step": 25 }, { "epoch": 0.1569811320754717, "grad_norm": 5.90625, "learning_rate": 9.94655153192429e-06, "loss": 0.7175, "step": 26 }, { "epoch": 0.1630188679245283, "grad_norm": 5.125, "learning_rate": 9.939202478766408e-06, "loss": 0.6892, "step": 27 }, { "epoch": 0.16905660377358492, "grad_norm": 5.6875, "learning_rate": 9.931383270253475e-06, "loss": 0.7206, "step": 28 }, { "epoch": 0.1750943396226415, "grad_norm": 5.0, "learning_rate": 9.923094650684346e-06, "loss": 0.724, "step": 29 }, { "epoch": 0.1811320754716981, "grad_norm": 5.46875, "learning_rate": 9.914337409040418e-06, "loss": 0.6785, "step": 30 }, { "epoch": 0.18716981132075472, "grad_norm": 4.96875, "learning_rate": 9.905112378910532e-06, "loss": 0.6888, "step": 31 }, { "epoch": 0.1932075471698113, "grad_norm": 5.46875, "learning_rate": 9.895420438411616e-06, "loss": 0.7148, "step": 32 }, { "epoch": 0.19924528301886793, "grad_norm": 5.0, "learning_rate": 9.885262510105102e-06, "loss": 0.6729, "step": 33 }, { "epoch": 0.20528301886792452, "grad_norm": 6.09375, "learning_rate": 9.874639560909118e-06, "loss": 0.6917, "step": 34 }, { "epoch": 0.21132075471698114, "grad_norm": 5.1875, "learning_rate": 9.863552602006435e-06, "loss": 0.6824, "step": 35 }, { "epoch": 0.21735849056603773, "grad_norm": 5.75, "learning_rate": 9.852002688748214e-06, "loss": 0.6918, "step": 36 }, { "epoch": 0.22339622641509435, "grad_norm": 5.71875, "learning_rate": 9.839990920553566e-06, "loss": 0.6715, "step": 37 }, { "epoch": 0.22943396226415094, "grad_norm": 5.09375, "learning_rate": 9.827518440804882e-06, "loss": 0.6812, "step": 38 }, { "epoch": 0.23547169811320753, "grad_norm": 5.4375, "learning_rate": 9.814586436738998e-06, "loss": 0.6536, "step": 39 }, { "epoch": 0.24150943396226415, "grad_norm": 5.0, "learning_rate": 9.801196139334195e-06, "loss": 0.6636, "step": 40 }, { "epoch": 0.24150943396226415, "eval_loss": 0.6721860766410828, "eval_runtime": 15.2402, "eval_samples_per_second": 18.307, "eval_steps_per_second": 4.593, "step": 40 }, { "epoch": 0.24754716981132074, "grad_norm": 4.65625, "learning_rate": 9.787348823193013e-06, "loss": 0.6657, "step": 41 }, { "epoch": 0.25358490566037734, "grad_norm": 5.09375, "learning_rate": 9.77304580642093e-06, "loss": 0.6651, "step": 42 }, { "epoch": 0.259622641509434, "grad_norm": 4.59375, "learning_rate": 9.75828845050089e-06, "loss": 0.6394, "step": 43 }, { "epoch": 0.2656603773584906, "grad_norm": 5.5, "learning_rate": 9.743078160163703e-06, "loss": 0.6836, "step": 44 }, { "epoch": 0.27169811320754716, "grad_norm": 4.96875, "learning_rate": 9.72741638325434e-06, "loss": 0.6808, "step": 45 }, { "epoch": 0.27773584905660376, "grad_norm": 4.96875, "learning_rate": 9.711304610594104e-06, "loss": 0.6823, "step": 46 }, { "epoch": 0.2837735849056604, "grad_norm": 4.75, "learning_rate": 9.694744375838725e-06, "loss": 0.6866, "step": 47 }, { "epoch": 0.289811320754717, "grad_norm": 4.59375, "learning_rate": 9.677737255332381e-06, "loss": 0.6555, "step": 48 }, { "epoch": 0.2958490566037736, "grad_norm": 4.9375, "learning_rate": 9.660284867957637e-06, "loss": 0.6765, "step": 49 }, { "epoch": 0.3018867924528302, "grad_norm": 4.8125, "learning_rate": 9.642388874981348e-06, "loss": 0.6413, "step": 50 }, { "epoch": 0.30792452830188677, "grad_norm": 4.375, "learning_rate": 9.624050979896533e-06, "loss": 0.6349, "step": 51 }, { "epoch": 0.3139622641509434, "grad_norm": 5.0, "learning_rate": 9.605272928260215e-06, "loss": 0.6399, "step": 52 }, { "epoch": 0.32, "grad_norm": 5.0, "learning_rate": 9.586056507527266e-06, "loss": 0.6417, "step": 53 }, { "epoch": 0.3260377358490566, "grad_norm": 5.0, "learning_rate": 9.566403546880262e-06, "loss": 0.6655, "step": 54 }, { "epoch": 0.3320754716981132, "grad_norm": 4.96875, "learning_rate": 9.546315917055362e-06, "loss": 0.6352, "step": 55 }, { "epoch": 0.33811320754716984, "grad_norm": 4.5, "learning_rate": 9.525795530164248e-06, "loss": 0.6421, "step": 56 }, { "epoch": 0.3441509433962264, "grad_norm": 5.125, "learning_rate": 9.504844339512096e-06, "loss": 0.6377, "step": 57 }, { "epoch": 0.350188679245283, "grad_norm": 4.8125, "learning_rate": 9.483464339411658e-06, "loss": 0.6435, "step": 58 }, { "epoch": 0.3562264150943396, "grad_norm": 4.8125, "learning_rate": 9.461657564993419e-06, "loss": 0.6802, "step": 59 }, { "epoch": 0.3622641509433962, "grad_norm": 4.90625, "learning_rate": 9.439426092011877e-06, "loss": 0.6735, "step": 60 }, { "epoch": 0.3622641509433962, "eval_loss": 0.6477010250091553, "eval_runtime": 15.34, "eval_samples_per_second": 18.188, "eval_steps_per_second": 4.563, "step": 60 }, { "epoch": 0.36830188679245285, "grad_norm": 4.71875, "learning_rate": 9.416772036647959e-06, "loss": 0.677, "step": 61 }, { "epoch": 0.37433962264150944, "grad_norm": 4.59375, "learning_rate": 9.393697555307581e-06, "loss": 0.6539, "step": 62 }, { "epoch": 0.38037735849056603, "grad_norm": 4.75, "learning_rate": 9.370204844416381e-06, "loss": 0.6248, "step": 63 }, { "epoch": 0.3864150943396226, "grad_norm": 4.75, "learning_rate": 9.346296140210653e-06, "loss": 0.6026, "step": 64 }, { "epoch": 0.39245283018867927, "grad_norm": 4.75, "learning_rate": 9.321973718524472e-06, "loss": 0.6503, "step": 65 }, { "epoch": 0.39849056603773586, "grad_norm": 4.4375, "learning_rate": 9.29723989457307e-06, "loss": 0.6239, "step": 66 }, { "epoch": 0.40452830188679245, "grad_norm": 4.5625, "learning_rate": 9.272097022732444e-06, "loss": 0.65, "step": 67 }, { "epoch": 0.41056603773584904, "grad_norm": 4.5625, "learning_rate": 9.24654749631526e-06, "loss": 0.6491, "step": 68 }, { "epoch": 0.41660377358490563, "grad_norm": 4.6875, "learning_rate": 9.220593747343028e-06, "loss": 0.6629, "step": 69 }, { "epoch": 0.4226415094339623, "grad_norm": 4.6875, "learning_rate": 9.194238246314599e-06, "loss": 0.6266, "step": 70 }, { "epoch": 0.4286792452830189, "grad_norm": 4.4375, "learning_rate": 9.16748350197101e-06, "loss": 0.6283, "step": 71 }, { "epoch": 0.43471698113207546, "grad_norm": 4.59375, "learning_rate": 9.140332061056678e-06, "loss": 0.6399, "step": 72 }, { "epoch": 0.44075471698113206, "grad_norm": 4.46875, "learning_rate": 9.112786508076972e-06, "loss": 0.6492, "step": 73 }, { "epoch": 0.4467924528301887, "grad_norm": 5.53125, "learning_rate": 9.08484946505221e-06, "loss": 0.6384, "step": 74 }, { "epoch": 0.4528301886792453, "grad_norm": 4.46875, "learning_rate": 9.056523591268064e-06, "loss": 0.6229, "step": 75 }, { "epoch": 0.4588679245283019, "grad_norm": 4.375, "learning_rate": 9.027811583022427e-06, "loss": 0.6438, "step": 76 }, { "epoch": 0.4649056603773585, "grad_norm": 4.4375, "learning_rate": 8.998716173368762e-06, "loss": 0.6374, "step": 77 }, { "epoch": 0.47094339622641507, "grad_norm": 4.15625, "learning_rate": 8.96924013185594e-06, "loss": 0.648, "step": 78 }, { "epoch": 0.4769811320754717, "grad_norm": 4.3125, "learning_rate": 8.939386264264616e-06, "loss": 0.6467, "step": 79 }, { "epoch": 0.4830188679245283, "grad_norm": 4.4375, "learning_rate": 8.90915741234015e-06, "loss": 0.646, "step": 80 }, { "epoch": 0.4830188679245283, "eval_loss": 0.6263449788093567, "eval_runtime": 16.5896, "eval_samples_per_second": 16.818, "eval_steps_per_second": 4.22, "step": 80 }, { "epoch": 0.4890566037735849, "grad_norm": 4.6875, "learning_rate": 8.8785564535221e-06, "loss": 0.6303, "step": 81 }, { "epoch": 0.4950943396226415, "grad_norm": 4.375, "learning_rate": 8.84758630067033e-06, "loss": 0.6112, "step": 82 }, { "epoch": 0.5011320754716981, "grad_norm": 4.75, "learning_rate": 8.816249901787736e-06, "loss": 0.6413, "step": 83 }, { "epoch": 0.5071698113207547, "grad_norm": 4.59375, "learning_rate": 8.78455023973963e-06, "loss": 0.6263, "step": 84 }, { "epoch": 0.5132075471698113, "grad_norm": 4.46875, "learning_rate": 8.752490331969807e-06, "loss": 0.602, "step": 85 }, { "epoch": 0.519245283018868, "grad_norm": 4.28125, "learning_rate": 8.720073230213315e-06, "loss": 0.5658, "step": 86 }, { "epoch": 0.5252830188679245, "grad_norm": 4.46875, "learning_rate": 8.687302020205967e-06, "loss": 0.6368, "step": 87 }, { "epoch": 0.5313207547169811, "grad_norm": 4.3125, "learning_rate": 8.65417982139062e-06, "loss": 0.6336, "step": 88 }, { "epoch": 0.5373584905660377, "grad_norm": 4.5625, "learning_rate": 8.620709786620231e-06, "loss": 0.639, "step": 89 }, { "epoch": 0.5433962264150943, "grad_norm": 4.75, "learning_rate": 8.586895101857747e-06, "loss": 0.6105, "step": 90 }, { "epoch": 0.549433962264151, "grad_norm": 4.28125, "learning_rate": 8.552738985872834e-06, "loss": 0.6211, "step": 91 }, { "epoch": 0.5554716981132075, "grad_norm": 4.5625, "learning_rate": 8.518244689935491e-06, "loss": 0.636, "step": 92 }, { "epoch": 0.5615094339622642, "grad_norm": 4.34375, "learning_rate": 8.483415497506567e-06, "loss": 0.5949, "step": 93 }, { "epoch": 0.5675471698113208, "grad_norm": 4.34375, "learning_rate": 8.448254723925205e-06, "loss": 0.5906, "step": 94 }, { "epoch": 0.5735849056603773, "grad_norm": 5.0625, "learning_rate": 8.412765716093273e-06, "loss": 0.6214, "step": 95 }, { "epoch": 0.579622641509434, "grad_norm": 5.0625, "learning_rate": 8.376951852156764e-06, "loss": 0.5991, "step": 96 }, { "epoch": 0.5856603773584905, "grad_norm": 4.6875, "learning_rate": 8.34081654118425e-06, "loss": 0.6169, "step": 97 }, { "epoch": 0.5916981132075472, "grad_norm": 4.59375, "learning_rate": 8.304363222842358e-06, "loss": 0.6112, "step": 98 }, { "epoch": 0.5977358490566038, "grad_norm": 4.09375, "learning_rate": 8.267595367068375e-06, "loss": 0.5735, "step": 99 }, { "epoch": 0.6037735849056604, "grad_norm": 4.21875, "learning_rate": 8.230516473739934e-06, "loss": 0.6006, "step": 100 }, { "epoch": 0.6037735849056604, "eval_loss": 0.6103448867797852, "eval_runtime": 16.0735, "eval_samples_per_second": 17.358, "eval_steps_per_second": 4.355, "step": 100 }, { "epoch": 0.609811320754717, "grad_norm": 4.4375, "learning_rate": 8.193130072341872e-06, "loss": 0.6219, "step": 101 }, { "epoch": 0.6158490566037735, "grad_norm": 4.03125, "learning_rate": 8.155439721630265e-06, "loss": 0.5929, "step": 102 }, { "epoch": 0.6218867924528302, "grad_norm": 4.40625, "learning_rate": 8.117449009293668e-06, "loss": 0.5974, "step": 103 }, { "epoch": 0.6279245283018868, "grad_norm": 4.3125, "learning_rate": 8.07916155161162e-06, "loss": 0.6064, "step": 104 }, { "epoch": 0.6339622641509434, "grad_norm": 4.34375, "learning_rate": 8.040580993110404e-06, "loss": 0.6119, "step": 105 }, { "epoch": 0.64, "grad_norm": 4.1875, "learning_rate": 8.001711006216138e-06, "loss": 0.5944, "step": 106 }, { "epoch": 0.6460377358490565, "grad_norm": 4.46875, "learning_rate": 7.962555290905198e-06, "loss": 0.603, "step": 107 }, { "epoch": 0.6520754716981132, "grad_norm": 4.21875, "learning_rate": 7.923117574352024e-06, "loss": 0.6147, "step": 108 }, { "epoch": 0.6581132075471698, "grad_norm": 4.46875, "learning_rate": 7.883401610574338e-06, "loss": 0.5746, "step": 109 }, { "epoch": 0.6641509433962264, "grad_norm": 4.28125, "learning_rate": 7.843411180075795e-06, "loss": 0.6098, "step": 110 }, { "epoch": 0.670188679245283, "grad_norm": 4.15625, "learning_rate": 7.803150089486144e-06, "loss": 0.6041, "step": 111 }, { "epoch": 0.6762264150943397, "grad_norm": 4.28125, "learning_rate": 7.76262217119885e-06, "loss": 0.5657, "step": 112 }, { "epoch": 0.6822641509433962, "grad_norm": 4.375, "learning_rate": 7.721831283006323e-06, "loss": 0.6099, "step": 113 }, { "epoch": 0.6883018867924529, "grad_norm": 4.25, "learning_rate": 7.680781307732683e-06, "loss": 0.592, "step": 114 }, { "epoch": 0.6943396226415094, "grad_norm": 4.4375, "learning_rate": 7.639476152864163e-06, "loss": 0.5997, "step": 115 }, { "epoch": 0.700377358490566, "grad_norm": 4.09375, "learning_rate": 7.597919750177168e-06, "loss": 0.5702, "step": 116 }, { "epoch": 0.7064150943396227, "grad_norm": 4.1875, "learning_rate": 7.556116055364008e-06, "loss": 0.5931, "step": 117 }, { "epoch": 0.7124528301886792, "grad_norm": 4.1875, "learning_rate": 7.51406904765636e-06, "loss": 0.5778, "step": 118 }, { "epoch": 0.7184905660377359, "grad_norm": 4.15625, "learning_rate": 7.4717827294464996e-06, "loss": 0.5554, "step": 119 }, { "epoch": 0.7245283018867924, "grad_norm": 4.4375, "learning_rate": 7.4292611259063105e-06, "loss": 0.6095, "step": 120 }, { "epoch": 0.7245283018867924, "eval_loss": 0.5946348905563354, "eval_runtime": 15.9878, "eval_samples_per_second": 17.451, "eval_steps_per_second": 4.378, "step": 120 }, { "epoch": 0.730566037735849, "grad_norm": 4.0625, "learning_rate": 7.3865082846041415e-06, "loss": 0.5897, "step": 121 }, { "epoch": 0.7366037735849057, "grad_norm": 4.125, "learning_rate": 7.343528275119515e-06, "loss": 0.5658, "step": 122 }, { "epoch": 0.7426415094339622, "grad_norm": 4.125, "learning_rate": 7.300325188655762e-06, "loss": 0.5628, "step": 123 }, { "epoch": 0.7486792452830189, "grad_norm": 4.375, "learning_rate": 7.256903137650575e-06, "loss": 0.5996, "step": 124 }, { "epoch": 0.7547169811320755, "grad_norm": 4.3125, "learning_rate": 7.21326625538456e-06, "loss": 0.5815, "step": 125 }, { "epoch": 0.7607547169811321, "grad_norm": 3.96875, "learning_rate": 7.169418695587791e-06, "loss": 0.6098, "step": 126 }, { "epoch": 0.7667924528301887, "grad_norm": 4.03125, "learning_rate": 7.125364632044423e-06, "loss": 0.5752, "step": 127 }, { "epoch": 0.7728301886792452, "grad_norm": 4.28125, "learning_rate": 7.0811082581953935e-06, "loss": 0.6119, "step": 128 }, { "epoch": 0.7788679245283019, "grad_norm": 4.34375, "learning_rate": 7.036653786739264e-06, "loss": 0.6137, "step": 129 }, { "epoch": 0.7849056603773585, "grad_norm": 4.03125, "learning_rate": 6.9920054492312086e-06, "loss": 0.6021, "step": 130 }, { "epoch": 0.7909433962264151, "grad_norm": 4.1875, "learning_rate": 6.947167495680224e-06, "loss": 0.5894, "step": 131 }, { "epoch": 0.7969811320754717, "grad_norm": 4.0625, "learning_rate": 6.902144194144576e-06, "loss": 0.5354, "step": 132 }, { "epoch": 0.8030188679245283, "grad_norm": 3.953125, "learning_rate": 6.8569398303255345e-06, "loss": 0.5578, "step": 133 }, { "epoch": 0.8090566037735849, "grad_norm": 3.953125, "learning_rate": 6.811558707159414e-06, "loss": 0.5529, "step": 134 }, { "epoch": 0.8150943396226416, "grad_norm": 4.3125, "learning_rate": 6.76600514440799e-06, "loss": 0.5796, "step": 135 }, { "epoch": 0.8211320754716981, "grad_norm": 4.1875, "learning_rate": 6.72028347824731e-06, "loss": 0.593, "step": 136 }, { "epoch": 0.8271698113207547, "grad_norm": 4.15625, "learning_rate": 6.674398060854931e-06, "loss": 0.6197, "step": 137 }, { "epoch": 0.8332075471698113, "grad_norm": 4.125, "learning_rate": 6.62835325999565e-06, "loss": 0.5648, "step": 138 }, { "epoch": 0.8392452830188679, "grad_norm": 4.5625, "learning_rate": 6.582153458605738e-06, "loss": 0.5664, "step": 139 }, { "epoch": 0.8452830188679246, "grad_norm": 4.09375, "learning_rate": 6.5358030543757375e-06, "loss": 0.5558, "step": 140 }, { "epoch": 0.8452830188679246, "eval_loss": 0.5809201598167419, "eval_runtime": 15.2762, "eval_samples_per_second": 18.264, "eval_steps_per_second": 4.582, "step": 140 }, { "epoch": 0.8513207547169811, "grad_norm": 4.03125, "learning_rate": 6.489306459331851e-06, "loss": 0.557, "step": 141 }, { "epoch": 0.8573584905660377, "grad_norm": 4.25, "learning_rate": 6.442668099415967e-06, "loss": 0.5499, "step": 142 }, { "epoch": 0.8633962264150944, "grad_norm": 4.03125, "learning_rate": 6.395892414064363e-06, "loss": 0.5886, "step": 143 }, { "epoch": 0.8694339622641509, "grad_norm": 3.984375, "learning_rate": 6.348983855785122e-06, "loss": 0.5767, "step": 144 }, { "epoch": 0.8754716981132076, "grad_norm": 4.1875, "learning_rate": 6.301946889734302e-06, "loss": 0.5833, "step": 145 }, { "epoch": 0.8815094339622641, "grad_norm": 3.875, "learning_rate": 6.254785993290907e-06, "loss": 0.5878, "step": 146 }, { "epoch": 0.8875471698113208, "grad_norm": 3.921875, "learning_rate": 6.20750565563069e-06, "loss": 0.5805, "step": 147 }, { "epoch": 0.8935849056603774, "grad_norm": 4.25, "learning_rate": 6.16011037729884e-06, "loss": 0.583, "step": 148 }, { "epoch": 0.8996226415094339, "grad_norm": 4.25, "learning_rate": 6.112604669781572e-06, "loss": 0.5855, "step": 149 }, { "epoch": 0.9056603773584906, "grad_norm": 3.984375, "learning_rate": 6.064993055076697e-06, "loss": 0.5556, "step": 150 }, { "epoch": 0.9116981132075471, "grad_norm": 3.859375, "learning_rate": 6.0172800652631706e-06, "loss": 0.5548, "step": 151 }, { "epoch": 0.9177358490566038, "grad_norm": 4.21875, "learning_rate": 5.9694702420696935e-06, "loss": 0.5608, "step": 152 }, { "epoch": 0.9237735849056604, "grad_norm": 3.953125, "learning_rate": 5.9215681364423975e-06, "loss": 0.5598, "step": 153 }, { "epoch": 0.929811320754717, "grad_norm": 3.9375, "learning_rate": 5.873578308111636e-06, "loss": 0.5573, "step": 154 }, { "epoch": 0.9358490566037736, "grad_norm": 4.125, "learning_rate": 5.825505325157962e-06, "loss": 0.5751, "step": 155 }, { "epoch": 0.9418867924528301, "grad_norm": 4.21875, "learning_rate": 5.77735376357729e-06, "loss": 0.5725, "step": 156 }, { "epoch": 0.9479245283018868, "grad_norm": 3.75, "learning_rate": 5.729128206845317e-06, "loss": 0.5666, "step": 157 }, { "epoch": 0.9539622641509434, "grad_norm": 4.03125, "learning_rate": 5.680833245481234e-06, "loss": 0.5607, "step": 158 }, { "epoch": 0.96, "grad_norm": 3.953125, "learning_rate": 5.632473476610748e-06, "loss": 0.5663, "step": 159 }, { "epoch": 0.9660377358490566, "grad_norm": 4.0, "learning_rate": 5.584053503528503e-06, "loss": 0.5656, "step": 160 }, { "epoch": 0.9660377358490566, "eval_loss": 0.5677427649497986, "eval_runtime": 15.2985, "eval_samples_per_second": 18.237, "eval_steps_per_second": 4.576, "step": 160 } ], "logging_steps": 1, "max_steps": 332, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.342214917128192e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }