{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.920048019207683, "eval_steps": 100, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01920768307322929, "grad_norm": 9.87369441986084, "learning_rate": 1.1464968152866242e-05, "loss": 1.5396, "step": 10 }, { "epoch": 0.03841536614645858, "grad_norm": 1.4561578035354614, "learning_rate": 2.4203821656050956e-05, "loss": 0.9607, "step": 20 }, { "epoch": 0.057623049219687875, "grad_norm": 0.6916958689689636, "learning_rate": 3.694267515923567e-05, "loss": 0.4612, "step": 30 }, { "epoch": 0.07683073229291716, "grad_norm": 0.39570435881614685, "learning_rate": 4.968152866242039e-05, "loss": 0.17, "step": 40 }, { "epoch": 0.09603841536614646, "grad_norm": 0.3437703251838684, "learning_rate": 6.24203821656051e-05, "loss": 0.116, "step": 50 }, { "epoch": 0.11524609843937575, "grad_norm": 0.32988497614860535, "learning_rate": 7.515923566878981e-05, "loss": 0.1096, "step": 60 }, { "epoch": 0.13445378151260504, "grad_norm": 0.29236501455307007, "learning_rate": 8.789808917197452e-05, "loss": 0.0865, "step": 70 }, { "epoch": 0.15366146458583432, "grad_norm": 0.24205942451953888, "learning_rate": 0.00010063694267515924, "loss": 0.0801, "step": 80 }, { "epoch": 0.17286914765906364, "grad_norm": 0.23881220817565918, "learning_rate": 0.00011337579617834395, "loss": 0.0852, "step": 90 }, { "epoch": 0.19207683073229292, "grad_norm": 0.203415647149086, "learning_rate": 0.00012611464968152866, "loss": 0.0788, "step": 100 }, { "epoch": 0.19207683073229292, "eval_loss": 0.07264828681945801, "eval_runtime": 3651.8287, "eval_samples_per_second": 0.805, "eval_steps_per_second": 0.201, "step": 100 }, { "epoch": 0.2112845138055222, "grad_norm": 0.22012194991111755, "learning_rate": 0.00013885350318471339, "loss": 0.0711, "step": 110 }, { "epoch": 0.2304921968787515, "grad_norm": 0.2833324670791626, "learning_rate": 0.0001515923566878981, "loss": 0.0718, "step": 120 }, { "epoch": 0.24969987995198079, "grad_norm": 0.1488553136587143, "learning_rate": 0.0001643312101910828, "loss": 0.0738, "step": 130 }, { "epoch": 0.2689075630252101, "grad_norm": 0.18909229338169098, "learning_rate": 0.00017707006369426754, "loss": 0.0677, "step": 140 }, { "epoch": 0.28811524609843936, "grad_norm": 0.17976899445056915, "learning_rate": 0.00018980891719745223, "loss": 0.0663, "step": 150 }, { "epoch": 0.30732292917166865, "grad_norm": 0.19813711941242218, "learning_rate": 0.00019999900147630093, "loss": 0.0618, "step": 160 }, { "epoch": 0.32653061224489793, "grad_norm": 0.17767417430877686, "learning_rate": 0.0001999640552405925, "loss": 0.0718, "step": 170 }, { "epoch": 0.3457382953181273, "grad_norm": 0.16566476225852966, "learning_rate": 0.0001998792027591314, "loss": 0.0647, "step": 180 }, { "epoch": 0.36494597839135656, "grad_norm": 0.14328187704086304, "learning_rate": 0.00019974448639383248, "loss": 0.0609, "step": 190 }, { "epoch": 0.38415366146458585, "grad_norm": 0.1612551510334015, "learning_rate": 0.00019955997340075107, "loss": 0.0652, "step": 200 }, { "epoch": 0.38415366146458585, "eval_loss": 0.06363514065742493, "eval_runtime": 3655.3099, "eval_samples_per_second": 0.804, "eval_steps_per_second": 0.201, "step": 200 }, { "epoch": 0.40336134453781514, "grad_norm": 0.1255066841840744, "learning_rate": 0.0001993257558965061, "loss": 0.0627, "step": 210 }, { "epoch": 0.4225690276110444, "grad_norm": 0.16409283876419067, "learning_rate": 0.00019904195081229143, "loss": 0.0599, "step": 220 }, { "epoch": 0.4417767106842737, "grad_norm": 0.16511447727680206, "learning_rate": 0.00019870869983549904, "loss": 0.062, "step": 230 }, { "epoch": 0.460984393757503, "grad_norm": 0.15147803723812103, "learning_rate": 0.00019832616933898267, "loss": 0.0599, "step": 240 }, { "epoch": 0.4801920768307323, "grad_norm": 0.13356028497219086, "learning_rate": 0.00019789455029799763, "loss": 0.0645, "step": 250 }, { "epoch": 0.49939975990396157, "grad_norm": 0.10996223986148834, "learning_rate": 0.00019741405819485785, "loss": 0.0596, "step": 260 }, { "epoch": 0.5186074429771909, "grad_norm": 0.09064004570245743, "learning_rate": 0.00019688493291135804, "loss": 0.0612, "step": 270 }, { "epoch": 0.5378151260504201, "grad_norm": 0.12441632896661758, "learning_rate": 0.00019630743860901472, "loss": 0.0568, "step": 280 }, { "epoch": 0.5570228091236494, "grad_norm": 0.10121817886829376, "learning_rate": 0.0001956818635971858, "loss": 0.057, "step": 290 }, { "epoch": 0.5762304921968787, "grad_norm": 0.1084541603922844, "learning_rate": 0.00019500852018913424, "loss": 0.0604, "step": 300 }, { "epoch": 0.5762304921968787, "eval_loss": 0.058300431817770004, "eval_runtime": 3655.2893, "eval_samples_per_second": 0.804, "eval_steps_per_second": 0.201, "step": 300 }, { "epoch": 0.595438175270108, "grad_norm": 0.11007491499185562, "learning_rate": 0.00019428774454610843, "loss": 0.0578, "step": 310 }, { "epoch": 0.6146458583433373, "grad_norm": 0.10574518889188766, "learning_rate": 0.0001935198965095162, "loss": 0.0538, "step": 320 }, { "epoch": 0.6338535414165666, "grad_norm": 0.09856812655925751, "learning_rate": 0.00019270535942127697, "loss": 0.0624, "step": 330 }, { "epoch": 0.6530612244897959, "grad_norm": 0.11747714132070541, "learning_rate": 0.0001918445399324416, "loss": 0.0535, "step": 340 }, { "epoch": 0.6722689075630253, "grad_norm": 0.12735703587532043, "learning_rate": 0.00019093786780017476, "loss": 0.0597, "step": 350 }, { "epoch": 0.6914765906362546, "grad_norm": 0.11712982505559921, "learning_rate": 0.00018998579567320253, "loss": 0.0587, "step": 360 }, { "epoch": 0.7106842737094838, "grad_norm": 0.1045333668589592, "learning_rate": 0.00018898879886583081, "loss": 0.0527, "step": 370 }, { "epoch": 0.7298919567827131, "grad_norm": 0.09244433045387268, "learning_rate": 0.0001879473751206489, "loss": 0.055, "step": 380 }, { "epoch": 0.7490996398559424, "grad_norm": 0.08246836066246033, "learning_rate": 0.00018686204436003525, "loss": 0.0538, "step": 390 }, { "epoch": 0.7683073229291717, "grad_norm": 0.09854533523321152, "learning_rate": 0.00018573334842659044, "loss": 0.0563, "step": 400 }, { "epoch": 0.7683073229291717, "eval_loss": 0.05585579574108124, "eval_runtime": 2847.8408, "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.258, "step": 400 }, { "epoch": 0.787515006002401, "grad_norm": 0.10713665187358856, "learning_rate": 0.00018456185081262684, "loss": 0.0525, "step": 410 }, { "epoch": 0.8067226890756303, "grad_norm": 0.09907171875238419, "learning_rate": 0.00018334813637884933, "loss": 0.0537, "step": 420 }, { "epoch": 0.8259303721488596, "grad_norm": 0.09218382090330124, "learning_rate": 0.0001820928110623687, "loss": 0.0547, "step": 430 }, { "epoch": 0.8451380552220888, "grad_norm": 0.11424380540847778, "learning_rate": 0.00018079650157419211, "loss": 0.0578, "step": 440 }, { "epoch": 0.8643457382953181, "grad_norm": 0.10797233134508133, "learning_rate": 0.00017945985508634288, "loss": 0.052, "step": 450 }, { "epoch": 0.8835534213685474, "grad_norm": 0.11603976041078568, "learning_rate": 0.00017808353890876508, "loss": 0.0538, "step": 460 }, { "epoch": 0.9027611044417767, "grad_norm": 0.08808846771717072, "learning_rate": 0.00017666824015617427, "loss": 0.053, "step": 470 }, { "epoch": 0.921968787515006, "grad_norm": 0.16436685621738434, "learning_rate": 0.0001752146654050213, "loss": 0.0498, "step": 480 }, { "epoch": 0.9411764705882353, "grad_norm": 0.0981052964925766, "learning_rate": 0.00017372354034073958, "loss": 0.0555, "step": 490 }, { "epoch": 0.9603841536614646, "grad_norm": 0.08799422532320023, "learning_rate": 0.00017219560939545246, "loss": 0.0515, "step": 500 }, { "epoch": 0.9603841536614646, "eval_loss": 0.05503761023283005, "eval_runtime": 2847.8125, "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.258, "step": 500 }, { "epoch": 0.9795918367346939, "grad_norm": 0.07972495257854462, "learning_rate": 0.00017063163537632168, "loss": 0.0528, "step": 510 }, { "epoch": 0.9987995198079231, "grad_norm": 0.08015741407871246, "learning_rate": 0.00016903239908472184, "loss": 0.0521, "step": 520 }, { "epoch": 1.0172869147659063, "grad_norm": 0.08703973144292831, "learning_rate": 0.0001673986989264319, "loss": 0.0429, "step": 530 }, { "epoch": 1.0364945978391356, "grad_norm": 0.08057113736867905, "learning_rate": 0.00016573135051303729, "loss": 0.0452, "step": 540 }, { "epoch": 1.0557022809123648, "grad_norm": 0.13058768212795258, "learning_rate": 0.00016403118625474264, "loss": 0.0444, "step": 550 }, { "epoch": 1.0749099639855944, "grad_norm": 0.10882497578859329, "learning_rate": 0.00016229905494479752, "loss": 0.0476, "step": 560 }, { "epoch": 1.0941176470588236, "grad_norm": 0.09046532213687897, "learning_rate": 0.0001605358213357435, "loss": 0.0473, "step": 570 }, { "epoch": 1.113325330132053, "grad_norm": 0.08155661821365356, "learning_rate": 0.00015874236570769331, "loss": 0.045, "step": 580 }, { "epoch": 1.1325330132052822, "grad_norm": 0.0884336531162262, "learning_rate": 0.000156919583428858, "loss": 0.0431, "step": 590 }, { "epoch": 1.1517406962785115, "grad_norm": 0.09529253095388412, "learning_rate": 0.00015506838450854194, "loss": 0.0491, "step": 600 }, { "epoch": 1.1517406962785115, "eval_loss": 0.05356701835989952, "eval_runtime": 2846.0761, "eval_samples_per_second": 1.033, "eval_steps_per_second": 0.258, "step": 600 }, { "epoch": 1.1709483793517408, "grad_norm": 0.08717475831508636, "learning_rate": 0.00015318969314282764, "loss": 0.0476, "step": 610 }, { "epoch": 1.19015606242497, "grad_norm": 0.082962766289711, "learning_rate": 0.0001512844472531787, "loss": 0.0457, "step": 620 }, { "epoch": 1.2093637454981994, "grad_norm": 0.1103208139538765, "learning_rate": 0.00014935359801818978, "loss": 0.0435, "step": 630 }, { "epoch": 1.2285714285714286, "grad_norm": 0.08975227177143097, "learning_rate": 0.0001473981093987187, "loss": 0.045, "step": 640 }, { "epoch": 1.247779111644658, "grad_norm": 0.08823322504758835, "learning_rate": 0.00014541895765663644, "loss": 0.0437, "step": 650 }, { "epoch": 1.2669867947178872, "grad_norm": 0.1031484454870224, "learning_rate": 0.00014341713086743672, "loss": 0.0432, "step": 660 }, { "epoch": 1.2861944777911165, "grad_norm": 0.07141754776239395, "learning_rate": 0.00014139362842694679, "loss": 0.0469, "step": 670 }, { "epoch": 1.3054021608643458, "grad_norm": 0.07835739850997925, "learning_rate": 0.00013934946055238763, "loss": 0.0421, "step": 680 }, { "epoch": 1.324609843937575, "grad_norm": 0.09041093289852142, "learning_rate": 0.00013728564777803088, "loss": 0.0428, "step": 690 }, { "epoch": 1.3438175270108044, "grad_norm": 0.07876908034086227, "learning_rate": 0.00013520322044570547, "loss": 0.0449, "step": 700 }, { "epoch": 1.3438175270108044, "eval_loss": 0.052141353487968445, "eval_runtime": 2848.9852, "eval_samples_per_second": 1.032, "eval_steps_per_second": 0.258, "step": 700 }, { "epoch": 1.3630252100840337, "grad_norm": 0.08498098701238632, "learning_rate": 0.0001331032181904078, "loss": 0.0404, "step": 710 }, { "epoch": 1.382232893157263, "grad_norm": 0.08044544607400894, "learning_rate": 0.00013098668942127239, "loss": 0.046, "step": 720 }, { "epoch": 1.4014405762304922, "grad_norm": 0.09615248441696167, "learning_rate": 0.000128854690798162, "loss": 0.044, "step": 730 }, { "epoch": 1.4206482593037215, "grad_norm": 0.13253851234912872, "learning_rate": 0.00012670828670413884, "loss": 0.045, "step": 740 }, { "epoch": 1.4398559423769508, "grad_norm": 0.12733927369117737, "learning_rate": 0.00012454854871407994, "loss": 0.0457, "step": 750 }, { "epoch": 1.45906362545018, "grad_norm": 0.1194474846124649, "learning_rate": 0.00012237655505970203, "loss": 0.0449, "step": 760 }, { "epoch": 1.4782713085234094, "grad_norm": 0.12221734970808029, "learning_rate": 0.00012019339009126306, "loss": 0.0479, "step": 770 }, { "epoch": 1.4974789915966387, "grad_norm": 0.11174425482749939, "learning_rate": 0.00011800014373620922, "loss": 0.045, "step": 780 }, { "epoch": 1.516686674669868, "grad_norm": 0.10973796993494034, "learning_rate": 0.00011579791095503733, "loss": 0.0444, "step": 790 }, { "epoch": 1.5358943577430972, "grad_norm": 0.07216602563858032, "learning_rate": 0.00011358779119464467, "loss": 0.0457, "step": 800 }, { "epoch": 1.5358943577430972, "eval_loss": 0.05131237953901291, "eval_runtime": 2857.2895, "eval_samples_per_second": 1.029, "eval_steps_per_second": 0.257, "step": 800 }, { "epoch": 1.5551020408163265, "grad_norm": 0.09344726800918579, "learning_rate": 0.00011137088783943927, "loss": 0.0444, "step": 810 }, { "epoch": 1.5743097238895558, "grad_norm": 0.09161848574876785, "learning_rate": 0.0001091483076604838, "loss": 0.0445, "step": 820 }, { "epoch": 1.593517406962785, "grad_norm": 0.08971661329269409, "learning_rate": 0.00010692116026294938, "loss": 0.0448, "step": 830 }, { "epoch": 1.6127250900360144, "grad_norm": 0.08268587291240692, "learning_rate": 0.00010469055753215395, "loss": 0.0427, "step": 840 }, { "epoch": 1.6319327731092437, "grad_norm": 0.08552444726228714, "learning_rate": 0.00010245761307846268, "loss": 0.0409, "step": 850 }, { "epoch": 1.651140456182473, "grad_norm": 0.12000453472137451, "learning_rate": 0.00010022344168132698, "loss": 0.0401, "step": 860 }, { "epoch": 1.6703481392557022, "grad_norm": 0.09046022593975067, "learning_rate": 9.798915873273976e-05, "loss": 0.0428, "step": 870 }, { "epoch": 1.6895558223289315, "grad_norm": 0.07934489101171494, "learning_rate": 9.57558796803852e-05, "loss": 0.0452, "step": 880 }, { "epoch": 1.7087635054021608, "grad_norm": 0.09740308672189713, "learning_rate": 9.352471947076027e-05, "loss": 0.0432, "step": 890 }, { "epoch": 1.72797118847539, "grad_norm": 0.06899864971637726, "learning_rate": 9.129679199254694e-05, "loss": 0.0434, "step": 900 }, { "epoch": 1.72797118847539, "eval_loss": 0.05020257458090782, "eval_runtime": 2853.3923, "eval_samples_per_second": 1.03, "eval_steps_per_second": 0.258, "step": 900 }, { "epoch": 1.7471788715486194, "grad_norm": 0.08942172676324844, "learning_rate": 8.907320952051225e-05, "loss": 0.0436, "step": 910 }, { "epoch": 1.7663865546218487, "grad_norm": 0.10566150397062302, "learning_rate": 8.685508216021389e-05, "loss": 0.042, "step": 920 }, { "epoch": 1.785594237695078, "grad_norm": 0.09011435508728027, "learning_rate": 8.464351729378927e-05, "loss": 0.0406, "step": 930 }, { "epoch": 1.8048019207683073, "grad_norm": 0.11913520842790604, "learning_rate": 8.24396190271038e-05, "loss": 0.0411, "step": 940 }, { "epoch": 1.8240096038415365, "grad_norm": 0.1245686337351799, "learning_rate": 8.0244487638535e-05, "loss": 0.0362, "step": 950 }, { "epoch": 1.8432172869147658, "grad_norm": 0.09471370279788971, "learning_rate": 7.805921902966748e-05, "loss": 0.0408, "step": 960 }, { "epoch": 1.8624249699879951, "grad_norm": 0.08700928837060928, "learning_rate": 7.588490417817278e-05, "loss": 0.0454, "step": 970 }, { "epoch": 1.8816326530612244, "grad_norm": 0.08810342848300934, "learning_rate": 7.372262859314773e-05, "loss": 0.0423, "step": 980 }, { "epoch": 1.9008403361344537, "grad_norm": 0.10621917247772217, "learning_rate": 7.15734717731829e-05, "loss": 0.0425, "step": 990 }, { "epoch": 1.920048019207683, "grad_norm": 0.08340129256248474, "learning_rate": 6.943850666743143e-05, "loss": 0.0443, "step": 1000 }, { "epoch": 1.920048019207683, "eval_loss": 0.049211401492357254, "eval_runtime": 2851.0358, "eval_samples_per_second": 1.031, "eval_steps_per_second": 0.258, "step": 1000 } ], "logging_steps": 10, "max_steps": 1563, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.018239420275098e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }