{ "best_global_step": 3000, "best_metric": 1.1457551717758179, "best_model_checkpoint": "/workspace/woodcode_2/checkpoint-3000", "epoch": 0.6942837306179125, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023142791020597086, "grad_norm": 1.0788178443908691, "learning_rate": 6.923076923076923e-06, "loss": 2.405, "step": 10 }, { "epoch": 0.004628558204119417, "grad_norm": 0.4521988034248352, "learning_rate": 1.4615384615384617e-05, "loss": 2.2017, "step": 20 }, { "epoch": 0.006942837306179125, "grad_norm": 0.41120657324790955, "learning_rate": 2.230769230769231e-05, "loss": 1.9363, "step": 30 }, { "epoch": 0.009257116408238834, "grad_norm": 0.28640317916870117, "learning_rate": 3e-05, "loss": 1.748, "step": 40 }, { "epoch": 0.011571395510298541, "grad_norm": 0.26409590244293213, "learning_rate": 3.769230769230769e-05, "loss": 1.6238, "step": 50 }, { "epoch": 0.01388567461235825, "grad_norm": 0.26786214113235474, "learning_rate": 4.538461538461539e-05, "loss": 1.5538, "step": 60 }, { "epoch": 0.01619995371441796, "grad_norm": 0.2931516766548157, "learning_rate": 5.3076923076923076e-05, "loss": 1.4984, "step": 70 }, { "epoch": 0.01851423281647767, "grad_norm": 0.29799872636795044, "learning_rate": 6.0769230769230765e-05, "loss": 1.4658, "step": 80 }, { "epoch": 0.020828511918537376, "grad_norm": 0.29269853234291077, "learning_rate": 6.846153846153847e-05, "loss": 1.4265, "step": 90 }, { "epoch": 0.023142791020597082, "grad_norm": 0.3255228102207184, "learning_rate": 7.615384615384616e-05, "loss": 1.4194, "step": 100 }, { "epoch": 0.025457070122656793, "grad_norm": 0.3054940104484558, "learning_rate": 8.384615384615386e-05, "loss": 1.3852, "step": 110 }, { "epoch": 0.0277713492247165, "grad_norm": 0.2838137149810791, "learning_rate": 9.153846153846155e-05, "loss": 1.3885, "step": 120 }, { "epoch": 0.03008562832677621, "grad_norm": 0.2860707938671112, "learning_rate": 9.923076923076923e-05, "loss": 1.3693, "step": 130 }, { "epoch": 0.03239990742883592, "grad_norm": 0.2725190818309784, "learning_rate": 9.999886214268966e-05, "loss": 1.3606, "step": 140 }, { "epoch": 0.03471418653089563, "grad_norm": 0.2650243937969208, "learning_rate": 9.999492887526629e-05, "loss": 1.3414, "step": 150 }, { "epoch": 0.03702846563295534, "grad_norm": 0.2538904845714569, "learning_rate": 9.998818637106816e-05, "loss": 1.3495, "step": 160 }, { "epoch": 0.03934274473501504, "grad_norm": 0.25812944769859314, "learning_rate": 9.99786350089595e-05, "loss": 1.3419, "step": 170 }, { "epoch": 0.04165702383707475, "grad_norm": 0.23412390053272247, "learning_rate": 9.996627532563551e-05, "loss": 1.3314, "step": 180 }, { "epoch": 0.04397130293913446, "grad_norm": 0.256795197725296, "learning_rate": 9.995110801559215e-05, "loss": 1.3326, "step": 190 }, { "epoch": 0.046285582041194165, "grad_norm": 0.24870288372039795, "learning_rate": 9.993313393108719e-05, "loss": 1.328, "step": 200 }, { "epoch": 0.048599861143253875, "grad_norm": 0.2265051007270813, "learning_rate": 9.991235408209221e-05, "loss": 1.3271, "step": 210 }, { "epoch": 0.050914140245313586, "grad_norm": 0.23642291128635406, "learning_rate": 9.988876963623597e-05, "loss": 1.3268, "step": 220 }, { "epoch": 0.053228419347373296, "grad_norm": 0.2279822677373886, "learning_rate": 9.986238191873874e-05, "loss": 1.3058, "step": 230 }, { "epoch": 0.055542698449433, "grad_norm": 0.23465260863304138, "learning_rate": 9.983319241233782e-05, "loss": 1.3057, "step": 240 }, { "epoch": 0.05785697755149271, "grad_norm": 0.22599244117736816, "learning_rate": 9.980120275720424e-05, "loss": 1.313, "step": 250 }, { "epoch": 0.06017125665355242, "grad_norm": 0.23743176460266113, "learning_rate": 9.976641475085067e-05, "loss": 1.3004, "step": 260 }, { "epoch": 0.06248553575561213, "grad_norm": 0.22628776729106903, "learning_rate": 9.972883034803025e-05, "loss": 1.3059, "step": 270 }, { "epoch": 0.06479981485767183, "grad_norm": 0.24011753499507904, "learning_rate": 9.968845166062692e-05, "loss": 1.2905, "step": 280 }, { "epoch": 0.06711409395973154, "grad_norm": 0.231357604265213, "learning_rate": 9.96452809575367e-05, "loss": 1.2971, "step": 290 }, { "epoch": 0.06942837306179125, "grad_norm": 0.23093485832214355, "learning_rate": 9.959932066454008e-05, "loss": 1.2977, "step": 300 }, { "epoch": 0.07174265216385096, "grad_norm": 0.22505411505699158, "learning_rate": 9.955057336416597e-05, "loss": 1.2746, "step": 310 }, { "epoch": 0.07405693126591067, "grad_norm": 0.2227935642004013, "learning_rate": 9.949904179554632e-05, "loss": 1.273, "step": 320 }, { "epoch": 0.07637121036797037, "grad_norm": 0.23704655468463898, "learning_rate": 9.944472885426235e-05, "loss": 1.2909, "step": 330 }, { "epoch": 0.07868548947003008, "grad_norm": 0.2225590944290161, "learning_rate": 9.938763759218185e-05, "loss": 1.2846, "step": 340 }, { "epoch": 0.08099976857208979, "grad_norm": 0.24287723004817963, "learning_rate": 9.932777121728763e-05, "loss": 1.2989, "step": 350 }, { "epoch": 0.0833140476741495, "grad_norm": 0.22559230029582977, "learning_rate": 9.926513309349732e-05, "loss": 1.2803, "step": 360 }, { "epoch": 0.08562832677620921, "grad_norm": 0.22119103372097015, "learning_rate": 9.919972674047429e-05, "loss": 1.269, "step": 370 }, { "epoch": 0.08794260587826892, "grad_norm": 0.2336379736661911, "learning_rate": 9.913155583342994e-05, "loss": 1.2775, "step": 380 }, { "epoch": 0.09025688498032863, "grad_norm": 0.2086753100156784, "learning_rate": 9.906062420291715e-05, "loss": 1.2868, "step": 390 }, { "epoch": 0.09257116408238833, "grad_norm": 0.24568480253219604, "learning_rate": 9.898693583461507e-05, "loss": 1.2746, "step": 400 }, { "epoch": 0.09488544318444804, "grad_norm": 0.22321555018424988, "learning_rate": 9.891049486910511e-05, "loss": 1.2682, "step": 410 }, { "epoch": 0.09719972228650775, "grad_norm": 0.22601205110549927, "learning_rate": 9.883130560163837e-05, "loss": 1.27, "step": 420 }, { "epoch": 0.09951400138856746, "grad_norm": 0.20481973886489868, "learning_rate": 9.874937248189415e-05, "loss": 1.275, "step": 430 }, { "epoch": 0.10182828049062717, "grad_norm": 0.2164992243051529, "learning_rate": 9.866470011373008e-05, "loss": 1.2661, "step": 440 }, { "epoch": 0.10414255959268688, "grad_norm": 0.20576460659503937, "learning_rate": 9.857729325492329e-05, "loss": 1.2626, "step": 450 }, { "epoch": 0.10645683869474659, "grad_norm": 0.22202594578266144, "learning_rate": 9.848715681690317e-05, "loss": 1.2488, "step": 460 }, { "epoch": 0.10877111779680629, "grad_norm": 0.20930485427379608, "learning_rate": 9.839429586447533e-05, "loss": 1.2623, "step": 470 }, { "epoch": 0.111085396898866, "grad_norm": 0.23361071944236755, "learning_rate": 9.829871561553702e-05, "loss": 1.2546, "step": 480 }, { "epoch": 0.11339967600092571, "grad_norm": 0.211343452334404, "learning_rate": 9.820042144078397e-05, "loss": 1.2538, "step": 490 }, { "epoch": 0.11571395510298542, "grad_norm": 0.20587043464183807, "learning_rate": 9.809941886340854e-05, "loss": 1.2719, "step": 500 }, { "epoch": 0.11571395510298542, "eval_loss": 1.2470530271530151, "eval_runtime": 23.9969, "eval_samples_per_second": 16.002, "eval_steps_per_second": 0.5, "step": 500 }, { "epoch": 0.11802823420504513, "grad_norm": 0.20054572820663452, "learning_rate": 9.799571355878947e-05, "loss": 1.2563, "step": 510 }, { "epoch": 0.12034251330710484, "grad_norm": 0.21044516563415527, "learning_rate": 9.788931135417287e-05, "loss": 1.2517, "step": 520 }, { "epoch": 0.12265679240916455, "grad_norm": 0.21391618251800537, "learning_rate": 9.778021822834485e-05, "loss": 1.2491, "step": 530 }, { "epoch": 0.12497107151122426, "grad_norm": 0.2132970243692398, "learning_rate": 9.766844031129552e-05, "loss": 1.2472, "step": 540 }, { "epoch": 0.12728535061328397, "grad_norm": 0.2169645130634308, "learning_rate": 9.755398388387462e-05, "loss": 1.2596, "step": 550 }, { "epoch": 0.12959962971534367, "grad_norm": 0.20134727656841278, "learning_rate": 9.743685537743856e-05, "loss": 1.257, "step": 560 }, { "epoch": 0.1319139088174034, "grad_norm": 0.21121706068515778, "learning_rate": 9.731706137348898e-05, "loss": 1.2616, "step": 570 }, { "epoch": 0.1342281879194631, "grad_norm": 0.21253220736980438, "learning_rate": 9.7194608603303e-05, "loss": 1.2355, "step": 580 }, { "epoch": 0.13654246702152278, "grad_norm": 0.22279760241508484, "learning_rate": 9.706950394755501e-05, "loss": 1.256, "step": 590 }, { "epoch": 0.1388567461235825, "grad_norm": 0.191938579082489, "learning_rate": 9.694175443592993e-05, "loss": 1.2408, "step": 600 }, { "epoch": 0.1411710252256422, "grad_norm": 0.2211560308933258, "learning_rate": 9.681136724672835e-05, "loss": 1.2563, "step": 610 }, { "epoch": 0.14348530432770193, "grad_norm": 0.2078508883714676, "learning_rate": 9.667834970646307e-05, "loss": 1.2323, "step": 620 }, { "epoch": 0.14579958342976163, "grad_norm": 0.21866025030612946, "learning_rate": 9.65427092894475e-05, "loss": 1.261, "step": 630 }, { "epoch": 0.14811386253182135, "grad_norm": 0.20903170108795166, "learning_rate": 9.640445361737556e-05, "loss": 1.2476, "step": 640 }, { "epoch": 0.15042814163388105, "grad_norm": 0.20698365569114685, "learning_rate": 9.626359045889355e-05, "loss": 1.2354, "step": 650 }, { "epoch": 0.15274242073594074, "grad_norm": 0.21057769656181335, "learning_rate": 9.612012772916353e-05, "loss": 1.2527, "step": 660 }, { "epoch": 0.15505669983800047, "grad_norm": 0.2073555439710617, "learning_rate": 9.597407348941865e-05, "loss": 1.2338, "step": 670 }, { "epoch": 0.15737097894006016, "grad_norm": 0.20362691581249237, "learning_rate": 9.582543594651005e-05, "loss": 1.2548, "step": 680 }, { "epoch": 0.1596852580421199, "grad_norm": 0.18878686428070068, "learning_rate": 9.56742234524459e-05, "loss": 1.2399, "step": 690 }, { "epoch": 0.16199953714417958, "grad_norm": 0.21003399789333344, "learning_rate": 9.552044450392189e-05, "loss": 1.2366, "step": 700 }, { "epoch": 0.1643138162462393, "grad_norm": 0.21605387330055237, "learning_rate": 9.536410774184396e-05, "loss": 1.2419, "step": 710 }, { "epoch": 0.166628095348299, "grad_norm": 0.21591876447200775, "learning_rate": 9.520522195084274e-05, "loss": 1.2412, "step": 720 }, { "epoch": 0.1689423744503587, "grad_norm": 0.2058115005493164, "learning_rate": 9.504379605877979e-05, "loss": 1.233, "step": 730 }, { "epoch": 0.17125665355241843, "grad_norm": 0.224104642868042, "learning_rate": 9.487983913624615e-05, "loss": 1.2272, "step": 740 }, { "epoch": 0.17357093265447812, "grad_norm": 0.20306575298309326, "learning_rate": 9.471336039605255e-05, "loss": 1.2278, "step": 750 }, { "epoch": 0.17588521175653785, "grad_norm": 0.1998828798532486, "learning_rate": 9.454436919271169e-05, "loss": 1.2344, "step": 760 }, { "epoch": 0.17819949085859754, "grad_norm": 0.1913456916809082, "learning_rate": 9.437287502191274e-05, "loss": 1.2376, "step": 770 }, { "epoch": 0.18051376996065727, "grad_norm": 0.20067718625068665, "learning_rate": 9.419888751998767e-05, "loss": 1.2586, "step": 780 }, { "epoch": 0.18282804906271696, "grad_norm": 0.1913948804140091, "learning_rate": 9.402241646336977e-05, "loss": 1.2414, "step": 790 }, { "epoch": 0.18514232816477666, "grad_norm": 0.20469442009925842, "learning_rate": 9.38434717680444e-05, "loss": 1.2395, "step": 800 }, { "epoch": 0.18745660726683638, "grad_norm": 0.20488658547401428, "learning_rate": 9.366206348899177e-05, "loss": 1.2259, "step": 810 }, { "epoch": 0.18977088636889608, "grad_norm": 0.21545757353305817, "learning_rate": 9.347820181962185e-05, "loss": 1.2267, "step": 820 }, { "epoch": 0.1920851654709558, "grad_norm": 0.20461086928844452, "learning_rate": 9.329189709120174e-05, "loss": 1.2482, "step": 830 }, { "epoch": 0.1943994445730155, "grad_norm": 0.21476367115974426, "learning_rate": 9.310315977227509e-05, "loss": 1.2321, "step": 840 }, { "epoch": 0.19671372367507522, "grad_norm": 0.20833474397659302, "learning_rate": 9.291200046807382e-05, "loss": 1.22, "step": 850 }, { "epoch": 0.19902800277713492, "grad_norm": 0.19986377656459808, "learning_rate": 9.27184299199223e-05, "loss": 1.2423, "step": 860 }, { "epoch": 0.20134228187919462, "grad_norm": 0.22088144719600677, "learning_rate": 9.252245900463373e-05, "loss": 1.232, "step": 870 }, { "epoch": 0.20365656098125434, "grad_norm": 0.20136182010173798, "learning_rate": 9.2324098733899e-05, "loss": 1.2229, "step": 880 }, { "epoch": 0.20597084008331404, "grad_norm": 0.1938410848379135, "learning_rate": 9.212336025366788e-05, "loss": 1.2227, "step": 890 }, { "epoch": 0.20828511918537376, "grad_norm": 0.202079638838768, "learning_rate": 9.19202548435228e-05, "loss": 1.2197, "step": 900 }, { "epoch": 0.21059939828743346, "grad_norm": 0.20484083890914917, "learning_rate": 9.1714793916045e-05, "loss": 1.2089, "step": 910 }, { "epoch": 0.21291367738949318, "grad_norm": 0.21105819940567017, "learning_rate": 9.150698901617327e-05, "loss": 1.2315, "step": 920 }, { "epoch": 0.21522795649155288, "grad_norm": 0.19875198602676392, "learning_rate": 9.129685182055519e-05, "loss": 1.2233, "step": 930 }, { "epoch": 0.21754223559361258, "grad_norm": 0.201791912317276, "learning_rate": 9.10843941368911e-05, "loss": 1.2324, "step": 940 }, { "epoch": 0.2198565146956723, "grad_norm": 0.20584046840667725, "learning_rate": 9.086962790327056e-05, "loss": 1.2167, "step": 950 }, { "epoch": 0.222170793797732, "grad_norm": 0.19981129467487335, "learning_rate": 9.065256518750154e-05, "loss": 1.2178, "step": 960 }, { "epoch": 0.22448507289979172, "grad_norm": 0.19994951784610748, "learning_rate": 9.043321818643233e-05, "loss": 1.2158, "step": 970 }, { "epoch": 0.22679935200185142, "grad_norm": 0.2023075968027115, "learning_rate": 9.021159922526623e-05, "loss": 1.2353, "step": 980 }, { "epoch": 0.22911363110391114, "grad_norm": 0.1981421411037445, "learning_rate": 8.998772075686896e-05, "loss": 1.2396, "step": 990 }, { "epoch": 0.23142791020597084, "grad_norm": 0.2052128165960312, "learning_rate": 8.976159536106894e-05, "loss": 1.2137, "step": 1000 }, { "epoch": 0.23142791020597084, "eval_loss": 1.208183765411377, "eval_runtime": 21.6303, "eval_samples_per_second": 17.753, "eval_steps_per_second": 0.555, "step": 1000 }, { "epoch": 0.23374218930803053, "grad_norm": 0.20763066411018372, "learning_rate": 8.953323574395037e-05, "loss": 1.2247, "step": 1010 }, { "epoch": 0.23605646841009026, "grad_norm": 0.19439862668514252, "learning_rate": 8.930265473713938e-05, "loss": 1.2239, "step": 1020 }, { "epoch": 0.23837074751214996, "grad_norm": 0.188704714179039, "learning_rate": 8.90698652970829e-05, "loss": 1.2331, "step": 1030 }, { "epoch": 0.24068502661420968, "grad_norm": 0.2066233903169632, "learning_rate": 8.883488050432074e-05, "loss": 1.2178, "step": 1040 }, { "epoch": 0.24299930571626938, "grad_norm": 0.20683979988098145, "learning_rate": 8.859771356275046e-05, "loss": 1.2222, "step": 1050 }, { "epoch": 0.2453135848183291, "grad_norm": 0.21290378272533417, "learning_rate": 8.835837779888557e-05, "loss": 1.2162, "step": 1060 }, { "epoch": 0.2476278639203888, "grad_norm": 0.19746707379817963, "learning_rate": 8.811688666110662e-05, "loss": 1.2239, "step": 1070 }, { "epoch": 0.24994214302244852, "grad_norm": 0.19365474581718445, "learning_rate": 8.787325371890558e-05, "loss": 1.2187, "step": 1080 }, { "epoch": 0.2522564221245082, "grad_norm": 0.20299233496189117, "learning_rate": 8.76274926621233e-05, "loss": 1.2075, "step": 1090 }, { "epoch": 0.25457070122656794, "grad_norm": 0.20049187541007996, "learning_rate": 8.737961730018034e-05, "loss": 1.2114, "step": 1100 }, { "epoch": 0.2568849803286276, "grad_norm": 0.19873455166816711, "learning_rate": 8.712964156130099e-05, "loss": 1.2247, "step": 1110 }, { "epoch": 0.25919925943068733, "grad_norm": 0.1992412507534027, "learning_rate": 8.687757949173063e-05, "loss": 1.2164, "step": 1120 }, { "epoch": 0.26151353853274706, "grad_norm": 0.2137262374162674, "learning_rate": 8.662344525494644e-05, "loss": 1.2083, "step": 1130 }, { "epoch": 0.2638278176348068, "grad_norm": 0.20104128122329712, "learning_rate": 8.636725313086162e-05, "loss": 1.2125, "step": 1140 }, { "epoch": 0.26614209673686645, "grad_norm": 0.2062898725271225, "learning_rate": 8.610901751502292e-05, "loss": 1.235, "step": 1150 }, { "epoch": 0.2684563758389262, "grad_norm": 0.20116354525089264, "learning_rate": 8.584875291780178e-05, "loss": 1.217, "step": 1160 }, { "epoch": 0.2707706549409859, "grad_norm": 0.20894746482372284, "learning_rate": 8.558647396357901e-05, "loss": 1.2173, "step": 1170 }, { "epoch": 0.27308493404304557, "grad_norm": 0.19359129667282104, "learning_rate": 8.532219538992301e-05, "loss": 1.2082, "step": 1180 }, { "epoch": 0.2753992131451053, "grad_norm": 0.1946392059326172, "learning_rate": 8.505593204676162e-05, "loss": 1.2161, "step": 1190 }, { "epoch": 0.277713492247165, "grad_norm": 0.2131495177745819, "learning_rate": 8.478769889554781e-05, "loss": 1.2046, "step": 1200 }, { "epoch": 0.28002777134922474, "grad_norm": 0.21192453801631927, "learning_rate": 8.451751100841887e-05, "loss": 1.2174, "step": 1210 }, { "epoch": 0.2823420504512844, "grad_norm": 0.1986854523420334, "learning_rate": 8.424538356734957e-05, "loss": 1.2124, "step": 1220 }, { "epoch": 0.28465632955334413, "grad_norm": 0.19923637807369232, "learning_rate": 8.397133186329903e-05, "loss": 1.2168, "step": 1230 }, { "epoch": 0.28697060865540386, "grad_norm": 0.19468043744564056, "learning_rate": 8.36953712953516e-05, "loss": 1.2067, "step": 1240 }, { "epoch": 0.2892848877574635, "grad_norm": 0.19150374829769135, "learning_rate": 8.34175173698515e-05, "loss": 1.2118, "step": 1250 }, { "epoch": 0.29159916685952325, "grad_norm": 0.19914792478084564, "learning_rate": 8.31377856995315e-05, "loss": 1.2018, "step": 1260 }, { "epoch": 0.293913445961583, "grad_norm": 0.19311580061912537, "learning_rate": 8.285619200263567e-05, "loss": 1.2001, "step": 1270 }, { "epoch": 0.2962277250636427, "grad_norm": 0.20415401458740234, "learning_rate": 8.257275210203622e-05, "loss": 1.2156, "step": 1280 }, { "epoch": 0.29854200416570237, "grad_norm": 0.1939728707075119, "learning_rate": 8.228748192434428e-05, "loss": 1.2035, "step": 1290 }, { "epoch": 0.3008562832677621, "grad_norm": 0.1993534117937088, "learning_rate": 8.200039749901511e-05, "loss": 1.1971, "step": 1300 }, { "epoch": 0.3031705623698218, "grad_norm": 0.19424191117286682, "learning_rate": 8.171151495744727e-05, "loss": 1.1923, "step": 1310 }, { "epoch": 0.3054848414718815, "grad_norm": 0.19882912933826447, "learning_rate": 8.142085053207629e-05, "loss": 1.1998, "step": 1320 }, { "epoch": 0.3077991205739412, "grad_norm": 0.1941244751214981, "learning_rate": 8.112842055546252e-05, "loss": 1.2152, "step": 1330 }, { "epoch": 0.31011339967600093, "grad_norm": 0.20408713817596436, "learning_rate": 8.083424145937339e-05, "loss": 1.2202, "step": 1340 }, { "epoch": 0.31242767877806066, "grad_norm": 0.19065722823143005, "learning_rate": 8.053832977386015e-05, "loss": 1.2123, "step": 1350 }, { "epoch": 0.3147419578801203, "grad_norm": 0.20365293323993683, "learning_rate": 8.024070212632892e-05, "loss": 1.1972, "step": 1360 }, { "epoch": 0.31705623698218005, "grad_norm": 0.20200444757938385, "learning_rate": 7.994137524060656e-05, "loss": 1.202, "step": 1370 }, { "epoch": 0.3193705160842398, "grad_norm": 0.19926463067531586, "learning_rate": 7.964036593600084e-05, "loss": 1.1989, "step": 1380 }, { "epoch": 0.32168479518629944, "grad_norm": 0.19380785524845123, "learning_rate": 7.933769112635534e-05, "loss": 1.203, "step": 1390 }, { "epoch": 0.32399907428835917, "grad_norm": 0.19268542528152466, "learning_rate": 7.903336781909911e-05, "loss": 1.2019, "step": 1400 }, { "epoch": 0.3263133533904189, "grad_norm": 0.20773714780807495, "learning_rate": 7.872741311429103e-05, "loss": 1.1995, "step": 1410 }, { "epoch": 0.3286276324924786, "grad_norm": 0.19505122303962708, "learning_rate": 7.841984420365888e-05, "loss": 1.2028, "step": 1420 }, { "epoch": 0.3309419115945383, "grad_norm": 0.19330574572086334, "learning_rate": 7.811067836963337e-05, "loss": 1.2002, "step": 1430 }, { "epoch": 0.333256190696598, "grad_norm": 0.21044421195983887, "learning_rate": 7.779993298437704e-05, "loss": 1.1985, "step": 1440 }, { "epoch": 0.33557046979865773, "grad_norm": 0.20081642270088196, "learning_rate": 7.74876255088081e-05, "loss": 1.2131, "step": 1450 }, { "epoch": 0.3378847489007174, "grad_norm": 0.1973022222518921, "learning_rate": 7.71737734916193e-05, "loss": 1.1997, "step": 1460 }, { "epoch": 0.3401990280027771, "grad_norm": 0.19213716685771942, "learning_rate": 7.685839456829183e-05, "loss": 1.201, "step": 1470 }, { "epoch": 0.34251330710483685, "grad_norm": 0.19389280676841736, "learning_rate": 7.65415064601044e-05, "loss": 1.2078, "step": 1480 }, { "epoch": 0.3448275862068966, "grad_norm": 0.20220617949962616, "learning_rate": 7.622312697313754e-05, "loss": 1.2013, "step": 1490 }, { "epoch": 0.34714186530895624, "grad_norm": 0.2051166296005249, "learning_rate": 7.59032739972729e-05, "loss": 1.2183, "step": 1500 }, { "epoch": 0.34714186530895624, "eval_loss": 1.1873364448547363, "eval_runtime": 21.6444, "eval_samples_per_second": 17.741, "eval_steps_per_second": 0.554, "step": 1500 }, { "epoch": 0.34945614441101597, "grad_norm": 0.19153615832328796, "learning_rate": 7.558196550518818e-05, "loss": 1.1948, "step": 1510 }, { "epoch": 0.3517704235130757, "grad_norm": 0.1992039531469345, "learning_rate": 7.525921955134713e-05, "loss": 1.1868, "step": 1520 }, { "epoch": 0.35408470261513536, "grad_norm": 0.20605571568012238, "learning_rate": 7.493505427098517e-05, "loss": 1.199, "step": 1530 }, { "epoch": 0.3563989817171951, "grad_norm": 0.17926311492919922, "learning_rate": 7.460948787909017e-05, "loss": 1.194, "step": 1540 }, { "epoch": 0.3587132608192548, "grad_norm": 0.20658712089061737, "learning_rate": 7.428253866937918e-05, "loss": 1.2012, "step": 1550 }, { "epoch": 0.36102753992131453, "grad_norm": 0.21082770824432373, "learning_rate": 7.395422501327036e-05, "loss": 1.2004, "step": 1560 }, { "epoch": 0.3633418190233742, "grad_norm": 0.20247185230255127, "learning_rate": 7.362456535885066e-05, "loss": 1.1878, "step": 1570 }, { "epoch": 0.3656560981254339, "grad_norm": 0.20155729353427887, "learning_rate": 7.329357822983929e-05, "loss": 1.1796, "step": 1580 }, { "epoch": 0.36797037722749365, "grad_norm": 0.1960991472005844, "learning_rate": 7.296128222454686e-05, "loss": 1.2043, "step": 1590 }, { "epoch": 0.3702846563295533, "grad_norm": 0.19188149273395538, "learning_rate": 7.262769601483024e-05, "loss": 1.2037, "step": 1600 }, { "epoch": 0.37259893543161304, "grad_norm": 0.2052951157093048, "learning_rate": 7.229283834504351e-05, "loss": 1.1985, "step": 1610 }, { "epoch": 0.37491321453367277, "grad_norm": 0.18684880435466766, "learning_rate": 7.195672803098463e-05, "loss": 1.2023, "step": 1620 }, { "epoch": 0.3772274936357325, "grad_norm": 0.20104870200157166, "learning_rate": 7.161938395883815e-05, "loss": 1.1892, "step": 1630 }, { "epoch": 0.37954177273779216, "grad_norm": 0.19793595373630524, "learning_rate": 7.128082508411406e-05, "loss": 1.1992, "step": 1640 }, { "epoch": 0.3818560518398519, "grad_norm": 0.20280171930789948, "learning_rate": 7.094107043058264e-05, "loss": 1.2076, "step": 1650 }, { "epoch": 0.3841703309419116, "grad_norm": 0.20379236340522766, "learning_rate": 7.060013908920548e-05, "loss": 1.1987, "step": 1660 }, { "epoch": 0.3864846100439713, "grad_norm": 0.19275911152362823, "learning_rate": 7.025805021706276e-05, "loss": 1.1983, "step": 1670 }, { "epoch": 0.388798889146031, "grad_norm": 0.20220735669136047, "learning_rate": 6.991482303627685e-05, "loss": 1.1992, "step": 1680 }, { "epoch": 0.3911131682480907, "grad_norm": 0.2047668844461441, "learning_rate": 6.957047683293215e-05, "loss": 1.2086, "step": 1690 }, { "epoch": 0.39342744735015045, "grad_norm": 0.19045311212539673, "learning_rate": 6.922503095599142e-05, "loss": 1.1926, "step": 1700 }, { "epoch": 0.3957417264522101, "grad_norm": 0.2014586180448532, "learning_rate": 6.887850481620858e-05, "loss": 1.1973, "step": 1710 }, { "epoch": 0.39805600555426984, "grad_norm": 0.18599851429462433, "learning_rate": 6.853091788503802e-05, "loss": 1.1956, "step": 1720 }, { "epoch": 0.40037028465632957, "grad_norm": 0.2029285877943039, "learning_rate": 6.818228969354037e-05, "loss": 1.2114, "step": 1730 }, { "epoch": 0.40268456375838924, "grad_norm": 0.19286784529685974, "learning_rate": 6.783263983128519e-05, "loss": 1.1761, "step": 1740 }, { "epoch": 0.40499884286044896, "grad_norm": 0.19630247354507446, "learning_rate": 6.748198794525016e-05, "loss": 1.188, "step": 1750 }, { "epoch": 0.4073131219625087, "grad_norm": 0.19817174971103668, "learning_rate": 6.71303537387171e-05, "loss": 1.1885, "step": 1760 }, { "epoch": 0.4096274010645684, "grad_norm": 0.19006091356277466, "learning_rate": 6.677775697016484e-05, "loss": 1.1915, "step": 1770 }, { "epoch": 0.4119416801666281, "grad_norm": 0.1849374771118164, "learning_rate": 6.642421745215901e-05, "loss": 1.1853, "step": 1780 }, { "epoch": 0.4142559592686878, "grad_norm": 0.2032414823770523, "learning_rate": 6.606975505023873e-05, "loss": 1.197, "step": 1790 }, { "epoch": 0.4165702383707475, "grad_norm": 0.1908976286649704, "learning_rate": 6.571438968180035e-05, "loss": 1.1937, "step": 1800 }, { "epoch": 0.4188845174728072, "grad_norm": 0.19852004945278168, "learning_rate": 6.535814131497833e-05, "loss": 1.1837, "step": 1810 }, { "epoch": 0.4211987965748669, "grad_norm": 0.19003674387931824, "learning_rate": 6.50010299675232e-05, "loss": 1.1959, "step": 1820 }, { "epoch": 0.42351307567692664, "grad_norm": 0.2054755687713623, "learning_rate": 6.46430757056767e-05, "loss": 1.1943, "step": 1830 }, { "epoch": 0.42582735477898637, "grad_norm": 0.19895458221435547, "learning_rate": 6.428429864304432e-05, "loss": 1.1871, "step": 1840 }, { "epoch": 0.42814163388104604, "grad_norm": 0.19693517684936523, "learning_rate": 6.39247189394651e-05, "loss": 1.185, "step": 1850 }, { "epoch": 0.43045591298310576, "grad_norm": 0.19280746579170227, "learning_rate": 6.356435679987882e-05, "loss": 1.1817, "step": 1860 }, { "epoch": 0.4327701920851655, "grad_norm": 0.19104084372520447, "learning_rate": 6.320323247319064e-05, "loss": 1.186, "step": 1870 }, { "epoch": 0.43508447118722515, "grad_norm": 0.19598130881786346, "learning_rate": 6.28413662511334e-05, "loss": 1.1946, "step": 1880 }, { "epoch": 0.4373987502892849, "grad_norm": 0.2072417438030243, "learning_rate": 6.247877846712734e-05, "loss": 1.1921, "step": 1890 }, { "epoch": 0.4397130293913446, "grad_norm": 0.19743064045906067, "learning_rate": 6.211548949513756e-05, "loss": 1.1825, "step": 1900 }, { "epoch": 0.4420273084934043, "grad_norm": 0.19049686193466187, "learning_rate": 6.175151974852923e-05, "loss": 1.1893, "step": 1910 }, { "epoch": 0.444341587595464, "grad_norm": 0.18704815208911896, "learning_rate": 6.138688967892055e-05, "loss": 1.1851, "step": 1920 }, { "epoch": 0.4466558666975237, "grad_norm": 0.2007189691066742, "learning_rate": 6.102161977503358e-05, "loss": 1.1791, "step": 1930 }, { "epoch": 0.44897014579958344, "grad_norm": 0.19694744050502777, "learning_rate": 6.065573056154289e-05, "loss": 1.1797, "step": 1940 }, { "epoch": 0.4512844249016431, "grad_norm": 0.1945074051618576, "learning_rate": 6.028924259792235e-05, "loss": 1.1842, "step": 1950 }, { "epoch": 0.45359870400370284, "grad_norm": 0.19543199241161346, "learning_rate": 5.9922176477289874e-05, "loss": 1.1897, "step": 1960 }, { "epoch": 0.45591298310576256, "grad_norm": 0.20255213975906372, "learning_rate": 5.9554552825250264e-05, "loss": 1.1912, "step": 1970 }, { "epoch": 0.4582272622078223, "grad_norm": 0.19501863420009613, "learning_rate": 5.918639229873624e-05, "loss": 1.1821, "step": 1980 }, { "epoch": 0.46054154130988195, "grad_norm": 0.19646863639354706, "learning_rate": 5.881771558484774e-05, "loss": 1.1756, "step": 1990 }, { "epoch": 0.4628558204119417, "grad_norm": 0.2014242708683014, "learning_rate": 5.844854339968952e-05, "loss": 1.1853, "step": 2000 }, { "epoch": 0.4628558204119417, "eval_loss": 1.1698839664459229, "eval_runtime": 21.5892, "eval_samples_per_second": 17.787, "eval_steps_per_second": 0.556, "step": 2000 }, { "epoch": 0.4651700995140014, "grad_norm": 0.19205763936042786, "learning_rate": 5.8078896487207015e-05, "loss": 1.1883, "step": 2010 }, { "epoch": 0.46748437861606107, "grad_norm": 0.19423869252204895, "learning_rate": 5.770879561802087e-05, "loss": 1.1777, "step": 2020 }, { "epoch": 0.4697986577181208, "grad_norm": 0.19925445318222046, "learning_rate": 5.7338261588259726e-05, "loss": 1.1843, "step": 2030 }, { "epoch": 0.4721129368201805, "grad_norm": 0.18574309349060059, "learning_rate": 5.696731521839167e-05, "loss": 1.1763, "step": 2040 }, { "epoch": 0.47442721592224024, "grad_norm": 0.19058012962341309, "learning_rate": 5.6595977352054407e-05, "loss": 1.1797, "step": 2050 }, { "epoch": 0.4767414950242999, "grad_norm": 0.1849735677242279, "learning_rate": 5.6224268854883996e-05, "loss": 1.1808, "step": 2060 }, { "epoch": 0.47905577412635963, "grad_norm": 0.1923811137676239, "learning_rate": 5.585221061334236e-05, "loss": 1.1744, "step": 2070 }, { "epoch": 0.48137005322841936, "grad_norm": 0.1943860650062561, "learning_rate": 5.547982353354376e-05, "loss": 1.1833, "step": 2080 }, { "epoch": 0.4836843323304791, "grad_norm": 0.20127460360527039, "learning_rate": 5.510712854008001e-05, "loss": 1.1798, "step": 2090 }, { "epoch": 0.48599861143253875, "grad_norm": 0.18425202369689941, "learning_rate": 5.473414657484468e-05, "loss": 1.1969, "step": 2100 }, { "epoch": 0.4883128905345985, "grad_norm": 0.19612173736095428, "learning_rate": 5.436089859585648e-05, "loss": 1.1707, "step": 2110 }, { "epoch": 0.4906271696366582, "grad_norm": 0.18944087624549866, "learning_rate": 5.3987405576081505e-05, "loss": 1.1822, "step": 2120 }, { "epoch": 0.49294144873871787, "grad_norm": 0.19573846459388733, "learning_rate": 5.361368850225479e-05, "loss": 1.1831, "step": 2130 }, { "epoch": 0.4952557278407776, "grad_norm": 0.18912994861602783, "learning_rate": 5.32397683737011e-05, "loss": 1.1859, "step": 2140 }, { "epoch": 0.4975700069428373, "grad_norm": 0.19357813894748688, "learning_rate": 5.286566620115493e-05, "loss": 1.1701, "step": 2150 }, { "epoch": 0.49988428604489704, "grad_norm": 0.18788059055805206, "learning_rate": 5.249140300557985e-05, "loss": 1.1764, "step": 2160 }, { "epoch": 0.5021985651469567, "grad_norm": 0.19492246210575104, "learning_rate": 5.211699981698747e-05, "loss": 1.1898, "step": 2170 }, { "epoch": 0.5045128442490164, "grad_norm": 0.21048106253147125, "learning_rate": 5.17424776732556e-05, "loss": 1.1768, "step": 2180 }, { "epoch": 0.5068271233510762, "grad_norm": 0.1978602409362793, "learning_rate": 5.1367857618946194e-05, "loss": 1.1791, "step": 2190 }, { "epoch": 0.5091414024531359, "grad_norm": 0.19546453654766083, "learning_rate": 5.09931607041229e-05, "loss": 1.1821, "step": 2200 }, { "epoch": 0.5114556815551956, "grad_norm": 0.19739992916584015, "learning_rate": 5.0618407983168146e-05, "loss": 1.1754, "step": 2210 }, { "epoch": 0.5137699606572552, "grad_norm": 0.19072087109088898, "learning_rate": 5.0243620513600145e-05, "loss": 1.1826, "step": 2220 }, { "epoch": 0.5160842397593149, "grad_norm": 0.1789073944091797, "learning_rate": 4.9868819354889625e-05, "loss": 1.1731, "step": 2230 }, { "epoch": 0.5183985188613747, "grad_norm": 0.19865523278713226, "learning_rate": 4.9494025567276544e-05, "loss": 1.1796, "step": 2240 }, { "epoch": 0.5207127979634344, "grad_norm": 0.1872965544462204, "learning_rate": 4.9119260210586695e-05, "loss": 1.176, "step": 2250 }, { "epoch": 0.5230270770654941, "grad_norm": 0.1958765685558319, "learning_rate": 4.874454434304824e-05, "loss": 1.1712, "step": 2260 }, { "epoch": 0.5253413561675538, "grad_norm": 0.19132095575332642, "learning_rate": 4.8369899020108626e-05, "loss": 1.1786, "step": 2270 }, { "epoch": 0.5276556352696136, "grad_norm": 0.19474317133426666, "learning_rate": 4.7995345293251284e-05, "loss": 1.1869, "step": 2280 }, { "epoch": 0.5299699143716732, "grad_norm": 0.19309870898723602, "learning_rate": 4.762090420881289e-05, "loss": 1.1802, "step": 2290 }, { "epoch": 0.5322841934737329, "grad_norm": 0.2047063410282135, "learning_rate": 4.7246596806800636e-05, "loss": 1.1689, "step": 2300 }, { "epoch": 0.5345984725757926, "grad_norm": 0.19408148527145386, "learning_rate": 4.687244411971009e-05, "loss": 1.1715, "step": 2310 }, { "epoch": 0.5369127516778524, "grad_norm": 0.21102771162986755, "learning_rate": 4.649846717134327e-05, "loss": 1.1868, "step": 2320 }, { "epoch": 0.5392270307799121, "grad_norm": 0.20618434250354767, "learning_rate": 4.612468697562741e-05, "loss": 1.1688, "step": 2330 }, { "epoch": 0.5415413098819718, "grad_norm": 0.19679012894630432, "learning_rate": 4.575112453543408e-05, "loss": 1.1758, "step": 2340 }, { "epoch": 0.5438555889840315, "grad_norm": 0.20675049722194672, "learning_rate": 4.537780084139913e-05, "loss": 1.1605, "step": 2350 }, { "epoch": 0.5461698680860911, "grad_norm": 0.18863654136657715, "learning_rate": 4.500473687074309e-05, "loss": 1.1742, "step": 2360 }, { "epoch": 0.5484841471881509, "grad_norm": 0.19098520278930664, "learning_rate": 4.463195358609258e-05, "loss": 1.1652, "step": 2370 }, { "epoch": 0.5507984262902106, "grad_norm": 0.19034633040428162, "learning_rate": 4.4259471934302324e-05, "loss": 1.1716, "step": 2380 }, { "epoch": 0.5531127053922703, "grad_norm": 0.19565701484680176, "learning_rate": 4.388731284527816e-05, "loss": 1.1503, "step": 2390 }, { "epoch": 0.55542698449433, "grad_norm": 0.19067049026489258, "learning_rate": 4.351549723080097e-05, "loss": 1.1772, "step": 2400 }, { "epoch": 0.5577412635963898, "grad_norm": 0.19726891815662384, "learning_rate": 4.3144045983351735e-05, "loss": 1.187, "step": 2410 }, { "epoch": 0.5600555426984495, "grad_norm": 0.19251461327075958, "learning_rate": 4.277297997493737e-05, "loss": 1.1734, "step": 2420 }, { "epoch": 0.5623698218005091, "grad_norm": 0.19542944431304932, "learning_rate": 4.2402320055918154e-05, "loss": 1.1717, "step": 2430 }, { "epoch": 0.5646841009025688, "grad_norm": 0.19372211396694183, "learning_rate": 4.203208705383594e-05, "loss": 1.1859, "step": 2440 }, { "epoch": 0.5669983800046285, "grad_norm": 0.19102297723293304, "learning_rate": 4.1662301772243996e-05, "loss": 1.1609, "step": 2450 }, { "epoch": 0.5693126591066883, "grad_norm": 0.1865842044353485, "learning_rate": 4.129298498953792e-05, "loss": 1.1898, "step": 2460 }, { "epoch": 0.571626938208748, "grad_norm": 0.19476434588432312, "learning_rate": 4.0924157457788226e-05, "loss": 1.1726, "step": 2470 }, { "epoch": 0.5739412173108077, "grad_norm": 0.19208824634552002, "learning_rate": 4.055583990157416e-05, "loss": 1.1777, "step": 2480 }, { "epoch": 0.5762554964128674, "grad_norm": 0.1908976435661316, "learning_rate": 4.01880530168192e-05, "loss": 1.1668, "step": 2490 }, { "epoch": 0.578569775514927, "grad_norm": 0.19089365005493164, "learning_rate": 3.982081746962826e-05, "loss": 1.1794, "step": 2500 }, { "epoch": 0.578569775514927, "eval_loss": 1.1556445360183716, "eval_runtime": 21.5393, "eval_samples_per_second": 17.828, "eval_steps_per_second": 0.557, "step": 2500 }, { "epoch": 0.5808840546169868, "grad_norm": 0.21013890206813812, "learning_rate": 3.94541538951262e-05, "loss": 1.157, "step": 2510 }, { "epoch": 0.5831983337190465, "grad_norm": 0.20332397520542145, "learning_rate": 3.908808289629865e-05, "loss": 1.1709, "step": 2520 }, { "epoch": 0.5855126128211062, "grad_norm": 0.19428518414497375, "learning_rate": 3.8722625042834025e-05, "loss": 1.1783, "step": 2530 }, { "epoch": 0.587826891923166, "grad_norm": 0.19970852136611938, "learning_rate": 3.835780086996794e-05, "loss": 1.1687, "step": 2540 }, { "epoch": 0.5901411710252257, "grad_norm": 0.18801788985729218, "learning_rate": 3.7993630877329124e-05, "loss": 1.1715, "step": 2550 }, { "epoch": 0.5924554501272854, "grad_norm": 0.2128693163394928, "learning_rate": 3.763013552778774e-05, "loss": 1.179, "step": 2560 }, { "epoch": 0.594769729229345, "grad_norm": 0.19203241169452667, "learning_rate": 3.726733524630535e-05, "loss": 1.1838, "step": 2570 }, { "epoch": 0.5970840083314047, "grad_norm": 0.20480811595916748, "learning_rate": 3.690525041878743e-05, "loss": 1.1616, "step": 2580 }, { "epoch": 0.5993982874334645, "grad_norm": 0.19616341590881348, "learning_rate": 3.6543901390937754e-05, "loss": 1.1416, "step": 2590 }, { "epoch": 0.6017125665355242, "grad_norm": 0.1999153196811676, "learning_rate": 3.6183308467115175e-05, "loss": 1.1659, "step": 2600 }, { "epoch": 0.6040268456375839, "grad_norm": 0.19980020821094513, "learning_rate": 3.582349190919275e-05, "loss": 1.1657, "step": 2610 }, { "epoch": 0.6063411247396436, "grad_norm": 0.19510309398174286, "learning_rate": 3.546447193541922e-05, "loss": 1.1701, "step": 2620 }, { "epoch": 0.6086554038417034, "grad_norm": 0.18956266343593597, "learning_rate": 3.510626871928287e-05, "loss": 1.1663, "step": 2630 }, { "epoch": 0.610969682943763, "grad_norm": 0.18637120723724365, "learning_rate": 3.474890238837806e-05, "loss": 1.1731, "step": 2640 }, { "epoch": 0.6132839620458227, "grad_norm": 0.19002896547317505, "learning_rate": 3.439239302327417e-05, "loss": 1.1683, "step": 2650 }, { "epoch": 0.6155982411478824, "grad_norm": 0.19537580013275146, "learning_rate": 3.403676065638735e-05, "loss": 1.1652, "step": 2660 }, { "epoch": 0.6179125202499421, "grad_norm": 0.1950923502445221, "learning_rate": 3.368202527085476e-05, "loss": 1.1778, "step": 2670 }, { "epoch": 0.6202267993520019, "grad_norm": 0.19736339151859283, "learning_rate": 3.332820679941186e-05, "loss": 1.179, "step": 2680 }, { "epoch": 0.6225410784540616, "grad_norm": 0.19073107838630676, "learning_rate": 3.297532512327231e-05, "loss": 1.162, "step": 2690 }, { "epoch": 0.6248553575561213, "grad_norm": 0.1941593438386917, "learning_rate": 3.262340007101076e-05, "loss": 1.1592, "step": 2700 }, { "epoch": 0.6271696366581809, "grad_norm": 0.1990540772676468, "learning_rate": 3.227245141744882e-05, "loss": 1.1571, "step": 2710 }, { "epoch": 0.6294839157602407, "grad_norm": 0.19624970853328705, "learning_rate": 3.192249888254381e-05, "loss": 1.1582, "step": 2720 }, { "epoch": 0.6317981948623004, "grad_norm": 0.18591170012950897, "learning_rate": 3.157356213028072e-05, "loss": 1.1518, "step": 2730 }, { "epoch": 0.6341124739643601, "grad_norm": 0.1997700184583664, "learning_rate": 3.122566076756724e-05, "loss": 1.1689, "step": 2740 }, { "epoch": 0.6364267530664198, "grad_norm": 0.18985426425933838, "learning_rate": 3.087881434313212e-05, "loss": 1.1693, "step": 2750 }, { "epoch": 0.6387410321684796, "grad_norm": 0.19050361216068268, "learning_rate": 3.053304234642661e-05, "loss": 1.1651, "step": 2760 }, { "epoch": 0.6410553112705393, "grad_norm": 0.19750134646892548, "learning_rate": 3.0188364206529467e-05, "loss": 1.1657, "step": 2770 }, { "epoch": 0.6433695903725989, "grad_norm": 0.18156161904335022, "learning_rate": 2.9844799291055083e-05, "loss": 1.1792, "step": 2780 }, { "epoch": 0.6456838694746586, "grad_norm": 0.19130638241767883, "learning_rate": 2.950236690506537e-05, "loss": 1.1623, "step": 2790 }, { "epoch": 0.6479981485767183, "grad_norm": 0.20145711302757263, "learning_rate": 2.916108628998484e-05, "loss": 1.162, "step": 2800 }, { "epoch": 0.6503124276787781, "grad_norm": 0.18573534488677979, "learning_rate": 2.8820976622519558e-05, "loss": 1.1724, "step": 2810 }, { "epoch": 0.6526267067808378, "grad_norm": 0.19041724503040314, "learning_rate": 2.84820570135795e-05, "loss": 1.1567, "step": 2820 }, { "epoch": 0.6549409858828975, "grad_norm": 0.19331230223178864, "learning_rate": 2.8144346507204728e-05, "loss": 1.1722, "step": 2830 }, { "epoch": 0.6572552649849572, "grad_norm": 0.19323979318141937, "learning_rate": 2.7807864079495306e-05, "loss": 1.1637, "step": 2840 }, { "epoch": 0.6595695440870168, "grad_norm": 0.18545861542224884, "learning_rate": 2.7472628637545082e-05, "loss": 1.1634, "step": 2850 }, { "epoch": 0.6618838231890766, "grad_norm": 0.19878439605236053, "learning_rate": 2.7138659018379144e-05, "loss": 1.169, "step": 2860 }, { "epoch": 0.6641981022911363, "grad_norm": 0.19122658669948578, "learning_rate": 2.680597398789554e-05, "loss": 1.1779, "step": 2870 }, { "epoch": 0.666512381393196, "grad_norm": 0.19803395867347717, "learning_rate": 2.647459223981064e-05, "loss": 1.1523, "step": 2880 }, { "epoch": 0.6688266604952557, "grad_norm": 0.19722239673137665, "learning_rate": 2.614453239460884e-05, "loss": 1.1596, "step": 2890 }, { "epoch": 0.6711409395973155, "grad_norm": 0.2012769877910614, "learning_rate": 2.581581299849627e-05, "loss": 1.1675, "step": 2900 }, { "epoch": 0.6734552186993752, "grad_norm": 0.19577769935131073, "learning_rate": 2.5488452522358585e-05, "loss": 1.167, "step": 2910 }, { "epoch": 0.6757694978014348, "grad_norm": 0.19656306505203247, "learning_rate": 2.5162469360723208e-05, "loss": 1.1737, "step": 2920 }, { "epoch": 0.6780837769034945, "grad_norm": 0.19946037232875824, "learning_rate": 2.4837881830725584e-05, "loss": 1.1509, "step": 2930 }, { "epoch": 0.6803980560055543, "grad_norm": 0.1928076148033142, "learning_rate": 2.451470817108007e-05, "loss": 1.1595, "step": 2940 }, { "epoch": 0.682712335107614, "grad_norm": 0.19065019488334656, "learning_rate": 2.4192966541054977e-05, "loss": 1.1651, "step": 2950 }, { "epoch": 0.6850266142096737, "grad_norm": 0.198676198720932, "learning_rate": 2.387267501945233e-05, "loss": 1.1487, "step": 2960 }, { "epoch": 0.6873408933117334, "grad_norm": 0.19842711091041565, "learning_rate": 2.3553851603591837e-05, "loss": 1.1606, "step": 2970 }, { "epoch": 0.6896551724137931, "grad_norm": 0.20439574122428894, "learning_rate": 2.3236514208299796e-05, "loss": 1.1464, "step": 2980 }, { "epoch": 0.6919694515158528, "grad_norm": 0.18908947706222534, "learning_rate": 2.2920680664902304e-05, "loss": 1.1608, "step": 2990 }, { "epoch": 0.6942837306179125, "grad_norm": 0.20464125275611877, "learning_rate": 2.260636872022339e-05, "loss": 1.1482, "step": 3000 }, { "epoch": 0.6942837306179125, "eval_loss": 1.1457551717758179, "eval_runtime": 21.6541, "eval_samples_per_second": 17.733, "eval_steps_per_second": 0.554, "step": 3000 } ], "logging_steps": 10, "max_steps": 4321, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.914805497032868e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }