{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9953917050691244, "eval_steps": 11, "global_step": 108, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009216589861751152, "grad_norm": 0.006341594582436306, "learning_rate": 1e-05, "loss": 2.1763, "step": 1 }, { "epoch": 0.009216589861751152, "eval_loss": 2.30206036567688, "eval_runtime": 298.4125, "eval_samples_per_second": 0.67, "eval_steps_per_second": 0.168, "step": 1 }, { "epoch": 0.018433179723502304, "grad_norm": 0.00662962718817515, "learning_rate": 2e-05, "loss": 2.2913, "step": 2 }, { "epoch": 0.027649769585253458, "grad_norm": 0.006614257518340679, "learning_rate": 3e-05, "loss": 2.2847, "step": 3 }, { "epoch": 0.03686635944700461, "grad_norm": 0.006316978099001957, "learning_rate": 4e-05, "loss": 2.3138, "step": 4 }, { "epoch": 0.04608294930875576, "grad_norm": 0.0070544986053885, "learning_rate": 5e-05, "loss": 2.3318, "step": 5 }, { "epoch": 0.055299539170506916, "grad_norm": 0.008002977658788222, "learning_rate": 4.9994186045291893e-05, "loss": 2.2078, "step": 6 }, { "epoch": 0.06451612903225806, "grad_norm": 0.007212018075618458, "learning_rate": 4.997674958949867e-05, "loss": 2.2557, "step": 7 }, { "epoch": 0.07373271889400922, "grad_norm": 0.007831376791308034, "learning_rate": 4.994770685258262e-05, "loss": 2.3081, "step": 8 }, { "epoch": 0.08294930875576037, "grad_norm": 0.00760097824895368, "learning_rate": 4.990708485104885e-05, "loss": 2.2963, "step": 9 }, { "epoch": 0.09216589861751152, "grad_norm": 0.007915801461137613, "learning_rate": 4.98549213728137e-05, "loss": 2.2277, "step": 10 }, { "epoch": 0.10138248847926268, "grad_norm": 0.007012530503323877, "learning_rate": 4.979126494205316e-05, "loss": 2.129, "step": 11 }, { "epoch": 0.10138248847926268, "eval_loss": 2.2997171878814697, "eval_runtime": 299.6373, "eval_samples_per_second": 0.667, "eval_steps_per_second": 0.167, "step": 11 }, { "epoch": 0.11059907834101383, "grad_norm": 0.00836311483494543, "learning_rate": 4.971617477406406e-05, "loss": 2.3068, "step": 12 }, { "epoch": 0.11981566820276497, "grad_norm": 0.0075642072813259425, "learning_rate": 4.962972072018013e-05, "loss": 2.2696, "step": 13 }, { "epoch": 0.12903225806451613, "grad_norm": 0.0068806732246105545, "learning_rate": 4.953198320279395e-05, "loss": 2.2845, "step": 14 }, { "epoch": 0.1382488479262673, "grad_norm": 0.007091340280889235, "learning_rate": 4.942305314054541e-05, "loss": 2.2229, "step": 15 }, { "epoch": 0.14746543778801843, "grad_norm": 0.008450159443273913, "learning_rate": 4.930303186374623e-05, "loss": 2.2657, "step": 16 }, { "epoch": 0.15668202764976957, "grad_norm": 0.0110972336191957, "learning_rate": 4.9172031020119195e-05, "loss": 2.2469, "step": 17 }, { "epoch": 0.16589861751152074, "grad_norm": 0.008621833737082517, "learning_rate": 4.903017247093975e-05, "loss": 2.1416, "step": 18 }, { "epoch": 0.17511520737327188, "grad_norm": 0.008033079208987498, "learning_rate": 4.887758817767666e-05, "loss": 2.3261, "step": 19 }, { "epoch": 0.18433179723502305, "grad_norm": 0.008247041641800389, "learning_rate": 4.871442007923718e-05, "loss": 2.3567, "step": 20 }, { "epoch": 0.1935483870967742, "grad_norm": 0.007706304349781975, "learning_rate": 4.8540819959930763e-05, "loss": 2.3219, "step": 21 }, { "epoch": 0.20276497695852536, "grad_norm": 0.008292878995189728, "learning_rate": 4.835694930827436e-05, "loss": 2.2385, "step": 22 }, { "epoch": 0.20276497695852536, "eval_loss": 2.2945244312286377, "eval_runtime": 300.2994, "eval_samples_per_second": 0.666, "eval_steps_per_second": 0.167, "step": 22 }, { "epoch": 0.2119815668202765, "grad_norm": 0.007781760795523874, "learning_rate": 4.816297916677052e-05, "loss": 2.4205, "step": 23 }, { "epoch": 0.22119815668202766, "grad_norm": 0.008415507314097811, "learning_rate": 4.7959089972798024e-05, "loss": 2.3182, "step": 24 }, { "epoch": 0.2304147465437788, "grad_norm": 0.007464479910584146, "learning_rate": 4.774547139076316e-05, "loss": 2.212, "step": 25 }, { "epoch": 0.23963133640552994, "grad_norm": 0.007629040350009522, "learning_rate": 4.752232213566764e-05, "loss": 2.3292, "step": 26 }, { "epoch": 0.2488479262672811, "grad_norm": 0.008001890858994876, "learning_rate": 4.728984978825742e-05, "loss": 2.3232, "step": 27 }, { "epoch": 0.25806451612903225, "grad_norm": 0.008724134793465777, "learning_rate": 4.704827060192432e-05, "loss": 2.2294, "step": 28 }, { "epoch": 0.2672811059907834, "grad_norm": 0.008184829559116022, "learning_rate": 4.6797809301539954e-05, "loss": 2.3264, "step": 29 }, { "epoch": 0.2764976958525346, "grad_norm": 0.007816772316149866, "learning_rate": 4.6538698874409394e-05, "loss": 2.2281, "step": 30 }, { "epoch": 0.2857142857142857, "grad_norm": 0.008186865385054075, "learning_rate": 4.627118035353867e-05, "loss": 2.2632, "step": 31 }, { "epoch": 0.29493087557603687, "grad_norm": 0.0073235358541247275, "learning_rate": 4.5995502593418004e-05, "loss": 2.2011, "step": 32 }, { "epoch": 0.30414746543778803, "grad_norm": 0.008111983255255069, "learning_rate": 4.571192203852923e-05, "loss": 2.233, "step": 33 }, { "epoch": 0.30414746543778803, "eval_loss": 2.2906479835510254, "eval_runtime": 300.3069, "eval_samples_per_second": 0.666, "eval_steps_per_second": 0.166, "step": 33 }, { "epoch": 0.31336405529953915, "grad_norm": 0.008172142123019982, "learning_rate": 4.5420702484792696e-05, "loss": 2.2851, "step": 34 }, { "epoch": 0.3225806451612903, "grad_norm": 0.008358411625521056, "learning_rate": 4.512211483417568e-05, "loss": 2.2361, "step": 35 }, { "epoch": 0.3317972350230415, "grad_norm": 0.008247340701498377, "learning_rate": 4.4816436842690526e-05, "loss": 2.3118, "step": 36 }, { "epoch": 0.34101382488479265, "grad_norm": 0.008886280096838773, "learning_rate": 4.450395286201688e-05, "loss": 2.4705, "step": 37 }, { "epoch": 0.35023041474654376, "grad_norm": 0.008132056446891323, "learning_rate": 4.418495357498843e-05, "loss": 2.2029, "step": 38 }, { "epoch": 0.35944700460829493, "grad_norm": 0.008471972341265172, "learning_rate": 4.385973572519026e-05, "loss": 2.2272, "step": 39 }, { "epoch": 0.3686635944700461, "grad_norm": 0.008767283831299696, "learning_rate": 4.35286018409182e-05, "loss": 2.3438, "step": 40 }, { "epoch": 0.3778801843317972, "grad_norm": 0.007929397656175006, "learning_rate": 4.3191859953757144e-05, "loss": 2.2881, "step": 41 }, { "epoch": 0.3870967741935484, "grad_norm": 0.008850609936339099, "learning_rate": 4.284982331203992e-05, "loss": 2.3688, "step": 42 }, { "epoch": 0.39631336405529954, "grad_norm": 0.00904494117075059, "learning_rate": 4.250281008945348e-05, "loss": 2.1962, "step": 43 }, { "epoch": 0.4055299539170507, "grad_norm": 0.008403892033503044, "learning_rate": 4.215114308906331e-05, "loss": 2.0907, "step": 44 }, { "epoch": 0.4055299539170507, "eval_loss": 2.2874112129211426, "eval_runtime": 299.7452, "eval_samples_per_second": 0.667, "eval_steps_per_second": 0.167, "step": 44 }, { "epoch": 0.4147465437788018, "grad_norm": 0.007920093840314358, "learning_rate": 4.179514944303142e-05, "loss": 2.249, "step": 45 }, { "epoch": 0.423963133640553, "grad_norm": 0.009447655699077405, "learning_rate": 4.143516030830731e-05, "loss": 2.2897, "step": 46 }, { "epoch": 0.43317972350230416, "grad_norm": 0.007857153673086726, "learning_rate": 4.107151055857495e-05, "loss": 2.2608, "step": 47 }, { "epoch": 0.4423963133640553, "grad_norm": 0.00863602117629485, "learning_rate": 4.070453847274226e-05, "loss": 2.3242, "step": 48 }, { "epoch": 0.45161290322580644, "grad_norm": 0.008191134199884613, "learning_rate": 4.033458542026302e-05, "loss": 2.3718, "step": 49 }, { "epoch": 0.4608294930875576, "grad_norm": 0.008347248773219445, "learning_rate": 3.996199554358383e-05, "loss": 2.3253, "step": 50 }, { "epoch": 0.4700460829493088, "grad_norm": 0.010404718994051891, "learning_rate": 3.958711543801153e-05, "loss": 2.2691, "step": 51 }, { "epoch": 0.4792626728110599, "grad_norm": 0.008780416106665991, "learning_rate": 3.921029382929888e-05, "loss": 2.2702, "step": 52 }, { "epoch": 0.48847926267281105, "grad_norm": 0.00909949892760408, "learning_rate": 3.8831881249248455e-05, "loss": 2.2613, "step": 53 }, { "epoch": 0.4976958525345622, "grad_norm": 0.01131101207742842, "learning_rate": 3.8452229709636475e-05, "loss": 2.2507, "step": 54 }, { "epoch": 0.5069124423963134, "grad_norm": 0.008814062646835924, "learning_rate": 3.807169237475994e-05, "loss": 2.2263, "step": 55 }, { "epoch": 0.5069124423963134, "eval_loss": 2.284841299057007, "eval_runtime": 299.7258, "eval_samples_per_second": 0.667, "eval_steps_per_second": 0.167, "step": 55 }, { "epoch": 0.5161290322580645, "grad_norm": 0.008960193210924948, "learning_rate": 3.7690623232911574e-05, "loss": 2.2619, "step": 56 }, { "epoch": 0.5253456221198156, "grad_norm": 0.009696833454332903, "learning_rate": 3.7309376767088426e-05, "loss": 2.2314, "step": 57 }, { "epoch": 0.5345622119815668, "grad_norm": 0.008269457274961622, "learning_rate": 3.692830762524007e-05, "loss": 2.2283, "step": 58 }, { "epoch": 0.543778801843318, "grad_norm": 0.009440471774573658, "learning_rate": 3.6547770290363525e-05, "loss": 2.3451, "step": 59 }, { "epoch": 0.5529953917050692, "grad_norm": 0.009118335835494016, "learning_rate": 3.616811875075155e-05, "loss": 2.1855, "step": 60 }, { "epoch": 0.5622119815668203, "grad_norm": 0.008982096822143406, "learning_rate": 3.5789706170701124e-05, "loss": 2.2363, "step": 61 }, { "epoch": 0.5714285714285714, "grad_norm": 0.008981128940771322, "learning_rate": 3.541288456198848e-05, "loss": 2.2096, "step": 62 }, { "epoch": 0.5806451612903226, "grad_norm": 0.00823027956711987, "learning_rate": 3.503800445641618e-05, "loss": 2.1916, "step": 63 }, { "epoch": 0.5898617511520737, "grad_norm": 0.009392240543833648, "learning_rate": 3.466541457973699e-05, "loss": 2.2957, "step": 64 }, { "epoch": 0.5990783410138248, "grad_norm": 0.009432065183986524, "learning_rate": 3.429546152725775e-05, "loss": 2.3128, "step": 65 }, { "epoch": 0.6082949308755761, "grad_norm": 0.009271418872949072, "learning_rate": 3.392848944142506e-05, "loss": 2.2703, "step": 66 }, { "epoch": 0.6082949308755761, "eval_loss": 2.2827680110931396, "eval_runtime": 296.9216, "eval_samples_per_second": 0.674, "eval_steps_per_second": 0.168, "step": 66 }, { "epoch": 0.6175115207373272, "grad_norm": 0.008766658618155636, "learning_rate": 3.3564839691692696e-05, "loss": 2.3799, "step": 67 }, { "epoch": 0.6267281105990783, "grad_norm": 0.010555438904098486, "learning_rate": 3.320485055696859e-05, "loss": 2.2963, "step": 68 }, { "epoch": 0.6359447004608295, "grad_norm": 0.008351179786003738, "learning_rate": 3.28488569109367e-05, "loss": 2.2472, "step": 69 }, { "epoch": 0.6451612903225806, "grad_norm": 0.009454242134158948, "learning_rate": 3.249718991054653e-05, "loss": 2.1851, "step": 70 }, { "epoch": 0.6543778801843319, "grad_norm": 0.00936435356820808, "learning_rate": 3.2150176687960096e-05, "loss": 2.2002, "step": 71 }, { "epoch": 0.663594470046083, "grad_norm": 0.009182061036270048, "learning_rate": 3.180814004624286e-05, "loss": 2.3132, "step": 72 }, { "epoch": 0.6728110599078341, "grad_norm": 0.009478558474893393, "learning_rate": 3.14713981590818e-05, "loss": 2.2784, "step": 73 }, { "epoch": 0.6820276497695853, "grad_norm": 0.009454952382160403, "learning_rate": 3.114026427480974e-05, "loss": 2.1714, "step": 74 }, { "epoch": 0.6912442396313364, "grad_norm": 0.010080456572333344, "learning_rate": 3.0815046425011576e-05, "loss": 2.228, "step": 75 }, { "epoch": 0.7004608294930875, "grad_norm": 0.010278477169796688, "learning_rate": 3.049604713798313e-05, "loss": 2.3568, "step": 76 }, { "epoch": 0.7096774193548387, "grad_norm": 0.009596875441546693, "learning_rate": 3.0183563157309474e-05, "loss": 2.4101, "step": 77 }, { "epoch": 0.7096774193548387, "eval_loss": 2.281269073486328, "eval_runtime": 297.7271, "eval_samples_per_second": 0.672, "eval_steps_per_second": 0.168, "step": 77 }, { "epoch": 0.7188940092165899, "grad_norm": 0.011054919506702275, "learning_rate": 2.987788516582432e-05, "loss": 2.3545, "step": 78 }, { "epoch": 0.728110599078341, "grad_norm": 0.01038525789770179, "learning_rate": 2.957929751520731e-05, "loss": 2.2726, "step": 79 }, { "epoch": 0.7373271889400922, "grad_norm": 0.010353326350272914, "learning_rate": 2.9288077961470773e-05, "loss": 2.1881, "step": 80 }, { "epoch": 0.7465437788018433, "grad_norm": 0.009073291327148403, "learning_rate": 2.9004497406582e-05, "loss": 2.1402, "step": 81 }, { "epoch": 0.7557603686635944, "grad_norm": 0.009968014397025708, "learning_rate": 2.8728819646461336e-05, "loss": 2.2383, "step": 82 }, { "epoch": 0.7649769585253456, "grad_norm": 0.009620205315061477, "learning_rate": 2.8461301125590613e-05, "loss": 2.2639, "step": 83 }, { "epoch": 0.7741935483870968, "grad_norm": 0.00977544354748898, "learning_rate": 2.8202190698460053e-05, "loss": 2.265, "step": 84 }, { "epoch": 0.783410138248848, "grad_norm": 0.010307755448269416, "learning_rate": 2.7951729398075695e-05, "loss": 2.2436, "step": 85 }, { "epoch": 0.7926267281105991, "grad_norm": 0.009422457517611288, "learning_rate": 2.7710150211742585e-05, "loss": 2.25, "step": 86 }, { "epoch": 0.8018433179723502, "grad_norm": 0.009754638407058579, "learning_rate": 2.7477677864332375e-05, "loss": 2.2758, "step": 87 }, { "epoch": 0.8110599078341014, "grad_norm": 0.009561231269697724, "learning_rate": 2.7254528609236844e-05, "loss": 2.2473, "step": 88 }, { "epoch": 0.8110599078341014, "eval_loss": 2.2800304889678955, "eval_runtime": 296.664, "eval_samples_per_second": 0.674, "eval_steps_per_second": 0.169, "step": 88 }, { "epoch": 0.8202764976958525, "grad_norm": 0.01112849185110947, "learning_rate": 2.7040910027201976e-05, "loss": 2.3045, "step": 89 }, { "epoch": 0.8294930875576036, "grad_norm": 0.010506389237890506, "learning_rate": 2.683702083322948e-05, "loss": 2.2741, "step": 90 }, { "epoch": 0.8387096774193549, "grad_norm": 0.009768468870786754, "learning_rate": 2.6643050691725647e-05, "loss": 2.2703, "step": 91 }, { "epoch": 0.847926267281106, "grad_norm": 0.010141192796013403, "learning_rate": 2.6459180040069244e-05, "loss": 2.2548, "step": 92 }, { "epoch": 0.8571428571428571, "grad_norm": 0.010229279382807752, "learning_rate": 2.6285579920762826e-05, "loss": 2.2488, "step": 93 }, { "epoch": 0.8663594470046083, "grad_norm": 0.010307954340086967, "learning_rate": 2.6122411822323345e-05, "loss": 2.275, "step": 94 }, { "epoch": 0.8755760368663594, "grad_norm": 0.011123883009152893, "learning_rate": 2.5969827529060266e-05, "loss": 2.124, "step": 95 }, { "epoch": 0.8847926267281107, "grad_norm": 0.009608681495586172, "learning_rate": 2.5827968979880805e-05, "loss": 2.3229, "step": 96 }, { "epoch": 0.8940092165898618, "grad_norm": 0.009782962640744528, "learning_rate": 2.5696968136253768e-05, "loss": 2.2763, "step": 97 }, { "epoch": 0.9032258064516129, "grad_norm": 0.009605554717833914, "learning_rate": 2.5576946859454592e-05, "loss": 2.2603, "step": 98 }, { "epoch": 0.9124423963133641, "grad_norm": 0.009456935936538857, "learning_rate": 2.5468016797206052e-05, "loss": 2.1912, "step": 99 }, { "epoch": 0.9124423963133641, "eval_loss": 2.27895188331604, "eval_runtime": 291.7402, "eval_samples_per_second": 0.686, "eval_steps_per_second": 0.171, "step": 99 }, { "epoch": 0.9216589861751152, "grad_norm": 0.010595387456993979, "learning_rate": 2.5370279279819864e-05, "loss": 2.2735, "step": 100 }, { "epoch": 0.9308755760368663, "grad_norm": 0.010469741198338179, "learning_rate": 2.528382522593594e-05, "loss": 2.2018, "step": 101 }, { "epoch": 0.9400921658986175, "grad_norm": 0.01162265413802059, "learning_rate": 2.5208735057946846e-05, "loss": 2.3279, "step": 102 }, { "epoch": 0.9493087557603687, "grad_norm": 0.0109629214839692, "learning_rate": 2.5145078627186302e-05, "loss": 2.2708, "step": 103 }, { "epoch": 0.9585253456221198, "grad_norm": 0.010413150712905432, "learning_rate": 2.5092915148951153e-05, "loss": 2.3774, "step": 104 }, { "epoch": 0.967741935483871, "grad_norm": 0.010610525090597078, "learning_rate": 2.5052293147417382e-05, "loss": 2.231, "step": 105 }, { "epoch": 0.9769585253456221, "grad_norm": 0.010306572358828672, "learning_rate": 2.5023250410501333e-05, "loss": 2.4008, "step": 106 }, { "epoch": 0.9861751152073732, "grad_norm": 0.010261194384190835, "learning_rate": 2.5005813954708107e-05, "loss": 2.3173, "step": 107 }, { "epoch": 0.9953917050691244, "grad_norm": 0.010312253992880615, "learning_rate": 2.5e-05, "loss": 2.3206, "step": 108 } ], "logging_steps": 1, "max_steps": 108, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 27, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.037172357247795e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }